Pearson and Lee heights: regression to the mean

Source: PearsonLee/heights.Rmd

Karl Pearson and Alice Lee’s family-height data are used in Chapter 6 to illustrate a basic linear regression: adult daughters’ heights are predicted from their mothers’ heights. This Python port keeps the original workflow: load the tabulated heights, fit the line, and draw the sequence of explanatory plots.

Setup and data

Code
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

root = Path("/Users/alal/tmp/ros-python-book/ROS-Examples")
rng = np.random.default_rng(1903)

heights = pd.read_csv(root / "PearsonLee/data/Heights.txt", sep=r"\s+")
heights.head()
daughter_height mother_height
0 52.5 59.5
1 52.5 59.5
2 53.5 59.5
3 53.5 59.5
4 55.5 59.5
Code
heights.describe().round(2)
daughter_height mother_height
count 5524.00 5524.00
mean 63.86 62.50
std 2.62 2.41
min 52.50 52.50
25% 62.50 60.50
50% 63.50 62.50
75% 65.50 64.50
max 73.50 70.50

Linear regression

The R page uses stan_glm(daughter_height ~ mother_height). The core fitted line is the least-squares line; with weak default priors the posterior center from stan_glm is very close to this estimate.

Code
fit = smf.ols("daughter_height ~ mother_height", data=heights).fit()
a_hat = fit.params["Intercept"]
b_hat = fit.params["mother_height"]

pd.DataFrame({
    "coef": fit.params,
    "se": fit.bse,
    "t": fit.tvalues,
}).round(3)
coef se t
Intercept 29.798 0.790 37.703
mother_height 0.545 0.013 43.125
Code
pd.Series({
    "mean mother height": heights["mother_height"].mean(),
    "mean daughter height": heights["daughter_height"].mean(),
    "correlation": heights[["mother_height", "daughter_height"]].corr().iloc[0, 1],
    "slope": b_hat,
    "intercept": a_hat,
    "residual sd": np.sqrt(fit.scale),
}).round(3)
mean mother height      62.499
mean daughter height    63.856
correlation              0.502
slope                    0.545
intercept               29.798
residual sd              2.262
dtype: float64

Scatterplot and fitted line

The raw values are heavily rounded, so a jittered display shows the density of repeated mother-daughter height pairs.

Code
def add_height_grid(ax, ticks=np.arange(48, 85, 6)):
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)
    for tick in ticks:
        ax.axhline(tick, color="0.85", lw=0.8, ls="--", zorder=0)
        ax.axvline(tick, color="0.85", lw=0.8, ls="--", zorder=0)
    ax.spines[["top", "right"]].set_visible(False)

mother = heights["mother_height"].to_numpy()
daughter = heights["daughter_height"].to_numpy()
rng_heights = (min(mother.min(), daughter.min()), max(mother.max(), daughter.max()))

fig, ax = plt.subplots(figsize=(4.5, 4.5))
ax.scatter(mother, daughter, s=8, color="black", alpha=0.45)
ax.set(xlabel="Mother's height (inches)", ylabel="Adult daughter's height (inches)", xlim=rng_heights, ylim=rng_heights)
add_height_grid(ax)

Code
mother_jit = mother + rng.uniform(-0.5, 0.5, size=len(mother))
daughter_jit = daughter + rng.uniform(-0.5, 0.5, size=len(daughter))
xs = np.linspace(rng_heights[0], rng_heights[1], 200)

fig, ax = plt.subplots(figsize=(4.5, 4.5))
ax.scatter(mother_jit, daughter_jit, s=5, color="black", alpha=0.25)
ax.plot(xs, a_hat + b_hat * xs, color="black", lw=1.5)
ax.scatter([mother.mean()], [daughter.mean()], s=45, color="white", edgecolor="black", zorder=3)
ax.set(xlabel="Mother's height (inches)", ylabel="Adult daughter's height (inches)", xlim=rng_heights, ylim=rng_heights)
add_height_grid(ax)

The line through the data average

A least-squares line with an intercept always passes through (mean(x), mean(y)). This is the centered version of the same equation.

Code
centered_equation = f"daughter = {daughter.mean():.1f} + {b_hat:.2f} * (mother - {mother.mean():.1f})"
standard_equation = f"daughter = {a_hat:.1f} + {b_hat:.2f} * mother"

pd.Series({
    "standard equation": standard_equation,
    "centered equation": centered_equation,
})
standard equation             daughter = 29.8 + 0.54 * mother
centered equation    daughter = 63.9 + 0.54 * (mother - 62.5)
dtype: str
Code
fig, ax = plt.subplots(figsize=(4.8, 4.2))
ax.plot(xs, a_hat + b_hat * xs, color="black", lw=1.5)
ax.axvline(mother.mean(), ymax=(daughter.mean() / rng_heights[1]), color="0.4", lw=0.8)
ax.axhline(daughter.mean(), xmax=(mother.mean() / rng_heights[1]), color="0.4", lw=0.8)
ax.scatter([mother.mean()], [daughter.mean()], s=40, color="black")
ax.text(61, 62, centered_equation, fontsize=9)
ax.set(xlabel="Mother's height (inches)", ylabel="Adult daughter's height (inches)", xlim=(54, 72), ylim=(54, 72))
ax.spines[["top", "right"]].set_visible(False)

The slope is well below one: very tall mothers have daughters who are predicted to be tall, but closer to the population average. This is the regression-to-the-mean pattern highlighted by the original example.