Pearson and Lee heights: the fitted line in context

Source: PearsonLee/PearsonLee.Rmd

This companion page uses the same Pearson-Lee height data but emphasizes how to read the fitted line: first by itself, then in the range where the data actually live. The original R script makes a sequence of simple base-R graphics; this port reproduces the same statistical objects and matplotlib displays.

Setup and fit

Code
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

root = Path("/Users/alal/tmp/ros-python-book/ROS-Examples")
rng = np.random.default_rng(1903)

heights = pd.read_csv(root / "PearsonLee/data/Heights.txt", sep=r"\s+")
mother_height = heights["mother_height"].to_numpy()
daughter_height = heights["daughter_height"].to_numpy()
n = len(heights)

fit = smf.ols("daughter_height ~ mother_height", data=heights).fit()
ab_hat = fit.params
ab_hat.round(3)
Intercept        29.798
mother_height     0.545
dtype: float64
Code
pd.Series({
    "n": n,
    "mother mean": mother_height.mean(),
    "daughter mean": daughter_height.mean(),
    "mother sd": mother_height.std(ddof=1),
    "daughter sd": daughter_height.std(ddof=1),
    "r": np.corrcoef(mother_height, daughter_height)[0, 1],
    "R-squared": fit.rsquared,
}).round(3)
n                5524.000
mother mean        62.499
daughter mean      63.856
mother sd           2.409
daughter sd         2.615
r                   0.502
R-squared           0.252
dtype: float64

Raw and jittered data

Code
def clean_axes(ax):
    ax.spines[["top", "right"]].set_visible(False)

def height_grid(ax, ticks=np.arange(48, 85, 6)):
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)
    for tick in ticks:
        ax.axhline(tick, color="0.86", lw=0.8, ls="--", zorder=0)
        ax.axvline(tick, color="0.86", lw=0.8, ls="--", zorder=0)
    clean_axes(ax)

height_range = (min(mother_height.min(), daughter_height.min()), max(mother_height.max(), daughter_height.max()))

fig, ax = plt.subplots(figsize=(4.5, 4.5))
ax.scatter(mother_height, daughter_height, s=8, color="black", alpha=0.45)
ax.set(xlabel="Mother's height (inches)", ylabel="Adult daughter's height (inches)", xlim=height_range, ylim=height_range)
height_grid(ax)

Code
mother_jitt = mother_height + rng.uniform(-0.5, 0.5, size=n)
daughter_jitt = daughter_height + rng.uniform(-0.5, 0.5, size=n)

fig, ax = plt.subplots(figsize=(4.5, 4.5))
ax.scatter(mother_jitt, daughter_jitt, s=5, color="black", alpha=0.25)
ax.set(xlabel="Mother's height (inches)", ylabel="Adult daughter's height (inches)", xlim=height_range, ylim=height_range)
height_grid(ax)

Data, average point, and regression line

Code
a_hat = ab_hat["Intercept"]
b_hat = ab_hat["mother_height"]
xs = np.linspace(height_range[0], height_range[1], 200)

fig, ax = plt.subplots(figsize=(4.5, 4.5))
ax.scatter(mother_jitt, daughter_jitt, s=5, color="black", alpha=0.22)
ax.plot(xs, a_hat + b_hat * xs, color="white", lw=3)
ax.plot(xs, a_hat + b_hat * xs, color="black", lw=1.5)
ax.scatter([mother_height.mean()], [daughter_height.mean()], s=45, color="white", edgecolor="black", zorder=3)
ax.set(xlabel="Mother's height (inches)", ylabel="Adult daughter's height (inches)", xlim=height_range, ylim=height_range)
height_grid(ax)

Code
fig, ax = plt.subplots(figsize=(4.7, 4.2))
ax.plot(xs, a_hat + b_hat * xs, color="black", lw=1.5)
ax.axvline(mother_height.mean(), color="0.4", lw=0.8)
ax.axhline(daughter_height.mean(), color="0.4", lw=0.8)
ax.scatter([mother_height.mean()], [daughter_height.mean()], s=40, color="black")
ax.text(66.5, 64.5, f"y = {a_hat:.0f} + {b_hat:.2f}x", fontsize=9)
ax.text(58.5, 62.1, f"equivalently: y = {daughter_height.mean():.1f} + {b_hat:.2f}(x - {mother_height.mean():.1f})", fontsize=9)
ax.set(xlabel="Mother's height (inches)", ylabel="Adult daughter's height (inches)", xlim=(54, 72), ylim=(54, 72))
clean_axes(ax)

The line alone can be misleading

The source page draws the fitted line on a 0--100 inch canvas and then redraws it with the data. The intercept is mathematically part of the line, but it is far outside the empirical range of mother heights, so it should not be given a literal interpretation.

Code
fig, axes = plt.subplots(1, 2, figsize=(9, 4), sharex=True, sharey=True)
for ax in axes:
    ax.set(xlim=(0, 100), ylim=(0, 100), xlabel="mother height", ylabel="daughter height")
    ax.axhline(0, color="black", lw=0.8)
    ax.axvline(0, color="black", lw=0.8)
    ax.plot([0, 100], [a_hat, a_hat + b_hat * 100], color="black", lw=1.5)
    ax.text(36, 42, f"slope {b_hat:.2f}")
    clean_axes(ax)

axes[0].set_title(f"The line: y = {a_hat:.0f} + {b_hat:.2f}x")
axes[1].scatter(mother_jitt, daughter_jitt, s=5, color="black", alpha=0.25)
axes[1].axvline(mother_height.mean(), ymax=daughter_height.mean() / 100, color="0.4", lw=0.8)
axes[1].axhline(daughter_height.mean(), xmax=mother_height.mean() / 100, color="0.4", lw=0.8)
axes[1].set_title("The same line in the context of the data")
Text(0.5, 1.0, 'The same line in the context of the data')

The fitted slope is a relationship within the observed height range. Extrapolating the line down to zero inches only demonstrates the algebraic intercept; it is not a plausible biological prediction.