CrossValidation / crossvalidation

Source: CrossValidation/crossvalidation.Rmd

This port keeps the structure of the ROS example: first a tiny simulation showing how leaving out an influential point changes prediction at that point, then the KidIQ comparison where in-sample fit is contrasted with leave-one-out predictive performance. statsmodels gives transparent linear-model calculations; ArviZ/CmdStanPy code is included for the Stan-backed PSIS-LOO workflow used by the R loo package.

Setup

Code
from pathlib import Path
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from scipy.stats import norm

sys.path.append(str(Path("..").resolve() / "python"))
from data import ros_path

SEED = 1507
rng = np.random.default_rng(SEED)

Simulated data

Code
x = np.arange(1, 21)
n = len(x)
a = 0.2
b = 0.3
sigma = 1.0
rng_fake = np.random.default_rng(2141)
y = a + b * x + rng_fake.normal(0, sigma, size=n)
fake = pd.DataFrame({"x": x, "y": y})

fit_all = smf.ols("y ~ x", data=fake).fit()
fit_minus_18 = smf.ols("y ~ x", data=fake.drop(index=17)).fit()
fit_all.params, fit_minus_18.params
(Intercept    0.302614
 x            0.299985
 dtype: float64,
 Intercept    0.319898
 x            0.297136
 dtype: float64)

Posterior-predictive-style density at the left-out point

For a Gaussian least-squares model, a quick analogue to the posterior predictive distribution uses the fitted mean and residual standard deviation. The R version averages over draws from stan_glm; here the deterministic curve is enough to show how the fit changes when observation 18 is omitted.

Code
y_grid = np.linspace(0, 9, 200)
mu_all_18 = fit_all.predict(pd.DataFrame({"x": [18]}))[0]
mu_loo_18 = fit_minus_18.predict(pd.DataFrame({"x": [18]}))[0]
sd_all = np.sqrt(fit_all.scale)
sd_loo = np.sqrt(fit_minus_18.scale)
post_curve = pd.DataFrame({
    "y": y_grid,
    "all_data_density": norm.pdf(y_grid, mu_all_18, sd_all),
    "loo_density": norm.pdf(y_grid, mu_loo_18, sd_loo),
})
post_curve.head()
y all_data_density loo_density
0 0.000000 2.908607e-07 6.989600e-07
1 0.045226 3.632040e-07 8.614981e-07
2 0.090452 4.527393e-07 1.060056e-06
3 0.135678 5.633491e-07 1.302195e-06
4 0.180905 6.997435e-07 1.596967e-06
Code
fig, ax = plt.subplots(figsize=(7, 5))
ax.scatter(fake["x"], fake["y"], color="black", zorder=3)
ax.scatter(fake.loc[17, "x"], fake.loc[17, "y"], facecolors="none", edgecolors="0.4", s=120, zorder=4)
x_grid = np.linspace(fake["x"].min(), fake["x"].max(), 100)
ax.plot(x_grid, fit_all.params["Intercept"] + fit_all.params["x"] * x_grid, color="black", label="fit to all data")
ax.plot(x_grid, fit_minus_18.params["Intercept"] + fit_minus_18.params["x"] * x_grid, color="0.45", ls="--", label="fit without x=18")
# Scale and rotate the predictive density curves, as in the R graphic.
scale = 6.0
ax.plot(18 + scale * post_curve["all_data_density"], post_curve["y"], color="black")
ax.plot(18 + scale * post_curve["loo_density"], post_curve["y"], color="0.45", ls="--")
ax.axvline(18, color="0.75", ls=":")
ax.set(xlabel="x", ylabel="y")
ax.legend(frameon=False)
ax.spines[["top", "right"]].set_visible(False)
fig.tight_layout()

Ordinary residuals and exact leave-one-out residuals

For linear regression, exact leave-one-out fitted values can be computed from the hat diagonal without refitting the model n times:

[ y_{i,-i} = y_i - . ]

Code
influence = fit_all.get_influence()
fake["residual"] = fit_all.resid
fake["hat"] = influence.hat_matrix_diag
fake["looresidual"] = fake["residual"] / (1 - fake["hat"])
fake["loo_fitted"] = fake["y"] - fake["looresidual"]

round(fake["residual"].std(ddof=1), 2), round(fake["looresidual"].std(ddof=1), 2)
(np.float64(1.05), np.float64(1.16))
Code
fig, ax = plt.subplots(figsize=(7, 4))
ax.scatter(fake["x"], fake["residual"], color="black", label="ordinary residual")
ax.scatter(fake["x"], fake["looresidual"], facecolors="none", edgecolors="0.45", label="LOO residual")
for row in fake.itertuples(index=False):
    ax.plot([row.x, row.x], [row.residual, row.looresidual], color="0.6", lw=0.8)
ax.axhline(0, color="black", ls="--", lw=1)
ax.set(xlabel="x", ylabel="residual")
ax.legend(frameon=False)
ax.spines[["top", "right"]].set_visible(False)
fig.tight_layout()

Code
r2_in_sample = 1 - fake["residual"].var(ddof=1) / fake["y"].var(ddof=1)
r2_loo = 1 - fake["looresidual"].var(ddof=1) / fake["y"].var(ddof=1)
round(r2_in_sample, 2), round(r2_loo, 2)
(np.float64(0.74), np.float64(0.68))

Log predictive densities

Code
fake["lpd_post"] = norm.logpdf(fake["y"], loc=fit_all.fittedvalues, scale=sd_all)
fake["lpd_loo_exact"] = norm.logpdf(fake["y"], loc=fake["loo_fitted"], scale=sd_loo)
fake[["x", "lpd_post", "lpd_loo_exact"]].head()
x lpd_post lpd_loo_exact
0 1 -0.992243 -1.019761
1 2 -1.684030 -1.945140
2 3 -2.227466 -2.581468
3 4 -1.064552 -1.106758
4 5 -1.088024 -1.130454
Code
fig, ax = plt.subplots(figsize=(7, 4))
ax.scatter(fake["x"], fake["lpd_post"], color="black", label="in-sample")
ax.scatter(fake["x"], fake["lpd_loo_exact"], facecolors="none", edgecolors="0.45", label="LOO")
for row in fake.itertuples(index=False):
    ax.plot([row.x, row.x], [row.lpd_post, row.lpd_loo_exact], color="0.6", lw=0.8)
ax.set(xlabel="x", ylabel="log predictive density")
ax.legend(frameon=False)
ax.spines[["top", "right"]].set_visible(False)
fig.tight_layout()

KidIQ example

Code
kidiq = pd.read_csv(ros_path("KidIQ", "data", "kidiq.csv"))
fit_1 = smf.ols("kid_score ~ mom_hs", data=kidiq).fit()
fit_3 = smf.ols("kid_score ~ mom_hs + mom_iq", data=kidiq).fit()

rng = np.random.default_rng(SEED)
kidiqr = kidiq.copy()
for j in range(1, 6):
    kidiqr[f"noise{j}"] = rng.normal(size=len(kidiqr))
noise_terms = " + ".join([f"noise{j}" for j in range(1, 6)])
fit_3n = smf.ols(f"kid_score ~ mom_hs + mom_iq + {noise_terms}", data=kidiqr).fit()
fit_4 = smf.ols("kid_score ~ mom_hs + mom_iq + mom_hs:mom_iq", data=kidiq).fit()
Code
def loo_residuals_ols(fit):
    h = fit.get_influence().hat_matrix_diag
    return fit.resid / (1 - h)

def loo_r2_ols(fit, y):
    return 1 - np.var(loo_residuals_ols(fit), ddof=1) / np.var(y, ddof=1)

summary = pd.DataFrame({
    "model": ["mom_hs", "mom_hs + mom_iq", "+ five noise predictors", "+ interaction"],
    "in_sample_R2": [fit_1.rsquared, fit_3.rsquared, fit_3n.rsquared, fit_4.rsquared],
    "loo_R2_exact_OLS": [
        loo_r2_ols(fit_1, kidiq["kid_score"]),
        loo_r2_ols(fit_3, kidiq["kid_score"]),
        loo_r2_ols(fit_3n, kidiqr["kid_score"]),
        loo_r2_ols(fit_4, kidiq["kid_score"]),
    ],
})
summary.round(3)
model in_sample_R2 loo_R2_exact_OLS
0 mom_hs 0.056 0.046
1 mom_hs + mom_iq 0.214 0.203
2 + five noise predictors 0.223 0.193
3 + interaction 0.230 0.216

The noise predictors can only improve or leave unchanged the in-sample least-squares R2. LOO-R2 penalizes their lack of out-of-sample value through the leverage-adjusted residuals.

CmdStanPy and ArviZ PSIS-LOO version

The following chunk mirrors the R stan_glm + loo workflow. It is useful when the model is Bayesian and we want pointwise log likelihoods and Pareto-k diagnostics rather than the exact OLS shortcut above.

Code
from cmdstanpy import CmdStanModel
import arviz as az
import patsy

stan_dir = Path("_generated_stan")
stan_dir.mkdir(exist_ok=True)
stan_file = stan_dir / "crossvalidation_gaussian_lm.stan"
stan_file.write_text(r'''
data {
  int<lower=1> N;
  int<lower=1> K;
  matrix[N, K] X;
  vector[N] y;
}
parameters {
  vector[K] beta;
  real<lower=0> sigma;
}
model {
  beta ~ normal(0, 20);
  sigma ~ exponential(1.0 / 30);
  y ~ normal(X * beta, sigma);
}
generated quantities {
  vector[N] log_lik;
  for (n in 1:N) log_lik[n] = normal_lpdf(y[n] | dot_product(row(X, n), beta), sigma);
}
''')

# Uncomment to run the Bayesian comparison.
# lm_model = CmdStanModel(stan_file=str(stan_file))
# y3, X3 = patsy.dmatrices("kid_score ~ mom_hs + mom_iq", kidiq, return_type="dataframe")
# fit3_stan = lm_model.sample(
#     data={"N": len(kidiq), "K": X3.shape[1], "X": X3.to_numpy(), "y": np.asarray(y3).ravel()},
#     seed=SEED, chains=4, parallel_chains=4, show_progress=False,
# )
# idata3 = az.from_cmdstanpy(posterior=fit3_stan, log_likelihood="log_lik")
# az.loo(idata3)
366