National Election Study: repeated linear regressions

Source: NES/nes_linear.Rmd

The ROS example fits the same party-identification regression separately in each presidential election year from 1972 through 2000. The coefficients are then plotted as a small multiple to show how a “secret weapon” workflow can summarize many related regressions.

Setup and data

Code

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

root = Path("../../ROS-Examples")
nes_dir = root / "NES"
nes = pd.read_csv(nes_dir / "data/nes.txt", sep=r"\s+", index_col=0, na_values=["NA"])
nes.head()

	year	resid	weight1	weight2	weight3	age	gender	race	educ1	urban	...	parent_party	white	year_new	income_new	age_new	vote.1	age_discrete	race_adj	dvote	rvote
536	1952	1	1.0	1.0	1.0	25	2	1	2	2.0	...	2.0	1	1	1	-2.052455	1.0	1	1.0	0.0	1.0
537	1952	2	1.0	1.0	1.0	33	2	1	1	2.0	...	0.0	1	1	1	-1.252455	1.0	2	1.0	1.0	0.0
538	1952	3	1.0	1.0	1.0	26	2	1	2	2.0	...	-2.0	1	1	0	-1.952455	1.0	1	1.0	0.0	1.0
539	1952	4	1.0	1.0	1.0	63	1	1	2	2.0	...	NaN	1	1	0	1.747545	1.0	3	1.0	0.0	1.0
540	1952	5	1.0	1.0	1.0	66	2	1	2	2.0	...	-2.0	1	1	-2	2.047545	1.0	4	1.0	0.0	1.0

5 rows × 70 columns

Fit the same model year by year

The R formula is

partyid7 ~ real_ideo + race_adj + factor(age_discrete) + educ1 + female + income

In Python, C(age_discrete) gives the same treatment of the age group as a categorical predictor. We use ordinary least squares as the close frequentist analogue to the default Gaussian stan_glm fit in the source example.

Code

formula = "partyid7 ~ real_ideo + race_adj + C(age_discrete) + educ1 + female + income"
years = list(range(1972, 2001, 4))

rows = []
for year in years:
    this_year = nes.loc[nes["year"] == year].copy()
    fit = smf.ols(formula, data=this_year).fit()
    for name, coef in fit.params.items():
        rows.append({"year": year, "term": name, "coef": coef, "se": fit.bse[name], "nobs": fit.nobs})

coef_by_year = pd.DataFrame(rows)
coef_by_year.head()

	year	term	coef	se	nobs
0	1972	Intercept	1.765830	0.384628	1330.0
1	1972	C(age_discrete)[T.2]	-0.188959	0.142290	1330.0
2	1972	C(age_discrete)[T.3]	-0.047406	0.139581	1330.0
3	1972	C(age_discrete)[T.4]	0.512585	0.183598	1330.0
4	1972	real_ideo	0.484637	0.042032	1330.0

The original figure uses readable labels for the nine displayed coefficients. Patsy names the categorical age terms according to the observed coding in age_discrete; the mapping below mirrors the book labels.

Code

term_labels = {
    "Intercept": "Intercept",
    "real_ideo": "Ideology",
    "race_adj": "Black",
    "C(age_discrete)[T.2]": "Age_30_44",
    "C(age_discrete)[T.3]": "Age_45_64",
    "C(age_discrete)[T.4]": "Age_65_up",
    "educ1": "Education",
    "female": "Female",
    "income": "Income",
}
plot_df = coef_by_year.loc[coef_by_year["term"].isin(term_labels)].copy()
plot_df["label"] = plot_df["term"].map(term_labels)
plot_df["lo"] = plot_df["coef"] - 0.67 * plot_df["se"]
plot_df["hi"] = plot_df["coef"] + 0.67 * plot_df["se"]
plot_df.sort_values(["label", "year"]).head()

	year	term	coef	se	nobs	label	lo	hi
1	1972	C(age_discrete)[T.2]	-0.188959	0.142290	1330.0	Age_30_44	-0.284293	-0.093624
10	1976	C(age_discrete)[T.2]	-0.037448	0.146108	1184.0	Age_30_44	-0.135341	0.060444
19	1980	C(age_discrete)[T.2]	-0.146259	0.194886	701.0	Age_30_44	-0.276833	-0.015685
28	1984	C(age_discrete)[T.2]	-0.230557	0.146184	1226.0	Age_30_44	-0.328500	-0.132614
37	1988	C(age_discrete)[T.2]	-0.309956	0.154986	1113.0	Age_30_44	-0.413797	-0.206115

Coefficient plot

The bars are plus/minus 0.67 standard errors, matching the R page. This interval corresponds roughly to a 50% uncertainty interval under a normal approximation, which makes it visually less cluttered than 95% intervals in a small multiple plot.

Code

labels = list(term_labels.values())
fig, axes = plt.subplots(2, 5, figsize=(11, 5), sharex=False)
axes = axes.ravel()

for ax, label in zip(axes, labels):
    d = plot_df.loc[plot_df["label"] == label].sort_values("year")
    ax.axhline(0, color="0.5", ls="--", lw=0.8)
    ax.errorbar(d["year"], d["coef"], yerr=0.67 * d["se"], fmt="o", color="black", ms=4, lw=0.8)
    ax.set_title(label, fontsize=10)
    ax.set_xticks([1972, 1986, 2000])
    ax.set_ylabel("Coefficient")
    ax.spines[["top", "right"]].set_visible(False)

axes[-1].axis("off")
fig.tight_layout()

Bayesian version with CmdStanPy

A direct CmdStanPy port is useful if exact posterior uncertainty is desired. The model is just Gaussian regression with a design matrix assembled by Patsy. Run it inside the loop above to replace fit.params and fit.bse with posterior medians and posterior standard deviations.

Code

import patsy
from cmdstanpy import CmdStanModel

stan_code = """
data {
  int<lower=1> N;
  int<lower=1> K;
  matrix[N, K] X;
  vector[N] y;
}
parameters {
  vector[K] beta;
  real<lower=0> sigma;
}
model {
  beta ~ normal(0, 10);
  sigma ~ exponential(1);
  y ~ normal(X * beta, sigma);
}
"""
stan_file = Path("_generated/nes_linear_regression.stan")
stan_file.parent.mkdir(exist_ok=True)
stan_file.write_text(stan_code)

sample_year = nes.loc[nes["year"] == 1972].copy()
y_mat, x_mat = patsy.dmatrices(formula, sample_year, return_type="dataframe")
model = CmdStanModel(stan_file=str(stan_file))
fit = model.sample(
    data={"N": x_mat.shape[0], "K": x_mat.shape[1], "X": x_mat.to_numpy(), "y": y_mat.iloc[:, 0].to_numpy()},
    seed=1972,
    chains=4,
    parallel_chains=4,
    show_progress=False,
)
posterior = fit.draws_pd()
posterior.filter(regex=r"^beta\[").describe(percentiles=[0.05, 0.5, 0.95])

	beta[1]	beta[2]	beta[3]	beta[4]	beta[5]	beta[6]	beta[7]	beta[8]	beta[9]
count	4000.000000	4000.000000	4000.000000	4000.000000	4000.000000	4000.000000	4000.000000	4000.000000	4000.000000
mean	1.757455	-0.187871	-0.047306	0.513352	0.485030	-1.102689	0.297016	-0.006744	0.162076
std	0.391104	0.140732	0.139833	0.184137	0.041567	0.195593	0.060152	0.106054	0.052139
min	0.276536	-0.692167	-0.529305	-0.152274	0.335766	-1.823050	0.080385	-0.399731	-0.024701
5%	1.120984	-0.419611	-0.275445	0.207022	0.417893	-1.423189	0.197589	-0.182870	0.075433
50%	1.756540	-0.188043	-0.045624	0.512809	0.484997	-1.101705	0.297524	-0.006416	0.162724
95%	2.397829	0.043857	0.178194	0.815141	0.552803	-0.782393	0.395618	0.165432	0.247773
max	3.240460	0.342462	0.382055	1.145120	0.636693	-0.317510	0.488833	0.364393	0.337768

# National Election Study: repeated linear regressions Source: `NES/nes_linear.Rmd` The ROS example fits the same party-identification regression separately in each presidential election year from 1972 through 2000. The coefficients are then plotted as a small multiple to show how a "secret weapon" workflow can summarize many related regressions. ## Setup and data ```{python} from pathlib import Path import numpy as np import pandas as pd import matplotlib.pyplot as plt import statsmodels.formula.api as smf root = Path("../../ROS-Examples") nes_dir = root / "NES" nes = pd.read_csv(nes_dir / "data/nes.txt", sep=r"\s+", index_col=0, na_values=["NA"]) nes.head() ``` ## Fit the same model year by year The R formula is ```text partyid7 ~ real_ideo + race_adj + factor(age_discrete) + educ1 + female + income ``` In Python, `C(age_discrete)` gives the same treatment of the age group as a categorical predictor. We use ordinary least squares as the close frequentist analogue to the default Gaussian `stan_glm` fit in the source example. ```{python} formula = "partyid7 ~ real_ideo + race_adj + C(age_discrete) + educ1 + female + income" years = list(range(1972, 2001, 4)) rows = [] for year in years: this_year = nes.loc[nes["year"] == year].copy() fit = smf.ols(formula, data=this_year).fit() for name, coef in fit.params.items(): rows.append({"year": year, "term": name, "coef": coef, "se": fit.bse[name], "nobs": fit.nobs}) coef_by_year = pd.DataFrame(rows) coef_by_year.head() ``` The original figure uses readable labels for the nine displayed coefficients. Patsy names the categorical age terms according to the observed coding in `age_discrete`; the mapping below mirrors the book labels. ```{python} term_labels = { "Intercept": "Intercept", "real_ideo": "Ideology", "race_adj": "Black", "C(age_discrete)[T.2]": "Age_30_44", "C(age_discrete)[T.3]": "Age_45_64", "C(age_discrete)[T.4]": "Age_65_up", "educ1": "Education", "female": "Female", "income": "Income", } plot_df = coef_by_year.loc[coef_by_year["term"].isin(term_labels)].copy() plot_df["label"] = plot_df["term"].map(term_labels) plot_df["lo"] = plot_df["coef"] - 0.67 * plot_df["se"] plot_df["hi"] = plot_df["coef"] + 0.67 * plot_df["se"] plot_df.sort_values(["label", "year"]).head() ``` ## Coefficient plot The bars are plus/minus `0.67` standard errors, matching the R page. This interval corresponds roughly to a 50% uncertainty interval under a normal approximation, which makes it visually less cluttered than 95% intervals in a small multiple plot. ```{python} labels = list(term_labels.values()) fig, axes = plt.subplots(2, 5, figsize=(11, 5), sharex=False) axes = axes.ravel() for ax, label in zip(axes, labels): d = plot_df.loc[plot_df["label"] == label].sort_values("year") ax.axhline(0, color="0.5", ls="--", lw=0.8) ax.errorbar(d["year"], d["coef"], yerr=0.67 * d["se"], fmt="o", color="black", ms=4, lw=0.8) ax.set_title(label, fontsize=10) ax.set_xticks([1972, 1986, 2000]) ax.set_ylabel("Coefficient") ax.spines[["top", "right"]].set_visible(False) axes[-1].axis("off") fig.tight_layout() ``` ## Bayesian version with CmdStanPy A direct CmdStanPy port is useful if exact posterior uncertainty is desired. The model is just Gaussian regression with a design matrix assembled by Patsy. Run it inside the loop above to replace `fit.params` and `fit.bse` with posterior medians and posterior standard deviations. ```{python} import patsy from cmdstanpy import CmdStanModel stan_code = """ data { int<lower=1> N; int<lower=1> K; matrix[N, K] X; vector[N] y; } parameters { vector[K] beta; real<lower=0> sigma; } model { beta ~ normal(0, 10); sigma ~ exponential(1); y ~ normal(X * beta, sigma); } """ stan_file = Path("_generated/nes_linear_regression.stan") stan_file.parent.mkdir(exist_ok=True) stan_file.write_text(stan_code) sample_year = nes.loc[nes["year"] == 1972].copy() y_mat, x_mat = patsy.dmatrices(formula, sample_year, return_type="dataframe") model = CmdStanModel(stan_file=str(stan_file)) fit = model.sample( data={"N": x_mat.shape[0], "K": x_mat.shape[1], "X": x_mat.to_numpy(), "y": y_mat.iloc[:, 0].to_numpy()}, seed=1972, chains=4, parallel_chains=4, show_progress=False, ) posterior = fit.draws_pd() posterior.filter(regex=r"^beta\[").describe(percentiles=[0.05, 0.5, 0.95]) ```