National Election Study: logistic regression, prediction, and separation

Source: NES/nes_logistic.Rmd

This page ports the ROS logistic-regression example. It uses the 1992 National Election Study for a simple vote-on-income model, then repeats logistic regressions across years to show changing income effects and the instability created by separation.

Setup and data

Code

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.special import expit
from python.bayes_glm import bayes_logit

root = Path("../../ROS-Examples")
nes_dir = root / "NES"
nes = pd.read_csv(nes_dir / "data/nes.txt", sep=r"\s+", index_col=0, na_values=["NA"])
nes.head()

	year	resid	weight1	weight2	weight3	age	gender	race	educ1	urban	...	parent_party	white	year_new	income_new	age_new	vote.1	age_discrete	race_adj	dvote	rvote
536	1952	1	1.0	1.0	1.0	25	2	1	2	2.0	...	2.0	1	1	1	-2.052455	1.0	1	1.0	0.0	1.0
537	1952	2	1.0	1.0	1.0	33	2	1	1	2.0	...	0.0	1	1	1	-1.252455	1.0	2	1.0	1.0	0.0
538	1952	3	1.0	1.0	1.0	26	2	1	2	2.0	...	-2.0	1	1	0	-1.952455	1.0	1	1.0	0.0	1.0
539	1952	4	1.0	1.0	1.0	63	1	1	2	2.0	...	NaN	1	1	0	1.747545	1.0	3	1.0	0.0	1.0
540	1952	5	1.0	1.0	1.0	66	2	1	2	2.0	...	-2.0	1	1	-2	2.047545	1.0	4	1.0	0.0	1.0

5 rows × 70 columns

The source R page first keeps 1992 respondents with a valid Democratic or Republican presidential vote.

Code

ok92 = (nes["year"] == 1992) & nes["rvote"].notna() & nes["dvote"].notna() & ((nes["rvote"] == 1) | (nes["dvote"] == 1))
nes92 = nes.loc[ok92].copy()
nes92[["year", "income", "rvote", "dvote", "presvote"]].head()

	year	income	rvote	dvote	presvote
32093	1992	4	1.0	0.0	2.0
32094	1992	2	1.0	0.0	2.0
32096	1992	1	0.0	1.0	1.0
32097	1992	2	1.0	0.0	2.0
32098	1992	3	0.0	1.0	1.0

A single-predictor logistic regression

The R model is stan_glm(rvote ~ income, family=binomial(link="logit")). For the main Bayesian port, fit the same Bernoulli-logit model with weak Normal priors using the shared lightweight helper.

Code

fit_1 = bayes_logit("rvote ~ income", data=nes92, draws=4000, prior_scale=2.5, seed=1992)
fit_1.summary()

	mean	sd	q5	q50	q95
Intercept	-1.394995	0.190050	-1.707889	-1.394071	-1.087977
income	0.323508	0.057151	0.228490	0.323695	0.419032

Code

new = pd.DataFrame({"income": [5]})
eta_draw = fit_1.beta_draws @ np.array([1.0, 5.0])
p_draw = expit(eta_draw)

pd.DataFrame({
    "linear_predictor_mean": [eta_draw.mean()],
    "linear_predictor_sd": [eta_draw.std(ddof=1)],
    "expected_probability_mean": [p_draw.mean()],
    "expected_probability_sd": [p_draw.std(ddof=1)],
})

	linear_predictor_mean	linear_predictor_sd	expected_probability_mean	expected_probability_sd
0	0.222546	0.121334	0.55521	0.029868

The distinction among linear predictor, expected outcome, and posterior predictive draw is clearest when using the posterior coefficient draws directly. This mirrors the operations performed by posterior_linpred, posterior_epred, and posterior_predict in the R example.

Code

rng = np.random.default_rng(1992)
draws = fit_1.beta_draws
X_new = np.column_stack([np.ones(5), np.arange(1, 6)])
linpred = draws @ X_new.T
epred = expit(linpred)
postpred = rng.binomial(1, epred)

summary_pred = pd.DataFrame(
    {
        "income": np.arange(1, 6),
        "p_mean": epred.mean(axis=0),
        "p_sd": epred.std(axis=0),
        "y_pred_mean": postpred.mean(axis=0),
    }
)
summary_pred

	income	p_mean	p_sd	y_pred_mean
0	1	0.255989	0.026000	0.24925
1	2	0.321571	0.019427	0.31625
2	3	0.395539	0.014497	0.39425
3	4	0.474819	0.019180	0.47800
4	5	0.555210	0.029864	0.57225

Code

prob_rich_gt_next = np.mean(epred[:, 4] > epred[:, 3])
diff_interval = np.quantile(epred[:, 4] - epred[:, 3], [0.025, 0.975])
prob_rich_gt_next, diff_interval

(np.float64(1.0), array([0.05203042, 0.10731924]))

Jittered data and fitted curve

Code

rng = np.random.default_rng(2141)
n = len(nes92)
income_jitt = nes92["income"].to_numpy() + rng.uniform(-0.2, 0.2, size=n)
vote_jitt = nes92["rvote"].to_numpy() + np.where(nes92["rvote"].to_numpy() == 0, rng.uniform(0.005, 0.05, size=n), rng.uniform(-0.05, -0.005, size=n))

x_grid = np.linspace(0.5, 5.5, 200)
X_grid = pd.DataFrame({"income": x_grid})

fig, ax = plt.subplots(figsize=(5.2, 3.8))
for draw in draws[rng.choice(len(draws), 25, replace=False)]:
    ax.plot(x_grid, expit(draw[0] + draw[1] * x_grid), color="0.65", lw=0.6, alpha=0.7)
ax.plot(x_grid, fit_1.epred(X_grid).mean(axis=0), color="black", lw=1.5)
ax.scatter(income_jitt, vote_jitt, s=4, color="black", alpha=0.25)
ax.set_xticks(range(1, 6))
ax.text(1, -0.13, "(poor)", ha="center", transform=ax.get_xaxis_transform())
ax.text(5, -0.13, "(rich)", ha="center", transform=ax.get_xaxis_transform())
ax.set(xlim=(0.5, 5.5), ylim=(0, 1), xlabel="Income", ylabel="Pr(Republican vote)")
ax.spines[["top", "right"]].set_visible(False)

Income coefficient by election year

Code

years = list(range(1952, 2001, 4))
rows = []
for year in years:
    ok = (nes["year"] == year) & nes["presvote"].notna() & (nes["presvote"] < 3) & nes["vote"].notna() & nes["income"].notna()
    d = nes.loc[ok, ["presvote", "income"]].copy()
    d["vote_rep"] = d["presvote"] - 1
    if d["vote_rep"].nunique() < 2:
        continue
    fit_y = smf.glm("vote_rep ~ income", data=d, family=sm.families.Binomial()).fit()
    rows.append({"year": year, "coef": fit_y.params["income"], "se": fit_y.bse["income"], "nobs": fit_y.nobs})

fits = pd.DataFrame(rows)
fits

	year	coef	se	nobs
0	1952	0.107120	0.052583	1128
1	1956	0.113598	0.052236	1175
2	1960	0.041071	0.061439	875
3	1964	0.240557	0.056592	1058
4	1968	0.071701	0.062281	847
5	1972	0.195183	0.047463	1512
6	1976	0.370580	0.055694	1197
7	1980	0.356647	0.068277	747
8	1984	0.435423	0.057301	1209
9	1988	0.311606	0.060277	1057
10	1992	0.325995	0.056881	1179
11	1996	0.422277	0.066438	912
12	2000	0.281381	0.063029	861

Code

fig, ax = plt.subplots(figsize=(5.6, 3.8))
ax.axhline(0, color="0.5", ls="--", lw=0.8)
ax.errorbar(fits["year"], fits["coef"], yerr=fits["se"], fmt="o", color="black", ms=4, lw=0.8)
ax.set(xlim=(1950, 2000), xlabel="Year", ylabel="Coefficient of income", title="Income coefficient in separate election-year logits")
ax.spines[["top", "right"]].set_visible(False)

In-sample accuracy and log score

Code

predp = fit_1.epred().mean(axis=0)
y = nes92["rvote"].to_numpy()
accuracy_parts = (predp[y == 1].mean(), (1 - predp[y == 0]).mean())
log_score = np.log(np.r_[predp[y == 1], 1 - predp[y == 0]]).sum()
accuracy_parts, log_score

((np.float64(0.4213588814980553), np.float64(0.6068043713175233)),
 np.float64(-778.465144277681))

A light leave-one-out calculation can be done by refitting the model after dropping each observation. This is slower than PSIS-LOO from the Stan ecosystem, but it is transparent for a simple GLM.

Code

def loo_log_score(data):
    values = []
    for i in data.index:
        train = data.drop(index=i)
        test = data.loc[[i]]
        fit_i = smf.glm("rvote ~ income", data=train, family=sm.families.Binomial()).fit(disp=0)
        p_i = fit_i.predict(test)[0]
        y_i = test["rvote"].iloc[0]
        values.append(np.log(p_i if y_i == 1 else 1 - p_i))
    return np.sum(values)

# loo_log_score(nes92)  # uncomment to run the exact leave-one-out refit

Separation and weak regularization

The final part of the source example compares classical glm with stan_glm for rvote ~ female + black + income. In years with quasi-complete separation, maximum-likelihood coefficients can become huge or have enormous standard errors. A weakly regularized logistic model stabilizes the fit.

Code

terms = ["Intercept", "female", "black", "income"]
sep_rows = []
base = nes.loc[nes[["rvote", "female", "black", "income"]].notna().all(axis=1)].copy()

for year in years:
    d = base.loc[base["year"] == year].copy()
    if d["rvote"].nunique() < 2:
        continue
    try:
        fit_glm = smf.glm("rvote ~ female + black + income", data=d, family=sm.families.Binomial()).fit()
        for term in terms:
            sep_rows.append({"year": year, "term": term, "model": "MLE", "coef": fit_glm.params[term], "se": fit_glm.bse[term]})
    except Exception as err:
        sep_rows.append({"year": year, "term": "model", "model": "MLE", "coef": np.nan, "se": np.nan, "note": str(err)})

sep = pd.DataFrame(sep_rows)
sep.head()

	year	term	model	coef	se
0	1952	Intercept	MLE	0.102174	0.194953
1	1952	female	MLE	0.058306	0.122895
2	1952	black	MLE	-1.664640	0.362479
3	1952	income	MLE	0.080091	0.053768
4	1956	Intercept	MLE	-0.046753	0.186773

The Bayesian fit uses the same weak Normal prior in every year. Its posterior mean and standard deviation stay finite when the likelihood alone is weakly identified.

Code

reg_rows = []
for year in years:
    d = base.loc[base["year"] == year].copy()
    if d["rvote"].nunique() < 2:
        continue
    fit_bayes_y = bayes_logit("rvote ~ female + black + income", data=d, draws=2000, prior_scale=2.5, seed=year)
    for term, coef, se in zip(fit_bayes_y.coef_names, fit_bayes_y.beta_draws.mean(axis=0), fit_bayes_y.beta_draws.std(axis=0, ddof=1)):
        reg_rows.append({"year": year, "term": term, "model": "Bayes normal-prior", "coef": coef, "se": se})

regularized = pd.DataFrame(reg_rows)
regularized.head()

	year	term	model	coef	se
0	1952	Intercept	Bayes normal-prior	0.101364	0.190235
1	1952	female	Bayes normal-prior	0.056016	0.124061
2	1952	black	Bayes normal-prior	-1.622939	0.357667
3	1952	income	Bayes normal-prior	0.081007	0.052791
4	1956	Intercept	Bayes normal-prior	-0.050170	0.193269

Code

fig, axes = plt.subplots(2, 4, figsize=(10, 5), sharex=True)
for col, term in enumerate(terms):
    d_mle = sep.loc[sep["term"] == term]
    axes[0, col].axhline(0, color="0.5", ls="--", lw=0.8)
    axes[0, col].errorbar(d_mle["year"], d_mle["coef"], yerr=d_mle["se"], fmt="o", color="black", ms=3, lw=0.7)
    axes[0, col].set_title(term)
    axes[0, col].set_ylabel("MLE")

    d_reg = regularized.loc[regularized["term"] == term]
    axes[1, col].axhline(0, color="0.5", ls="--", lw=0.8)
    axes[1, col].errorbar(d_reg["year"], d_reg["coef"], yerr=d_reg["se"], fmt="o-", color="black", ms=3, lw=0.7)
    axes[1, col].set_ylabel("Bayes")
    axes[1, col].set_xlabel("Year")

for ax in axes.ravel():
    ax.set_xticks([1960, 1980, 2000])
    ax.spines[["top", "right"]].set_visible(False)
fig.tight_layout()

CmdStanPy version of the single-predictor model

Use this small Stan model if exact Stan sampling is needed instead of the lightweight Laplace posterior used above.

Code

from cmdstanpy import CmdStanModel

stan_code = """
data {
  int<lower=1> N;
  vector[N] income;
  array[N] int<lower=0, upper=1> rvote;
}
parameters {
  real alpha;
  real beta;
}
model {
  alpha ~ normal(0, 2.5);
  beta ~ normal(0, 2.5);
  rvote ~ bernoulli_logit(alpha + beta * income);
}
"""
stan_file = Path("_generated/nes_vote_income.stan")
stan_file.parent.mkdir(exist_ok=True)
stan_file.write_text(stan_code)
model = CmdStanModel(stan_file=str(stan_file))
fit = model.sample(
    data={"N": len(nes92), "income": nes92["income"].to_numpy(), "rvote": nes92["rvote"].astype(int).to_list()},
    seed=1992,
    chains=4,
    parallel_chains=4,
    show_progress=False,
)
fit.draws_pd()[["alpha", "beta"]].describe(percentiles=[0.05, 0.5, 0.95])

	alpha	beta
count	4000.000000	4000.000000
mean	-1.395210	0.323489
std	0.193247	0.057849
min	-2.005810	0.121681
5%	-1.718534	0.227129
50%	-1.397070	0.322820
95%	-1.071593	0.419290
max	-0.736809	0.514239

# National Election Study: logistic regression, prediction, and separation Source: `NES/nes_logistic.Rmd` This page ports the ROS logistic-regression example. It uses the 1992 National Election Study for a simple vote-on-income model, then repeats logistic regressions across years to show changing income effects and the instability created by separation. ## Setup and data ```{python} from pathlib import Path import numpy as np import pandas as pd import matplotlib.pyplot as plt import statsmodels.api as sm import statsmodels.formula.api as smf from scipy.special import expit from python.bayes_glm import bayes_logit root = Path("../../ROS-Examples") nes_dir = root / "NES" nes = pd.read_csv(nes_dir / "data/nes.txt", sep=r"\s+", index_col=0, na_values=["NA"]) nes.head() ``` The source R page first keeps 1992 respondents with a valid Democratic or Republican presidential vote. ```{python} ok92 = (nes["year"] == 1992) & nes["rvote"].notna() & nes["dvote"].notna() & ((nes["rvote"] == 1) | (nes["dvote"] == 1)) nes92 = nes.loc[ok92].copy() nes92[["year", "income", "rvote", "dvote", "presvote"]].head() ``` ## A single-predictor logistic regression The R model is `stan_glm(rvote ~ income, family=binomial(link="logit"))`. For the main Bayesian port, fit the same Bernoulli-logit model with weak Normal priors using the shared lightweight helper. ```{python} fit_1 = bayes_logit("rvote ~ income", data=nes92, draws=4000, prior_scale=2.5, seed=1992) fit_1.summary() ``` ```{python} new = pd.DataFrame({"income": [5]}) eta_draw = fit_1.beta_draws @ np.array([1.0, 5.0]) p_draw = expit(eta_draw) pd.DataFrame({ "linear_predictor_mean": [eta_draw.mean()], "linear_predictor_sd": [eta_draw.std(ddof=1)], "expected_probability_mean": [p_draw.mean()], "expected_probability_sd": [p_draw.std(ddof=1)], }) ``` The distinction among linear predictor, expected outcome, and posterior predictive draw is clearest when using the posterior coefficient draws directly. This mirrors the operations performed by `posterior_linpred`, `posterior_epred`, and `posterior_predict` in the R example. ```{python} rng = np.random.default_rng(1992) draws = fit_1.beta_draws X_new = np.column_stack([np.ones(5), np.arange(1, 6)]) linpred = draws @ X_new.T epred = expit(linpred) postpred = rng.binomial(1, epred) summary_pred = pd.DataFrame( { "income": np.arange(1, 6), "p_mean": epred.mean(axis=0), "p_sd": epred.std(axis=0), "y_pred_mean": postpred.mean(axis=0), } ) summary_pred ``` ```{python} prob_rich_gt_next = np.mean(epred[:, 4] > epred[:, 3]) diff_interval = np.quantile(epred[:, 4] - epred[:, 3], [0.025, 0.975]) prob_rich_gt_next, diff_interval ``` ## Jittered data and fitted curve ```{python} rng = np.random.default_rng(2141) n = len(nes92) income_jitt = nes92["income"].to_numpy() + rng.uniform(-0.2, 0.2, size=n) vote_jitt = nes92["rvote"].to_numpy() + np.where(nes92["rvote"].to_numpy() == 0, rng.uniform(0.005, 0.05, size=n), rng.uniform(-0.05, -0.005, size=n)) x_grid = np.linspace(0.5, 5.5, 200) X_grid = pd.DataFrame({"income": x_grid}) fig, ax = plt.subplots(figsize=(5.2, 3.8)) for draw in draws[rng.choice(len(draws), 25, replace=False)]: ax.plot(x_grid, expit(draw[0] + draw[1] * x_grid), color="0.65", lw=0.6, alpha=0.7) ax.plot(x_grid, fit_1.epred(X_grid).mean(axis=0), color="black", lw=1.5) ax.scatter(income_jitt, vote_jitt, s=4, color="black", alpha=0.25) ax.set_xticks(range(1, 6)) ax.text(1, -0.13, "(poor)", ha="center", transform=ax.get_xaxis_transform()) ax.text(5, -0.13, "(rich)", ha="center", transform=ax.get_xaxis_transform()) ax.set(xlim=(0.5, 5.5), ylim=(0, 1), xlabel="Income", ylabel="Pr(Republican vote)") ax.spines[["top", "right"]].set_visible(False) ``` ## Income coefficient by election year ```{python} years = list(range(1952, 2001, 4)) rows = [] for year in years: ok = (nes["year"] == year) & nes["presvote"].notna() & (nes["presvote"] < 3) & nes["vote"].notna() & nes["income"].notna() d = nes.loc[ok, ["presvote", "income"]].copy() d["vote_rep"] = d["presvote"] - 1 if d["vote_rep"].nunique() < 2: continue fit_y = smf.glm("vote_rep ~ income", data=d, family=sm.families.Binomial()).fit() rows.append({"year": year, "coef": fit_y.params["income"], "se": fit_y.bse["income"], "nobs": fit_y.nobs}) fits = pd.DataFrame(rows) fits ``` ```{python} fig, ax = plt.subplots(figsize=(5.6, 3.8)) ax.axhline(0, color="0.5", ls="--", lw=0.8) ax.errorbar(fits["year"], fits["coef"], yerr=fits["se"], fmt="o", color="black", ms=4, lw=0.8) ax.set(xlim=(1950, 2000), xlabel="Year", ylabel="Coefficient of income", title="Income coefficient in separate election-year logits") ax.spines[["top", "right"]].set_visible(False) ``` ## In-sample accuracy and log score ```{python} predp = fit_1.epred().mean(axis=0) y = nes92["rvote"].to_numpy() accuracy_parts = (predp[y == 1].mean(), (1 - predp[y == 0]).mean()) log_score = np.log(np.r_[predp[y == 1], 1 - predp[y == 0]]).sum() accuracy_parts, log_score ``` A light leave-one-out calculation can be done by refitting the model after dropping each observation. This is slower than PSIS-LOO from the Stan ecosystem, but it is transparent for a simple GLM. ```{python} def loo_log_score(data): values = [] for i in data.index: train = data.drop(index=i) test = data.loc[[i]] fit_i = smf.glm("rvote ~ income", data=train, family=sm.families.Binomial()).fit(disp=0) p_i = fit_i.predict(test)[0] y_i = test["rvote"].iloc[0] values.append(np.log(p_i if y_i == 1 else 1 - p_i)) return np.sum(values) # loo_log_score(nes92) # uncomment to run the exact leave-one-out refit ``` ## Separation and weak regularization The final part of the source example compares classical `glm` with `stan_glm` for `rvote ~ female + black + income`. In years with quasi-complete separation, maximum-likelihood coefficients can become huge or have enormous standard errors. A weakly regularized logistic model stabilizes the fit. ```{python} terms = ["Intercept", "female", "black", "income"] sep_rows = [] base = nes.loc[nes[["rvote", "female", "black", "income"]].notna().all(axis=1)].copy() for year in years: d = base.loc[base["year"] == year].copy() if d["rvote"].nunique() < 2: continue try: fit_glm = smf.glm("rvote ~ female + black + income", data=d, family=sm.families.Binomial()).fit() for term in terms: sep_rows.append({"year": year, "term": term, "model": "MLE", "coef": fit_glm.params[term], "se": fit_glm.bse[term]}) except Exception as err: sep_rows.append({"year": year, "term": "model", "model": "MLE", "coef": np.nan, "se": np.nan, "note": str(err)}) sep = pd.DataFrame(sep_rows) sep.head() ``` The Bayesian fit uses the same weak Normal prior in every year. Its posterior mean and standard deviation stay finite when the likelihood alone is weakly identified. ```{python} reg_rows = [] for year in years: d = base.loc[base["year"] == year].copy() if d["rvote"].nunique() < 2: continue fit_bayes_y = bayes_logit("rvote ~ female + black + income", data=d, draws=2000, prior_scale=2.5, seed=year) for term, coef, se in zip(fit_bayes_y.coef_names, fit_bayes_y.beta_draws.mean(axis=0), fit_bayes_y.beta_draws.std(axis=0, ddof=1)): reg_rows.append({"year": year, "term": term, "model": "Bayes normal-prior", "coef": coef, "se": se}) regularized = pd.DataFrame(reg_rows) regularized.head() ``` ```{python} fig, axes = plt.subplots(2, 4, figsize=(10, 5), sharex=True) for col, term in enumerate(terms): d_mle = sep.loc[sep["term"] == term] axes[0, col].axhline(0, color="0.5", ls="--", lw=0.8) axes[0, col].errorbar(d_mle["year"], d_mle["coef"], yerr=d_mle["se"], fmt="o", color="black", ms=3, lw=0.7) axes[0, col].set_title(term) axes[0, col].set_ylabel("MLE") d_reg = regularized.loc[regularized["term"] == term] axes[1, col].axhline(0, color="0.5", ls="--", lw=0.8) axes[1, col].errorbar(d_reg["year"], d_reg["coef"], yerr=d_reg["se"], fmt="o-", color="black", ms=3, lw=0.7) axes[1, col].set_ylabel("Bayes") axes[1, col].set_xlabel("Year") for ax in axes.ravel(): ax.set_xticks([1960, 1980, 2000]) ax.spines[["top", "right"]].set_visible(False) fig.tight_layout() ``` ## CmdStanPy version of the single-predictor model Use this small Stan model if exact Stan sampling is needed instead of the lightweight Laplace posterior used above. ```{python} from cmdstanpy import CmdStanModel stan_code = """ data { int<lower=1> N; vector[N] income; array[N] int<lower=0, upper=1> rvote; } parameters { real alpha; real beta; } model { alpha ~ normal(0, 2.5); beta ~ normal(0, 2.5); rvote ~ bernoulli_logit(alpha + beta * income); } """ stan_file = Path("_generated/nes_vote_income.stan") stan_file.parent.mkdir(exist_ok=True) stan_file.write_text(stan_code) model = CmdStanModel(stan_file=str(stan_file)) fit = model.sample( data={"N": len(nes92), "income": nes92["income"].to_numpy(), "rvote": nes92["rvote"].astype(int).to_list()}, seed=1992, chains=4, parallel_chains=4, show_progress=False, ) fit.draws_pd()[["alpha", "beta"]].describe(percentiles=[0.05, 0.5, 0.95]) ```