Poisson regression example

Source: PoissonExample/poisson_regression.Rmd

This example simulates count data from a log-linear Poisson regression and then contrasts it with overdispersed negative-binomial data. The Python port uses statsmodels GLM families and NumPy simulation.

Setup

Code
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

rng = np.random.default_rng(3579)

Simulate Poisson data

Code
n = 100
x = rng.uniform(-2, 2, n)
a = 1
b = 2
linpred = a + b * x
mu = np.exp(linpred)
y = rng.poisson(mu)
fake = pd.DataFrame({"x": x, "y": y})
fake.head()
x y
0 -0.023494 5
1 0.031250 6
2 -1.093270 0
3 -0.229920 1
4 -1.595141 0

Fit a Poisson regression

The R original uses:

stan_glm(y ~ x, family=poisson(link="log"), data=fake)

The maximum-likelihood version is:

Code
fit_fake = smf.glm("y ~ x", data=fake, family=sm.families.Poisson()).fit()
fit_fake.summary()
Generalized Linear Model Regression Results
Dep. Variable: y No. Observations: 100
Model: GLM Df Residuals: 98
Model Family: Poisson Df Model: 1
Link Function: Log Scale: 1.0000
Method: IRLS Log-Likelihood: -179.15
Date: Sun, 31 May 2026 Deviance: 101.79
Time: 23:04:39 Pearson chi2: 102.
No. Iterations: 6 Pseudo R-squ. (CS): 1.000
Covariance Type: nonrobust
coef std err z P>|z| [0.025 0.975]
Intercept 1.0353 0.071 14.670 0.000 0.897 1.174
x 1.9758 0.051 38.759 0.000 1.876 2.076
Code
xgrid = np.linspace(-2.5, 2.5, 300)
eta_hat = fit_fake.params["Intercept"] + fit_fake.params["x"] * xgrid

fig, ax = plt.subplots()
ax.plot(xgrid, np.exp(eta_hat), color="black", label="fitted mean")
ax.scatter(fake.x, fake.y, s=24, alpha=0.75)
ax.set_ylim(-1, 200)
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_title("Simulated data from Poisson regression")
ax.legend(frameon=False)

Overdispersion via negative binomial data

The R example uses MASS::rnegbin(n, mu, theta=phi). With that parameterization, variance is

[ (Y) = + ^2 / . ]

Small phi means heavy overdispersion; large phi approaches the Poisson.

Code
def rnegbin_mu_phi(rng, mu, phi):
    # NumPy negative_binomial uses number of successes r and success probability p,
    # with mean r(1-p)/p.  Set r=phi and p=phi/(phi+mu).
    p = phi / (phi + mu)
    return rng.negative_binomial(phi, p)

phi_grid = [0.1, 1, 10]
fits_nb = {}
nb_data = {}
for phi in phi_grid:
    yy = rnegbin_mu_phi(rng, mu, phi)
    d = pd.DataFrame({"x": x, "y": yy})
    nb_data[phi] = d
    fits_nb[phi] = smf.glm("y ~ x", data=d, family=sm.families.NegativeBinomial(alpha=1 / phi)).fit()

pd.DataFrame({phi: fits_nb[phi].params for phi in phi_grid}).T.rename_axis("phi")
Intercept x
phi
0.1 0.764097 1.744021
1.0 1.017681 1.945912
10.0 0.865666 2.165113
Code
fig, axes = plt.subplots(1, 3, figsize=(12, 3.5), sharey=True)
for ax, phi in zip(axes, phi_grid):
    d = nb_data[phi]
    fit = fits_nb[phi]
    ax.plot(xgrid, np.exp(fit.params["Intercept"] + fit.params["x"] * xgrid), color="black")
    ax.scatter(d.x, d.y, s=22, alpha=0.75)
    ax.set_title(rf"$\phi={phi}$")
    ax.set_xlabel("x")
    ax.set_ylim(-1, 200)
axes[0].set_ylabel("y")
fig.suptitle("Simulated overdispersed Poisson / negative-binomial regressions")
Text(0.5, 0.98, 'Simulated overdispersed Poisson / negative-binomial regressions')

Poisson fit to overdispersed data

A useful diagnostic is to fit a Poisson model to negative-binomial data and compare the Pearson dispersion statistic to 1.

Code
rows = []
for phi, d in nb_data.items():
    poisson_fit = smf.glm("y ~ x", data=d, family=sm.families.Poisson()).fit()
    pearson_dispersion = poisson_fit.pearson_chi2 / poisson_fit.df_resid
    rows.append({
        "phi": phi,
        "poisson_intercept": poisson_fit.params["Intercept"],
        "poisson_x": poisson_fit.params["x"],
        "pearson_dispersion": pearson_dispersion,
    })
pd.DataFrame(rows)
phi poisson_intercept poisson_x pearson_dispersion
0 0.1 0.980643 1.129890 19.498917
1 1.0 0.996134 2.018917 7.267926
2 10.0 0.900847 2.125748 1.391068

CmdStanPy versions

Poisson regression:

Code
from pathlib import Path
from cmdstanpy import CmdStanModel

poisson_stan = """
data {
  int<lower=1> N;
  vector[N] x;
  array[N] int<lower=0> y;
}
parameters {
  real alpha;
  real beta;
}
model {
  alpha ~ normal(0, 5);
  beta ~ normal(0, 5);
  y ~ poisson_log(alpha + beta * x);
}
generated quantities {
  array[N] int y_rep;
  for (n in 1:N) y_rep[n] = poisson_log_rng(alpha + beta * x[n]);
}
"""
Path("poisson_regression.stan").write_text(poisson_stan)
model = CmdStanModel(stan_file="poisson_regression.stan")
fit = model.sample(data={"N": n, "x": x, "y": y.astype(int)}, seed=3579)
fit.summary().loc[["alpha", "beta"]]

Negative-binomial regression with Stan’s neg_binomial_2_log, where phi is the overdispersion parameter:

Code
nb_stan = """
data {
  int<lower=1> N;
  vector[N] x;
  array[N] int<lower=0> y;
}
parameters {
  real alpha;
  real beta;
  real<lower=0> phi;
}
model {
  alpha ~ normal(0, 5);
  beta ~ normal(0, 5);
  phi ~ exponential(1);
  y ~ neg_binomial_2_log(alpha + beta * x, phi);
}
"""
Path("neg_binomial_regression.stan").write_text(nb_stan)
model_nb = CmdStanModel(stan_file="neg_binomial_regression.stan")
fit_nb = model_nb.sample(data={"N": n, "x": x, "y": nb_data[1].y.astype(int).to_numpy()}, seed=3579)
fit_nb.summary().loc[["alpha", "beta", "phi"]]