Source: Roaches/roaches.Rmd

This example analyzes an integrated pest-management experiment in urban apartments. The response y is the post-treatment roach count. Predictors include the baseline roach count, treatment assignment, senior-building indicator, and an exposure offset. The original R page compares negative-binomial, Poisson, and zero-inflated negative-binomial models using rstanarm, brms, and posterior predictive checks.

Setup and data

Code
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import gaussian_kde

root = Path("../../ROS-Examples")
roaches = pd.read_csv(root / "Roaches/data/roaches.csv", index_col=0)
roaches["roach100"] = roaches["roach1"] / 100
roaches["logp1_roach1"] = np.log1p(roaches["roach1"])
roaches["log_exposure2"] = np.log(roaches["exposure2"])
roaches.head()
y roach1 treatment senior exposure2 roach100 logp1_roach1 log_exposure2
1 153 308.00 1 0 0.800000 3.0800 5.733341 -0.223144
2 127 331.25 1 0 0.600000 3.3125 5.805888 -0.510826
3 7 1.67 1 0 1.000000 0.0167 0.982078 0.000000
4 7 3.00 1 0 1.000000 0.0300 1.386294 0.000000
5 0 2.00 1 0 1.142857 0.0200 1.098612 0.133531
Code
roaches.describe()
y roach1 treatment senior exposure2 roach100 logp1_roach1 log_exposure2
count 262.000000 262.000000 262.000000 262.000000 262.000000 262.000000 262.000000 262.000000
mean 25.648855 42.193473 0.603053 0.305344 1.021047 0.421935 2.272546 -0.012538
std 50.846539 75.261969 0.490201 0.461434 0.320757 0.752620 1.863130 0.249755
min 0.000000 0.000000 0.000000 0.000000 0.200000 0.000000 0.000000 -1.609438
25% 0.000000 1.000000 0.000000 0.000000 1.000000 0.010000 0.693147 0.000000
50% 3.000000 7.000000 1.000000 0.000000 1.000000 0.070000 2.079442 0.000000
75% 24.000000 50.500000 1.000000 1.000000 1.000000 0.505000 3.941439 0.000000
max 357.000000 450.000000 1.000000 1.000000 4.285714 4.500000 6.111467 1.455287

Negative-binomial model

The R original fits:

stan_glm(y ~ roach100 + treatment + senior,
         family=neg_binomial_2,
         offset=log(exposure2), data=roaches)

A close maximum-likelihood analogue is statsmodels.discrete.NegativeBinomial, with the offset supplied explicitly.

Code
nb_model = sm.NegativeBinomial.from_formula(
    "y ~ roach100 + treatment + senior",
    data=roaches,
    offset=roaches["log_exposure2"],
)
fit_1 = nb_model.fit(disp=False)
fit_1.summary()
NegativeBinomial Regression Results
Dep. Variable: y No. Observations: 262
Model: NegativeBinomial Df Residuals: 258
Method: MLE Df Model: 3
Date: Sun, 31 May 2026 Pseudo R-squ.: 0.03178
Time: 23:05:02 Log-Likelihood: -889.29
converged: True LL-Null: -918.48
Covariance Type: nonrobust LLR p-value: 1.308e-12
coef std err z P>|z| [0.025 0.975]
Intercept 2.8387 0.229 12.392 0.000 2.390 3.288
roach100 1.2882 0.247 5.206 0.000 0.803 1.773
treatment -0.7767 0.245 -3.170 0.002 -1.257 -0.296
senior -0.3444 0.262 -1.316 0.188 -0.857 0.168
alpha 3.6461 0.348 10.473 0.000 2.964 4.328
Code
alpha_hat = fit_1.params.get("alpha", np.nan)
phi_hat = 1 / alpha_hat if alpha_hat > 0 else np.inf
pd.Series({"alpha": alpha_hat, "phi=1/alpha": phi_hat})
alpha          3.646128
phi=1/alpha    0.274264
dtype: float64

Posterior-predictive-style checks

Without MCMC draws, we can still do the same kind of model criticism using parametric simulation at the fitted MLE. For a negative-binomial 2 model, variance is mu + alpha * mu^2, equivalently phi = 1 / alpha.

Code
rng = np.random.default_rng(3579)

X = fit_1.model.exog
beta = fit_1.params.drop("alpha").to_numpy()
mu_nb = np.exp(X @ beta + roaches["log_exposure2"].to_numpy())
alpha = fit_1.params["alpha"]
phi = 1 / alpha
p = phi / (phi + mu_nb)
yrep_1 = rng.negative_binomial(phi, p, size=(400, len(roaches)))

def check_stats(y, yrep):
    return pd.DataFrame({
        "observed": [np.mean(y == 0), np.mean(y == 1), np.quantile(y, 0.95), np.quantile(y, 0.99), np.max(y)],
        "rep_mean": [np.mean(yrep == 0), np.mean(yrep == 1), np.mean(np.quantile(yrep, 0.95, axis=1)), np.mean(np.quantile(yrep, 0.99, axis=1)), np.mean(np.max(yrep, axis=1))],
    }, index=["Pr(y=0)", "Pr(y=1)", "q95", "q99", "max"])

check_stats(roaches.y.to_numpy(), yrep_1)
observed rep_mean
Pr(y=0) 0.358779 0.334523
Pr(y=1) 0.076336 0.090840
q95 135.950000 126.250000
q99 218.600000 793.287800
max 357.000000 6797.225000
Code
def density_overlay(ax, observed, replicated, title):
    grid = np.linspace(0, max(np.max(observed), np.percentile(replicated, 99)), 250)
    for draw in replicated[:80]:
        ax.plot(grid, gaussian_kde(draw)(grid), color="gray", alpha=0.08)
    ax.plot(grid, gaussian_kde(observed)(grid), color="black", linewidth=2)
    ax.set_title(title)
    ax.set_xlabel("log10(y + 1)")
    ax.set_yticks([])

fig, ax = plt.subplots()
density_overlay(ax, np.log10(roaches.y + 1), np.log10(yrep_1 + 1), "Negative binomial")

Poisson model

Poisson regression is a restricted version of the count model in which variance equals the mean.

Code
fit_2 = smf.glm(
    "y ~ roach100 + treatment + senior",
    data=roaches,
    family=sm.families.Poisson(),
    offset=roaches["log_exposure2"],
).fit()
fit_2.summary()
Generalized Linear Model Regression Results
Dep. Variable: y No. Observations: 262
Model: GLM Df Residuals: 258
Model Family: Poisson Df Model: 3
Link Function: Log Scale: 1.0000
Method: IRLS Log-Likelihood: -6092.0
Date: Sun, 31 May 2026 Deviance: 11429.
Time: 23:05:02 Pearson chi2: 1.69e+04
No. Iterations: 6 Pseudo R-squ. (CS): 1.000
Covariance Type: nonrobust
coef std err z P>|z| [0.025 0.975]
Intercept 3.0892 0.021 145.486 0.000 3.048 3.131
roach100 0.6983 0.009 78.685 0.000 0.681 0.716
treatment -0.5167 0.025 -20.887 0.000 -0.565 -0.468
senior -0.3799 0.033 -11.367 0.000 -0.445 -0.314
Code
mu_pois = fit_2.predict(roaches, offset=roaches["log_exposure2"])
yrep_2 = rng.poisson(mu_pois, size=(400, len(roaches)))

pd.concat(
    {
        "negative_binomial": check_stats(roaches.y.to_numpy(), yrep_1)["rep_mean"],
        "poisson": check_stats(roaches.y.to_numpy(), yrep_2)["rep_mean"],
        "observed": check_stats(roaches.y.to_numpy(), yrep_2)["observed"],
    },
    axis=1,
)
negative_binomial poisson observed
Pr(y=0) 0.334523 0.000611 0.358779
Pr(y=1) 0.090840 0.001613 0.076336
q95 126.250000 76.026875 135.950000
q99 793.287800 154.737275 218.600000
max 6797.225000 407.142500 357.000000
Code
fig, axes = plt.subplots(1, 2, figsize=(10, 3.5), sharey=True)
density_overlay(axes[0], np.log10(roaches.y + 1), np.log10(yrep_2 + 1), "Poisson")
density_overlay(axes[1], np.log10(roaches.y + 1), np.log10(yrep_1 + 1), "Negative binomial")

Code
pd.Series({
    "nb_aic": fit_1.aic,
    "poisson_aic": fit_2.aic,
    "poisson_pearson_dispersion": fit_2.pearson_chi2 / fit_2.df_resid,
})
nb_aic                         1788.583469
poisson_aic                   12191.955271
poisson_pearson_dispersion       65.438151
dtype: float64

The Poisson model typically underestimates the number of zeros and the extreme upper tail; the negative-binomial model’s extra dispersion helps.

Zero-inflated negative binomial

The R page switches to brms for a zero-inflated negative-binomial model. statsmodels has a frequentist zero-inflated negative-binomial implementation. We use the same covariates in both the count and inflation components.

Code
from statsmodels.discrete.count_model import ZeroInflatedNegativeBinomialP

exog = sm.add_constant(roaches[["logp1_roach1", "treatment", "senior"]])
zinb_model = ZeroInflatedNegativeBinomialP(
    endog=roaches["y"],
    exog=exog,
    exog_infl=exog,
    offset=roaches["log_exposure2"],
    inflation="logit",
)
try:
    fit_3 = zinb_model.fit(method="bfgs", maxiter=200, disp=False)
    zinb_summary = fit_3.summary()
except Exception as err:
    fit_3 = None
    zinb_summary = f"Zero-inflated NB fit did not converge in this environment: {err}"
zinb_summary
ZeroInflatedNegativeBinomialP Regression Results
Dep. Variable: y No. Observations: 262
Model: ZeroInflatedNegativeBinomialP Df Residuals: 258
Method: MLE Df Model: 3
Date: Sun, 31 May 2026 Pseudo R-squ.: 0.07763
Time: 23:05:02 Log-Likelihood: -847.16
converged: True LL-Null: -918.46
Covariance Type: nonrobust LLR p-value: 1.036e-30
coef std err z P>|z| [0.025 0.975]
inflate_const -0.8190 0.648 -1.263 0.206 -2.089 0.452
inflate_logp1_roach1 -1.1244 0.276 -4.068 0.000 -1.666 -0.583
inflate_treatment 1.4695 0.624 2.355 0.019 0.247 2.692
inflate_senior 1.1509 0.564 2.041 0.041 0.046 2.256
const 2.1407 0.244 8.763 0.000 1.662 2.619
logp1_roach1 0.5089 0.062 8.178 0.000 0.387 0.631
treatment -0.8069 0.224 -3.607 0.000 -1.245 -0.368
senior 0.1884 0.265 0.712 0.477 -0.330 0.707
alpha 1.9132 0.251 7.616 0.000 1.421 2.406
Code
pd.Series({
    "negative_binomial_aic": fit_1.aic,
    "poisson_aic": fit_2.aic,
    "zero_inflated_nb_aic": np.nan if fit_3 is None else fit_3.aic,
})
negative_binomial_aic     1788.583469
poisson_aic              12191.955271
zero_inflated_nb_aic      1712.318743
dtype: float64

ZeroInflatedNegativeBinomialP prediction APIs and optimizers vary across statsmodels versions, so for a robust page we use the fitted summary/AIC when available rather than requiring replicated zero-inflated draws in evaluated code.

CmdStanPy negative-binomial model

Code
from cmdstanpy import CmdStanModel

stan_code = """
data {
  int<lower=1> N;
  vector[N] roach100;
  vector[N] treatment;
  vector[N] senior;
  vector[N] log_exposure;
  array[N] int<lower=0> y;
}
parameters {
  real alpha;
  real beta_roach;
  real beta_treatment;
  real beta_senior;
  real<lower=0> phi;
}
model {
  alpha ~ normal(0, 5);
  beta_roach ~ normal(0, 2);
  beta_treatment ~ normal(0, 2);
  beta_senior ~ normal(0, 2);
  phi ~ exponential(1);
  y ~ neg_binomial_2_log(
    alpha + beta_roach * roach100 + beta_treatment * treatment + beta_senior * senior + log_exposure,
    phi
  );
}
generated quantities {
  array[N] int y_rep;
  for (n in 1:N) {
    real eta = alpha + beta_roach * roach100[n] + beta_treatment * treatment[n] + beta_senior * senior[n] + log_exposure[n];
    y_rep[n] = neg_binomial_2_log_rng(eta, phi);
  }
}
"""
Path("roaches_nb.stan").write_text(stan_code)
model = CmdStanModel(stan_file="roaches_nb.stan")
fit = model.sample(
    data={
        "N": len(roaches),
        "roach100": roaches.roach100.to_numpy(),
        "treatment": roaches.treatment.to_numpy(),
        "senior": roaches.senior.to_numpy(),
        "log_exposure": roaches.log_exposure2.to_numpy(),
        "y": roaches.y.astype(int).to_numpy(),
    },
    seed=3579,
)
fit.summary().loc[["alpha", "beta_roach", "beta_treatment", "beta_senior", "phi"]]

CmdStanPy zero-inflated negative-binomial sketch

Stan can also express the brms model directly by adding a logit model for structural zeros. This chunk is a template because the full model is slower and more fragile than the negative-binomial baseline.

Code
zinb_stan_code = """
// In the model block, for each observation n:
// if (y[n] == 0)
//   target += log_sum_exp(
//     bernoulli_logit_lpmf(1 | zi_eta[n]),
//     bernoulli_logit_lpmf(0 | zi_eta[n]) + neg_binomial_2_log_lpmf(0 | count_eta[n], phi)
//   );
// else
//   target += bernoulli_logit_lpmf(0 | zi_eta[n]) + neg_binomial_2_log_lpmf(y[n] | count_eta[n], phi);
"""