Arsenic wells: logistic residuals

Source: Arsenic/arsenic_logistic_residuals.Rmd

This ports the ROS binned-residual diagnostics for logistic regression. The fitted models mirror the R formulas; statsmodels supplies the logistic-regression estimates and matplotlib recreates the residual checks.

Setup and data

Code
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from scipy.special import expit

root = Path("../../ROS-Examples")
wells = pd.read_csv(root / "Arsenic/data/wells.csv")
wells["dist100"] = wells["dist"] / 100
wells["educ4"] = wells["educ"] / 4
wells["c_dist100"] = wells["dist100"] - wells["dist100"].mean()
wells["c_arsenic"] = wells["arsenic"] - wells["arsenic"].mean()
wells["c_educ4"] = wells["educ4"] - wells["educ4"].mean()
wells.head()
switch arsenic dist dist100 assoc educ educ4 c_dist100 c_arsenic c_educ4
0 1 2.36 16.826000 0.16826 0 0 0.0 -0.315059 0.70307 -1.207119
1 1 0.71 47.321999 0.47322 0 0 0.0 -0.010099 -0.94693 -1.207119
2 0 2.07 20.966999 0.20967 0 10 2.5 -0.273649 0.41307 1.292881
3 1 1.15 21.486000 0.21486 0 12 3.0 -0.268459 -0.50693 1.792881
4 1 1.10 40.874001 0.40874 1 14 3.5 -0.074579 -0.55693 2.292881

Fit the interaction model

Code
fit_8 = smf.logit(
    "switch ~ c_dist100 + c_arsenic + c_educ4 + c_dist100:c_educ4 + c_arsenic:c_educ4",
    data=wells,
).fit(disp=False)
pred8 = fit_8.predict(wells)
resid8 = wells["switch"] - pred8
fit_8.params.round(2)
Intercept            0.34
c_dist100           -0.92
c_arsenic            0.49
c_educ4              0.19
c_dist100:c_educ4    0.33
c_arsenic:c_educ4    0.08
dtype: float64

The null classification rule predicts the majority class; the fitted rule classifies by whether the fitted probability is above 0.5.

Code
error_rate_null = np.mean(np.round(np.abs(wells["switch"] - pred8.mean())))
error_rate = np.mean(np.round(np.abs(wells["switch"] - pred8)))
round(error_rate_null, 2), round(error_rate, 2)
(np.float64(0.42), np.float64(0.38))

Raw residual plot

Code
fig, ax = plt.subplots(figsize=(5, 4))
ax.axhline(0, color="0.7", lw=1)
ax.scatter(pred8, resid8, s=5, color="black", alpha=0.35)
ax.set(xlabel="Estimated Pr(switching)", ylabel="Observed - estimated", title="Residual plot")
ax.spines[["top", "right"]].set_visible(False)

Binned residuals

For binary outcomes, raw residuals form two diagonal bands. Binning makes lack of fit easier to see: the average residual in each bin should usually stay inside approximate two-standard-error bands around zero.

Code
def binned_resids(x, y, nclass=40):
    x = np.asarray(x)
    y = np.asarray(y)
    order = np.argsort(x)
    groups = np.array_split(order, nclass)
    rows = []
    for idx in groups:
        yy = y[idx]
        xx = x[idx]
        rows.append({
            "xbar": xx.mean(),
            "ybar": yy.mean(),
            "n": len(idx),
            "x_lo": xx.min(),
            "x_hi": xx.max(),
            "two_se": 2 * yy.std(ddof=1) / np.sqrt(len(idx)),
        })
    return pd.DataFrame(rows)

def plot_binned(x, residual, xlabel, title="Binned residual plot", nclass=40):
    br = binned_resids(x, residual, nclass=nclass)
    fig, ax = plt.subplots(figsize=(5, 4))
    ax.axhline(0, color="0.7", lw=1)
    ax.plot(br["xbar"], br["two_se"], color="0.7", lw=1)
    ax.plot(br["xbar"], -br["two_se"], color="0.7", lw=1)
    ax.scatter(br["xbar"], br["ybar"], s=18, color="black")
    ax.set(xlabel=xlabel, ylabel="Average residual", title=title)
    ax.spines[["top", "right"]].set_visible(False)
    return br, ax

br_pred, _ = plot_binned(pred8, resid8, "Estimated Pr(switching)")
br_pred.head()
xbar ybar n x_lo x_hi two_se
0 0.246423 0.003577 76 0.037377 0.320079 0.100977
1 0.347163 -0.031373 76 0.320301 0.369372 0.106884
2 0.385882 -0.122724 76 0.369663 0.403028 0.101766
3 0.417379 -0.035800 76 0.403378 0.429575 0.112217
4 0.442539 0.044303 76 0.430931 0.450707 0.115716

Code
br_dist, _ = plot_binned(wells["dist"], resid8, "Distance to nearest safe well")
br_arsenic, _ = plot_binned(wells["arsenic"], resid8, "Arsenic level")

Log-arsenic model

The R example then replaces arsenic with log(arsenic) and checks whether the residual pattern against arsenic improves.

Code
wells["log_arsenic"] = np.log(wells["arsenic"])
fit_8b = smf.logit(
    "switch ~ dist100 + log_arsenic + educ4 + dist100:educ4 + log_arsenic:educ4",
    data=wells,
).fit(disp=False)
pred8b = fit_8b.predict(wells)
resid8b = wells["switch"] - pred8b
round(np.mean(np.round(np.abs(wells["switch"] - pred8b))), 2)
np.float64(0.36)
Code
rng = np.random.default_rng(123)
y_jit = wells["switch"] + (1 - 2 * wells["switch"]) * rng.uniform(0, 0.05, len(wells))
xs = np.linspace(0.5, wells["arsenic"].max(), 300)
b = fit_8b.params
mean_educ4 = wells["educ4"].mean()

fig, ax = plt.subplots(figsize=(5, 4))
ax.scatter(wells["arsenic"], y_jit, s=4, color="black", alpha=0.25)
for dist100, label in [(0, "dist = 0"), (0.5, "dist = 50m")]:
    p = expit(
        b["Intercept"] + b["dist100"] * dist100 + b["log_arsenic"] * np.log(xs)
        + b["educ4"] * mean_educ4 + b["dist100:educ4"] * dist100 * mean_educ4
        + b["log_arsenic:educ4"] * np.log(xs) * mean_educ4
    )
    ax.plot(xs, p, lw=1, label=label)
ax.set(xlabel="Arsenic concentration in well water", ylabel="Pr(switching)")
ax.legend(frameon=False)
ax.spines[["top", "right"]].set_visible(False)

Code
br_log_arsenic, _ = plot_binned(
    wells["arsenic"], resid8b, "Arsenic level",
    title="Binned residual plot for model with log(arsenic)",
)