Mesquite yields: transformations and predictive comparison

Source: Mesquite/mesquite.Rmd

The mesquite example compares a model for raw bush weight to log-log models. The original uses Bayesian GLMs with LOO/K-fold; this Python port uses OLS analogues and cross-validated predictive error to show the same modeling issue.

Code

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

root = Path("../../ROS-Examples")
mesquite = pd.read_table(root / "Mesquite/data/mesquite.dat", sep=r"\s+")
mesquite.head()

	obs	group	diam1	diam2	total_height	canopy_height	density	weight
0	1	MCD	1.8	1.15	1.30	1.00	1	401.3
1	2	MCD	1.7	1.35	1.35	1.33	1	513.7
2	3	MCD	2.8	2.55	2.16	0.60	1	1179.2
3	4	MCD	1.3	0.85	1.80	1.20	1	308.0
4	5	MCD	3.3	1.90	1.55	1.05	1	855.2

Raw-scale regression

Code

fit1 = smf.ols("weight ~ diam1 + diam2 + canopy_height + total_height + density + C(group)", data=mesquite).fit()
fit1.summary()

OLS Regression Results
Dep. Variable:	weight	R-squared:	0.848
Model:	OLS	Adj. R-squared:	0.825
Method:	Least Squares	F-statistic:	36.34
Date:	Sun, 31 May 2026	Prob (F-statistic):	1.73e-14
Time:	23:03:40	Log-Likelihood:	-318.82
No. Observations:	46	AIC:	651.6
Df Residuals:	39	BIC:	664.4
Df Model:	6
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	-1091.8880	176.456	-6.188	0.000	-1448.804	-734.972
C(group)[T.MCD]	363.2951	100.184	3.626	0.001	160.654	565.936
diam1	189.6690	112.760	1.682	0.101	-38.410	417.748
diam2	371.4621	124.378	2.987	0.005	119.883	623.041
canopy_height	355.6653	209.843	1.695	0.098	-68.782	780.113
total_height	-101.7325	185.574	-0.548	0.587	-477.091	273.626
density	131.2542	34.355	3.820	0.000	61.764	200.744

Omnibus:	9.865	Durbin-Watson:	2.137
Prob(Omnibus):	0.007	Jarque-Bera (JB):	19.723
Skew:	0.366	Prob(JB):	5.21e-05
Kurtosis:	6.123	Cond. No.	26.9

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Log-log regression

Code

mesquite = mesquite.assign(
    log_weight=np.log(mesquite.weight),
    log_diam1=np.log(mesquite.diam1),
    log_diam2=np.log(mesquite.diam2),
    log_canopy_height=np.log(mesquite.canopy_height),
    log_total_height=np.log(mesquite.total_height),
    log_density=np.log(mesquite.density),
)
fit2 = smf.ols("log_weight ~ log_diam1 + log_diam2 + log_canopy_height + log_total_height + log_density + C(group)", data=mesquite).fit()
fit2.summary()

OLS Regression Results
Dep. Variable:	log_weight	R-squared:	0.887
Model:	OLS	Adj. R-squared:	0.870
Method:	Least Squares	F-statistic:	51.17
Date:	Sun, 31 May 2026	Prob (F-statistic):	5.73e-17
Time:	23:03:40	Log-Likelihood:	-10.406
No. Observations:	46	AIC:	34.81
Df Residuals:	39	BIC:	47.61
Df Model:	6
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	4.7680	0.155	30.747	0.000	4.454	5.082
C(group)[T.MCD]	0.5834	0.129	4.534	0.000	0.323	0.844
log_diam1	0.3938	0.282	1.397	0.170	-0.177	0.964
log_diam2	1.1512	0.210	5.477	0.000	0.726	1.576
log_canopy_height	0.3732	0.281	1.330	0.191	-0.194	0.941
log_total_height	0.3943	0.313	1.260	0.215	-0.239	1.027
log_density	0.1093	0.122	0.896	0.376	-0.137	0.356

Omnibus:	1.935	Durbin-Watson:	1.362
Prob(Omnibus):	0.380	Jarque-Bera (JB):	1.047
Skew:	-0.283	Prob(JB):	0.593
Kurtosis:	3.474	Cond. No.	12.6

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

K-fold predictive comparison

Code

cv = KFold(n_splits=10, shuffle=True, random_state=4587)
raw_X = mesquite[["diam1", "diam2", "canopy_height", "total_height", "density", "group"]]
log_X = mesquite[["log_diam1", "log_diam2", "log_canopy_height", "log_total_height", "log_density", "group"]]
pre_raw = ColumnTransformer([("num", StandardScaler(), ["diam1", "diam2", "canopy_height", "total_height", "density"]), ("grp", OneHotEncoder(drop="first"), ["group"])])
pre_log = ColumnTransformer([("num", StandardScaler(), ["log_diam1", "log_diam2", "log_canopy_height", "log_total_height", "log_density"]), ("grp", OneHotEncoder(drop="first"), ["group"])])
raw_pipe = make_pipeline(pre_raw, LinearRegression())
log_pipe = make_pipeline(pre_log, LinearRegression())
raw_rmse = -cross_val_score(raw_pipe, raw_X, mesquite.weight, cv=cv, scoring="neg_root_mean_squared_error")
log_rmse = -cross_val_score(log_pipe, log_X, mesquite.log_weight, cv=cv, scoring="neg_root_mean_squared_error")
print(raw_rmse.mean(), log_rmse.mean())

312.28872431252427 0.33630890119937035

Posterior-predictive-check analogue

Code

rng = np.random.default_rng(4587)
yhat1 = fit1.fittedvalues.to_numpy()
yhat2 = np.exp(fit2.fittedvalues.to_numpy())
sigma1 = np.sqrt(fit1.mse_resid)
sigma2 = np.sqrt(fit2.mse_resid)
yrep1 = rng.normal(yhat1, sigma1, size=(100, len(yhat1)))
yrep2 = np.exp(rng.normal(fit2.fittedvalues.to_numpy(), sigma2, size=(100, len(yhat2))))
fig, axs = plt.subplots(1, 2, figsize=(10, 3))
axs[0].hist(mesquite.weight, bins=20, density=True, color="black", alpha=.3)
for row in yrep1[:25]: axs[0].hist(row, bins=20, density=True, histtype="step", color="gray", alpha=.25)
axs[0].set_title("raw-scale model")
axs[1].hist(mesquite.weight, bins=20, density=True, color="black", alpha=.3)
for row in yrep2[:25]: axs[1].hist(row, bins=20, density=True, histtype="step", color="gray", alpha=.25)
axs[1].set_title("log-scale model, back-transformed")

Text(0.5, 1.0, 'log-scale model, back-transformed')

Canopy volume simplification

Code

mesquite["canopy_volume"] = mesquite.diam1 * mesquite.diam2 * mesquite.canopy_height
fit3 = smf.ols("log_weight ~ np.log(canopy_volume)", data=mesquite).fit()
pd.DataFrame({"model": ["full log model", "canopy volume"], "aic": [fit2.aic, fit3.aic], "r2": [fit2.rsquared, fit3.rsquared]})

	model	aic	r2
0	full log model	34.812099	0.887286
1	canopy volume	51.373763	0.799206

# Mesquite yields: transformations and predictive comparison Source: `Mesquite/mesquite.Rmd` The mesquite example compares a model for raw bush weight to log-log models. The original uses Bayesian GLMs with LOO/K-fold; this Python port uses OLS analogues and cross-validated predictive error to show the same modeling issue. ```{python} from pathlib import Path import numpy as np import pandas as pd import matplotlib.pyplot as plt import statsmodels.formula.api as smf from sklearn.model_selection import KFold, cross_val_score from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.pipeline import make_pipeline from sklearn.linear_model import LinearRegression root = Path("../../ROS-Examples") mesquite = pd.read_table(root / "Mesquite/data/mesquite.dat", sep=r"\s+") mesquite.head() ``` ## Raw-scale regression ```{python} fit1 = smf.ols("weight ~ diam1 + diam2 + canopy_height + total_height + density + C(group)", data=mesquite).fit() fit1.summary() ``` ## Log-log regression ```{python} mesquite = mesquite.assign( log_weight=np.log(mesquite.weight), log_diam1=np.log(mesquite.diam1), log_diam2=np.log(mesquite.diam2), log_canopy_height=np.log(mesquite.canopy_height), log_total_height=np.log(mesquite.total_height), log_density=np.log(mesquite.density), ) fit2 = smf.ols("log_weight ~ log_diam1 + log_diam2 + log_canopy_height + log_total_height + log_density + C(group)", data=mesquite).fit() fit2.summary() ``` ## K-fold predictive comparison ```{python} cv = KFold(n_splits=10, shuffle=True, random_state=4587) raw_X = mesquite[["diam1", "diam2", "canopy_height", "total_height", "density", "group"]] log_X = mesquite[["log_diam1", "log_diam2", "log_canopy_height", "log_total_height", "log_density", "group"]] pre_raw = ColumnTransformer([("num", StandardScaler(), ["diam1", "diam2", "canopy_height", "total_height", "density"]), ("grp", OneHotEncoder(drop="first"), ["group"])]) pre_log = ColumnTransformer([("num", StandardScaler(), ["log_diam1", "log_diam2", "log_canopy_height", "log_total_height", "log_density"]), ("grp", OneHotEncoder(drop="first"), ["group"])]) raw_pipe = make_pipeline(pre_raw, LinearRegression()) log_pipe = make_pipeline(pre_log, LinearRegression()) raw_rmse = -cross_val_score(raw_pipe, raw_X, mesquite.weight, cv=cv, scoring="neg_root_mean_squared_error") log_rmse = -cross_val_score(log_pipe, log_X, mesquite.log_weight, cv=cv, scoring="neg_root_mean_squared_error") print(raw_rmse.mean(), log_rmse.mean()) ``` ## Posterior-predictive-check analogue ```{python} rng = np.random.default_rng(4587) yhat1 = fit1.fittedvalues.to_numpy() yhat2 = np.exp(fit2.fittedvalues.to_numpy()) sigma1 = np.sqrt(fit1.mse_resid) sigma2 = np.sqrt(fit2.mse_resid) yrep1 = rng.normal(yhat1, sigma1, size=(100, len(yhat1))) yrep2 = np.exp(rng.normal(fit2.fittedvalues.to_numpy(), sigma2, size=(100, len(yhat2)))) fig, axs = plt.subplots(1, 2, figsize=(10, 3)) axs[0].hist(mesquite.weight, bins=20, density=True, color="black", alpha=.3) for row in yrep1[:25]: axs[0].hist(row, bins=20, density=True, histtype="step", color="gray", alpha=.25) axs[0].set_title("raw-scale model") axs[1].hist(mesquite.weight, bins=20, density=True, color="black", alpha=.3) for row in yrep2[:25]: axs[1].hist(row, bins=20, density=True, histtype="step", color="gray", alpha=.25) axs[1].set_title("log-scale model, back-transformed") ``` ## Canopy volume simplification ```{python} mesquite["canopy_volume"] = mesquite.diam1 * mesquite.diam2 * mesquite.canopy_height fit3 = smf.ols("log_weight ~ np.log(canopy_volume)", data=mesquite).fit() pd.DataFrame({"model": ["full log model", "canopy volume"], "aic": [fit2.aic, fit3.aic], "r2": [fit2.rsquared, fit3.rsquared]}) ```