glm-tools/pyglmnet

ensure reproducibility of negative bionomial on toy dataset with R

Opened this issue · 1 comments

Dataset: https://stats.idre.ucla.edu/r/dae/negative-binomial-regression/

R code:

require(foreign)
require(ggplot2)
require(MASS)

dat <- read.dta("https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta")
dat <- within(dat, {
    prog <- factor(prog, levels = 1:3, labels = c("General", "Academic", "Vocational"))
    id <- factor(id)
})

ggplot(dat, aes(daysabs, fill = prog)) + geom_histogram(binwidth = 1) + facet_grid(prog ~ 
    ., margins = TRUE, scales = "free")

with(dat, tapply(daysabs, prog, function(x) {
    sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))
}))

summary(m1 <- glm.nb(daysabs ~ math + prog, data = dat))

Output:

glm.nb(formula = daysabs ~ math + prog, data = dat, init.theta = 1.032713156, 
    link = log)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1547  -1.0192  -0.3694   0.2285   2.5273  

Coefficients:
                Estimate Std. Error z value Pr(>|z|)
(Intercept)     2.615265   0.197460  13.245  < 2e-16
math           -0.005993   0.002505  -2.392   0.0167
progAcademic   -0.440760   0.182610  -2.414   0.0158
progVocational -1.278651   0.200720  -6.370 1.89e-10

Python code:

##########################################################
#
# GLM with negative binomial distribution
#
# This is an example of GLM with negative binomial distribution.
# In this example, we will be using an example from R community below
# https://stats.idre.ucla.edu/r/dae/negative-binomial-regression/
#
# Here, we would like to predict the days absense of high school juniors
# at two schools from there type of program they are enrolled, and their math
# score.


##########################################################
# Import relevance libraries

import pandas as pd
from pyglmnet import GLM

import matplotlib.pyplot as plt

##########################################################
# Read and preprocess data

df = pd.read_stata("https://stats.idre.ucla.edu/stat/stata/dae/nb_data.dta")
print(df.head())

# histogram of type of program they are enrolled
df.hist(column='daysabs', by=['prog'])
plt.show()

# print mean and standard deviation for each program enrolled
print(df.groupby('prog').agg({'daysabs': ['mean', 'std']}))

##########################################################
# Feature

X = df.drop('daysabs', axis=1)
y = df['daysabs'].values

# design matrix
program_df = pd.get_dummies(df.prog)
Xdsgn = pd.concat((df['math'], program_df.drop(1.0, axis=1)), axis=1).values

##########################################################
# Predict using GLM

# theta = 0.968
theta = 1.033
# theta = 5.
distr = 'neg-binomial'
# distr = 'softplus'

glm_neg_bino = GLM(distr=distr,
                   alpha=0.0, reg_lambda=0.0, max_iter=1000, solver='cdfast',
                   score_metric='pseudo_R2', theta=theta, learning_rate=10)
glm_neg_bino.fit(Xdsgn, y)
yhat = glm_neg_bino.predict(Xdsgn)
print(glm_neg_bino.beta0_, glm_neg_bino.beta_)
print(glm_neg_bino.score(Xdsgn, yhat))

I would use the 'batch-gradient' solver because that's better tested.