Extended TWFE a la Wooldridge (2021)
Opened this issue · 5 comments
Implement an extended 2wfe estimator a la Wooldridge 2021 following the etwfe R package.
Some Background
See the Stata docs for an intro to the estimator without covariates (bottom of page 18): link or Wooldridge's paper linked above:
This is also explained in @vincentarelbundock 's blog post. Basically, we fit
etwfe = feols(y ~ treat : factor(time_to_treat) : factor(cohort) | firm + year, data = dat)
With covariates, we instead get
with
Note that for the residualizing of covariates X by cohort, we can use pyfixest's demean function as
from pyfixest.estimation.demean_ import demean
X = X - demean(X, cohort)
To do
Implement an API similar to R's etwfe()
function.
mod =
etwfe(
fml = lemp ~ lpop, # outcome ~ controls
tvar = year, # time variable
gvar = first.treat, # group variable
data = mpdta, # dataset
vcov = ~countyreal # vcov adjustment (here: clustered)
)
Marginal effects can be computed via py-marginaleffects.
Implementation sketch:
library(etwfe)
library(data.table)
data("mpdta", package = "did")
fwrite(mpdta, "C:/Users/alexa/Documents/data/mpdta.csv")
head(mpdta)
data = mpdta
mod =
etwfe(
fml = lemp ~ 1, # outcome ~ controls
tvar = year, # time variable
gvar = first.treat, # group variable
data = data, # dataset
vcov = ~countyreal, # vcov adjustment (here: clustered)
)
summary(mod)
# OLS estimation, Dep. Var.: lemp
# Observations: 2,500
# Fixed-effects: first.treat: 4, year: 5
# Standard-errors: Clustered (countyreal)
# Estimate Std. Error t value Pr(>|t|)
# .Dtreat:first.treat::2004:year::2004 -0.010503 0.023363 -0.449562 0.65322178
# .Dtreat:first.treat::2004:year::2005 -0.070423 0.031134 -2.261910 0.02413241 *
# .Dtreat:first.treat::2004:year::2006 -0.137259 0.036612 -3.749051 0.00019829 ***
# .Dtreat:first.treat::2004:year::2007 -0.100811 0.034525 -2.919941 0.00365931 **
# .Dtreat:first.treat::2006:year::2004 0.006520 0.023439 0.278168 0.78099831
# .Dtreat:first.treat::2006:year::2005 0.003769 0.031493 0.119685 0.90478060
# .Dtreat:first.treat::2006:year::2006 -0.000825 0.033799 -0.024418 0.98052868
# .Dtreat:first.treat::2006:year::2007 -0.037455 0.035897 -1.043403 0.29726681
# .Dtreat:first.treat::2007:year::2004 0.030507 0.015106 2.019485 0.04397085 *
# .Dtreat:first.treat::2007:year::2005 0.027781 0.019638 1.414614 0.15780542
# .Dtreat:first.treat::2007:year::2006 -0.003306 0.024570 -0.134569 0.89300678
# .Dtreat:first.treat::2007:year::2007 -0.029361 0.026561 -1.105397 0.26952021
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RMSE: 1.48661 Adj. R2: 0.021348
# Within R2: 8.696e-5
mod |> coef() |> sort() |> unname()
mod |> coef() |> sort() |> unname()
# [1] -0.136078114 -0.104707472 -0.078319099 -0.043106033 -0.039192736
# [6] -0.019372364 0.002513862
import pandas as pd
import numpy as np
import pyfixest as pf
data = pd.read_csv("C:/Users/alexa/Documents/data/mpdta.csv")
data.rename(columns = {"first.treat":"first_treat"}, inplace = True)
tvar = "year"
gvar = "first_treat"
# Unique values
ug = data[gvar].unique()
ut = data[tvar].unique()
# Determine gref
gref = ug[ug > max(ut)]
if len(gref) == 0:
gref = ug[ug < min(ut)]
gref = gref[0]
gref_min_flag = True
# Construct rhs (right-hand side)
rhs = ""
rhs = f".Dtreat : {rhs}"
ref_string = f", ref = {gref}"
rhs = f"{rhs}i({gvar}, {tvar}{ref_string})"
data[".Dtreat"] = (data[tvar] >= data[gvar]) & (data[gvar] != gref)
if not gref_min_flag:
data[".Dtreat"] = np.where(data[tvar] < gref, data[".Dtreat"], np.nan)
else:
data[".Dtreat"] = np.where(data[tvar] > gref, data[".Dtreat"], np.nan)
# turn tvar into categorical
data[gvar] = pd.Categorical(data[gvar])
lhs = "lemp"
fixef = "first_treat + year"
fml = f"{lhs} ~ {rhs} | {fixef}"
fit = pf.feols(fml, data = data)
fit.summary()
# raise ValueError(
#ValueError:
# The second variable in the i() syntax cannot be of type #"category" or "object", but
# but it is of type category.
Problem: interacting two categoricals via i()
currently leads to the error above.
Alternative: don't use i() syntax and go with C() interactions directly.
rhs = ""
rhs = f".Dtreat : {rhs}"
rhs2 = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
lhs = "lemp"
fixef = "first_treat + year"
fml2 = f"{lhs} ~ {rhs2} | {fixef}"
fit2 = pf.feols(fml2, data = data)
np.sort(fit2.coef().values)
#array([-0.13607811, -0.10470747, -0.0783191 , -0.04310603, -0.03919274,
# -0.01937236, 0.00251386])
Matches the etfwe
output above.
Basic implementation without covariates:
import pandas as pd
import numpy as np
import pyfixest as pf
def etwfe(fml, tvar, gvar, data, cgroup = "notyet"):
fml = fml.replace(" ", "")
fml_split = fml.split("~")
lhs = fml_split[0]
#if fml_split["1"] != "1":
# raise NotImplementedError("Covariates are not yet implemented.")
rhs = fml_split[1] if fml_split[1] != "1" else ""
if cgroup not in ["notyet"]:
raise NotImplementedError(f"cgroup must be 'notyet' but is {cgroup}.")
ug = data[gvar].unique()
ut = data[tvar].unique()
gref = ug[ug > max(ut)]
if gref.size == 0:
gref = ug[ug < min(ut)]
gref = gref[0]
gref_min_flag = True
#rhs = ""
#rhs = f".Dtreat : {rhs}"
#rhs = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
#fixef = f"{tvar} + {gvar}"
#fml = f"{lhs} ~ {rhs} | {fixef}"
rhs = ""
rhs = f".Dtreat : {rhs}"
rhs = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
lhs = "lemp"
fixef = "first_treat + year"
fml = f"{lhs} ~ {rhs} | {fixef}"
data[".Dtreat"] = (data[tvar] >= data[gvar]) & (data[gvar] != gref)
if not gref_min_flag:
data[".Dtreat"] = np.where(data[tvar] < gref, data[".Dtreat"], np.nan)
else:
data[".Dtreat"] = np.where(data[tvar] > gref, data[".Dtreat"], np.nan)
print("fml", fml)
fit = pf.feols(fml, data = data)
return fit
data = pd.read_csv("C:/Users/alexa/Documents/data/mpdta.csv")
data.rename(columns = {"first.treat":"first_treat"}, inplace = True)
fit = etwfe(fml = "lpop ~ 1", tvar = "year", gvar = "first_treat", data = data)
fit.coef().sort_values()
#Coefficient
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2006] -0.136078
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2007] -0.104707
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2005] -0.078319
#.Dtreat:C(first_treat, contr.treatment(0))[T.2007]:C(year)[T.2007] -0.043106
#.Dtreat:C(first_treat, contr.treatment(0))[T.2006]:C(year)[T.2007] -0.039193
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2004] -0.019372
#.Dtreat:C(first_treat, contr.treatment(0))[T.2006]:C(year)[T.2006] 0.002514
#Name: Estimate, dtype: float64
Next steps:
- add support for covariates
- marginal effects
emfx
- additional function arguments & argument values:
cgroup = "never"
,gref
, etc
emfx
would look something like this:
from marginaleffects import slopes
def emfx(fit, data, tvar, gvar):
predict = "response" # could also be link
wts = data.groupby([tvar, gvar]).size()
wts.name = "N"
wts = wts.reset_index()
data = data.merge(wts, on = [tvar, gvar], how = "left")
by_var = ".Dtreat"
return slopes(
fit,
newdata = data,
wts = "N",
variables = ".Dtreat",
by = by_var,
)
See the R emfx function.
this would be an uncompressed version of the mundlak regression right?