py-econometrics/pyfixest

Extended TWFE a la Wooldridge (2021)

Opened this issue · 5 comments

Implement an extended 2wfe estimator a la Wooldridge 2021 following the etwfe R package.

Some Background

See the Stata docs for an intro to the estimator without covariates (bottom of page 18): link or Wooldridge's paper linked above:
image

This is also explained in @vincentarelbundock 's blog post. Basically, we fit

etwfe = feols(y ~ treat : factor(time_to_treat) : factor(cohort) | firm + year, data = dat)

With covariates, we instead get

image

with

image

Note that for the residualizing of covariates X by cohort, we can use pyfixest's demean function as

from pyfixest.estimation.demean_ import demean
X = X - demean(X, cohort) 

To do

Implement an API similar to R's etwfe() function.

mod =
  etwfe(
    fml  = lemp ~ lpop, # outcome ~ controls
    tvar = year,        # time variable
    gvar = first.treat, # group variable
    data = mpdta,       # dataset
    vcov = ~countyreal  # vcov adjustment (here: clustered)
    )

Marginal effects can be computed via py-marginaleffects.

Implementation sketch:

library(etwfe)
library(data.table)

data("mpdta", package = "did")
fwrite(mpdta, "C:/Users/alexa/Documents/data/mpdta.csv")

head(mpdta)

data = mpdta

mod =
  etwfe(
    fml  = lemp ~ 1, # outcome ~ controls
    tvar = year,        # time variable
    gvar = first.treat, # group variable
    data = data,       # dataset
    vcov = ~countyreal,  # vcov adjustment (here: clustered)
  )

summary(mod)

# OLS estimation, Dep. Var.: lemp
# Observations: 2,500
# Fixed-effects: first.treat: 4,  year: 5
# Standard-errors: Clustered (countyreal) 
#                                       Estimate Std. Error   t value   Pr(>|t|)    
# .Dtreat:first.treat::2004:year::2004 -0.010503   0.023363 -0.449562 0.65322178    
# .Dtreat:first.treat::2004:year::2005 -0.070423   0.031134 -2.261910 0.02413241 *  
# .Dtreat:first.treat::2004:year::2006 -0.137259   0.036612 -3.749051 0.00019829 ***
# .Dtreat:first.treat::2004:year::2007 -0.100811   0.034525 -2.919941 0.00365931 ** 
# .Dtreat:first.treat::2006:year::2004  0.006520   0.023439  0.278168 0.78099831    
# .Dtreat:first.treat::2006:year::2005  0.003769   0.031493  0.119685 0.90478060    
# .Dtreat:first.treat::2006:year::2006 -0.000825   0.033799 -0.024418 0.98052868    
# .Dtreat:first.treat::2006:year::2007 -0.037455   0.035897 -1.043403 0.29726681    
# .Dtreat:first.treat::2007:year::2004  0.030507   0.015106  2.019485 0.04397085 *  
# .Dtreat:first.treat::2007:year::2005  0.027781   0.019638  1.414614 0.15780542    
# .Dtreat:first.treat::2007:year::2006 -0.003306   0.024570 -0.134569 0.89300678    
# .Dtreat:first.treat::2007:year::2007 -0.029361   0.026561 -1.105397 0.26952021    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# RMSE: 1.48661     Adj. R2: 0.021348
#                 Within R2: 8.696e-5

mod |> coef() |> sort() |> unname()
mod |> coef() |> sort() |> unname()
# [1] -0.136078114 -0.104707472 -0.078319099 -0.043106033 -0.039192736
# [6] -0.019372364  0.002513862
import pandas as pd 
import numpy as np
import pyfixest as pf

data = pd.read_csv("C:/Users/alexa/Documents/data/mpdta.csv")
data.rename(columns = {"first.treat":"first_treat"}, inplace = True)

tvar = "year"
gvar = "first_treat"

# Unique values
ug = data[gvar].unique()
ut = data[tvar].unique()

# Determine gref
gref = ug[ug > max(ut)]
if len(gref) == 0:
    gref = ug[ug < min(ut)]
gref = gref[0]  
gref_min_flag = True

# Construct rhs (right-hand side)
rhs = ""
rhs = f".Dtreat : {rhs}"
ref_string = f", ref = {gref}"
rhs = f"{rhs}i({gvar}, {tvar}{ref_string})"

data[".Dtreat"] = (data[tvar] >= data[gvar]) & (data[gvar] != gref)
if not gref_min_flag:
    data[".Dtreat"] = np.where(data[tvar] < gref, data[".Dtreat"], np.nan)
else:
    data[".Dtreat"] = np.where(data[tvar] > gref, data[".Dtreat"], np.nan)

# turn tvar into categorical
data[gvar] = pd.Categorical(data[gvar])

lhs = "lemp"
fixef = "first_treat + year"
fml = f"{lhs} ~ {rhs} | {fixef}"

fit = pf.feols(fml, data = data)
fit.summary()

#    raise ValueError(
#ValueError: 
#                        The second variable in the i() syntax cannot be of type #"category" or "object", but
#                        but it is of type category.

Problem: interacting two categoricals via i() currently leads to the error above.

Alternative: don't use i() syntax and go with C() interactions directly.

rhs = ""
rhs = f".Dtreat : {rhs}"
rhs2 = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
lhs = "lemp"
fixef = "first_treat + year"
fml2 = f"{lhs} ~ {rhs2} | {fixef}"


fit2 = pf.feols(fml2, data = data)
np.sort(fit2.coef().values)
#array([-0.13607811, -0.10470747, -0.0783191 , -0.04310603, -0.03919274,
#       -0.01937236,  0.00251386])

Matches the etfwe output above.

Basic implementation without covariates:

import pandas as pd 
import numpy as np
import pyfixest as pf


def etwfe(fml, tvar, gvar, data, cgroup = "notyet"):
  
    fml = fml.replace(" ", "")
    fml_split = fml.split("~")
    lhs = fml_split[0]
    
    #if fml_split["1"] != "1": 
    #  raise NotImplementedError("Covariates are not yet implemented.")

    rhs = fml_split[1] if fml_split[1] != "1" else ""

    if cgroup not in ["notyet"]: 
        raise NotImplementedError(f"cgroup must be 'notyet' but is {cgroup}.")

    ug = data[gvar].unique()
    ut = data[tvar].unique()
    
    gref = ug[ug > max(ut)]
    if gref.size == 0:
        gref = ug[ug < min(ut)]
    gref = gref[0]  
    gref_min_flag = True
    
    #rhs = ""
    #rhs = f".Dtreat : {rhs}"
    #rhs = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
  
    #fixef = f"{tvar} + {gvar}"
    #fml = f"{lhs} ~ {rhs} | {fixef}"
        
    rhs = ""
    rhs = f".Dtreat : {rhs}"
    rhs = f"{rhs}C({gvar}, contr.treatment({gref})):C({tvar})"
    lhs = "lemp"
    fixef = "first_treat + year"
    fml = f"{lhs} ~ {rhs} | {fixef}"

    data[".Dtreat"] = (data[tvar] >= data[gvar]) & (data[gvar] != gref)
    if not gref_min_flag:
        data[".Dtreat"] = np.where(data[tvar] < gref, data[".Dtreat"], np.nan)
    else:
        data[".Dtreat"] = np.where(data[tvar] > gref, data[".Dtreat"], np.nan)
    
    print("fml", fml)
    fit = pf.feols(fml, data = data)
    
    return fit


data = pd.read_csv("C:/Users/alexa/Documents/data/mpdta.csv")
data.rename(columns = {"first.treat":"first_treat"}, inplace = True)

fit = etwfe(fml = "lpop ~ 1", tvar = "year", gvar = "first_treat", data = data)
fit.coef().sort_values()

#Coefficient
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2006]   -0.136078
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2007]   -0.104707
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2005]   -0.078319
#.Dtreat:C(first_treat, contr.treatment(0))[T.2007]:C(year)[T.2007]   -0.043106
#.Dtreat:C(first_treat, contr.treatment(0))[T.2006]:C(year)[T.2007]   -0.039193
#.Dtreat:C(first_treat, contr.treatment(0))[T.2004]:C(year)[T.2004]   -0.019372
#.Dtreat:C(first_treat, contr.treatment(0))[T.2006]:C(year)[T.2006]    0.002514
#Name: Estimate, dtype: float64

Next steps:

  • add support for covariates
  • marginal effects emfx
  • additional function arguments & argument values: cgroup = "never", gref, etc

emfx would look something like this:

from marginaleffects import slopes  
def emfx(fit, data, tvar, gvar): 
  
  predict = "response" # could also be link
  wts = data.groupby([tvar, gvar]).size()
  wts.name = "N"
  wts = wts.reset_index()
  
  data = data.merge(wts, on = [tvar, gvar], how = "left")
  
  by_var = ".Dtreat"
  
  return slopes(
      fit,
      newdata = data,   
      wts = "N",
      variables = ".Dtreat",
      by = by_var,
    )

See the R emfx function.

this would be an uncompressed version of the mundlak regression right?