Fix errors arising under formulaic>=1.1.0 (#786)

* add jax benchmark notebook * support formulaic 1.1.0 * delete gpu notebook * delete coverage xml
py-econometrics · Jan 12, 2025 · 306da82 · 306da82
1 parent cb31ee2
commit 306da82
Show file tree

Hide file tree

Showing 8 changed files with 54 additions and 4,578 deletions.
diff --git a/coverage.xml b/coverage.xml
diff --git a/docs/changelog.qmd b/docs/changelog.qmd
@@ -5,6 +5,7 @@
 - Adds a `pf.feglm()` function that supports GLMs with normal and binomial families (gaussian, logit, probit) without fixed effects. Fixed effects support is work in progress.
 - Adds a function argument `context`, that allows to pass information / context to the `formulaic.Formulaic.get_model_matrix()` call that creates the model matrix.
 - Fix a bug that caused reindexing of `LPDID._coeftable` when calling `LPDID.iplot()`. As a result, a second call of `LPDID.iplot()` would fail.
+- Bumps the required `formulaic` version to `1.1.0` and fixes errors that arose when a) the ref argument was used for i() syntax, which led to a silent failure under formulaic >= 1.1.0, and fixef() / predict() with fixed effects, which led to a loud error.
 
 ## PyFixest 0.27.0
 

diff --git a/pixi.lock b/pixi.lock
diff --git a/pyfixest/estimation/model_matrix_fixest_.py b/pyfixest/estimation/model_matrix_fixest_.py
@@ -254,7 +254,7 @@ def _get_columns_to_drop_and_check_ivars(
                     if ref and "_" in ref:
                         ref = ref.replace("_", "")
 
-                pattern = rf"\[T\.{ref}(?:\.0)?\]:{var2}"
+                pattern = rf"\[(?:T\.)?{ref}(?:\.0)?\]:{var2}"
                 if ref:
                     for column in X.columns:
                         if var1 in column and re.search(pattern, column):

diff --git a/pyfixest/utils/dev_utils.py b/pyfixest/utils/dev_utils.py
@@ -182,18 +182,12 @@ def _extract_variable_level(fe_string: str):
         A tuple containing the extracted variable and level for the fixed
         effect.
     """
-    c_pattern = r"C\((.+?)\)"
-    t_pattern = r"\[T\.(.*\])"
-    c_match = re.search(c_pattern, fe_string)
-    t_match = re.search(t_pattern, fe_string, re.DOTALL)
+    pattern = r"C\(([^)]*)\)\[(?:T\.)?(.*)\]$"
+    match = re.search(pattern, fe_string)
+    if not match:
+        raise ValueError(f"Cannot parse: {fe_string}")
 
-    if not c_match or not t_match:
-        raise ValueError(
-            f"feols() failed after regex encountered the following value as a fixed effect:\n {fe_string}."
-            + "\nThis may due to the presence of line separation and/or escape sequences within the string."
-            + " If so, consider recoding the underlying string. Otherwise, please open a PR in the github repo!"
-        )
-
-    variable = c_match.group(1)
-    level = t_match.group(1)
-    return "C(" + variable + ")", level[0 : level.rfind("]")]
+    variable = match.group(1)
+    level = match.group(2)
+
+    return f"C({variable})", level
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ license = { text = "MIT" }
 dependencies = [
   "lets-plot>=4.0.0",
   "scipy>=1.6",
-  "formulaic>=1.0.0,<1.1.0",
+  "formulaic>=1.1.0",
   "pandas>=1.1.0",
   "numba>=0.58.0",
   "seaborn>=0.13.2",

diff --git a/tests/test_i.py b/tests/test_i.py
@@ -115,42 +115,23 @@ def test_i_vs_fixest():
     )
 
 
-def test_i_interacted_fixest():
+@pytest.mark.parametrize(
+    "fml",
+    [
+        "dep_var ~ i(state)",
+        "dep_var ~ i(state, ref = 1)",
+        "dep_var ~ i(state, year)",
+        "dep_var ~ i(state, year, ref = 1)",
+        "dep_var ~ i(state, year) | state",
+        "dep_var ~ i(state, year, ref = 1) | state",
+    ],
+)
+def test_i_interacted_fixest(fml):
     df_het = pd.read_csv("pyfixest/did/data/df_het.csv")
     df_het["X"] = np.random.normal(df_het.shape[0])
 
-    # ------------------------------------------------------------------------ #
-    # no fixed effects
-
-    # no references
-    fit_py = feols("dep_var~i(state, year)", df_het)
-    fit_r = fixest.feols(ro.Formula("dep_var~i(state, year)"), df_het)
+    fit_py = feols(fml, df_het)
+    fit_r = fixest.feols(ro.Formula(fml), df_het)
     np.testing.assert_allclose(
         fit_py.coef().values, np.array(fit_r.rx2("coefficients"))
     )
-
-    if True:
-        # no reference one fixed effect
-        fit_py = feols("dep_var~i(state, year) | state ", df_het)
-        fit_r = fixest.feols(ro.Formula("dep_var~i(state, year) | state"), df_het)
-        np.testing.assert_allclose(
-            fit_py.coef().values, np.array(fit_r.rx2("coefficients"))
-        )
-
-    if True:
-        # one reference
-        fit_py = feols("dep_var~i(state, year,ref=1)  ", df_het)
-        fit_r = fixest.feols(ro.Formula("dep_var~i(state, year, ref = 1)"), df_het)
-        np.testing.assert_allclose(
-            fit_py.coef().values, np.array(fit_r.rx2("coefficients"))
-        )
-
-    if True:
-        # one reference and fixed effect
-        fit_py = feols("dep_var~i(state, year,ref=1) | state ", df_het)
-        fit_r = fixest.feols(
-            ro.Formula("dep_var~i(state, year, ref = 1) | state"), df_het
-        )
-        np.testing.assert_allclose(
-            fit_py.coef().values, np.array(fit_r.rx2("coefficients"))
-        )
diff --git a/tests/test_predict_resid_fixef.py b/tests/test_predict_resid_fixef.py
@@ -305,3 +305,5 @@ def test_extract_variable_level():
     assert _extract_variable_level(var) == ("C(f3)", "1.0")
     var = "C(f4)[T.1]"
     assert _extract_variable_level(var) == ("C(f4)", "1")
+    var = "C(f5)[1.0]"
+    assert _extract_variable_level(var) == ("C(f5)", "1.0")