Add jackknife variance estimation to SyntheticDiD

igerber · claude · igerber · commit 56c71fa1dd5b · 2026-04-15T20:05:18.000-04:00
Implement Algorithm 3 from Arkhangelsky et al. (2021) as a third
variance_method option ("jackknife") matching R's synthdid::vcov(method="jackknife").
Delete-1 jackknife over all units with fixed weights - no Frank-Wolfe
re-estimation, making it the fastest variance method. Validated against
R golden values with SE matching to machine precision (5.5e-15).

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/benchmarks/R/benchmark_synthdid.R b/benchmarks/R/benchmark_synthdid.R
@@ -73,14 +73,21 @@ weights <- attr(tau_hat, "weights")
 unit_weights <- weights$omega
 time_weights <- weights$lambda
 
-# Compute SE via placebo (jackknife)
+# Compute SE via placebo
 message("Computing standard errors...")
 se_start <- Sys.time()
 se_matrix <- vcov(tau_hat, method = "placebo")
 se <- as.numeric(sqrt(se_matrix[1, 1]))  # Extract scalar SE
 se_time <- as.numeric(difftime(Sys.time(), se_start, units = "secs"))
 
-total_time <- estimation_time + se_time
+# Compute SE via jackknife (Algorithm 3)
+message("Computing jackknife standard errors...")
+se_jk_start <- Sys.time()
+se_jk_matrix <- vcov(tau_hat, method = "jackknife")
+se_jackknife <- as.numeric(sqrt(se_jk_matrix[1, 1]))
+se_jk_time <- as.numeric(difftime(Sys.time(), se_jk_start, units = "secs"))
+
+total_time <- estimation_time + se_time + se_jk_time
 
 # Compute noise level and regularization (to match Python's auto-computed values)
 N0 <- setup$N0
@@ -98,6 +105,7 @@ results <- list(
   # Point estimate and SE
   att = as.numeric(tau_hat),
   se = se,
+  se_jackknife = se_jackknife,
 
   # Weights (full precision)
   unit_weights = as.numeric(unit_weights),
@@ -111,7 +119,8 @@ results <- list(
   # Timing
   timing = list(
     estimation_seconds = estimation_time,
-    se_seconds = se_time,
+    se_placebo_seconds = se_time,
+    se_jackknife_seconds = se_jk_time,
     total_seconds = total_time
   ),
 
diff --git a/benchmarks/python/benchmark_synthdid.py b/benchmarks/python/benchmark_synthdid.py
@@ -45,7 +45,7 @@ def parse_args():
     )
     parser.add_argument(
         "--variance-method", type=str, default="placebo",
-        choices=["bootstrap", "placebo"],
+        choices=["bootstrap", "jackknife", "placebo"],
         help="Variance estimation method (default: placebo to match R)"
     )
     parser.add_argument(
diff --git a/diff_diff/results.py b/diff_diff/results.py
@@ -662,7 +662,7 @@ class SyntheticDiDResults:
     att : float
         Average Treatment effect on the Treated (ATT).
     se : float
-        Standard error of the ATT estimate (bootstrap or placebo-based).
+        Standard error of the ATT estimate (bootstrap, jackknife, or placebo-based).
     t_stat : float
         T-statistic for the ATT estimate.
     p_value : float
@@ -684,7 +684,12 @@ class SyntheticDiDResults:
     post_periods : list
         List of post-treatment period identifiers.
     variance_method : str
-        Method used for variance estimation: "bootstrap" or "placebo".
+        Method used for variance estimation: "bootstrap", "jackknife", or "placebo".
+    placebo_effects : np.ndarray, optional
+        Method-specific per-iteration estimates: placebo treatment effects
+        (for "placebo"), bootstrap ATT estimates (for "bootstrap"), or
+        leave-one-out estimates (for "jackknife"). The ``variance_method``
+        field disambiguates the contents.
     """
 
     att: float
diff --git a/diff_diff/synthetic_did.py b/diff_diff/synthetic_did.py
@@ -53,10 +53,15 @@ class SyntheticDiD(DifferenceInDifferences):
           Implements Algorithm 4 from Arkhangelsky et al. (2021). This is R's default.
         - "bootstrap": Bootstrap at unit level with fixed weights matching R's
           synthdid::vcov(method="bootstrap").
+        - "jackknife": Jackknife variance matching R's synthdid::vcov(method="jackknife").
+          Implements Algorithm 3 from Arkhangelsky et al. (2021). Deterministic
+          (N_control + N_treated iterations), uses fixed weights (no re-estimation).
+          The ``n_bootstrap`` parameter is ignored for this method.
     n_bootstrap : int, default=200
         Number of replications for variance estimation. Used for both:
         - Bootstrap: Number of bootstrap samples
         - Placebo: Number of random permutations (matches R's `replications` argument)
+        Ignored when ``variance_method="jackknife"``.
     seed : int, optional
         Random seed for reproducibility. If None (default), results
         will vary between runs.
@@ -163,15 +168,15 @@ def __init__(
         self.n_bootstrap = n_bootstrap
         self.seed = seed
 
-        # Validate n_bootstrap
-        if n_bootstrap < 2:
+        # Validate n_bootstrap (irrelevant for jackknife, which is deterministic)
+        if n_bootstrap < 2 and variance_method != "jackknife":
             raise ValueError(
                 f"n_bootstrap must be >= 2 (got {n_bootstrap}). At least 2 "
                 f"iterations are needed to estimate standard errors."
             )
 
         # Validate variance_method
-        valid_methods = ("bootstrap", "placebo")
+        valid_methods = ("bootstrap", "jackknife", "placebo")
         if variance_method not in valid_methods:
             raise ValueError(
                 f"variance_method must be one of {valid_methods}, " f"got '{variance_method}'"
@@ -269,18 +274,19 @@ def fit(  # type: ignore[override]
                 f"Got '{resolved_survey.weight_type}'."
             )
 
-        # Reject placebo + full survey design (strata/PSU/FPC are silently ignored)
+        # Reject non-bootstrap + full survey design (strata/PSU/FPC need Rao-Wu)
         if (
             resolved_survey is not None
             and (
                 resolved_survey.strata is not None
                 or resolved_survey.psu is not None
                 or resolved_survey.fpc is not None
             )
-            and self.variance_method == "placebo"
+            and self.variance_method != "bootstrap"
         ):
             raise NotImplementedError(
-                "SyntheticDiD with variance_method='placebo' does not support strata/PSU/FPC. "
+                f"SyntheticDiD with variance_method='{self.variance_method}' does not "
+                "support strata/PSU/FPC. "
                 "Use variance_method='bootstrap' for full survey design support."
             )
 
@@ -510,6 +516,20 @@ def fit(  # type: ignore[override]
             )
             placebo_effects = bootstrap_estimates
             inference_method = "bootstrap"
+        elif self.variance_method == "jackknife":
+            # Fixed-weight jackknife (R's synthdid Algorithm 3)
+            se, jackknife_estimates = self._jackknife_se(
+                Y_pre_control,
+                Y_post_control,
+                Y_pre_treated,
+                Y_post_treated,
+                unit_weights,
+                time_weights,
+                w_treated=w_treated,
+                w_control=w_control,
+            )
+            placebo_effects = jackknife_estimates
+            inference_method = "jackknife"
         else:
             # Use placebo-based variance (R's synthdid Algorithm 4)
             se, placebo_effects = self._placebo_variance_se(
@@ -528,7 +548,14 @@ def fit(  # type: ignore[override]
 
         # Compute test statistics
         t_stat, p_value_analytical, conf_int = safe_inference(att, se, alpha=self.alpha)
-        if len(placebo_effects) > 0 and np.isfinite(t_stat):
+        # Empirical p-value for placebo/bootstrap (null-distribution draws).
+        # Jackknife pseudo-values are NOT null-distribution draws, so use
+        # analytical (normal) p-value instead.
+        if (
+            inference_method != "jackknife"
+            and len(placebo_effects) > 0
+            and np.isfinite(t_stat)
+        ):
             p_value = max(
                 np.mean(np.abs(placebo_effects) >= np.abs(att)),
                 1.0 / (len(placebo_effects) + 1),
@@ -1106,6 +1133,171 @@ def _placebo_variance_se(
 
         return se, placebo_estimates
 
+    def _jackknife_se(
+        self,
+        Y_pre_control: np.ndarray,
+        Y_post_control: np.ndarray,
+        Y_pre_treated: np.ndarray,
+        Y_post_treated: np.ndarray,
+        unit_weights: np.ndarray,
+        time_weights: np.ndarray,
+        w_treated=None,
+        w_control=None,
+    ) -> Tuple[float, np.ndarray]:
+        """Compute jackknife standard error matching R's synthdid Algorithm 3.
+
+        Delete-1 jackknife over all units (control + treated) with **fixed**
+        weights.  For each leave-one-out sample the original omega is subsetted
+        and renormalized; lambda stays unchanged.  No Frank-Wolfe
+        re-estimation, making this the fastest variance method.
+
+        This matches R's ``synthdid::vcov(method="jackknife")`` which sets
+        ``update.omega=FALSE, update.lambda=FALSE``.
+
+        Parameters
+        ----------
+        Y_pre_control : np.ndarray
+            Control outcomes in pre-treatment periods, shape (n_pre, n_control).
+        Y_post_control : np.ndarray
+            Control outcomes in post-treatment periods, shape (n_post, n_control).
+        Y_pre_treated : np.ndarray
+            Treated outcomes in pre-treatment periods, shape (n_pre, n_treated).
+        Y_post_treated : np.ndarray
+            Treated outcomes in post-treatment periods, shape (n_post, n_treated).
+        unit_weights : np.ndarray
+            Unit weights from Frank-Wolfe optimization, shape (n_control,).
+        time_weights : np.ndarray
+            Time weights from Frank-Wolfe optimization, shape (n_pre,).
+        w_treated : np.ndarray, optional
+            Survey probability weights for treated units.
+        w_control : np.ndarray, optional
+            Survey probability weights for control units.
+
+        Returns
+        -------
+        tuple
+            (se, jackknife_estimates) where se is the standard error and
+            jackknife_estimates is a length-N array of leave-one-out estimates
+            (first n_control entries are control-LOO, last n_treated are
+            treated-LOO).
+
+        References
+        ----------
+        Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., & Wager, S.
+        (2021). Synthetic Difference-in-Differences. American Economic Review,
+        111(12), 4088-4118. Algorithm 3.
+        """
+        n_control = Y_pre_control.shape[1]
+        n_treated = Y_pre_treated.shape[1]
+        n = n_control + n_treated
+
+        # --- Early-return NaN: matches R's NA conditions ---
+        if n_treated <= 1:
+            warnings.warn(
+                "Jackknife variance requires more than 1 treated unit. "
+                "Use variance_method='placebo' for single treated unit.",
+                UserWarning,
+                stacklevel=3,
+            )
+            return np.nan, np.array([])
+
+        if np.sum(unit_weights > 0) <= 1:
+            warnings.warn(
+                "Jackknife variance requires more than 1 control unit with "
+                "nonzero weight. Consider variance_method='placebo'.",
+                UserWarning,
+                stacklevel=3,
+            )
+            return np.nan, np.array([])
+
+        jackknife_estimates = np.empty(n)
+
+        # --- Precompute treated means (constant across control-LOO) ---
+        if w_treated is not None:
+            treated_pre_mean = np.average(Y_pre_treated, axis=1, weights=w_treated)
+            treated_post_mean = np.average(Y_post_treated, axis=1, weights=w_treated)
+        else:
+            treated_pre_mean = np.mean(Y_pre_treated, axis=1)
+            treated_post_mean = np.mean(Y_post_treated, axis=1)
+
+        # --- Precompute omega composed with survey weights (for treated-LOO) ---
+        if w_control is not None:
+            omega_eff_full = unit_weights * w_control
+            omega_eff_full = omega_eff_full / omega_eff_full.sum()
+        else:
+            omega_eff_full = unit_weights
+
+        # --- Leave-one-out over control units ---
+        mask = np.ones(n_control, dtype=bool)
+        for j in range(n_control):
+            mask[j] = False
+
+            # Subset and renormalize omega
+            omega_jk = _sum_normalize(unit_weights[mask])
+
+            # Compose with survey weights if present
+            if w_control is not None:
+                omega_jk = omega_jk * w_control[mask]
+                omega_jk = omega_jk / omega_jk.sum()
+
+            jackknife_estimates[j] = compute_sdid_estimator(
+                Y_pre_control[:, mask],
+                Y_post_control[:, mask],
+                treated_pre_mean,
+                treated_post_mean,
+                omega_jk,
+                time_weights,
+            )
+
+            mask[j] = True  # restore for next iteration
+
+        # --- Leave-one-out over treated units ---
+        mask = np.ones(n_treated, dtype=bool)
+        for k in range(n_treated):
+            mask[k] = False
+
+            # Recompute treated means from remaining units
+            if w_treated is not None:
+                w_t_jk = w_treated[mask]
+                t_pre_mean = np.average(
+                    Y_pre_treated[:, mask], axis=1, weights=w_t_jk
+                )
+                t_post_mean = np.average(
+                    Y_post_treated[:, mask], axis=1, weights=w_t_jk
+                )
+            else:
+                t_pre_mean = np.mean(Y_pre_treated[:, mask], axis=1)
+                t_post_mean = np.mean(Y_post_treated[:, mask], axis=1)
+
+            jackknife_estimates[n_control + k] = compute_sdid_estimator(
+                Y_pre_control,
+                Y_post_control,
+                t_pre_mean,
+                t_post_mean,
+                omega_eff_full,
+                time_weights,
+            )
+
+            mask[k] = True  # restore for next iteration
+
+        # --- Check for non-finite estimates (propagate NaN like R's var()) ---
+        if not np.all(np.isfinite(jackknife_estimates)):
+            warnings.warn(
+                "Some jackknife leave-one-out estimates are non-finite. "
+                "Standard error cannot be computed.",
+                UserWarning,
+                stacklevel=3,
+            )
+            return np.nan, jackknife_estimates
+
+        # --- Jackknife SE formula: sqrt((n-1)/n * sum((u - ubar)^2)) ---
+        # Matches R's: sqrt(((n-1)/n) * (n-1) * var(u))
+        u_bar = np.mean(jackknife_estimates)
+        ss = np.sum((jackknife_estimates - u_bar) ** 2)
+        se = np.sqrt((n - 1) / n * ss)
+
+        return se, jackknife_estimates
+
     def get_params(self) -> Dict[str, Any]:
         """Get estimator parameters."""
         return {
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
@@ -1472,6 +1472,23 @@ Convergence criterion: stop when objective decrease < min_decrease² (default mi
   5. Compute SDID estimate with renormalized ω and original λ
   6. `SE = sd(bootstrap_estimates, ddof=1)`
 
+- Alternative: Jackknife variance (matching R's `synthdid::vcov(method="jackknife")`)
+  Implements Algorithm 3 from Arkhangelsky et al. (2021):
+  1. For each control unit j=1,...,N_co:
+     - Remove unit j, renormalize omega: `ω_jk = _sum_normalize(ω[remaining])`
+     - Keep λ unchanged, keep treated means unchanged
+     - Compute SDID estimate τ_{(-j)}
+  2. For each treated unit k=1,...,N_tr:
+     - Keep ω and λ unchanged
+     - Recompute treated mean from remaining N_tr-1 treated units
+     - Compute SDID estimate τ_{(-k)}
+  3. `SE = sqrt( ((n-1)/n) × Σ (τ_{(-i)} - τ̄)² )` where n = N_co + N_tr
+
+  Fixed weights: No Frank-Wolfe re-estimation (`update.omega=FALSE, update.lambda=FALSE`).
+  Returns NaN SE for single treated unit or single nonzero-weight control.
+  Deterministic: exactly N_co + N_tr iterations, no replications parameter.
+  P-value: analytical (normal distribution), not empirical.
+
 *Edge cases:*
 - **Frank-Wolfe non-convergence**: Returns current weights after max_iter iterations. No warning emitted; the convergence check `vals[t-1] - vals[t] < min_decrease²` simply does not trigger early exit, and the final iterate is returned.
 - **`_sparsify` all-zero input**: If `max(v) <= 0`, returns uniform weights `ones(len(v)) / len(v)`.
@@ -1490,7 +1507,10 @@ Convergence criterion: stop when objective decrease < min_decrease² (default mi
 - **Varying treatment within unit**: Raises `ValueError`. SDID requires block treatment (constant within each unit). Suggests CallawaySantAnna or ImputationDiD for staggered adoption.
 - **Unbalanced panel**: Raises `ValueError`. SDID requires all units observed in all periods. Suggests `balance_panel()`.
 - **Poor pre-treatment fit**: Warns (`UserWarning`) when `pre_fit_rmse > std(treated_pre_outcomes, ddof=1)`. Diagnostic only; estimation proceeds.
-- **Note:** Survey support: weights, strata, PSU, and FPC are all supported. Full-design surveys use Rao-Wu rescaled bootstrap (Phase 6); `variance_method="placebo"` requires weights-only (strata/PSU/FPC require bootstrap). Both sides weighted per WLS regression interpretation: treated-side means are survey-weighted (Frank-Wolfe target and ATT formula); control-side synthetic weights are composed with survey weights post-optimization (ω_eff = ω * w_co, renormalized). Frank-Wolfe optimization itself is unweighted — survey importance enters after trajectory-matching. Covariate residualization uses WLS with survey weights. Placebo and bootstrap SE preserve survey weights on both sides.
+- **Jackknife with single treated unit**: Returns NaN SE. Cannot leave-one-out with N_tr=1; R returns NA for the same condition.
+- **Jackknife with single nonzero-weight control**: Returns NaN SE. Leaving out the only effective control is not meaningful.
+- **Jackknife with non-finite LOO estimate**: Returns NaN SE. Unlike bootstrap/placebo, jackknife is deterministic and cannot skip failed iterations; NaN propagates through `var()` (matches R behavior).
+- **Note:** Survey support: weights, strata, PSU, and FPC are all supported. Full-design surveys use Rao-Wu rescaled bootstrap (Phase 6); non-bootstrap variance methods (`variance_method="placebo"` or `"jackknife"`) require weights-only (strata/PSU/FPC require bootstrap). Both sides weighted per WLS regression interpretation: treated-side means are survey-weighted (Frank-Wolfe target and ATT formula); control-side synthetic weights are composed with survey weights post-optimization (ω_eff = ω * w_co, renormalized). Frank-Wolfe optimization itself is unweighted — survey importance enters after trajectory-matching. Covariate residualization uses WLS with survey weights. Placebo, jackknife, and bootstrap SE preserve survey weights on both sides.
 
 **Reference implementation(s):**
 - R: `synthdid::synthdid_estimate()` (Arkhangelsky et al.'s official package)
@@ -1505,6 +1525,9 @@ Convergence criterion: stop when objective decrease < min_decrease² (default mi
 - [x] Placebo SE formula: sqrt((r-1)/r) * sd(placebo_estimates)
 - [x] Placebo SE: re-estimates omega and lambda per replication (matching R's update.omega=TRUE, update.lambda=TRUE)
 - [x] Bootstrap: fixed weights (original lambda unchanged, omega renormalized for resampled controls)
+- [x] Jackknife SE: fixed weights, LOO all units, formula `sqrt((n-1)/n * sum((u-ubar)^2))`
+- [x] Jackknife: NaN SE for single treated or single nonzero-weight control
+- [x] Jackknife: analytical p-value (not empirical)
 - [x] Returns both unit and time weights for interpretation
 - [x] Column centering (intercept=True) in Frank-Wolfe optimization
 
diff --git a/tests/test_estimators.py b/tests/test_estimators.py
diff --git a/tests/test_methodology_sdid.py b/tests/test_methodology_sdid.py

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def parse_args():`
`45`	`45`	`)`
`46`	`46`	`parser.add_argument(`
`47`	`47`	`"--variance-method", type=str, default="placebo",`
`48`		`- choices=["bootstrap", "placebo"],`
	`48`	`+ choices=["bootstrap", "jackknife", "placebo"],`
`49`	`49`	`help="Variance estimation method (default: placebo to match R)"`
`50`	`50`	`)`
`51`	`51`	`parser.add_argument(`