Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 31 additions & 3 deletions tmd/datasets/tmd.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import sys
import subprocess
import tempfile
import numpy as np
import pandas as pd
from policyengine_us import Microsimulation
Expand All @@ -7,7 +10,6 @@
from tmd.datasets.taxcalc_dataset import create_tc_dataset
from tmd.utils.trace import trace1
from tmd.utils.taxcalc_utils import add_taxcalc_outputs
from tmd.utils.reweight import reweight


def create_tmd_2021():
Expand Down Expand Up @@ -38,15 +40,41 @@ def create_tmd_2021():
# ... drop CPS records with positive 2021 income tax amount
idx = combined[((combined.data_source == 0) & (combined.iitax > 0))].index
combined.drop(idx, inplace=True)
# ... scale CPS records weight in order to get correct population count
# ... scale CPS records weight to get correct population count
scale = np.where(combined.data_source == 0, CPS_WEIGHTS_SCALE, 1.0)
combined["s006"] *= scale

trace1("B", combined)

print("Reweighting...")
combined["s006_original"] = combined["s006"].values
combined = reweight(combined, 2021)
# Run reweighting in a subprocess so that prior PyTorch
# operations (PolicyEngine Microsimulation) don't affect
# gradient computation. Without this, autograd accumulation
# order differs at machine epsilon, which compounds over
# many optimizer iterations on the flat loss surface.
with tempfile.TemporaryDirectory() as tmpdir:
snapshot_path = f"{tmpdir}/snapshot.csv.gz"
result_path = f"{tmpdir}/result.csv.gz"
combined.to_csv(snapshot_path, index=False)
subprocess.run(
[
sys.executable,
"-c",
"import pandas as pd; "
"import sys; sys.path.insert(0, '.'); "
"from tmd.utils.reweight import reweight; "
f"df = pd.read_csv('{snapshot_path}'); "
"df = reweight(df, 2021); "
f"df[['RECID','s006']].to_csv("
f"'{result_path}', index=False)",
],
check=True,
)
reweighted = pd.read_csv(result_path)
combined["s006"] = combined.merge(
reweighted, on="RECID", suffixes=("_old", "")
)["s006"].values

trace1("C", combined)

Expand Down
3 changes: 2 additions & 1 deletion tmd/imputation_assumptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# parameters used in creation of national sampling weights:
REWEIGHT_MULTIPLIER_MIN = 0.1
REWEIGHT_MULTIPLIER_MAX = 10.0
REWEIGHT_DEVIATION_PENALTY = 0.0
REWEIGHT_DEVIATION_PENALTY = 0.0001
# penalty value of 1.0 says "this is as important as everything else"
# penalty value of 0.0 imposes no penalty
# uses L2 norm: sum((new - original)^2) / sum(original^2)
Loading