Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 39 additions & 10 deletions methods/matching/find_pairs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# Implement similar logic to the R matchit package
# https://github.com/kosukeimai/MatchIt/blob/7b7b48af995ad1d2ab83a72bad9a689bbb9e8151/R/aux_functions.R#L550
def pooled_scaled_cov(treatment, control):
treatment -= np.mean(treatment, axis=0)
control -= np.mean(control, axis=0)
combined_scaled = np.concatenate([treatment, control])
size = len(combined_scaled)
return np.cov(combined_scaled, rowvar=False) * (size-1) / (size-2)

def find_match_iteration(
k_parquet_filename: str,
s_parquet_filename: str,
Expand Down Expand Up @@ -52,10 +61,36 @@ def find_match_iteration(
# As well as all the LUC columns for later use
luc_columns = [x for x in s_set.columns if x.startswith('luc')]

s_subset_for_cov = s_subset[['elevation', 'slope', 'access', \
'cpc0_u', 'cpc0_d', 'cpc5_u', 'cpc5_d', 'cpc10_u', 'cpc10_d']]
covarience = np.cov(s_subset_for_cov, rowvar=False)
invconv = np.linalg.inv(covarience)
distance_columns = [
"elevation", "slope", "access",
"cpc0_u", "cpc0_d",
"cpc5_u", "cpc5_d",
"cpc10_u", "cpc10_d"
]

s_subset_for_cov = s_subset[distance_columns]
k_subset_for_cov = k_subset[distance_columns]

logging.info("Scaling the columns...")

# Scale the columns as per matchit
# https://github.com/kosukeimai/MatchIt/blob/7b7b48af995ad1d2ab83a72bad9a689bbb9e8151/R/dist_functions.R#L240
combined = np.concatenate([s_subset_for_cov, k_subset_for_cov])
means = np.mean(combined, axis=0)
stds = np.std(combined - means, axis=0)
s_subset_for_cov -= means
s_subset_for_cov /= stds
k_subset_for_cov -= means
k_subset_for_cov /= stds

s_subset[distance_columns] = s_subset_for_cov
k_subset[distance_columns] = k_subset_for_cov

logging.info("Calculating pooled covariance...")
covariance = pooled_scaled_cov(k_subset_for_cov, s_subset_for_cov)

logging.info("Calculating inverse covariance...")
invconv = np.linalg.inv(covariance).astype(np.float32)

for _, k_row in k_subset.iterrows():
# Methodology 6.5.7: find the matches.
Expand Down Expand Up @@ -85,12 +120,6 @@ def find_match_iteration(
# * slope
# * accessibility
# * coarsened proportional coverage
distance_columns = [
"elevation", "slope", "access",
"cpc0_u", "cpc0_d",
"cpc5_u", "cpc5_d",
"cpc10_u", "cpc10_d"
]
k_soft = np.array(k_row[distance_columns].to_list())
just_cols = filtered_s[distance_columns].to_numpy()

Expand Down