Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 83 additions & 60 deletions methods/matching/find_pairs.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import argparse
import math
import os
import logging
from functools import partial
from multiprocessing import Pool, cpu_count, set_start_method
from numba import jit # type: ignore

import numpy as np
import pandas as pd
from numba import jit

from methods.common.luc import luc_matching_columns
from methods.utils.kd_tree import make_kdrangetree, make_rumba_tree

REPEAT_MATCH_FINDING = 100
DEFAULT_DISTANCE = 10000000.0
Expand Down Expand Up @@ -35,12 +38,7 @@ def find_match_iteration(

logging.info("Loading K from %s", k_parquet_filename)

# Methodology 6.5.7: For a 10% sample of K
k_set = pd.read_parquet(k_parquet_filename)
k_subset = k_set.sample(
frac=0.1,
random_state=rng
).reset_index()

logging.info("Loading M from %s", m_parquet_filename)
m_set = pd.read_parquet(m_parquet_filename)
Expand All @@ -61,11 +59,21 @@ def find_match_iteration(
logging.info("Preparing s_set...")

m_dist_thresholded_df = m_set[DISTANCE_COLUMNS] / thresholds_for_columns
k_subset_dist_thresholded_df = k_subset[DISTANCE_COLUMNS] / thresholds_for_columns
k_set_dist_thresholded_df = k_set[DISTANCE_COLUMNS] / thresholds_for_columns

# Rearrange columns by variance so we throw out the least likely to match first
# except the bottom three which are deforestation CPCs and have more cross-variance between K and M
variances = np.std(m_dist_thresholded_df, axis=0)
cols = DISTANCE_COLUMNS
order = np.argsort(-variances.to_numpy())
order = np.roll(order, 3)
new_cols = [cols[o] for o in order]
m_dist_thresholded_df = m_dist_thresholded_df[new_cols]
k_set_dist_thresholded_df = k_set_dist_thresholded_df[new_cols]

# convert to float32 numpy arrays and make them contiguous for numba to vectorise
m_dist_thresholded = np.ascontiguousarray(m_dist_thresholded_df, dtype=np.float32)
k_subset_dist_thresholded = np.ascontiguousarray(k_subset_dist_thresholded_df, dtype=np.float32)
k_set_dist_thresholded = np.ascontiguousarray(k_set_dist_thresholded_df, dtype=np.float32)

# LUC columns are all named with the year in, so calculate the column names
# for the years we are intested in
Expand All @@ -76,32 +84,50 @@ def find_match_iteration(
hard_match_columns = ['country', 'ecoregion', luc10, luc5, luc0]
assert len(hard_match_columns) == HARD_COLUMN_COUNT

# similar to the above, make the hard match columns contiguous float32 numpy arrays
m_dist_hard = np.ascontiguousarray(m_set[hard_match_columns].to_numpy()).astype(np.int32)
k_subset_dist_hard = np.ascontiguousarray(k_subset[hard_match_columns].to_numpy()).astype(np.int32)
# Find categories in K
hard_match_categories = [k[hard_match_columns].to_numpy() for _, k in k_set.iterrows()]
hard_match_categories = {k.tobytes(): k for k in hard_match_categories}
no_potentials = []

# Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this for every
# pixel in the subsample (which is 10% the size of K) we select 100 pixels.
required = 100
# Methodology 6.5.5: S should be 10 times the size of K
required = 10

logging.info("Running make_s_set_mask... required: %d", required)
starting_positions = rng.integers(0, int(m_dist_thresholded.shape[0]), int(k_subset_dist_thresholded.shape[0]))
s_set_mask_true, no_potentials = make_s_set_mask(
m_dist_thresholded,
k_subset_dist_thresholded,
m_dist_hard,
k_subset_dist_hard,
starting_positions,
required
)

s_set_mask_true = np.zeros(m_set.shape[0], dtype=np.bool_)
no_potentials = np.zeros(k_set.shape[0], dtype=np.bool_)

# Split K and M into those categories and create masks
for values in hard_match_categories.values():
k_selector = np.all(k_set[hard_match_columns] == values, axis=1)
m_selector = np.all(m_set[hard_match_columns] == values, axis=1)
logging.info(" category: %a |K|: %d |M|: %d", values, k_selector.sum(), m_selector.sum())
# Make masks for each of those pairs
key_s_set_mask_true, key_no_potentials = make_s_set_mask(
m_dist_thresholded[m_selector],
k_set_dist_thresholded[k_selector],
required,
rng
)
# Merge into one s_set_mask_true
s_set_mask_true[m_selector] = key_s_set_mask_true
# Merge into no_potentials
no_potentials[k_selector] = key_no_potentials

logging.info("Done make_s_set_mask. s_set_mask.shape: %a", {s_set_mask_true.shape})

s_set = m_set[s_set_mask_true]
logging.info("Finished preparing s_set. shape: %a", {s_set.shape})
potentials = np.invert(no_potentials)

k_subset = k_subset[potentials]
logging.info("Finished preparing s_set. shape: %a", {s_set.shape})
# Methodology 6.5.7: For a 10% sample of K
k_subset = k_set.sample(
frac=0.1,
random_state=rng
)
k_subset = k_subset[k_subset.apply(lambda row: potentials[row.name], axis=1)]
k_subset.reset_index()
logging.info("Finished preparing k_subset. shape: %a", {k_subset.shape})

# Notes:
# 1. Not all pixels may have matches
Expand Down Expand Up @@ -173,56 +199,51 @@ def find_match_iteration(

logging.info("Finished find match iteration")

@jit(nopython=True, fastmath=True, error_model="numpy")
def make_s_set_mask(
m_dist_thresholded: np.ndarray,
k_subset_dist_thresholded: np.ndarray,
m_dist_hard: np.ndarray,
k_subset_dist_hard: np.ndarray,
starting_positions: np.ndarray,
required: int
k_set_dist_thresholded: np.ndarray,
required: int,
rng: np.random.Generator
):
k_size = k_set_dist_thresholded.shape[0]
m_size = m_dist_thresholded.shape[0]
k_size = k_subset_dist_thresholded.shape[0]

s_include = np.zeros(m_size, dtype=np.bool_)
k_miss = np.zeros(k_size, dtype=np.bool_)

for k in range(k_size):
matches = 0
k_row = k_subset_dist_thresholded[k, :]
k_hard = k_subset_dist_hard[k]
m_sets = max(1, min(100, math.floor(m_size // 1e6), math.ceil(m_size / (k_size * required * 10))))

for index in range(m_size):
m_index = (index + starting_positions[k]) % m_size
m_lookup = np.arange(m_size)
rng.shuffle(m_lookup)
m_step = math.ceil(m_size / m_sets)

m_row = m_dist_thresholded[m_index, :]
m_hard = m_dist_hard[m_index]
def m_index(m_set: int, pos: int):
return m_lookup[m_set * m_step + pos]
def m_indexes(m_set: int):
return m_lookup[m_set * m_step:(m_set + 1) * m_step]

should_include = True
m_trees = [make_kdrangetree(m_dist_thresholded[m_indexes(m_set)], np.ones(m_dist_thresholded.shape[1])) for m_set in range(m_sets)]

# check that every element of m_hard matches k_hard
hard_equals = True
for j in range(m_hard.shape[0]):
if m_hard[j] != k_hard[j]:
hard_equals = False
rumba_trees = [make_rumba_tree(m_tree, m_dist_thresholded) for m_tree in m_trees]

if not hard_equals:
should_include = False
for k in range(k_size):
k_row = k_set_dist_thresholded[k]
m_order = np.arange(m_sets)
rng.shuffle(m_order)
possible_s = []
for m_set in m_order:
next_possible_s = rumba_trees[m_set].members_sample(k_row, required, rng)
if possible_s is None:
possible_s = [m_index(m_set, s) for s in next_possible_s]
else:
for j in range(m_row.shape[0]):
if abs(m_row[j] - k_row[j]) > 1.0:
should_include = False

if should_include:
s_include[m_index] = True
matches += 1

# Don't find any more M's
if matches == required:
take = min(required - len(possible_s), len(next_possible_s))
possible_s[len(possible_s):len(possible_s)+take] = [m_index(m_set, s) for s in next_possible_s[0:take]]
if len(possible_s) == required:
break

k_miss[k] = matches == 0
if len(possible_s) == 0:
k_miss[k] = True
else:
s_include[possible_s] = True

return s_include, k_miss

Expand Down Expand Up @@ -278,6 +299,8 @@ def greedy_match(
results.append((k_idx, min_dist_idx))
s_available[min_dist_idx] = False
total_available -= 1
else:
matchless.append(k_idx)
else:
matchless.append(k_idx)

Expand Down
Loading