quantifyearth · robinmessage · Nov 22, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023
diff --git a/methods/matching/find_pairs.py b/methods/matching/find_pairs.py
@@ -1,13 +1,16 @@
 import argparse
+import math
 import os
 import logging
 from functools import partial
 from multiprocessing import Pool, cpu_count, set_start_method
-from numba import jit  # type: ignore
+
 import numpy as np
 import pandas as pd
+from numba import jit
 
 from methods.common.luc import luc_matching_columns
+from methods.utils.kd_tree import make_kdrangetree, make_rumba_tree
 
 REPEAT_MATCH_FINDING = 100
 DEFAULT_DISTANCE = 10000000.0
@@ -35,12 +38,7 @@ def find_match_iteration(
 
     logging.info("Loading K from %s", k_parquet_filename)
 
-    # Methodology 6.5.7: For a 10% sample of K
     k_set = pd.read_parquet(k_parquet_filename)
-    k_subset = k_set.sample(
-        frac=0.1,
-        random_state=rng
-    ).reset_index()
 
     logging.info("Loading M from %s", m_parquet_filename)
     m_set = pd.read_parquet(m_parquet_filename)
@@ -61,11 +59,21 @@ def find_match_iteration(
     logging.info("Preparing s_set...")
 
     m_dist_thresholded_df = m_set[DISTANCE_COLUMNS] / thresholds_for_columns
-    k_subset_dist_thresholded_df = k_subset[DISTANCE_COLUMNS] / thresholds_for_columns
+    k_set_dist_thresholded_df = k_set[DISTANCE_COLUMNS] / thresholds_for_columns
+
+    # Rearrange columns by variance so we throw out the least likely to match first
+    # except the bottom three which are deforestation CPCs and have more cross-variance between K and M
+    variances = np.std(m_dist_thresholded_df, axis=0)
+    cols = DISTANCE_COLUMNS
+    order = np.argsort(-variances.to_numpy())
+    order = np.roll(order, 3)
+    new_cols = [cols[o] for o in order]
+    m_dist_thresholded_df = m_dist_thresholded_df[new_cols]
+    k_set_dist_thresholded_df = k_set_dist_thresholded_df[new_cols]
 
     # convert to float32 numpy arrays and make them contiguous for numba to vectorise
     m_dist_thresholded = np.ascontiguousarray(m_dist_thresholded_df, dtype=np.float32)
-    k_subset_dist_thresholded = np.ascontiguousarray(k_subset_dist_thresholded_df, dtype=np.float32)
+    k_set_dist_thresholded = np.ascontiguousarray(k_set_dist_thresholded_df, dtype=np.float32)
 
     # LUC columns are all named with the year in, so calculate the column names
     # for the years we are intested in
@@ -76,32 +84,50 @@ def find_match_iteration(
     hard_match_columns = ['country', 'ecoregion', luc10, luc5, luc0]
     assert len(hard_match_columns) == HARD_COLUMN_COUNT
 
-    # similar to the above, make the hard match columns contiguous float32 numpy arrays
-    m_dist_hard = np.ascontiguousarray(m_set[hard_match_columns].to_numpy()).astype(np.int32)
-    k_subset_dist_hard = np.ascontiguousarray(k_subset[hard_match_columns].to_numpy()).astype(np.int32)
+    # Find categories in K
+    hard_match_categories = [k[hard_match_columns].to_numpy() for _, k in k_set.iterrows()]
+    hard_match_categories = {k.tobytes(): k for k in hard_match_categories}
+    no_potentials = []
 
-    # Methodology 6.5.5: S should be 10 times the size of K, in order to achieve this for every
-    # pixel in the subsample (which is 10% the size of K) we select 100 pixels.
-    required = 100
+    # Methodology 6.5.5: S should be 10 times the size of K
+    required = 10
 
     logging.info("Running make_s_set_mask... required: %d", required)
-    starting_positions = rng.integers(0, int(m_dist_thresholded.shape[0]), int(k_subset_dist_thresholded.shape[0]))
-    s_set_mask_true, no_potentials = make_s_set_mask(
-        m_dist_thresholded,
-        k_subset_dist_thresholded,
-        m_dist_hard,
-        k_subset_dist_hard,
-        starting_positions,
-        required
-    )
+
+    s_set_mask_true = np.zeros(m_set.shape[0], dtype=np.bool_)
+    no_potentials = np.zeros(k_set.shape[0], dtype=np.bool_)
+
+    # Split K and M into those categories and create masks
+    for values in hard_match_categories.values():
+        k_selector = np.all(k_set[hard_match_columns] == values, axis=1)
+        m_selector = np.all(m_set[hard_match_columns] == values, axis=1)
+        logging.info("  category: %a |K|: %d |M|: %d", values, k_selector.sum(), m_selector.sum())
+        # Make masks for each of those pairs
+        key_s_set_mask_true, key_no_potentials = make_s_set_mask(
+            m_dist_thresholded[m_selector],
+            k_set_dist_thresholded[k_selector],
+            required,
+            rng
+        )
+        # Merge into one s_set_mask_true
+        s_set_mask_true[m_selector] = key_s_set_mask_true
+        # Merge into no_potentials
+        no_potentials[k_selector] = key_no_potentials
 
     logging.info("Done make_s_set_mask. s_set_mask.shape: %a", {s_set_mask_true.shape})
 
     s_set = m_set[s_set_mask_true]
+    logging.info("Finished preparing s_set. shape: %a", {s_set.shape})
     potentials = np.invert(no_potentials)
 
-    k_subset = k_subset[potentials]
-    logging.info("Finished preparing s_set. shape: %a", {s_set.shape})
+    # Methodology 6.5.7: For a 10% sample of K
+    k_subset = k_set.sample(
+        frac=0.1,
+        random_state=rng
+    )
+    k_subset = k_subset[k_subset.apply(lambda row: potentials[row.name], axis=1)]
+    k_subset.reset_index()
+    logging.info("Finished preparing k_subset. shape: %a", {k_subset.shape})
 
     # Notes:
     # 1. Not all pixels may have matches
@@ -173,56 +199,51 @@ def find_match_iteration(
 
     logging.info("Finished find match iteration")
 
-@jit(nopython=True, fastmath=True, error_model="numpy")
 def make_s_set_mask(
     m_dist_thresholded: np.ndarray,
-    k_subset_dist_thresholded: np.ndarray,
-    m_dist_hard: np.ndarray,
-    k_subset_dist_hard: np.ndarray,
-    starting_positions: np.ndarray,
-    required: int
+    k_set_dist_thresholded: np.ndarray,
+    required: int,
+    rng: np.random.Generator
 ):
+    k_size = k_set_dist_thresholded.shape[0]
     m_size = m_dist_thresholded.shape[0]
-    k_size = k_subset_dist_thresholded.shape[0]
 
     s_include = np.zeros(m_size, dtype=np.bool_)
     k_miss = np.zeros(k_size, dtype=np.bool_)
 
-    for k in range(k_size):
-        matches = 0
-        k_row = k_subset_dist_thresholded[k, :]
-        k_hard = k_subset_dist_hard[k]
+    m_sets = max(1, min(100, math.floor(m_size // 1e6), math.ceil(m_size / (k_size * required * 10))))
 
-        for index in range(m_size):
-            m_index = (index + starting_positions[k]) % m_size
+    m_lookup = np.arange(m_size)
+    rng.shuffle(m_lookup)
+    m_step = math.ceil(m_size / m_sets)
 
-            m_row = m_dist_thresholded[m_index, :]
-            m_hard = m_dist_hard[m_index]
+    def m_index(m_set: int, pos: int):
+        return m_lookup[m_set * m_step + pos]
+    def m_indexes(m_set: int):
+        return m_lookup[m_set * m_step:(m_set + 1) * m_step]
 
-            should_include = True
+    m_trees = [make_kdrangetree(m_dist_thresholded[m_indexes(m_set)], np.ones(m_dist_thresholded.shape[1])) for m_set in range(m_sets)]
 
-            # check that every element of m_hard matches k_hard
-            hard_equals = True
-            for j in range(m_hard.shape[0]):
-                if m_hard[j] != k_hard[j]:
-                    hard_equals = False
+    rumba_trees = [make_rumba_tree(m_tree, m_dist_thresholded) for m_tree in m_trees]
 
-            if not hard_equals:
-                should_include = False
+    for k in range(k_size):
+        k_row =  k_set_dist_thresholded[k]
+        m_order = np.arange(m_sets)
+        rng.shuffle(m_order)
+        possible_s = []
+        for m_set in m_order:
+            next_possible_s = rumba_trees[m_set].members_sample(k_row, required, rng)
+            if possible_s is None:
+                possible_s = [m_index(m_set, s) for s in next_possible_s]
             else:
-                for j in range(m_row.shape[0]):
-                    if abs(m_row[j] - k_row[j]) > 1.0:
-                        should_include = False
-
-            if should_include:
-                s_include[m_index] = True
-                matches += 1
-
-            # Don't find any more M's
-            if matches == required:
+                take = min(required - len(possible_s), len(next_possible_s))
+                possible_s[len(possible_s):len(possible_s)+take] = [m_index(m_set, s) for s in next_possible_s[0:take]]
+            if len(possible_s) == required:
                 break
-
-        k_miss[k] = matches == 0
+        if len(possible_s) == 0:
+            k_miss[k] = True
+        else:
+            s_include[possible_s] = True
 
     return s_include, k_miss
 
@@ -278,6 +299,8 @@ def greedy_match(
                 results.append((k_idx, min_dist_idx))
                 s_available[min_dist_idx] = False
                 total_available -= 1
+            else:
+                matchless.append(k_idx)
         else:
             matchless.append(k_idx)