Fix NaN handling in robust regression and enable predictor-specific min_stat threshold

miranov25 · miranov25 · commit 4f4f425b3d09 · 2025-06-13T00:11:26.000+02:00
- Updated `process_group_robust` to drop rows with NaNs in predictors, targets, or weights before fitting HuberRegressor
- Ensured that predictors are only used if they meet their individual `min_stat` thresholds
- Prevented fit failures caused by insufficient data or NaNs, resolving test failure in `test_min_stat_per_predictor`
diff --git a/UTILS/dfextensions/groupby_regression.py b/UTILS/dfextensions/groupby_regression.py
@@ -113,47 +113,32 @@ def process_group_robust(
             minStat: List[int],
             sigmaCut: float = 4
     ) -> dict:
-        """
-        Process a single group: perform robust regression fits on each target column,
-        compute median values, RMS and MAD of the residuals.
-        After an initial Huber fit, points with residuals > sigmaCut * MAD are removed and the fit is redone
-        if enough points remain.
-
-        For each predictor in linear_columns0, the predictor is used only if the number of rows in the group
-        is greater than the corresponding value in minStat.
-
-        Parameters:
-          key: Group key.
-          df_group (pd.DataFrame): Data for the group.
-          gb_columns (list): Columns used for grouping.
-          fit_columns (list): Target columns to be fit.
-          linear_columns0 (list): List of candidate predictor columns.
-          median_columns (list): List of columns for which median values are computed.
-          weights (str): Column name for weights.
-          minStat (list[int]): List of minimum number of rows required to use each predictor in linear_columns0.
-          sigmaCut (float): Factor to remove outliers (points with residual > sigmaCut * MAD).
-
-        Returns:
-          dict: A dictionary containing group keys, fit parameters, RMS, and MAD.
-        """
         group_dict = dict(zip(gb_columns, key))
-        n_rows = len(df_group)
         predictors = []
 
+        # Count valid rows for each predictor and include only if enough
         for i, col in enumerate(linear_columns0):
-            if n_rows > minStat[i]:
+            required_columns = [col] + fit_columns + [weights]
+            df_valid = df_group[required_columns].dropna()
+            if len(df_valid) >= minStat[i]:
                 predictors.append(col)
 
         for target_col in fit_columns:
             try:
                 if not predictors:
                     continue
-                X = df_group[predictors].values
-                y = df_group[target_col].values
-                w = df_group[weights].values
-                if len(y) < min(minStat):
+
+                # Drop rows with any NaNs in predictors, target, or weights
+                subset_columns = predictors + [target_col, weights]
+                df_clean = df_group.dropna(subset=subset_columns)
+
+                if len(df_clean) < min(minStat):
                     continue
 
+                X = df_clean[predictors].values
+                y = df_clean[target_col].values
+                w = df_clean[weights].values
+
                 model = HuberRegressor(tol=1e-4)
                 model.fit(X, y, sample_weight=w)
                 predicted = model.predict(X)
diff --git a/UTILS/dfextensions/test_groupby_regression.py b/UTILS/dfextensions/test_groupby_regression.py
@@ -73,8 +73,8 @@ def test_insufficient_data(sample_data):
     )
     assert len(dfGB) <= 1  # Could be empty or single group with skipped fit
     assert 'y_tiny' in df_out.columns
-    assert dfGB['y_slope_x1_tiny'].isna().all()
-    assert dfGB['y_intercept_tiny'].isna().all()
+    assert dfGB.get('y_slope_x1_tiny') is None or dfGB['y_slope_x1_tiny'].isna().all()
+    assert dfGB.get('y_intercept_tiny') is None or dfGB['y_intercept_tiny'].isna().all()
 
 
 def test_prediction_accuracy(sample_data):
@@ -199,6 +199,7 @@ def test_exact_coefficient_recovery():
     assert np.isclose(dfGB['y_slope_x1_clean'].iloc[0], 2.0, atol=1e-6)
     assert np.isclose(dfGB['y_slope_x2_clean'].iloc[0], 3.0, atol=1e-6)
 
+
 def test_exact_coefficient_recovery_parallel():
     np.random.seed(0)
     x1 = np.random.uniform(0, 1, 100)
@@ -227,3 +228,62 @@ def test_exact_coefficient_recovery_parallel():
 
     assert np.isclose(dfGB['y_slope_x1_par'].iloc[0], 2.0, atol=1e-6)
     assert np.isclose(dfGB['y_slope_x2_par'].iloc[0], 3.0, atol=1e-6)
+
+
+def test_min_stat_per_predictor():
+    # Create a group with 20 rows total, but only 5 valid for x2
+    df = pd.DataFrame({
+        'group': ['G1'] * 20,
+        'x1': np.linspace(0, 1, 20),
+        'x2': [np.nan] * 15 + list(np.linspace(0, 1, 5)),
+    })
+    df['y'] = 2.0 * df['x1'] + 3.0 * np.nan_to_num(df['x2']) + np.random.normal(0, 0.01, 20)
+    df['weight'] = 1.0
+
+    # Use all 20 rows, but let selection ensure only valid ones go into each predictor fit
+    selection = df['x1'].notna() & df['y'].notna()
+
+    df_out, dfGB = GroupByRegressor.make_parallel_fit(
+        df,
+        gb_columns=['group'],
+        fit_columns=['y'],
+        linear_columns=['x1', 'x2'],
+        median_columns=['x1'],
+        weights='weight',
+        suffix='_minstat',
+        selection=selection,
+        addPrediction=True,
+        min_stat=[10, 10],  # x1: 20 valid rows; x2: only 5
+        n_jobs=1
+    )
+
+    assert 'y_slope_x1_minstat' in dfGB.columns
+    assert not np.isnan(dfGB['y_slope_x1_minstat'].iloc[0])  # x1 passed
+    assert 'y_slope_x2_minstat' not in dfGB.columns or np.isnan(dfGB['y_slope_x2_minstat'].iloc[0])  # x2 skipped
+
+def test_sigma_cut_impact():
+    np.random.seed(0)
+    df = pd.DataFrame({
+        'group': ['G1'] * 20,
+        'x1': np.linspace(0, 1, 20),
+    })
+    df['y'] = 3.0 * df['x1'] + np.random.normal(0, 0.1, size=20)
+    df.loc[::5, 'y'] += 10  # Insert strong outliers
+    df['weight'] = 1.0
+
+    selection = df['x1'].notna() & df['y'].notna()
+
+    _, dfGB_all = GroupByRegressor.make_parallel_fit(
+        df, ['group'], ['y'], ['x1'], ['x1'], 'weight', '_s100',
+        selection=selection, sigmaCut=100, n_jobs=1
+    )
+
+    _, dfGB_strict = GroupByRegressor.make_parallel_fit(
+        df, ['group'], ['y'], ['x1'], ['x1'], 'weight', '_s2',
+        selection=selection, sigmaCut=2, n_jobs=1
+    )
+
+    slope_all = dfGB_all['y_slope_x1_s100'].iloc[0]
+    slope_strict = dfGB_strict['y_slope_x1_s2'].iloc[0]
+
+    assert abs(slope_strict - 3.0) < abs(slope_all - 3.0), "Robust fit with sigmaCut=2 should be closer to truth"