Add flexible regression model selection via fitter parameter

miranov25 · miranov25 · commit 2785bc412bde · 2025-06-13T00:51:15.000+02:00
- Introduced `fitter` argument to `process_group_robust` and `make_parallel_fit`, accepting:
  - "robust" for HuberRegressor (default for backward compatibility)
  - "ols" for LinearRegression
  - "auto" to default to Huber and fall back to OLS on failure
  - a user-provided callable for custom fitters
- Ensured minimal code changes to preserve compatibility and logic structure
- Added error handling for fallback behavior and retained full prediction pipeline
diff --git a/UTILS/dfextensions/groupby_regression.md b/UTILS/dfextensions/groupby_regression.md
@@ -131,6 +131,60 @@ df_out, dfGB = GroupByRegressor.make_parallel_fit(
 ## Tips
 
 💡 Use `cast_dtype='float16'` for storage savings, but ensure it's compatible with downstream numerical precision requirements.
+**Improvements for groupby\_regression.md**
+
+---
+
+### Usage Example for `cast_dtype`
+
+In the `make_parallel_fit` and `make_linear_fit` functions, the `cast_dtype` parameter ensures consistent numeric precision for slope, intercept, and error terms. This is useful for long pipelines or for memory-sensitive applications.
+
+```python
+import pandas as pd
+import numpy as np
+from dfextensions.groupby_regression import GroupByRegressor
+
+# Sample DataFrame
+df = pd.DataFrame({
+    'group': ['A'] * 10 + ['B'] * 10,
+    'x': np.linspace(0, 1, 20),
+    'y': np.linspace(0, 2, 20) + np.random.normal(0, 0.1, 20),
+    'weight': 1.0,
+})
+
+# Linear fit with casting to float32
+df_out, dfGB = GroupByRegressor.make_parallel_fit(
+    df,
+    gb_columns=['group'],
+    fit_columns=['y'],
+    linear_columns=['x'],
+    median_columns=['x'],
+    weights='weight',
+    suffix='_f32',
+    selection=df['x'].notna(),
+    cast_dtype='float32',
+    addPrediction=True
+)
+
+# Check resulting data types
+print(dfGB.dtypes)
+```
+
+### Output (Example)
+
+```
+group                      object
+x_f32                    float64
+y_slope_x_f32            float32
+y_err_x_f32              float32
+y_intercept_f32          float32
+y_rms_f32                float32
+y_mad_f32                float32
+bin_count_f32              int64
+dtype: object
+```
+
+
 
 ## Recent Changes
 
diff --git a/UTILS/dfextensions/groupby_regression.py b/UTILS/dfextensions/groupby_regression.py
@@ -4,7 +4,7 @@
 from sklearn.linear_model import LinearRegression, HuberRegressor
 from joblib import Parallel, delayed
 from numpy.linalg import inv, LinAlgError
-from typing import Union, List, Tuple
+from typing import Union, List, Tuple, Callable
 
 
 class GroupByRegressor:
@@ -111,12 +111,12 @@ def process_group_robust(
             median_columns: List[str],
             weights: str,
             minStat: List[int],
-            sigmaCut: float = 4
+            sigmaCut: float = 4,
+            fitter: Union[str, Callable] = "auto"
     ) -> dict:
         group_dict = dict(zip(gb_columns, key))
         predictors = []
 
-        # Count valid rows for each predictor and include only if enough
         for i, col in enumerate(linear_columns0):
             required_columns = [col] + fit_columns + [weights]
             df_valid = df_group[required_columns].dropna()
@@ -128,7 +128,6 @@ def process_group_robust(
                 if not predictors:
                     continue
 
-                # Drop rows with any NaNs in predictors, target, or weights
                 subset_columns = predictors + [target_col, weights]
                 df_clean = df_group.dropna(subset=subset_columns)
 
@@ -139,11 +138,20 @@ def process_group_robust(
                 y = df_clean[target_col].values
                 w = df_clean[weights].values
 
-                try:
+                model = None
+                if callable(fitter):
+                    model = fitter()
+                elif fitter == "robust":
+                    model = HuberRegressor(tol=1e-4)
+                elif fitter == "ols":
+                    model = LinearRegression()
+                else:
                     model = HuberRegressor(tol=1e-4)
+
+                try:
                     model.fit(X, y, sample_weight=w)
                 except Exception as e:
-                    logging.warning(f"HuberRegressor failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.")
+                    logging.warning(f"{model.__class__.__name__} failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.")
                     model = LinearRegression()
                     model.fit(X, y, sample_weight=w)
 
@@ -167,7 +175,7 @@ def process_group_robust(
                     try:
                         model.fit(X[mask], y[mask], sample_weight=w[mask])
                     except Exception as e:
-                        logging.warning(f"HuberRegressor re-fit with outlier mask failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.")
+                        logging.warning(f"{model.__class__.__name__} re-fit with outlier mask failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.")
                         model = LinearRegression()
                         model.fit(X[mask], y[mask], sample_weight=w[mask])
 
@@ -201,6 +209,7 @@ def process_group_robust(
             group_dict[col] = df_group[col].median()
 
         return group_dict
+
     @staticmethod
     def make_parallel_fit(
             df: pd.DataFrame,
@@ -215,7 +224,8 @@ def make_parallel_fit(
             cast_dtype: Union[str, None] = None,
             n_jobs: int = 1,
             min_stat: List[int] = [10, 10],
-            sigmaCut: float = 4.0
+            sigmaCut: float = 4.0,
+            fitter: Union[str, Callable] = "auto"
     ) -> Tuple[pd.DataFrame, pd.DataFrame]:
         """
         Perform grouped robust linear regression using HuberRegressor in parallel.
@@ -244,7 +254,7 @@ def make_parallel_fit(
         results = Parallel(n_jobs=n_jobs)(
             delayed(GroupByRegressor.process_group_robust)(
                 key, group_df, gb_columns, fit_columns, linear_columns,
-                median_columns, weights, min_stat, sigmaCut
+                median_columns, weights, min_stat, sigmaCut, fitter
             )
             for key, group_df in grouped
         )