Skip to content

Commit 2785bc4

Browse files
author
miranov25
committed
Add flexible regression model selection via fitter parameter
- Introduced `fitter` argument to `process_group_robust` and `make_parallel_fit`, accepting: - "robust" for HuberRegressor (default for backward compatibility) - "ols" for LinearRegression - "auto" to default to Huber and fall back to OLS on failure - a user-provided callable for custom fitters - Ensured minimal code changes to preserve compatibility and logic structure - Added error handling for fallback behavior and retained full prediction pipeline
1 parent 22ce23c commit 2785bc4

File tree

2 files changed

+73
-9
lines changed

2 files changed

+73
-9
lines changed

UTILS/dfextensions/groupby_regression.md

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,60 @@ df_out, dfGB = GroupByRegressor.make_parallel_fit(
131131
## Tips
132132

133133
💡 Use `cast_dtype='float16'` for storage savings, but ensure it's compatible with downstream numerical precision requirements.
134+
**Improvements for groupby\_regression.md**
135+
136+
---
137+
138+
### Usage Example for `cast_dtype`
139+
140+
In the `make_parallel_fit` and `make_linear_fit` functions, the `cast_dtype` parameter ensures consistent numeric precision for slope, intercept, and error terms. This is useful for long pipelines or for memory-sensitive applications.
141+
142+
```python
143+
import pandas as pd
144+
import numpy as np
145+
from dfextensions.groupby_regression import GroupByRegressor
146+
147+
# Sample DataFrame
148+
df = pd.DataFrame({
149+
'group': ['A'] * 10 + ['B'] * 10,
150+
'x': np.linspace(0, 1, 20),
151+
'y': np.linspace(0, 2, 20) + np.random.normal(0, 0.1, 20),
152+
'weight': 1.0,
153+
})
154+
155+
# Linear fit with casting to float32
156+
df_out, dfGB = GroupByRegressor.make_parallel_fit(
157+
df,
158+
gb_columns=['group'],
159+
fit_columns=['y'],
160+
linear_columns=['x'],
161+
median_columns=['x'],
162+
weights='weight',
163+
suffix='_f32',
164+
selection=df['x'].notna(),
165+
cast_dtype='float32',
166+
addPrediction=True
167+
)
168+
169+
# Check resulting data types
170+
print(dfGB.dtypes)
171+
```
172+
173+
### Output (Example)
174+
175+
```
176+
group object
177+
x_f32 float64
178+
y_slope_x_f32 float32
179+
y_err_x_f32 float32
180+
y_intercept_f32 float32
181+
y_rms_f32 float32
182+
y_mad_f32 float32
183+
bin_count_f32 int64
184+
dtype: object
185+
```
186+
187+
134188

135189
## Recent Changes
136190

UTILS/dfextensions/groupby_regression.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from sklearn.linear_model import LinearRegression, HuberRegressor
55
from joblib import Parallel, delayed
66
from numpy.linalg import inv, LinAlgError
7-
from typing import Union, List, Tuple
7+
from typing import Union, List, Tuple, Callable
88

99

1010
class GroupByRegressor:
@@ -111,12 +111,12 @@ def process_group_robust(
111111
median_columns: List[str],
112112
weights: str,
113113
minStat: List[int],
114-
sigmaCut: float = 4
114+
sigmaCut: float = 4,
115+
fitter: Union[str, Callable] = "auto"
115116
) -> dict:
116117
group_dict = dict(zip(gb_columns, key))
117118
predictors = []
118119

119-
# Count valid rows for each predictor and include only if enough
120120
for i, col in enumerate(linear_columns0):
121121
required_columns = [col] + fit_columns + [weights]
122122
df_valid = df_group[required_columns].dropna()
@@ -128,7 +128,6 @@ def process_group_robust(
128128
if not predictors:
129129
continue
130130

131-
# Drop rows with any NaNs in predictors, target, or weights
132131
subset_columns = predictors + [target_col, weights]
133132
df_clean = df_group.dropna(subset=subset_columns)
134133

@@ -139,11 +138,20 @@ def process_group_robust(
139138
y = df_clean[target_col].values
140139
w = df_clean[weights].values
141140

142-
try:
141+
model = None
142+
if callable(fitter):
143+
model = fitter()
144+
elif fitter == "robust":
145+
model = HuberRegressor(tol=1e-4)
146+
elif fitter == "ols":
147+
model = LinearRegression()
148+
else:
143149
model = HuberRegressor(tol=1e-4)
150+
151+
try:
144152
model.fit(X, y, sample_weight=w)
145153
except Exception as e:
146-
logging.warning(f"HuberRegressor failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.")
154+
logging.warning(f"{model.__class__.__name__} failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.")
147155
model = LinearRegression()
148156
model.fit(X, y, sample_weight=w)
149157

@@ -167,7 +175,7 @@ def process_group_robust(
167175
try:
168176
model.fit(X[mask], y[mask], sample_weight=w[mask])
169177
except Exception as e:
170-
logging.warning(f"HuberRegressor re-fit with outlier mask failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.")
178+
logging.warning(f"{model.__class__.__name__} re-fit with outlier mask failed for {target_col} in group {key}: {e}. Falling back to LinearRegression.")
171179
model = LinearRegression()
172180
model.fit(X[mask], y[mask], sample_weight=w[mask])
173181

@@ -201,6 +209,7 @@ def process_group_robust(
201209
group_dict[col] = df_group[col].median()
202210

203211
return group_dict
212+
204213
@staticmethod
205214
def make_parallel_fit(
206215
df: pd.DataFrame,
@@ -215,7 +224,8 @@ def make_parallel_fit(
215224
cast_dtype: Union[str, None] = None,
216225
n_jobs: int = 1,
217226
min_stat: List[int] = [10, 10],
218-
sigmaCut: float = 4.0
227+
sigmaCut: float = 4.0,
228+
fitter: Union[str, Callable] = "auto"
219229
) -> Tuple[pd.DataFrame, pd.DataFrame]:
220230
"""
221231
Perform grouped robust linear regression using HuberRegressor in parallel.
@@ -244,7 +254,7 @@ def make_parallel_fit(
244254
results = Parallel(n_jobs=n_jobs)(
245255
delayed(GroupByRegressor.process_group_robust)(
246256
key, group_df, gb_columns, fit_columns, linear_columns,
247-
median_columns, weights, min_stat, sigmaCut
257+
median_columns, weights, min_stat, sigmaCut, fitter
248258
)
249259
for key, group_df in grouped
250260
)

0 commit comments

Comments
 (0)