|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import logging |
| 4 | +from sklearn.linear_model import LinearRegression, HuberRegressor |
| 5 | +from joblib import Parallel, delayed |
| 6 | +from numpy.linalg import inv, LinAlgError |
| 7 | + |
| 8 | + |
| 9 | +class GroupByRegressor: |
| 10 | + @staticmethod |
| 11 | + def _cast_fit_columns(dfGB, cast_dtype=None): |
| 12 | + if cast_dtype is not None: |
| 13 | + for col in dfGB.columns: |
| 14 | + if ("slope" in col or "intercept" in col or "rms" in col or "mad" in col): |
| 15 | + dfGB[col] = dfGB[col].astype(cast_dtype) |
| 16 | + return dfGB |
| 17 | + |
| 18 | + @staticmethod |
| 19 | + def make_linear_fit(df, gb_columns, fit_columns, linear_columns, median_columns, suffix, selection, addPrediction=False, cast_dtype=None, min_stat=10): |
| 20 | + """ |
| 21 | + Perform standard linear regression fits for grouped data and compute median values. |
| 22 | +
|
| 23 | + Parameters: |
| 24 | + df (pd.DataFrame): Input dataframe. |
| 25 | + gb_columns (list): Columns to group by. |
| 26 | + fit_columns (list): Target columns for linear regression. |
| 27 | + linear_columns (list): Independent variables used for the fit. |
| 28 | + median_columns (list): Columns for which median values are computed. |
| 29 | + suffix (str): Suffix to append to columns in the output dfGB. |
| 30 | + selection (pd.Series): Boolean mask for selecting rows. |
| 31 | + addPrediction (bool): If True, merge predictions back into df. |
| 32 | + cast_dtype (str or None): If not None, cast fit-related columns to this dtype. |
| 33 | + min_stat (int): Minimum number of rows required to perform regression. |
| 34 | +
|
| 35 | + Returns: |
| 36 | + tuple: (df, dfGB) where |
| 37 | + df is the original dataframe with predicted values appended (if addPrediction is True), |
| 38 | + and dfGB is the group-by statistics dataframe containing medians and fit coefficients. |
| 39 | + """ |
| 40 | + df_selected = df.loc[selection] |
| 41 | + group_results = [] |
| 42 | + group_sizes = {} |
| 43 | + |
| 44 | + for group_vals, df_group in df_selected.groupby(gb_columns): |
| 45 | + group_dict = dict(zip(gb_columns, group_vals)) |
| 46 | + group_sizes[group_vals] = len(df_group) |
| 47 | + for target_col in fit_columns: |
| 48 | + try: |
| 49 | + X = df_group[linear_columns].values |
| 50 | + y = df_group[target_col].values |
| 51 | + if len(X) < min_stat: |
| 52 | + for col in linear_columns: |
| 53 | + group_dict[f"{target_col}_slope_{col}"] = np.nan |
| 54 | + group_dict[f"{target_col}_intercept"] = np.nan |
| 55 | + continue |
| 56 | + model = LinearRegression() |
| 57 | + model.fit(X, y) |
| 58 | + for i, col in enumerate(linear_columns): |
| 59 | + group_dict[f"{target_col}_slope_{col}"] = model.coef_[i] |
| 60 | + group_dict[f"{target_col}_intercept"] = model.intercept_ |
| 61 | + except Exception as e: |
| 62 | + logging.warning(f"Linear regression failed for {target_col} in group {group_vals}: {e}") |
| 63 | + for col in linear_columns: |
| 64 | + group_dict[f"{target_col}_slope_{col}"] = np.nan |
| 65 | + group_dict[f"{target_col}_intercept"] = np.nan |
| 66 | + |
| 67 | + for col in median_columns: |
| 68 | + group_dict[col] = df_group[col].median() |
| 69 | + |
| 70 | + group_results.append(group_dict) |
| 71 | + |
| 72 | + dfGB = pd.DataFrame(group_results) |
| 73 | + dfGB = GroupByRegressor._cast_fit_columns(dfGB, cast_dtype) |
| 74 | + |
| 75 | + bin_counts = np.array([group_sizes.get(tuple(row), 0) for row in dfGB[gb_columns].itertuples(index=False)], dtype=np.int32) |
| 76 | + dfGB["bin_count"] = bin_counts |
| 77 | + dfGB = dfGB.rename(columns={col: f"{col}{suffix}" for col in dfGB.columns if col not in gb_columns}) |
| 78 | + dfGB = dfGB.copy() |
| 79 | + |
| 80 | + if addPrediction: |
| 81 | + df = df.merge(dfGB, on=gb_columns, how="left") |
| 82 | + for target_col in fit_columns: |
| 83 | + intercept_col = f"{target_col}_intercept{suffix}" |
| 84 | + if intercept_col not in df.columns: |
| 85 | + continue |
| 86 | + df[f"{target_col}{suffix}"] = df[intercept_col] |
| 87 | + for col in linear_columns: |
| 88 | + slope_col = f"{target_col}_slope_{col}{suffix}" |
| 89 | + if slope_col in df.columns: |
| 90 | + df[f"{target_col}{suffix}"] += df[slope_col] * df[col] |
| 91 | + |
| 92 | + return df, dfGB |
| 93 | + |
| 94 | + @staticmethod |
| 95 | + def process_group_robust(key, df_group, gb_columns, fit_columns, linear_columns0, median_columns, weights, minStat=[], sigmaCut=4): |
| 96 | + """ |
| 97 | + Process a single group: perform robust regression fits on each target column, |
| 98 | + compute median values, RMS and MAD of the residuals. |
| 99 | + After an initial Huber fit, points with residuals > sigmaCut * MAD are removed and the fit is redone |
| 100 | + if enough points remain. |
| 101 | +
|
| 102 | + For each predictor in linear_columns0, the predictor is used only if the number of rows in the group |
| 103 | + is greater than the corresponding value in minStat. |
| 104 | +
|
| 105 | + Parameters: |
| 106 | + key: Group key. |
| 107 | + df_group (pd.DataFrame): Data for the group. |
| 108 | + gb_columns (list): Columns used for grouping. |
| 109 | + fit_columns (list): Target columns to be fit. |
| 110 | + linear_columns0 (list): List of candidate predictor columns. |
| 111 | + median_columns (list): List of columns for which median values are computed. |
| 112 | + weights (str): Column name for weights. |
| 113 | + minStat (list): List of minimum number of rows required to use each predictor in linear_columns0. |
| 114 | + sigmaCut (float): Factor to remove outliers (points with residual > sigmaCut * MAD). |
| 115 | +
|
| 116 | + Returns: |
| 117 | + dict: A dictionary containing group keys, fit parameters, RMS, and MAD. |
| 118 | + """ |
| 119 | + group_dict = dict(zip(gb_columns, key)) |
| 120 | + n_rows = len(df_group) |
| 121 | + predictors = [] |
| 122 | + |
| 123 | + for i, col in enumerate(linear_columns0): |
| 124 | + if n_rows > minStat[i]: |
| 125 | + predictors.append(col) |
| 126 | + |
| 127 | + for target_col in fit_columns: |
| 128 | + try: |
| 129 | + if not predictors: |
| 130 | + continue |
| 131 | + X = df_group[predictors].values |
| 132 | + y = df_group[target_col].values |
| 133 | + w = df_group[weights].values |
| 134 | + if len(y) < min(minStat): |
| 135 | + continue |
| 136 | + |
| 137 | + model = HuberRegressor(tol=1e-4) |
| 138 | + model.fit(X, y, sample_weight=w) |
| 139 | + predicted = model.predict(X) |
| 140 | + residuals = y - predicted |
| 141 | + n, p = X.shape |
| 142 | + denom = n - p if n > p else 1e-9 |
| 143 | + s2 = np.sum(residuals ** 2) / denom |
| 144 | + |
| 145 | + try: |
| 146 | + cov_matrix = inv(X.T @ X) * s2 |
| 147 | + std_errors = np.sqrt(np.diag(cov_matrix)) |
| 148 | + except LinAlgError: |
| 149 | + std_errors = np.full(len(predictors), np.nan) |
| 150 | + |
| 151 | + rms = np.sqrt(np.mean(residuals ** 2)) |
| 152 | + mad = np.median(np.abs(residuals)) |
| 153 | + |
| 154 | + mask = np.abs(residuals) <= sigmaCut * mad |
| 155 | + if mask.sum() >= min(minStat): |
| 156 | + model.fit(X[mask], y[mask], sample_weight=w[mask]) |
| 157 | + predicted = model.predict(X) |
| 158 | + residuals = y - predicted |
| 159 | + rms = np.sqrt(np.mean(residuals ** 2)) |
| 160 | + mad = np.median(np.abs(residuals)) |
| 161 | + |
| 162 | + for col in linear_columns0: |
| 163 | + if col in predictors: |
| 164 | + idx = predictors.index(col) |
| 165 | + group_dict[f"{target_col}_slope_{col}"] = model.coef_[idx] |
| 166 | + group_dict[f"{target_col}_err_{col}"] = std_errors[idx] if idx < len(std_errors) else np.nan |
| 167 | + else: |
| 168 | + group_dict[f"{target_col}_slope_{col}"] = np.nan |
| 169 | + group_dict[f"{target_col}_err_{col}"] = np.nan |
| 170 | + |
| 171 | + group_dict[f"{target_col}_intercept"] = model.intercept_ |
| 172 | + group_dict[f"{target_col}_rms"] = rms |
| 173 | + group_dict[f"{target_col}_mad"] = mad |
| 174 | + except Exception as e: |
| 175 | + logging.warning(f"Robust regression failed for {target_col} in group {key}: {e}") |
| 176 | + for col in linear_columns0: |
| 177 | + group_dict[f"{target_col}_slope_{col}"] = np.nan |
| 178 | + group_dict[f"{target_col}_err_{col}"] = np.nan |
| 179 | + group_dict[f"{target_col}_intercept"] = np.nan |
| 180 | + group_dict[f"{target_col}_rms"] = np.nan |
| 181 | + group_dict[f"{target_col}_mad"] = np.nan |
| 182 | + |
| 183 | + for col in median_columns: |
| 184 | + group_dict[col] = df_group[col].median() |
| 185 | + |
| 186 | + return group_dict |
0 commit comments