Skip to content

Commit cc1ecb4

Browse files
author
miranov25
committed
docs(restartContext): record diagnostics integration and real-data validation
Added suffix-aware summarize_diagnostics + benchmark report integration Confirmed robust re-fit loop in real datasets Prepared next-phase plan for real-use-case profiling and fast-path study
1 parent a71cc4d commit cc1ecb4

File tree

1 file changed

+56
-2
lines changed

1 file changed

+56
-2
lines changed

UTILS/dfextensions/test_groupby_regression.py

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import pytest
22
import pandas as pd
33
import numpy as np
4-
from groupby_regression import GroupByRegressor
5-
4+
#from groupby_regression import GroupByRegressor
5+
from .groupby_regression import GroupByRegressor
66

77
@pytest.fixture
88
def sample_data():
@@ -364,3 +364,57 @@ def predict(self, X):
364364
assert 'y_slope_x1_dummy' in dfGB.columns
365365
assert dfGB['y_slope_x1_dummy'].iloc[0] == 0
366366
assert dfGB['y_intercept_dummy'].iloc[0] == 42
367+
368+
369+
def _make_groups(n_rows, n_groups, seed=0):
370+
rng = np.random.default_rng(seed)
371+
base = np.repeat(np.arange(n_groups, dtype=np.int32), n_rows // n_groups)
372+
rem = n_rows - base.size
373+
if rem > 0:
374+
base = np.concatenate([base, rng.choice(n_groups, size=rem, replace=False)])
375+
rng.shuffle(base)
376+
return base
377+
378+
def _create_clean(n_rows=1000, n_groups=200, seed=0):
379+
rng = np.random.default_rng(seed)
380+
g = _make_groups(n_rows, n_groups, seed)
381+
x = rng.normal(size=(n_rows, 2)).astype(np.float32)
382+
y = (2*x[:,0] + 3*x[:,1] + rng.normal(0,1.0,size=n_rows)).astype(np.float32)
383+
df = pd.DataFrame({"group": g, "x1": x[:,0], "x2": x[:,1], "y": y})
384+
df["group2"] = df["group"]
385+
df["weight"] = 1.0
386+
return df
387+
388+
def test_diagnostics_columns_present():
389+
df = _create_clean()
390+
sel = pd.Series(True, index=df.index)
391+
_, dfGB = GroupByRegressor.make_parallel_fit(
392+
df,
393+
gb_columns=["group", "group2"],
394+
fit_columns=["y"],
395+
linear_columns=["x1", "x2"],
396+
median_columns=[],
397+
weights="weight",
398+
suffix="_fit",
399+
selection=sel,
400+
addPrediction=False,
401+
n_jobs=1,
402+
min_stat=[3, 4],
403+
sigmaCut=5,
404+
fitter="ols",
405+
batch_size="auto",
406+
diag=True, # <-- exercise diagnostics
407+
diag_prefix="diag_",
408+
)
409+
# Change the expected column names to include the suffix
410+
suffix = "_fit" # <-- Add this line for clarity
411+
cols = [
412+
f"diag_n_refits{suffix}", f"diag_frac_rejected{suffix}", f"diag_hat_max{suffix}",
413+
f"diag_cond_xtx{suffix}", f"diag_time_ms{suffix}", f"diag_n_rows{suffix}",
414+
]
415+
416+
for c in cols:
417+
assert c in dfGB.columns, f"missing diagnostic column {c}"
418+
# The original un-suffixed assertion: assert (dfGB["diag_n_refits"] >= 0).all()
419+
# must also be updated to:
420+
assert (dfGB[f"diag_n_refits{suffix}"] >= 0).all()

0 commit comments

Comments
 (0)