|
1 | 1 | import pytest |
2 | 2 | import pandas as pd |
3 | 3 | import numpy as np |
4 | | -from groupby_regression import GroupByRegressor |
5 | | - |
| 4 | +#from groupby_regression import GroupByRegressor |
| 5 | +from .groupby_regression import GroupByRegressor |
6 | 6 |
|
7 | 7 | @pytest.fixture |
8 | 8 | def sample_data(): |
@@ -364,3 +364,57 @@ def predict(self, X): |
364 | 364 | assert 'y_slope_x1_dummy' in dfGB.columns |
365 | 365 | assert dfGB['y_slope_x1_dummy'].iloc[0] == 0 |
366 | 366 | assert dfGB['y_intercept_dummy'].iloc[0] == 42 |
| 367 | + |
| 368 | + |
| 369 | +def _make_groups(n_rows, n_groups, seed=0): |
| 370 | + rng = np.random.default_rng(seed) |
| 371 | + base = np.repeat(np.arange(n_groups, dtype=np.int32), n_rows // n_groups) |
| 372 | + rem = n_rows - base.size |
| 373 | + if rem > 0: |
| 374 | + base = np.concatenate([base, rng.choice(n_groups, size=rem, replace=False)]) |
| 375 | + rng.shuffle(base) |
| 376 | + return base |
| 377 | + |
| 378 | +def _create_clean(n_rows=1000, n_groups=200, seed=0): |
| 379 | + rng = np.random.default_rng(seed) |
| 380 | + g = _make_groups(n_rows, n_groups, seed) |
| 381 | + x = rng.normal(size=(n_rows, 2)).astype(np.float32) |
| 382 | + y = (2*x[:,0] + 3*x[:,1] + rng.normal(0,1.0,size=n_rows)).astype(np.float32) |
| 383 | + df = pd.DataFrame({"group": g, "x1": x[:,0], "x2": x[:,1], "y": y}) |
| 384 | + df["group2"] = df["group"] |
| 385 | + df["weight"] = 1.0 |
| 386 | + return df |
| 387 | + |
| 388 | +def test_diagnostics_columns_present(): |
| 389 | + df = _create_clean() |
| 390 | + sel = pd.Series(True, index=df.index) |
| 391 | + _, dfGB = GroupByRegressor.make_parallel_fit( |
| 392 | + df, |
| 393 | + gb_columns=["group", "group2"], |
| 394 | + fit_columns=["y"], |
| 395 | + linear_columns=["x1", "x2"], |
| 396 | + median_columns=[], |
| 397 | + weights="weight", |
| 398 | + suffix="_fit", |
| 399 | + selection=sel, |
| 400 | + addPrediction=False, |
| 401 | + n_jobs=1, |
| 402 | + min_stat=[3, 4], |
| 403 | + sigmaCut=5, |
| 404 | + fitter="ols", |
| 405 | + batch_size="auto", |
| 406 | + diag=True, # <-- exercise diagnostics |
| 407 | + diag_prefix="diag_", |
| 408 | + ) |
| 409 | + # Change the expected column names to include the suffix |
| 410 | + suffix = "_fit" # <-- Add this line for clarity |
| 411 | + cols = [ |
| 412 | + f"diag_n_refits{suffix}", f"diag_frac_rejected{suffix}", f"diag_hat_max{suffix}", |
| 413 | + f"diag_cond_xtx{suffix}", f"diag_time_ms{suffix}", f"diag_n_rows{suffix}", |
| 414 | + ] |
| 415 | + |
| 416 | + for c in cols: |
| 417 | + assert c in dfGB.columns, f"missing diagnostic column {c}" |
| 418 | + # The original un-suffixed assertion: assert (dfGB["diag_n_refits"] >= 0).all() |
| 419 | + # must also be updated to: |
| 420 | + assert (dfGB[f"diag_n_refits{suffix}"] >= 0).all() |
0 commit comments