Skip to content

Commit 1b2ed00

Browse files
author
miranov25
committed
bench(quantile_fit_nd): correct scaling assertions — α_b≈−0.5, α_rt≈0.0
- Round-trip RMS is dominated by per-event noise → expect α_rt≈0 (flat), not −0.5 - Keep rms_b scaling check near −0.5 (loosen tol to ±0.2 across 5 N points) - Clarify summary prints and expectations; leave constancy check only for rms_b·√N PWGPP-643
1 parent 12d5fe4 commit 1b2ed00

File tree

1 file changed

+312
-45
lines changed

1 file changed

+312
-45
lines changed

UTILS/dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py

Lines changed: 312 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,67 +1,334 @@
1-
# dfextension/quantile_fit_nd/bench_quantile_fit_nd.py
2-
# Simple timing benchmark across N points, distributions, and grid sizes.
1+
#!/usr/bin/env python3
2+
# dfextensions/quantile_fit_nd/bench_quantile_fit_nd.py
3+
"""
4+
Benchmark speed + precision for fit_quantile_linear_nd with scaling checks.
5+
6+
- Distributions: uniform, poisson (via randomized PIT), gaussian
7+
- q_centers step = 0.025; dq = 0.05 (more points per z-bin)
8+
- Precision metrics per N:
9+
* rms_b := sqrt(mean( (b_meas(z) - b_exp(z))^2 )) over informative z-bins
10+
* rel_err_sigmaQ := median relative error of sigma_Q vs truth per z-bin
11+
* rms_rt := round-trip inversion RMS for a random subset of events
12+
- Scaling check:
13+
* expect alpha_b ≈ -0.5 (rms_b ∝ N^{-1/2})
14+
* expect alpha_rt ≈ 0.0 (rms_rt roughly flat; per-event noise)
15+
- Prints E*sqrt(N) for rms_b as a constancy sanity check.
16+
- Optional: PNG plots (log-log), CSV export, memory profiling, strict assertions.
17+
"""
18+
19+
import argparse
20+
import json
21+
import warnings
22+
from math import erf, sqrt
323
import time
424
import numpy as np
525
import pandas as pd
626

7-
from dfextensions.quantile_fit_nd.quantile_fit_nd import fit_quantile_linear_nd
27+
from dfextensions.quantile_fit_nd.quantile_fit_nd import (
28+
fit_quantile_linear_nd,
29+
QuantileEvaluator,
30+
)
31+
from dfextensions.quantile_fit_nd.utils import discrete_to_uniform_rank_poisson
32+
33+
RNG = np.random.default_rng(123456)
834

9-
RNG = np.random.default_rng(1234)
1035

36+
# ----------------------- Synthetic data generation -----------------------
1137

12-
def gen_data(n: int, dist: str = "uniform", sigma_X: float = 0.5):
38+
def gen_Q_from_distribution(dist: str, n: int, *, lam: float) -> np.ndarray:
1339
if dist == "uniform":
14-
Q = RNG.uniform(0, 1, size=n)
40+
return RNG.uniform(0.0, 1.0, size=n)
1541
elif dist == "poisson":
16-
lam = 20.0
17-
m = RNG.poisson(lam, size=n)
18-
from math import erf
19-
z = (m + 0.5 - lam) / np.sqrt(max(lam, 1e-6))
20-
Q = 0.5 * (1.0 + np.array([erf(zi / np.sqrt(2)) for zi in z]))
21-
Q = np.clip(Q, 0, 1)
42+
k = RNG.poisson(lam, size=n)
43+
return discrete_to_uniform_rank_poisson(k, lam, mode="randomized", rng=RNG)
2244
elif dist == "gaussian":
23-
g = RNG.normal(0.0, 1.0, size=n)
24-
from math import erf
25-
Q = 0.5 * (1.0 + np.array([erf(gi / np.sqrt(2)) for gi in g]))
26-
Q = np.clip(Q, 0, 1)
45+
z = RNG.normal(0.0, 1.0, size=n) # standard normal
46+
cdf = 0.5 * (1.0 + np.array([erf(zi / np.sqrt(2)) for zi in z]))
47+
return np.clip(cdf, 0.0, 1.0)
2748
else:
28-
raise ValueError
49+
raise ValueError(f"unknown dist {dist}")
2950

30-
z = np.clip(RNG.normal(0.0, 5.0, size=n), -10, 10)
31-
a = 10.0 + 0.5 * z
32-
b = (50.0 + 2.0 * z / 10.0).clip(min=5.0)
33-
X = a + b * Q + RNG.normal(0.0, sigma_X, size=n)
34-
df = pd.DataFrame({"channel_id": "bench", "Q": Q, "X": X, "z_vtx": z, "is_outlier": False})
35-
return df
3651

52+
def gen_synthetic_df(
53+
n: int,
54+
dist: str,
55+
*,
56+
lam: float,
57+
z_sigma_cm: float = 5.0,
58+
z_range_cm: float = 10.0,
59+
sigma_X_given_Q: float = 0.5,
60+
a0: float = 10.0,
61+
a1: float = 0.5,
62+
b0: float = 50.0,
63+
b1: float = 2.0,
64+
) -> tuple[pd.DataFrame, dict]:
65+
Q = gen_Q_from_distribution(dist, n, lam=lam)
66+
z = np.clip(RNG.normal(0.0, z_sigma_cm, size=n), -z_range_cm, z_range_cm)
67+
a_true = a0 + a1 * z
68+
b_true = (b0 + b1 * z / max(z_range_cm, 1e-6)).clip(min=5.0)
69+
X = a_true + b_true * Q + RNG.normal(0.0, sigma_X_given_Q, size=n)
70+
df = pd.DataFrame({
71+
"channel_id": np.repeat("ch0", n),
72+
"Q": Q,
73+
"X": X,
74+
"z_vtx": z,
75+
"is_outlier": np.zeros(n, dtype=bool),
76+
})
77+
truth = {
78+
"a0": a0, "a1": a1,
79+
"b0": b0, "b1": b1,
80+
"sigma_X_given_Q": sigma_X_given_Q,
81+
"z_range": z_range_cm,
82+
"lam": lam
83+
}
84+
return df, truth
3785

38-
def run_one(n, dist, q_bins=11, z_bins=10):
39-
df = gen_data(n, dist=dist)
40-
t0 = time.perf_counter()
41-
table = fit_quantile_linear_nd(
42-
df,
43-
channel_key="channel_id",
44-
q_centers=np.linspace(0, 1, q_bins),
86+
87+
# ----------------------- Helpers for expectations -----------------------
88+
89+
def _edges_from_centers(centers: np.ndarray) -> np.ndarray:
90+
mid = 0.5 * (centers[1:] + centers[:-1])
91+
first = centers[0] - (mid[0] - centers[0])
92+
last = centers[-1] + (centers[-1] - mid[-1])
93+
return np.concatenate([[first], mid, [last]])
94+
95+
96+
def expected_b_per_zbin_from_sample(df: pd.DataFrame, z_edges: np.ndarray, truth: dict) -> np.ndarray:
97+
z_vals = df["z_vtx"].to_numpy(np.float64)
98+
b_true_all = (truth["b0"] + truth["b1"] * z_vals / max(truth["z_range"], 1e-6)).clip(min=5.0)
99+
b_expected = []
100+
for i in range(len(z_edges) - 1):
101+
m = (z_vals >= z_edges[i]) & (z_vals <= z_edges[i+1])
102+
b_expected.append(np.mean(b_true_all[m]) if m.sum() > 0 else np.nan)
103+
return np.array(b_expected, dtype=np.float64)
104+
105+
106+
def weights_from_fit_stats(col: pd.Series) -> np.ndarray:
107+
w = []
108+
for s in col:
109+
try:
110+
d = json.loads(s)
111+
except Exception:
112+
d = {}
113+
w.append(d.get("n_used", np.nan))
114+
return np.array(w, dtype=float)
115+
116+
117+
# ----------------------------- Benchmark core -----------------------------
118+
119+
def run_one(
120+
dist: str,
121+
n: int,
122+
*,
123+
q_step=0.025,
45124
dq=0.05,
46-
nuisance_axes={"z": "z_vtx"},
47-
n_bins_axes={"z": z_bins},
48-
mask_col="is_outlier",
49-
b_min_option="auto",
50-
fit_mode="ols",
51-
kappa_w=1.3,
125+
z_bins=20,
126+
lam=50.0,
127+
sample_fraction=0.006,
128+
mem_profile: bool = False,
129+
) -> dict:
130+
df, truth = gen_synthetic_df(n, dist, lam=lam)
131+
q_centers = np.arange(0.0, 1.0 + 1e-12, q_step) # 0..1 inclusive
132+
133+
def _do_fit():
134+
return fit_quantile_linear_nd(
135+
df,
136+
channel_key="channel_id",
137+
q_centers=q_centers,
138+
dq=dq,
139+
nuisance_axes={"z": "z_vtx"},
140+
n_bins_axes={"z": z_bins},
141+
)
142+
143+
t0 = time.perf_counter()
144+
if mem_profile:
145+
try:
146+
from memory_profiler import memory_usage
147+
mem_trace, table = memory_usage((_do_fit, ), retval=True, max_iterations=1)
148+
peak_mem_mb = float(np.max(mem_trace)) if len(mem_trace) else np.nan
149+
except Exception as e:
150+
warnings.warn(f"memory_profiler unavailable or failed: {e}")
151+
table = _do_fit()
152+
peak_mem_mb = np.nan
153+
else:
154+
table = _do_fit()
155+
peak_mem_mb = np.nan
156+
t_fit = time.perf_counter() - t0
157+
158+
# Expected b per z-bin (from sample)
159+
z_centers = np.sort(table["z_center"].unique())
160+
z_edges = _edges_from_centers(z_centers)
161+
b_exp = expected_b_per_zbin_from_sample(df, z_edges, truth)
162+
163+
# Measured b per z-bin (weighted by window n_used)
164+
b_meas_w = np.full_like(b_exp, np.nan, dtype=float)
165+
for i, zc in enumerate(z_centers):
166+
g = table[table["z_center"] == zc]
167+
if g.empty:
168+
continue
169+
w = weights_from_fit_stats(g["fit_stats"])
170+
ok = np.isfinite(g["b"].to_numpy()) & (w > 0)
171+
if ok.sum() == 0:
172+
continue
173+
bvals = g["b"].to_numpy()[ok]
174+
ww = w[ok]
175+
b_meas_w[i] = np.average(bvals, weights=ww)
176+
177+
# Informative mask
178+
m = np.isfinite(b_meas_w) & np.isfinite(b_exp)
179+
180+
# Slope error metrics
181+
rms_b = float(np.sqrt(np.nanmean((b_meas_w[m] - b_exp[m]) ** 2))) if m.any() else np.nan
182+
183+
# sigma_Q check (median rel err by z-bin)
184+
sigma_q_true = truth["sigma_X_given_Q"] / np.maximum(1e-9, b_exp)
185+
sigma_q_meas = table.groupby("z_center")["sigma_Q"].median().reindex(z_centers).to_numpy()
186+
mk = np.isfinite(sigma_q_true) & np.isfinite(sigma_q_meas)
187+
rel_err_sigmaQ = float(np.nanmean(np.abs(sigma_q_meas[mk] - sigma_q_true[mk]) /
188+
np.maximum(1e-9, sigma_q_true[mk]))) if mk.any() else np.nan
189+
190+
# Round-trip inversion RMS (sample to limit CPU)
191+
k = max(1, int(round(sample_fraction * len(df))))
192+
idx = RNG.choice(len(df), size=min(k, len(df)), replace=False)
193+
evalr = QuantileEvaluator(table)
194+
resid = []
195+
for ii in idx:
196+
z = float(df.loc[ii, "z_vtx"])
197+
q_true = float(df.loc[ii, "Q"])
198+
x = float(df.loc[ii, "X"])
199+
q_hat = evalr.invert_rank(x, channel_id="ch0", z=z)
200+
resid.append(q_hat - q_true)
201+
resid = np.array(resid, dtype=float)
202+
rms_rt = float(np.sqrt(np.mean(np.square(resid)))) if resid.size else np.nan
203+
204+
return dict(
205+
dist=dist, N=int(n),
206+
lam=float(lam),
207+
q_step=float(q_step), dq=float(dq), z_bins=int(z_bins),
208+
t_fit=float(t_fit),
209+
rms_b=rms_b,
210+
rel_err_sigmaQ=rel_err_sigmaQ,
211+
rms_rt=rms_rt,
212+
n_z_inf=int(np.sum(m)),
213+
peak_mem_mb=peak_mem_mb,
52214
)
53-
dt = time.perf_counter() - t0
54-
return dt, len(table)
215+
216+
217+
def fit_log_slope(xs: np.ndarray, ys: np.ndarray) -> float:
218+
# Fit log(ys) ~ alpha * log(xs) + c ; return alpha
219+
m = np.isfinite(xs) & np.isfinite(ys) & (ys > 0)
220+
if m.sum() < 2:
221+
warnings.warn("Insufficient points for scaling slope fit.", RuntimeWarning)
222+
return np.nan
223+
lx = np.log(xs[m].astype(float))
224+
ly = np.log(ys[m].astype(float))
225+
A = np.column_stack([lx, np.ones_like(lx)])
226+
sol, _, _, _ = np.linalg.lstsq(A, ly, rcond=None)
227+
return float(sol[0])
228+
229+
230+
def _plot_scaling(res: pd.DataFrame, dists: list[str]):
231+
try:
232+
import matplotlib.pyplot as plt
233+
except Exception as e:
234+
warnings.warn(f"--plot requested but matplotlib unavailable: {e}")
235+
return
236+
for dist in dists:
237+
sub = res[res["dist"] == dist].sort_values("N")
238+
if sub.empty:
239+
continue
240+
fig, ax = plt.subplots(figsize=(6.2, 4.2), dpi=140)
241+
ax.loglog(sub["N"], sub["rms_b"], marker="o", label="rms_b")
242+
ax.loglog(sub["N"], sub["rms_rt"], marker="s", label="rms_rt")
243+
ax.set_title(f"Scaling — {dist}")
244+
ax.set_xlabel("N events")
245+
ax.set_ylabel("Error")
246+
ax.grid(True, which="both", ls=":")
247+
ax.legend()
248+
fig.tight_layout()
249+
fig.savefig(f"bench_scaling_{dist}.png")
250+
plt.close(fig)
55251

56252

57253
def main():
58-
Ns = [5_000, 50_000, 200_000]
59-
dists = ["uniform", "poisson", "gaussian"]
60-
print("N, dist, q_bins, z_bins, secs, rows")
61-
for n in Ns:
62-
for dist in dists:
63-
dt, rows = run_one(n, dist, q_bins=11, z_bins=10)
64-
print(f"{n:>8}, {dist:>8}, {11:>2}, {10:>2}, {dt:7.3f}, {rows:>6}")
254+
ap = argparse.ArgumentParser()
255+
ap.add_argument("--Ns", type=str, default="2000,5000,10000,20000,50000",
256+
help="comma-separated N list")
257+
ap.add_argument("--dists", type=str, default="uniform,poisson,gaussian",
258+
help="comma-separated distributions")
259+
ap.add_argument("--lam", type=float, default=50.0, help="Poisson mean λ")
260+
ap.add_argument("--q_step", type=float, default=0.025, help="q_center step")
261+
ap.add_argument("--dq", type=float, default=0.05, help="Δq window")
262+
ap.add_argument("--z_bins", type=int, default=20, help="# z bins")
263+
ap.add_argument("--sample_fraction", type=float, default=0.006, help="fraction for round-trip sampling")
264+
ap.add_argument("--plot", action="store_true", help="save log-log plots as PNGs")
265+
ap.add_argument("--save_csv", type=str, default="", help="path to save CSV results")
266+
ap.add_argument("--mem_profile", action="store_true", help="profile peak memory (if memory_profiler available)")
267+
ap.add_argument("--strict", action="store_true", help="raise AssertionError on scaling deviations")
268+
ap.add_argument("--scaling_tol", type=float, default=0.20, help="tolerance for |alpha_b + 0.5|")
269+
ap.add_argument("--rt_tol", type=float, default=0.10, help="tolerance for |alpha_rt - 0.0|")
270+
args = ap.parse_args()
271+
272+
Ns = [int(s) for s in args.Ns.split(",") if s.strip()]
273+
dists = [s.strip() for s in args.dists.split(",") if s.strip()]
274+
275+
print(f"Distributions: {', '.join(dists)} (Poisson uses PIT, λ={args.lam})")
276+
print(f"q_step={args.q_step}, dq={args.dq}, z_bins={args.z_bins}, sample_fraction={args.sample_fraction}\n")
277+
278+
rows = []
279+
for dist in dists:
280+
print(f"=== Benchmark: {dist} ===")
281+
for N in Ns:
282+
r = run_one(
283+
dist, N,
284+
q_step=args.q_step, dq=args.dq, z_bins=args.z_bins,
285+
lam=args.lam, sample_fraction=args.sample_fraction,
286+
mem_profile=args.mem_profile,
287+
)
288+
rows.append(r)
289+
print(f"N={N:7d} | t_fit={r['t_fit']:.3f}s | rms_b={r['rms_b']:.5f} "
290+
f"(rms_b*√N={r['rms_b']*sqrt(N):.5f}) | σQ_rel={r['rel_err_sigmaQ']:.3f} | "
291+
f"rt_rms={r['rms_rt']:.5f} (rt_rms*√N={r['rms_rt']*sqrt(N):.5f}) | "
292+
f"z_inf={r['n_z_inf']} | mem={r['peak_mem_mb']:.1f}MB")
293+
294+
res = pd.DataFrame(rows)
295+
296+
# Scaling summary per distribution
297+
print("\n=== Scaling summary (expect: α_b ≈ -0.5, α_rt ≈ 0.0) ===")
298+
summary_rows = []
299+
for dist in dists:
300+
sub = res[res["dist"] == dist].sort_values("N")
301+
a_b = fit_log_slope(sub["N"].to_numpy(), sub["rms_b"].to_numpy())
302+
a_rt = fit_log_slope(sub["N"].to_numpy(), sub["rms_rt"].to_numpy())
303+
const_b = (sub["rms_b"] * np.sqrt(sub["N"])).to_numpy()
304+
print(f"{dist:8s} | α_b={a_b: .3f} (→ -0.5) | α_rt={a_rt: .3f} (→ 0.0) "
305+
f"| mean(rms_b√N)={np.nanmean(const_b):.5f}")
306+
summary_rows.append({"dist": dist, "alpha_rms_b": a_b, "alpha_rms_rt": a_rt})
307+
summary = pd.DataFrame(summary_rows)
308+
309+
# CSV export
310+
if args.save_csv:
311+
res.to_csv(args.save_csv, index=False)
312+
print(f"\nSaved CSV to: {args.save_csv}")
313+
314+
# Plots
315+
if args.plot:
316+
_plot_scaling(res, dists)
317+
print("Saved PNG plots: bench_scaling_{dist}.png")
318+
319+
# Checks (warn by default; --strict to raise)
320+
for dist in dists:
321+
a_b = float(summary[summary["dist"] == dist]["alpha_rms_b"].iloc[0])
322+
a_rt = float(summary[summary["dist"] == dist]["alpha_rms_rt"].iloc[0])
323+
ok_b = np.isfinite(a_b) and (abs(a_b + 0.5) <= args.scaling_tol)
324+
ok_rt = np.isfinite(a_rt) and (abs(a_rt - 0.0) <= args.rt_tol)
325+
msg = f"SCALING [{dist}] α_b={a_b:.3f} vs -0.5 (tol {args.scaling_tol}) | α_rt={a_rt:.3f} vs 0.0 (tol {args.rt_tol})"
326+
if ok_b and ok_rt:
327+
print("PASS - " + msg)
328+
else:
329+
if args.strict:
330+
raise AssertionError("FAIL - " + msg)
331+
warnings.warn("WARN - " + msg)
65332

66333

67334
if __name__ == "__main__":

0 commit comments

Comments
 (0)