Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 66 additions & 104 deletions machine_learning_hep/analysis/analyzer_jets.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,23 +62,13 @@ def __init__(self, datap, case, typean, period):
super().__init__(datap, case, typean, period)

# output directories
self.d_resultsallpmc = self.cfg(f"mc.results.{period}") if period is not None else self.cfg("mc.resultsallp")
self.d_resultsallpdata = (
self.cfg(f"data.results.{period}") if period is not None else self.cfg("data.resultsallp")
)
suffix = f"results.{period}" if period is not None else "resultsallp"
self.d_resultsallpmc = self.cfg(f"mc.{suffix}")
self.d_resultsallpdata = self.cfg(f"data.{suffix}")

# input directories (processor output)
self.d_resultsallpmc_proc = self.d_resultsallpmc
self.d_resultsallpdata_proc = self.d_resultsallpdata
# use a different processor output
if "data_proc" in datap["analysis"][typean]:
self.d_resultsallpdata_proc = (
self.cfg(f"data_proc.results.{period}") if period is not None else self.cfg("data_proc.resultsallp")
)
if "mc_proc" in datap["analysis"][typean]:
self.d_resultsallpmc_proc = (
self.cfg(f"mc_proc.results.{period}") if period is not None else self.cfg("mc_proc.resultsallp")
)
self.d_resultsallpdata_proc = self.cfg(f"data_proc.{suffix}")
self.d_resultsallpmc_proc = self.cfg(f"mc_proc.{suffix}")

# input files
n_filemass_name = datap["files_names"]["histofilename"]
Expand All @@ -93,6 +83,7 @@ def __init__(self, datap, case, typean, period):
self.p_pdfnames = datap["analysis"][self.typean].get("pdf_names")
self.p_param_names = datap["analysis"][self.typean].get("param_names")

# TODO: should come entirely from DB
self.observables = {
"qa": ["zg", "rg", "nsd", "zpar", "dr", "lntheta", "lnkt", "lntheta-lnkt"],
"all": [*self.cfg("observables", {})],
Expand All @@ -104,7 +95,6 @@ def __init__(self, datap, case, typean, period):
self.fit_levels = self.cfg("fit_levels", ["mc", "data"])
self.fit_sigma = {}
self.fit_mean = {}
self.fit_func_bkg = {}
self.fit_range = {}
self.hcandeff = {"pr": None, "np": None}
self.hcandeff_gen = {}
Expand All @@ -124,6 +114,7 @@ def __init__(self, datap, case, typean, period):
for param, symbol in zip(
("mean", "sigma", "significance", "chi2"),
("#it{#mu}", "#it{#sigma}", "significance", "#it{#chi}^{2}"),
strict=False,
)
}
for level in self.fit_levels
Expand All @@ -140,10 +131,8 @@ def __init__(self, datap, case, typean, period):
self.file_out_histo = TFile(self.n_fileresult, "recreate")

self.fitter = RooFitter()
self.roo_ws = {}
self.roo_ws_ptjet = {}
self.roows = {}
self.roows_ptjet = {}
self.roo_ws = {} # ROOT workspaces stored at various levels
self.roows = {} # ROOT workspaces at latest level

# region helpers
def _save_canvas(self, canvas, filename):
Expand Down Expand Up @@ -487,17 +476,13 @@ def _fit_mass(self, hist, filename=None):
# pylint: disable=too-many-branches,too-many-statements
def fit(self):
if not self.cfg("hfjet", True):
self.logger.info("Not fitting mass distributions for inclusive jets")
return
self.logger.info("Fitting inclusive mass distributions")
gStyle.SetOptFit(1111)
for level in self.fit_levels:
self.fit_mean[level] = [None] * self.nbins
self.fit_sigma[level] = [None] * self.nbins
self.fit_func_bkg[level] = [None] * self.nbins
self.fit_range[level] = [None] * self.nbins
self.roo_ws[level] = [None] * self.nbins
self.roo_ws_ptjet[level] = [[None] * self.nbins for _ in range(10)]
rfilename = self.n_filemass_mc if "mc" in level else self.n_filemass
fitcfg = None
self.logger.debug("Opening file %s.", rfilename)
Expand Down Expand Up @@ -528,23 +513,22 @@ def fit(self):
if h_invmass.GetEntries() < 100: # TODO: reconsider criterion
self.logger.error("Not enough entries to fit %s iptjet %s ipt %d", level, iptjet, ipt)
continue
fit_res, _, func_bkg = self._fit_mass(
fit_res, _, _ = self._fit_mass(
h_invmass, f"fit/h_mass_fitted_{string_range_pthf(range_pthf)}_{level}.png"
)
if fit_res and fit_res.Get() and fit_res.IsValid():
self.fit_mean[level][ipt] = fit_res.Parameter(1)
self.fit_sigma[level][ipt] = fit_res.Parameter(2)
self.fit_func_bkg[level][ipt] = func_bkg
else:
self.logger.error("Fit failed for %s bin %d", level, ipt)
if self.cfg("mass_roofit"):
for entry in self.cfg("mass_roofit", []):
if lvl := entry.get("level"):
if lvl != level:
continue
if ptspec := entry.get("ptrange"):
if ptspec[0] > range_pthf[0] or ptspec[1] < range_pthf[1]:
continue
if (lvl := entry.get("level")) and lvl != level:
continue
if (ptspec := entry.get("ptrange")) and (
ptspec[0] > range_pthf[0] or ptspec[1] < range_pthf[1]
):
continue
fitcfg = entry
break
self.logger.debug("Using fit config for %i: %s", ipt, fitcfg)
Expand All @@ -559,7 +543,7 @@ def fit(self):
if h_invmass.GetEntries() < 100: # TODO: reconsider criterion
self.logger.error("Not enough entries to fit %s iptjet %s ipt %d", level, iptjet, ipt)
continue
roows = self.roows.get(ipt) if iptjet is None else self.roows_ptjet.get((iptjet, ipt))
roows = self.roows.get((iptjet, ipt))
if roows is None and level != self.fit_levels[0]:
self.logger.critical(
"missing previous fit result, cannot fit %s iptjet %s ipt %d", level, iptjet, ipt
Expand Down Expand Up @@ -596,18 +580,17 @@ def fit(self):
# roo_ws.Print()
# TODO: save snapshot per level
# roo_ws.saveSnapshot(level, None)
if iptjet is not None:
self.logger.debug("Setting roows_ptjet for %s iptjet %s ipt %d", level, iptjet, ipt)
self.roows_ptjet[(iptjet, ipt)] = roo_ws
self.roo_ws_ptjet[level][iptjet][ipt] = roo_ws
else:
self.logger.debug("Setting roows for %s iptjet %s ipt %d", level, iptjet, ipt)
self.roows[ipt] = roo_ws
self.roo_ws[level][ipt] = roo_ws
for jptjet in range(get_nbins(h, 1)):
self.roows_ptjet[(jptjet, ipt)] = roo_ws.Clone()
self.roo_ws_ptjet[level][jptjet][ipt] = roo_ws.Clone()
# TODO: take parameter names from DB
self.logger.info("Setting roows_ptjet for %s iptjet %s ipt %d", level, iptjet, ipt)
self.roows[(iptjet, ipt)] = roo_ws.Clone()
self.roo_ws[(level, iptjet, ipt)] = roo_ws.Clone()
if iptjet is None:
if not fitcfg.get("per_ptjet"):
for jptjet in range(get_nbins(h, 1)):
self.logger.info(
"Overwriting roows_ptjet for %s iptjet %s ipt %d", level, jptjet, ipt
)
self.roows[(jptjet, ipt)] = roo_ws.Clone()
self.roo_ws[(level, jptjet, ipt)] = roo_ws.Clone()
if level in ("data", "mc"):
varname_mean = fitcfg.get("var_mean", self.p_param_names["gauss_mean"])
varname_sigma = fitcfg.get("var_sigma", self.p_param_names["gauss_sigma"])
Expand All @@ -626,8 +609,6 @@ def fit(self):
ipt + 1, roo_ws.var(varname_sigma).getError()
)
varname_m = fitcfg.get("var", "m")
if roo_ws.pdf("bkg"):
self.fit_func_bkg[level][ipt] = roo_ws.pdf("bkg").asTF(roo_ws.var(varname_m))
self.fit_range[level][ipt] = (
roo_ws.var(varname_m).getMin("fit"),
roo_ws.var(varname_m).getMax("fit"),
Expand Down Expand Up @@ -660,12 +641,12 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
return None

for entry in self.cfg("sidesub", []):
if level := entry.get("level"):
if level != mcordata:
continue
if ptrange_sel := entry.get("ptrange"):
if ptrange_sel[0] > self.bins_candpt[ipt] or ptrange_sel[1] < self.bins_candpt[ipt + 1]:
continue
if (level := entry.get("level")) and level != mcordata:
continue
if (ptrange_sel := entry.get("ptrange")) and (
ptrange_sel[0] > self.bins_candpt[ipt] or ptrange_sel[1] < self.bins_candpt[ipt + 1]
):
continue
regcfg = entry["regions"]
break
regions = {
Expand Down Expand Up @@ -699,7 +680,7 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):

fh = {}
area = {}
var_m = self.roows[ipt].var("m")
var_m = self.roows[(None, ipt)].var("m")
for region in regions:
# project out the mass regions (first axis)
axes = list(range(get_dim(hist)))[1:]
Expand All @@ -716,56 +697,37 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
[fh["sideband_left"], fh["sideband_right"]], f"h_ptjet{label}_sideband_{ipt}_{mcordata}"
)
ensure_sumw2(fh_sideband)

subtract_sidebands = False
if mcordata == "data" and self.cfg("sidesub_per_ptjet"):
self.logger.info("Subtracting sidebands in pt jet bins")
for iptjet in range(get_nbins(fh_subtracted, 0)):
if rws := self.roo_ws_ptjet[mcordata][iptjet][ipt]:
f = rws.pdf("bkg").asTF(self.roo_ws[mcordata][ipt].var("m"))
else:
self.logger.error("Could not retrieve roows for %s-%i-%i", mcordata, iptjet, ipt)
continue
if mcordata == "data":
bins_ptjet = list(range(get_nbins(fh_subtracted, 0))) if self.cfg("sidesub_per_ptjet") else [None]
self.logger.info("Scaling sidebands in ptjet-%s bins: %s using %s", label, bins_ptjet, fh_sideband)
hx = project_hist(fh_sideband, (0,), {}) if get_dim(fh_sideband) > 1 else fh_sideband
for iptjet in bins_ptjet:
if iptjet:
n = hx.GetBinContent(iptjet)
self.logger.info("Need to scale in ptjet %i: %g", iptjet, n)
if n <= 0:
continue
rws = self.roo_ws.get((mcordata, iptjet, ipt))
if not rws:
self.logger.error("Falling back to incl. roows for %s-iptjet%i-ipt%i", mcordata, iptjet, ipt)
rws = self.roo_ws.get((mcordata, None, ipt))
if not rws:
self.logger.critical("Could not retrieve roows for %s-iptjet%i-ipt%i", mcordata, iptjet, ipt)
f = rws.pdf("bkg").asTF(rws.var("m"))
area = {region: f.Integral(*limits[region]) for region in regions}
self.logger.info(
"areas for %s-%s: %g, %g, %g",
mcordata,
ipt,
area["signal"],
area["sideband_left"],
area["sideband_right"],
)
self.logger.info("areas for %s-iptjet%s-ipt%s: %s", mcordata, iptjet, ipt, area)
if (area["sideband_left"] + area["sideband_right"]) > 0.0:
subtract_sidebands = True
areaNormFactor = area["signal"] / (area["sideband_left"] + area["sideband_right"])
# TODO: extend to higher dimensions
for ibin in range(get_nbins(fh_subtracted, 1)):
scale_bin(fh_sideband, areaNormFactor, iptjet + 1, ibin + 1)
else:
for region in regions:
f = self.roo_ws[mcordata][ipt].pdf("bkg").asTF(self.roo_ws[mcordata][ipt].var("m"))
area[region] = f.Integral(*limits[region])

self.logger.info(
"areas for %s-%s: %g, %g, %g",
mcordata,
ipt,
area["signal"],
area["sideband_left"],
area["sideband_right"],
)

if (area["sideband_left"] + area["sideband_right"]) > 0.0:
subtract_sidebands = True
areaNormFactor = area["signal"] / (area["sideband_left"] + area["sideband_right"])
fh_sideband.Scale(areaNormFactor)

# TODO: generalize and extend to higher dimensions
if iptjet is None:
fh_sideband.Scale(areaNormFactor)
else:
for ibin in range(get_nbins(fh_subtracted, 1)):
scale_bin(fh_sideband, areaNormFactor, iptjet + 1, ibin + 1)
fh_subtracted.Add(fh_sideband, -1.0)
self._save_hist(fh_sideband, f"sideband/h_ptjet{label}_sideband_{string_range_pthf(range_pthf)}_{mcordata}.png")
if subtract_sidebands:
fh_subtracted.Add(fh_sideband, -1.0)

self._clip_neg(fh_subtracted)

self._save_hist(
fh_subtracted,
f"sideband/h_ptjet{label}_subtracted_notscaled_{string_range_pthf(range_pthf)}_{mcordata}.png",
Expand Down Expand Up @@ -802,17 +764,17 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
self._save_canvas(c, filename)

# TODO: calculate per ptjet bin
roows = self.roows[ipt]
roows = self.roows[(None, ipt)]
roows.var("mean").setVal(self.fit_mean[mcordata][ipt])
roows.var("sigma_g1").setVal(self.fit_sigma[mcordata][ipt])
var_m.setRange("signal", *limits["signal"])
var_m.setRange("sidel", *limits["sideband_left"])
var_m.setRange("sider", *limits["sideband_right"])
# correct for reflections
if self.cfg("corr_refl") and (mcordata == "data" or not self.cfg("closure.filter_reflections")):
pdf_sig = self.roows[ipt].pdf("sig")
pdf_refl = self.roows[ipt].pdf("refl")
pdf_bkg = self.roows[ipt].pdf("bkg")
pdf_sig = self.roows[(None, ipt)].pdf("sig")
pdf_refl = self.roows[(None, ipt)].pdf("refl")
pdf_bkg = self.roows[(None, ipt)].pdf("bkg")
frac_sig = roows.var("frac").getVal() if mcordata == "data" else 1.0
frac_bkg = 1.0 - frac_sig
fac_sig = frac_sig * (1.0 - roows.var("frac_refl").getVal())
Expand Down Expand Up @@ -862,9 +824,9 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
self.h_reflcorr.SetBinContent(ipt + 1, corr)
fh_subtracted.Scale(corr)

pdf_sig = self.roows[ipt].pdf("sig")
pdf_sig = self.roows[(None, ipt)].pdf("sig")
frac_sig = pdf_sig.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal()
if pdf_peak := self.roows[ipt].pdf("peak"):
if pdf_peak := self.roows[(None, ipt)].pdf("peak"):
frac_peak = pdf_peak.createIntegral(var_m, ROOT.RooFit.NormSet(var_m), ROOT.RooFit.Range("signal")).getVal()
self.logger.info(
"correcting %s-%i for fractional signal area: %g (Gaussian: %g)", mcordata, ipt, frac_sig, frac_peak
Expand Down
Loading