Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 74 additions & 59 deletions machine_learning_hep/analysis/analyzer_jets.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,18 @@ def __init__(self, datap, case, typean, period):
suffix = f"results.{period}" if period is not None else "resultsallp"
self.d_resultsallpmc = self.cfg(f"mc.{suffix}")
self.d_resultsallpdata = self.cfg(f"data.{suffix}")
self.d_resultsallpfd = self.cfg(f"fd.{suffix}")

# input directories (processor output)
self.d_resultsallpdata_proc = self.cfg(f"data_proc.{suffix}")
self.d_resultsallpmc_proc = self.cfg(f"mc_proc.{suffix}")
self.d_resultsallpfd_proc = self.cfg(f"fd_proc.{suffix}")

# input files
n_filemass_name = datap["files_names"]["histofilename"]
self.n_filemass = os.path.join(self.d_resultsallpdata_proc, n_filemass_name)
self.n_filemass_mc = os.path.join(self.d_resultsallpmc_proc, n_filemass_name)
self.n_filemass_fd = os.path.join(self.d_resultsallpfd_proc, n_filemass_name)
self.n_fileeff = datap["files_names"]["efffilename"]
self.n_fileeff = os.path.join(self.d_resultsallpmc_proc, self.n_fileeff)
self.n_fileresp = datap["files_names"]["respfilename"]
Expand All @@ -83,9 +86,8 @@ def __init__(self, datap, case, typean, period):
self.p_pdfnames = datap["analysis"][self.typean].get("pdf_names")
self.p_param_names = datap["analysis"][self.typean].get("param_names")

# TODO: should come entirely from DB
self.observables = {
"qa": ["zg", "rg", "nsd", "zpar", "dr", "lntheta", "lnkt", "lntheta-lnkt"],
"qa": [*self.cfg("observables", {})],
"all": [*self.cfg("observables", {})],
}

Expand Down Expand Up @@ -114,7 +116,7 @@ def __init__(self, datap, case, typean, period):
for param, symbol in zip(
("mean", "sigma", "significance", "chi2"),
("#it{#mu}", "#it{#sigma}", "significance", "#it{#chi}^{2}"),
strict=False,
strict=True,
)
}
for level in self.fit_levels
Expand Down Expand Up @@ -209,7 +211,7 @@ def qa(self): # pylint: disable=invalid-name
# region efficiency
# pylint: disable=too-many-statements
def calculate_efficiencies(self):
self.logger.info("Calculating efficiencies")
self.logger.info("Calculating efficiencies from %s", self.n_fileeff)
cats = {"pr", "np"}
with TFile(self.n_fileeff) as rfile:
h_gen = {cat: rfile.Get(f"h_ptjet-pthf_{cat}_gen") for cat in cats}
Expand Down Expand Up @@ -586,9 +588,6 @@ def fit(self):
if iptjet is None:
if not fitcfg.get("per_ptjet"):
for jptjet in range(get_nbins(h, 1)):
self.logger.info(
"Overwriting roows_ptjet for %s iptjet %s ipt %d", level, jptjet, ipt
)
self.roows[(jptjet, ipt)] = roo_ws.Clone()
self.roo_ws[(level, jptjet, ipt)] = roo_ws.Clone()
if level in ("data", "mc"):
Expand Down Expand Up @@ -685,7 +684,6 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
# project out the mass regions (first axis)
axes = list(range(get_dim(hist)))[1:]
fh[region] = project_hist(hist, axes, {0: bins[region]})
self.logger.info("Projecting %s to %s in %s: %g entries", hist, axes, bins[region], fh[region].GetEntries())
self._save_hist(
fh[region], f"sideband/h_ptjet{label}_{region}_{string_range_pthf(range_pthf)}_{mcordata}.png"
)
Expand All @@ -702,11 +700,8 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
self.logger.info("Scaling sidebands in ptjet-%s bins: %s using %s", label, bins_ptjet, fh_sideband)
hx = project_hist(fh_sideband, (0,), {}) if get_dim(fh_sideband) > 1 else fh_sideband
for iptjet in bins_ptjet:
if iptjet:
n = hx.GetBinContent(iptjet)
self.logger.info("Need to scale in ptjet %i: %g", iptjet, n)
if n <= 0:
continue
if iptjet and hx.GetBinContent(iptjet) <= 0:
continue
rws = self.roo_ws.get((mcordata, iptjet, ipt))
if not rws:
self.logger.error("Falling back to incl. roows for %s-iptjet%i-ipt%i", mcordata, iptjet, ipt)
Expand Down Expand Up @@ -1063,57 +1058,77 @@ def _extract_signal(self, hist, var, mcordata, ipt):
# hres.Sumw2() # TODO: check if we should do this here
return hres

# region feeddown
# pylint: disable=too-many-statements
def estimate_feeddown(self):
self.logger.info("Estimating feeddown")
"""Estimate feeddown from legacy Run 2 trees or gen-only simulation"""
match self.cfg("fd_input", "tree"):
case "tree":
self.logger.info("Reading feeddown information from trees")
with TFile(self.cfg("fd_root")) as rfile:
powheg_xsection = rfile.Get("fHistXsection")
powheg_xsection_scale_factor = powheg_xsection.GetBinContent(1) / powheg_xsection.GetEntries()
self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor)

df = pd.read_parquet(self.cfg("fd_parquet"))
col_mapping = {"dr": "delta_r_jet", "zpar": "z"} # TODO: check mapping

# TODO: generalize to higher dimensions
h3_fd_gen_orig = {}
for var in self.observables["all"]:
bins_ptjet = np.asarray(self.cfg("bins_ptjet"), "d")
# TODO: generalize or derive from histogram?
bins_obs = {}
if binning := self.cfg(f"observables.{var}.bins_gen_var"):
bins_tmp = np.asarray(binning, "d")
elif binning := self.cfg(f"observables.{var}.bins_gen_fix"):
bins_tmp = bin_array(*binning)
elif binning := self.cfg(f"observables.{var}.bins_var"):
bins_tmp = np.asarray(binning, "d")
elif binning := self.cfg(f"observables.{var}.bins_fix"):
bins_tmp = bin_array(*binning)
else:
self.logger.error("no binning specified for %s, using defaults", var)
bins_tmp = bin_array(10, 0.0, 1.0)
bins_obs[var] = bins_tmp

with TFile(self.cfg("fd_root")) as rfile:
powheg_xsection = rfile.Get("fHistXsection")
powheg_xsection_scale_factor = powheg_xsection.GetBinContent(1) / powheg_xsection.GetEntries()
self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor)
colname = col_mapping.get(var, f"{var}_jet")
if f"{colname}" not in df:
if var is not None:
self.logger.error(
"No feeddown information for %s (%s), cannot estimate feeddown", var, colname
)
# print(df.info(), flush=True)
continue

df = pd.read_parquet(self.cfg("fd_parquet"))
col_mapping = {"dr": "delta_r_jet", "zpar": "z"} # TODO: check mapping
# TODO: derive histogram
h3_fd_gen_orig[var] = create_hist(
"h3_feeddown_gen",
f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}",
bins_ptjet,
self.bins_candpt,
bins_obs[var],
)
fill_hist_fast(h3_fd_gen_orig[var], df[["pt_jet", "pt_cand", f"{colname}"]])

case "sim":
h3_fd_gen_orig = {}
with TFile(self.n_filemass_fd) as rfile:
for var in self.observables["all"]:
self.logger.info("Running feeddown analysis for obs. %s", var)
label = f"-{var}" if var else ""
if fh := rfile.Get(f"h_mass-ptjet-pthf{label}"):
h3_fd_gen_orig[var] = project_hist(fh, list(range(1, get_dim(fh))), {})
h_norm = rfile.Get("histonorm")
powheg_xsection_avg = h_norm.GetBinContent(6) / h_norm.GetBinContent(5)
powheg_xsection_scale_factor = powheg_xsection_avg / h_norm.GetBinContent(5)
self.logger.info("powheg_xsection_scale_factor = %f", powheg_xsection_scale_factor)
self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1. / powheg_xsection_scale_factor)

case fd_input:
self.logger.critical("Invalid feeddown input %s", fd_input)

# TODO: generalize to higher dimensions
for var in self.observables["all"]:
bins_ptjet = np.asarray(self.cfg("bins_ptjet"), "d")
# TODO: generalize or derive from histogram?
bins_obs = {}
if binning := self.cfg(f"observables.{var}.bins_gen_var"):
bins_tmp = np.asarray(binning, "d")
elif binning := self.cfg(f"observables.{var}.bins_gen_fix"):
bins_tmp = bin_array(*binning)
elif binning := self.cfg(f"observables.{var}.bins_var"):
bins_tmp = np.asarray(binning, "d")
elif binning := self.cfg(f"observables.{var}.bins_fix"):
bins_tmp = bin_array(*binning)
else:
self.logger.error("no binning specified for %s, using defaults", var)
bins_tmp = bin_array(10, 0.0, 1.0)
bins_obs[var] = bins_tmp

colname = col_mapping.get(var, f"{var}_jet")
if f"{colname}" not in df:
if var is not None:
self.logger.error("No feeddown information for %s (%s), cannot estimate feeddown", var, colname)
# print(df.info(), flush=True)
continue

# TODO: derive histogram
h3_fd_gen_orig = create_hist(
"h3_feeddown_gen",
f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}",
bins_ptjet,
self.bins_candpt,
bins_obs[var],
)
fill_hist_fast(h3_fd_gen_orig, df[["pt_jet", "pt_cand", f"{colname}"]])
self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png")

# new method
h3_fd_gen = h3_fd_gen_orig.Clone()
h3_fd_gen = h3_fd_gen_orig[var].Clone()
ensure_sumw2(h3_fd_gen)
self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_gen.png")
# apply np efficiency
Expand Down Expand Up @@ -1158,7 +1173,7 @@ def estimate_feeddown(self):
h_fd_det = project_hist(h3_fd_det, [0, 2], {})

# old method
h3_fd_gen = h3_fd_gen_orig.Clone()
h3_fd_gen = h3_fd_gen_orig[var].Clone()
ensure_sumw2(h3_fd_gen)
for ipt in range(get_nbins(h3_fd_gen, 1)):
eff_pr = self.hcandeff["pr"].GetBinContent(ipt + 1)
Expand Down
6 changes: 6 additions & 0 deletions machine_learning_hep/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from enum import Enum

class DataType(Enum):
MC = "mc"
DATA = "data"
FD = "fd"
Loading
Loading