alisw · qgp · Aug 21, 2025 · Sep 1, 2025 · Mar 6, 2025 · Sep 1, 2025
diff --git a/machine_learning_hep/analysis/analyzer_jets.py b/machine_learning_hep/analysis/analyzer_jets.py
@@ -65,15 +65,18 @@ def __init__(self, datap, case, typean, period):
         suffix = f"results.{period}" if period is not None else "resultsallp"
         self.d_resultsallpmc = self.cfg(f"mc.{suffix}")
         self.d_resultsallpdata = self.cfg(f"data.{suffix}")
+        self.d_resultsallpfd = self.cfg(f"fd.{suffix}")
 
         # input directories (processor output)
         self.d_resultsallpdata_proc = self.cfg(f"data_proc.{suffix}")
         self.d_resultsallpmc_proc = self.cfg(f"mc_proc.{suffix}")
+        self.d_resultsallpfd_proc = self.cfg(f"fd_proc.{suffix}")
 
         # input files
         n_filemass_name = datap["files_names"]["histofilename"]
         self.n_filemass = os.path.join(self.d_resultsallpdata_proc, n_filemass_name)
         self.n_filemass_mc = os.path.join(self.d_resultsallpmc_proc, n_filemass_name)
+        self.n_filemass_fd = os.path.join(self.d_resultsallpfd_proc, n_filemass_name)
         self.n_fileeff = datap["files_names"]["efffilename"]
         self.n_fileeff = os.path.join(self.d_resultsallpmc_proc, self.n_fileeff)
         self.n_fileresp = datap["files_names"]["respfilename"]
@@ -83,9 +86,8 @@ def __init__(self, datap, case, typean, period):
         self.p_pdfnames = datap["analysis"][self.typean].get("pdf_names")
         self.p_param_names = datap["analysis"][self.typean].get("param_names")
 
-        # TODO: should come entirely from DB
         self.observables = {
-            "qa": ["zg", "rg", "nsd", "zpar", "dr", "lntheta", "lnkt", "lntheta-lnkt"],
+            "qa": [*self.cfg("observables", {})],
             "all": [*self.cfg("observables", {})],
         }
 
@@ -114,7 +116,7 @@ def __init__(self, datap, case, typean, period):
                 for param, symbol in zip(
                     ("mean", "sigma", "significance", "chi2"),
                     ("#it{#mu}", "#it{#sigma}", "significance", "#it{#chi}^{2}"),
-                    strict=False,
+                    strict=True,
                 )
             }
             for level in self.fit_levels
@@ -209,7 +211,7 @@ def qa(self):  # pylint: disable=invalid-name
     # region efficiency
     # pylint: disable=too-many-statements
     def calculate_efficiencies(self):
-        self.logger.info("Calculating efficiencies")
+        self.logger.info("Calculating efficiencies from %s", self.n_fileeff)
         cats = {"pr", "np"}
         with TFile(self.n_fileeff) as rfile:
             h_gen = {cat: rfile.Get(f"h_ptjet-pthf_{cat}_gen") for cat in cats}
@@ -586,9 +588,6 @@ def fit(self):
                         if iptjet is None:
                             if not fitcfg.get("per_ptjet"):
                                 for jptjet in range(get_nbins(h, 1)):
-                                    self.logger.info(
-                                        "Overwriting roows_ptjet for %s iptjet %s ipt %d", level, jptjet, ipt
-                                    )
                                     self.roows[(jptjet, ipt)] = roo_ws.Clone()
                                     self.roo_ws[(level, jptjet, ipt)] = roo_ws.Clone()
                             if level in ("data", "mc"):
@@ -685,7 +684,6 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
             # project out the mass regions (first axis)
             axes = list(range(get_dim(hist)))[1:]
             fh[region] = project_hist(hist, axes, {0: bins[region]})
-            self.logger.info("Projecting %s to %s in %s: %g entries", hist, axes, bins[region], fh[region].GetEntries())
             self._save_hist(
                 fh[region], f"sideband/h_ptjet{label}_{region}_{string_range_pthf(range_pthf)}_{mcordata}.png"
             )
@@ -702,11 +700,8 @@ def _subtract_sideband(self, hist, var, mcordata, ipt):
             self.logger.info("Scaling sidebands in ptjet-%s bins: %s using %s", label, bins_ptjet, fh_sideband)
             hx = project_hist(fh_sideband, (0,), {}) if get_dim(fh_sideband) > 1 else fh_sideband
             for iptjet in bins_ptjet:
-                if iptjet:
-                    n = hx.GetBinContent(iptjet)
-                    self.logger.info("Need to scale in ptjet %i: %g", iptjet, n)
-                    if n <= 0:
-                        continue
+                if iptjet and hx.GetBinContent(iptjet) <= 0:
+                    continue
                 rws = self.roo_ws.get((mcordata, iptjet, ipt))
                 if not rws:
                     self.logger.error("Falling back to incl. roows for %s-iptjet%i-ipt%i", mcordata, iptjet, ipt)
@@ -1063,57 +1058,77 @@ def _extract_signal(self, hist, var, mcordata, ipt):
         # hres.Sumw2() # TODO: check if we should do this here
         return hres
 
-    # region feeddown
-    # pylint: disable=too-many-statements
     def estimate_feeddown(self):
-        self.logger.info("Estimating feeddown")
+        """Estimate feeddown from legacy Run 2 trees or gen-only simulation"""
+        match self.cfg("fd_input", "tree"):
+            case "tree":
+                self.logger.info("Reading feeddown information from trees")
+                with TFile(self.cfg("fd_root")) as rfile:
+                    powheg_xsection = rfile.Get("fHistXsection")
+                    powheg_xsection_scale_factor = powheg_xsection.GetBinContent(1) / powheg_xsection.GetEntries()
+                self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor)
+
+                df = pd.read_parquet(self.cfg("fd_parquet"))
+                col_mapping = {"dr": "delta_r_jet", "zpar": "z"}  # TODO: check mapping
+
+                # TODO: generalize to higher dimensions
+                h3_fd_gen_orig = {}
+                for var in self.observables["all"]:
+                    bins_ptjet = np.asarray(self.cfg("bins_ptjet"), "d")
+                    # TODO: generalize or derive from histogram?
+                    bins_obs = {}
+                    if binning := self.cfg(f"observables.{var}.bins_gen_var"):
+                        bins_tmp = np.asarray(binning, "d")
+                    elif binning := self.cfg(f"observables.{var}.bins_gen_fix"):
+                        bins_tmp = bin_array(*binning)
+                    elif binning := self.cfg(f"observables.{var}.bins_var"):
+                        bins_tmp = np.asarray(binning, "d")
+                    elif binning := self.cfg(f"observables.{var}.bins_fix"):
+                        bins_tmp = bin_array(*binning)
+                    else:
+                        self.logger.error("no binning specified for %s, using defaults", var)
+                        bins_tmp = bin_array(10, 0.0, 1.0)
+                    bins_obs[var] = bins_tmp
 
-        with TFile(self.cfg("fd_root")) as rfile:
-            powheg_xsection = rfile.Get("fHistXsection")
-            powheg_xsection_scale_factor = powheg_xsection.GetBinContent(1) / powheg_xsection.GetEntries()
-        self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1.0 / powheg_xsection_scale_factor)
+                    colname = col_mapping.get(var, f"{var}_jet")
+                    if f"{colname}" not in df:
+                        if var is not None:
+                            self.logger.error(
+                                "No feeddown information for %s (%s), cannot estimate feeddown", var, colname
+                            )
+                            # print(df.info(), flush=True)
+                        continue
 
-        df = pd.read_parquet(self.cfg("fd_parquet"))
-        col_mapping = {"dr": "delta_r_jet", "zpar": "z"}  # TODO: check mapping
+                    # TODO: derive histogram
+                    h3_fd_gen_orig[var] = create_hist(
+                        "h3_feeddown_gen",
+                        f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}",
+                        bins_ptjet,
+                        self.bins_candpt,
+                        bins_obs[var],
+                    )
+                    fill_hist_fast(h3_fd_gen_orig[var], df[["pt_jet", "pt_cand", f"{colname}"]])
+
+            case "sim":
+                h3_fd_gen_orig = {}
+                with TFile(self.n_filemass_fd) as rfile:
+                    for var in self.observables["all"]:
+                        self.logger.info("Running feeddown analysis for obs. %s", var)
+                        label = f"-{var}" if var else ""
+                        if fh := rfile.Get(f"h_mass-ptjet-pthf{label}"):
+                            h3_fd_gen_orig[var] = project_hist(fh, list(range(1, get_dim(fh))), {})
+                    h_norm = rfile.Get("histonorm")
+                    powheg_xsection_avg = h_norm.GetBinContent(6) / h_norm.GetBinContent(5)
+                    powheg_xsection_scale_factor = powheg_xsection_avg / h_norm.GetBinContent(5)
+                    self.logger.info("powheg_xsection_scale_factor = %f", powheg_xsection_scale_factor)
+                    self.logger.info("POWHEG luminosity (mb^{-1}): %g", 1. / powheg_xsection_scale_factor)
+
+            case fd_input:
+                self.logger.critical("Invalid feeddown input %s", fd_input)
 
-        # TODO: generalize to higher dimensions
         for var in self.observables["all"]:
-            bins_ptjet = np.asarray(self.cfg("bins_ptjet"), "d")
-            # TODO: generalize or derive from histogram?
-            bins_obs = {}
-            if binning := self.cfg(f"observables.{var}.bins_gen_var"):
-                bins_tmp = np.asarray(binning, "d")
-            elif binning := self.cfg(f"observables.{var}.bins_gen_fix"):
-                bins_tmp = bin_array(*binning)
-            elif binning := self.cfg(f"observables.{var}.bins_var"):
-                bins_tmp = np.asarray(binning, "d")
-            elif binning := self.cfg(f"observables.{var}.bins_fix"):
-                bins_tmp = bin_array(*binning)
-            else:
-                self.logger.error("no binning specified for %s, using defaults", var)
-                bins_tmp = bin_array(10, 0.0, 1.0)
-            bins_obs[var] = bins_tmp
-
-            colname = col_mapping.get(var, f"{var}_jet")
-            if f"{colname}" not in df:
-                if var is not None:
-                    self.logger.error("No feeddown information for %s (%s), cannot estimate feeddown", var, colname)
-                    # print(df.info(), flush=True)
-                continue
-
-            # TODO: derive histogram
-            h3_fd_gen_orig = create_hist(
-                "h3_feeddown_gen",
-                f";p_{{T}}^{{jet}} (GeV/#it{{c}});p_{{T}}^{{HF}} (GeV/#it{{c}});{var}",
-                bins_ptjet,
-                self.bins_candpt,
-                bins_obs[var],
-            )
-            fill_hist_fast(h3_fd_gen_orig, df[["pt_jet", "pt_cand", f"{colname}"]])
-            self._save_hist(project_hist(h3_fd_gen_orig, [0, 2], {}), f"fd/h_ptjet-{var}_feeddown_gen_noeffscaling.png")
-
             # new method
-            h3_fd_gen = h3_fd_gen_orig.Clone()
+            h3_fd_gen = h3_fd_gen_orig[var].Clone()
             ensure_sumw2(h3_fd_gen)
             self._save_hist(project_hist(h3_fd_gen, [0, 2], {}), f"fd/h_ptjet-{var}_fdnew_gen.png")
             # apply np efficiency
@@ -1158,7 +1173,7 @@ def estimate_feeddown(self):
             h_fd_det = project_hist(h3_fd_det, [0, 2], {})
 
             # old method
-            h3_fd_gen = h3_fd_gen_orig.Clone()
+            h3_fd_gen = h3_fd_gen_orig[var].Clone()
             ensure_sumw2(h3_fd_gen)
             for ipt in range(get_nbins(h3_fd_gen, 1)):
                 eff_pr = self.hcandeff["pr"].GetBinContent(ipt + 1)

diff --git a/machine_learning_hep/common.py b/machine_learning_hep/common.py
@@ -0,0 +1,6 @@
+from enum import Enum
+
+class DataType(Enum):
+    MC = "mc"
+    DATA = "data"
+    FD = "fd"