Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions machine_learning_hep/multitrial/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Multitrial systematics with MLHEP

## Generate configurations (MLHEP yml databases) for each trial

File: `run-mlhep-fitter-multitrial.py`<br>
Usage: `python run-mlhep-fitter-multitrial.py database_file in_db_dir out_db_dir mlhep_results_dir_pattern`

Arguments:
- `database_file`: filename of the template database without the .yml extension, e.g., `database_ml_parameters_LcToPKPi`
- `in_db_dir`: path to the directory containing the database, e.g., `data/data_run3`
- `out_db_dir`: path to the directory for output multitrial databases, e.g., `multitrial_db`
- `mlhep_results_dir_pattern`: prefix of output directory name for fit results; for each trial, the trial name is appended to the directory name, and the resulting directory name is written under `Run3analysis/{data,mc}/prefix_dir_res` in the database file

Adjust `DIR_PATH` in the script. It is the path to the base directory where you store directories with MLHEP results.

This script needs to be ran only once to generate databases.

Currently, the trials are hardcoded in the Python script. To add or modify a trial, you need to adjust `BASE_TRIALS` variable and the `process_trial` function.

## Get mass fits for each trial

File: `run-mlhep-fitter-multitrial.sh`<br>
Usage: `./run-mlhep-fitter-multitrial.sh`

The `submission/analyzer.yml` config is used.
The script automates running MLHEP for each trial. Mass histograms are copied before each MLHEP invocation, so as only the quick fit steps needs to be activated in `submission/analyzer.yml`

Adjust the variables before the `for` loop.<br>
The script includes also a call to `run-mlhep-fitter-multitrial.py`, which can be commented out. In this case, make sure to pass the same `OUT_DB_DIR`, `DB_PATTERN`, `DIR_PATTERN` values to the two scripts.

Before running, you need to create the directory structure for each MLHEP output. You can, for example, run the `.sh` script with the `cp` lines commented out. Then, MLHEP creates directories for each trial and fails quietly. Next, run the script with `cp` lines uncommented, and you will get the final output.

## Plot multitrial results

Files: `multitrial.py`, `config_multitrial.json`<br>
Usage: `python3 multitrial.py config_multitrial.json`

Adjust the sample `config_multitrial.json` to your needs.
20 changes: 20 additions & 0 deletions machine_learning_hep/multitrial/config_multitrial.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"file_pattern": "/data8/majak/MLHEP/results-24022025-newtrain-multitrial-prompt*/LHC23pp_pass4/Results/resultsdatatot/yields_LcpKpi_Run3analysis.root",
"_file_pattern": "regex pattern for all multitrial fit files; note the asterisk to match all trial suffixes",
"dir_pattern": "results-24022025-newtrain-multitrial-prompt",
"_dir_pattern": "the base directory prefix from the file pattern above",
"histoname": "hyields0",
"_histoname": "histogram with mass fit",
"sel_histoname": "hchi0",
"_sel_histoname": "histogram for filtering the results",
"selection": "lambda x : x < 5.0",
"_selection": "filter to apply with sel_histoname, e.g., chi < 5",
"pt_bins_min": [1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 16],
"pt_bins_max": [2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 24],
"central_trial": "",
"_central_trial": "suffix of the directory with the central trial",
"outdir": "/data8/majak/multitrial",
"_outdir": "output directory",
"outfile": "result-prompt-chi5",
"_outfile": "output file pattern"
}
187 changes: 187 additions & 0 deletions machine_learning_hep/multitrial/multitrial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# pylint: disable=missing-function-docstring, invalid-name
"""
file: multitrial.py
brief: Plot multitrial systematics based on multiple fit trials, one file per trial.
usage: python3 multitrial.py config_multitrial.json
author: Maja Karwowska <mkarwowska@cern.ch>, Warsaw University of Technology
"""
import argparse
import glob
import json
import re
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator, AutoMinorLocator

from ROOT import ( # pylint: disable=import-error,no-name-in-module
TFile,
gROOT,
)


def plot_text_box(ax, text):
ax.text(0.98, 0.97, text,
horizontalalignment="right", verticalalignment="top",
fontsize=40, va="top", transform=ax.transAxes,
bbox={"edgecolor": "black", "fill": False})


def get_yields(cfg):
filenames = sorted(glob.glob(cfg["file_pattern"]),
key=lambda filename: re.split("/", filename)[-2])
yields = {}
yields_err = {}
trials = {}
chis = {}
for pt_bin_min, pt_bin_max in zip(cfg["pt_bins_min"], cfg["pt_bins_max"]):
yields[f"{pt_bin_min}_{pt_bin_max}"] = []
yields_err[f"{pt_bin_min}_{pt_bin_max}"] = []
trials[f"{pt_bin_min}_{pt_bin_max}"] = []
chis[f"{pt_bin_min}_{pt_bin_max}"] = []
for filename in filenames:
print(f"Reading {filename}")
with TFile.Open(filename) as fin:
hist = fin.Get(cfg["histoname"])
hist_sel = fin.Get(cfg["sel_histoname"])
if hist.ClassName() != "TH1F":
print(f"No hist in {filename}")
if hist_sel.ClassName() != "TH1F":
print(f"No hist sel in {filename}")
dirname = re.split("/", filename)[4] # [-2] for D2H fitter
trial_name = dirname.replace(cfg["dir_pattern"], "")
for ind, (pt_bin_min, pt_bin_max) in enumerate(zip(cfg["pt_bins_min"],
cfg["pt_bins_max"])):
if eval(cfg["selection"])(hist_sel.GetBinContent(ind + 1)) \
and hist.GetBinContent(ind + 1) > 1.0 : # pylint: disable=eval-used
yields[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinContent(ind + 1))
yields_err[f"{pt_bin_min}_{pt_bin_max}"].append(hist.GetBinError(ind + 1))
trials[f"{pt_bin_min}_{pt_bin_max}"].append(trial_name)
chis[f"{pt_bin_min}_{pt_bin_max}"].append(hist_sel.GetBinContent(ind + 1))
else:
print(f"Rejected: {hist_sel.GetBinContent(ind + 1)} {trial_name} "\
f"pt: {pt_bin_min}, {pt_bin_max}")
if hist.GetBinContent(ind + 1) < 1.0:
print("Yield 0")
return yields, yields_err, trials, chis


def prepare_figure(y_label, ticks):
fig = plt.figure(figsize=(20, 15))
ax = plt.subplot(1, 1, 1)
ax.set_xlabel("Trial #", fontsize=20)
ax.set_ylabel(y_label, fontsize=20)
ax.tick_params(which="both", width=2.5, direction="in")
ax.tick_params(which="major", labelsize=20, length=15)
ax.tick_params(which="minor", length=7)
ax.xaxis.set_major_locator(MultipleLocator(ticks))
ax.xaxis.set_minor_locator(AutoMinorLocator(5))
ax.yaxis.set_minor_locator(AutoMinorLocator(5))
return fig, ax


def set_ax_limits(ax, pt_string, values):
ax.margins(0.01, 0.2)
np_values = np.array(values, dtype="float32")
if ax.get_ylim()[1] - ax.get_ylim()[0] > 30.0 * np.std(np_values):
ax.set_ylim(np.mean(np_values) - 10.0 * np.std(np_values),
np.mean(np_values) + 10.0 * np.std(np_values))
print(f"{pt_string} narrowing down the axis to {ax.get_ylim()}")


def plot_trial_line(ax, central_trial_ind):
axis_lim = ax.get_ylim()
y_axis = np.linspace(*axis_lim, 100)
ax.plot([central_trial_ind] * len(y_axis), y_axis, c="m", ls="--", linewidth=4.0)
ax.set_ylim(*axis_lim)


def plot_yields_trials(yields, yields_err, trials, cfg, pt_string, plot_pt_string,
central_trial_ind, central_yield):
fig, ax = prepare_figure("Raw yield", 100)
x_axis = range(len(trials))
ax.errorbar(x_axis, yields, yerr=yields_err,
fmt="o", c="b", elinewidth=2.5, linewidth=4.0)
set_ax_limits(ax, pt_string, yields)
central_line = np.array([central_yield] * len(x_axis), dtype="float32")
ax.plot(x_axis, central_line, c="orange", ls="--", linewidth=4.0)
central_err = np.array([yields_err[central_trial_ind]] * len(x_axis), dtype="float32")
ax.fill_between(x_axis, central_line - central_err, central_line + central_err,
facecolor="orange", edgecolor="none", alpha=0.3)
plot_trial_line(ax, central_trial_ind)
plot_text_box(ax, plot_pt_string)
fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_yields_trials_{pt_string}.png',
bbox_inches='tight')
plt.close()


def plot_chis(chis, cfg, pt_string, plot_pt_string):
fig, ax = prepare_figure("Chi2/ndf", 100)
x_axis = range(len(chis))
ax.scatter(x_axis, chis, c="b", marker="o")
set_ax_limits(ax, pt_string, chis)
plot_text_box(ax, plot_pt_string)
fig.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_chis_{pt_string}.png',
bbox_inches='tight')
plt.close()


def plot_yields_distr(yields, cfg, pt_string, plot_pt_string, central_trial_ind, central_yield):
plt.figure(figsize=(20, 15))
ax = plt.subplot(1, 1, 1)
ax.set_xlabel("Ratio", fontsize=20)
ax.tick_params(labelsize=20, length=7, width=2.5)
ratios = [yield_ / central_yield for ind, yield_ in enumerate(yields) \
if ind != central_trial_ind]
ax.hist(ratios, color="b", linewidth=4.0)
mean = np.mean(yields)
std_dev = np.std(yields)
diffs = [(yield_ - central_yield) / central_yield \
for yield_ in yields[:central_trial_ind]]
diffs.extend([(yield_ - central_yield) / central_yield \
for yield_ in yields[central_trial_ind+1:]])
rmse = np.sqrt(np.mean(np.array(diffs, dtype="float32")**2))
plot_text_box(ax, f"{plot_pt_string}\n"\
f"mean: {mean:.0f}\n"\
f"std dev: {std_dev:.2f}\n"\
f"RMSE: {rmse:.2f}\n"\
f"#trials: {len(yields)}")
plt.savefig(f'{cfg["outdir"]}/{cfg["outfile"]}_distr_{pt_string}.png', bbox_inches='tight')
plt.close()


def main():
gROOT.SetBatch(True)

parser = argparse.ArgumentParser(description="Arguments to pass")
parser.add_argument("config", help="JSON config file")
args = parser.parse_args()

with open(args.config, encoding="utf8") as fil:
cfg = json.load(fil)

yields, yields_err, trials, chis = get_yields(cfg)

for pt_bin_min, pt_bin_max in zip(cfg["pt_bins_min"], cfg["pt_bins_max"]):
plot_pt_string = f"${pt_bin_min} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {pt_bin_max}$"
pt_string = f"{pt_bin_min}_{pt_bin_max}"

try:
central_trial_ind = trials[pt_string].index(cfg["central_trial"])
central_yield = yields[pt_string][central_trial_ind]

plot_yields_trials(yields[pt_string], yields_err[pt_string], trials[pt_string], cfg,
pt_string, plot_pt_string, central_trial_ind, central_yield)
plot_yields_distr(yields[pt_string], cfg, pt_string, plot_pt_string,
central_trial_ind, central_yield)
plot_chis(chis[pt_string], cfg, pt_string, plot_pt_string)
except: # pylint: disable=bare-except
pass

with open(f'{cfg["outdir"]}/{cfg["outfile"]}_trials_{pt_string}.txt',
"w", encoding="utf-8") as ftext:
for trial in trials[pt_string]:
ftext.write(f"{trial}\n")


if __name__ == "__main__":
main()
Loading
Loading