Mu2e · lborrel · Mar 20, 2026
diff --git a/TrkPID/README.md b/TrkPID/README.md
@@ -0,0 +1,20 @@
+# TrkPID
+
+## Introduction
+
+TrkPID is a machine learning algorithm is trained to differentiate between conversion electrons and cosmic ray muons using information from both the tracker and the calorimeter
+
+## Workflow
+
+The python code provided in the file [TrackPIDTrain.py](TrkPIDTrain.py) is used to:
+* make a dataset containing conversion electrons and cosmic ray muons
+* define the neural network structure
+* train the algorithm and save the model weights into an ONNX file named "TrackPID.onnx"
+* test the algorithm, providing performance metrics
+* generate plots to provide more information on the dataset, how the training went, and how the model perform
+
+Once the model is trained and the weights are saved in an ONNX file, this file can be used by TMVA:SOFIE to generate the inference code that can be used in Offline (for details about this process, check [this documentation](https://github.com/Mu2e/MLTrain/blob/main/TrkQual/README.md#converting-a-model-for-use-in-offline)).
+
+## Version history
+
+This version has been trained using MDC2020au datasets, and tested on MDC2020au and MDC2020aw, generated using Offline v11_00_00 and EventNtuple v06_07_00.
diff --git a/TrkPID/TrackPIDTrain.py b/TrkPID/TrackPIDTrain.py
@@ -0,0 +1,366 @@
+# TrackPIDTrain.py
+# Make dataset, train and test artificial neural network for TrackPID
+# This code works with MDC2020au and MDC2020aw datasets generated using Offline v11_00_00, EventNtuple v06_07_00.
+# Author: Leo Borrel
+# Date: 2025-10-29
+
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import uproot
+import awkward as ak
+import pandas as pd
+import tensorflow as tf
+import json
+import tf2onnx
+import onnx
+
+
+CE_dataset_name = "nts.lborrel.CeMLeadingLogOnSpillTriggered.MDC2020au_perfect_v1_3.root"  # CE MDS2020au dataset used for training
+CE_csv_name = "array_test_CE.csv"
+MU1_dataset_name = "nts.lborrel.CosmicCRYSignalAllOnSpillTriggered.MDC2020au_perfect_v1_3_part1.root"   # Cosmic MDS2020au dataset used for training (divided in 2 parts because too large)
+MU1_csv_name = "array_test_MU_1.csv"
+MU2_dataset_name = "nts.lborrel.CosmicCRYSignalAllOnSpillTriggered.MDC2020au_perfect_v1_3_part2.root"   # Cosmic MDS2020au dataset used for training (part 2)
+MU2_csv_name = "array_test_MU_2.csv"
+
+# Switches to turn on and off depending on which task needs to be performed
+# Import the EventNtuple tree from the ROOT file, apply the cuts, and save the trimmed dataset into a csv file (one for conversion electron, one for cosmic muons)
+# Use only once when using a new dataset
+switch_make_dataset = False
+# Train the machine learning model
+# If false, it will use the model saved in a specific file named ""
+switch_train = False
+# Plot the result and the performance of the trained model
+switch_plot = True
+
+
+def import_evtntuple(filename, list_branches):
+    # Import EvenNtuple root file and transform it into an awkward array
+
+    file = uproot.open(filename)
+    tree = file["EventNtuple/ntuple"]
+
+    array = tree.arrays(list_branches, library='ak')
+
+    #print("# of events: ", ak.num(array, axis=0))
+    #print("# of tracks: ", ak.count(array['trk','trk.status']))
+
+    return array
+
+
+def apply_cut(array, array_mc, particle):
+    # Apply cuts on the dataset by iterating over events, tracks and track segments ; only keep a selected number of branches useful for training and testing
+    # array: awkward array containing the reco branches
+    # array_mc: awkward array containing the monte carlo branches
+    # particle: 'e' for conversion electron dataset, 'mu' for cosmic muon dataset
+
+    if particle == "e":
+        mc_pdg = 11
+        label_particle = 1
+    elif particle == "mu":
+        mc_pdg = 13
+        label_particle = 0
+
+    data_array = []
+    for i_evt in range(ak.num(array, axis=0)):  # iterate over events
+        evt_it = array[i_evt]
+        for i_trk in range(ak.num(evt_it['trk','trk.status'], axis=0)):   # iterate over tracks
+            if evt_it['trk','trk.pdg'][i_trk] != 11:    # mask pdg hypothesis
+                continue
+            if array_mc['trkmcsim','pdg'][i_evt,i_trk,0] != mc_pdg:     #mask mc pdg
+                continue
+            if evt_it['trkcalohit','trkcalohit.active'][i_trk] == 0:    # calo hit exists
+                continue
+            if evt_it['trkqual','trkqual.result'][i_trk] < 0.2:    # mask TrkQual
+                continue
+            trk_it = evt_it['trksegs'][i_trk]
+            for i_trksegs in range(ak.num(trk_it['sid'], axis=0)):  # iterate over track segments
+                trksegs_it = trk_it[i_trksegs]
+                if trksegs_it['sid'] != 1:  # mask sid
+                    continue
+                if trksegs_it['mom','fCoordinates','fZ'] < 0:   # mask downstream
+                    continue
+                if trksegs_it['mom','mag'] < 80 or trksegs_it['mom','mag'] > 130:    # mask momentum
+                    continue
+                data_array.append({
+                    'trkqual': evt_it['trkqual','trkqual.result'][i_trk],
+                    'mom': trksegs_it['mom','mag'], 'time': trksegs_it['time'],
+                    'edep': evt_it['trkcalohit','trkcalohit.edep'][i_trk],
+                    'dt': evt_it['trkcalohit','trkcalohit.dt'][i_trk],
+                    'trkdepth': evt_it['trkcalohit','trkcalohit.trkdepth'][i_trk],
+                    'pocaX': evt_it['trkcalohit','trkcalohit.poca.fCoordinates.fX'][i_trk],
+                    'pocaY': evt_it['trkcalohit','trkcalohit.poca.fCoordinates.fY'][i_trk],
+                    'pocaZ': evt_it['trkcalohit','trkcalohit.poca.fCoordinates.fZ'][i_trk],
+                    'pocamomX': evt_it['trkcalohit','trkcalohit.mom.fCoordinates.fX'][i_trk],
+                    'pocamomY': evt_it['trkcalohit','trkcalohit.mom.fCoordinates.fY'][i_trk],
+                    'pocamomZ': evt_it['trkcalohit','trkcalohit.mom.fCoordinates.fZ'][i_trk],
+                    'label': label_particle,
+                    })
+
+    df_array = pd.DataFrame(data_array)
+    print(df_array)
+    print(df_array.describe())
+
+    return df_array
+
+
+def make_dataset(particle, dataset_name, csv_name):
+    # Import EventNtuple data and apply a set of cuts ; the cut dataset with only useful branches is saved as a csv file to be used later in the training and test
+    # particle: 'e' for conversion electron dataset ; 'mu' for cosmic muon dataset
+    # dataset_name: path to the ROOT file containing the EventNtuple tree
+    # csv_name: name of the csv file in which the trimmed dataset will be saved in ; this is used to access the data easier later for training
+
+    branches_reco = ['trk','trkqual','trksegs','trksegpars_lh','trkcalohit']
+    branches_mc = ['trkmc','trkmcsim','trksegsmc']
+    #array_evt = import_evtntuple(dataset_name, ['evtinfo'])
+    array = import_evtntuple(dataset_name, branches_reco)
+    array_mc = import_evtntuple(dataset_name, branches_mc)
+
+    # make mc time modulo event time
+    array_mc['trksegsmc','time_mod'] = np.mod(array_mc['trksegsmc','time'], 1695)
+
+    # make momentum magnitude branches
+    array['trksegs','mom','mag'] = np.sqrt((array['trksegs','mom','fCoordinates','fX'])**2 + (array['trksegs','mom','fCoordinates','fY'])**2 + (array['trksegs','mom','fCoordinates','fZ'])**2)
+    array_mc['trksegsmc','mom','mag'] = np.sqrt((array_mc['trksegsmc','mom','fCoordinates','fX'])**2 + (array_mc['trksegsmc','mom','fCoordinates','fY'])**2 + (array_mc['trksegsmc','mom','fCoordinates','fZ'])**2)
+
+    # make branches for PID
+    #array['trkpid.deltaE'] = array['trkcalohit','trkcalohit.edep'] - array['trksegs','mom','mag']
+    #array['trkcalohit','trkcalohit.dt'].show()
+
+    df_array = apply_cut(array, array_mc, particle)
+
+    # Save cut array into csv file
+    df_array.to_csv(csv_name, index=True)
+
+
+def train_model(dataframe):
+    # Create the multilayer perceptron neural network
+    PID_model = tf.keras.Sequential([
+        tf.keras.layers.Input(shape=(n_features,), batch_size=32),
+        tf.keras.layers.Dense(5, activation='relu'),
+        tf.keras.layers.Dense(10, activation='relu'),
+        tf.keras.layers.Dense(5, activation='relu'),
+        tf.keras.layers.Dense(1, activation='sigmoid')
+        ])
+
+    # Setup loss, optimizer, metrics, and early stop condition for the model
+    model_loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
+    model_optimizer = tf.keras.optimizers.Adam()
+    model_metrics = [tf.keras.metrics.BinaryAccuracy(threshold=0.5), tf.keras.metrics.AUC(from_logits=False)]
+    PID_model.compile(loss = model_loss, optimizer = model_optimizer, metrics = model_metrics)
+    early_stop = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", start_from_epoch = 100, patience = 20, restore_best_weights = True, verbose = 1)
+
+    print(PID_model.summary())
+
+    n_epochs = 500
+    train_history = PID_model.fit(dataframe[features], dataframe['label'], epochs = n_epochs, validation_split=0.1, callbacks=[early_stop])
+    train_history = train_history.history   # extract the training history (loss as function of epochs)
+    # Save model and training history
+    PID_model.save("PID_model.keras")
+    with open("train_history.json",'w') as history_file:
+        json.dump(train_history, history_file)
+
+    # export model in onnx format to be able to use it with SOFIE inference code ; manually enter name and shape of input and output for SOFIE
+    PID_model.output_names = ['output']
+    onnx_signature = [tf.TensorSpec(input.shape, dtype=input.dtype, name=input.name) for input in PID_model.inputs]
+    onnx_model, _ = tf2onnx.convert.from_keras(PID_model, input_signature=onnx_signature)
+    onnx.save(onnx_model, "TrackPID.onnx")
+
+    return PID_model
+
+
+def make_results(model, dataset, dataset_name, threshold = 0.5):
+    # Print model performances
+    results = model.evaluate(dataset[features], dataset['label'])
+    print("\n", dataset_name, "loss,", dataset_name, "accuracy,", dataset_name, "AUC:", results, "\n")
+
+    dataset['prediction'] = model.predict(dataset[features])
+    dataset['predict_label'] = (dataset['prediction'] > threshold).astype(int)
+
+    # Create confusion matrix
+    confusion_matrix = tf.math.confusion_matrix(dataset['label'], dataset['predict_label'], num_classes=2)
+    print("\n Confusion matrix: \n [ True negative (correctly labeled cosmic muons) ; False positive (mislabeled cosmic muons) ] \n [ False negative (mislabeled conversion electrons) ; True positive (correctly labeled conversion electron) ]\n", confusion_matrix)
+
+    true_negative , false_positive = confusion_matrix[0].numpy()
+    false_negative , true_positive = confusion_matrix[1].numpy()
+
+    TPR = true_positive / (true_positive + false_negative)
+    TNR = true_negative / (true_negative + false_positive)
+
+    print("\n", dataset_name, " dataset results:\n")
+    print("True Positive Rate (correctly labeled conversion electrons / all conversion electrons): ", 100*TPR, "%")
+    print("True Negative Rate (correcly labeled cosmic muons / all cosmic muons): ", 100*TNR, "%")
+    print("False Positive Rate (mislabeled conversion electrons / all conversion electrons): ", 100*(1-TPR), "%")
+    print("False Negative Rate (mislabeled cosmic muons / all cosmic muons): ", 100*(1-TNR), "%\n")
+
+    return dataset, results, confusion_matrix
+
+
+## Main code
+
+if switch_make_dataset:
+    make_dataset('e', CE_dataset_name, "CE_temp.csv")
+    make_dataset('mu', MU_dataset_name, "MU_temp.csv")
+
+df_CE = pd.read_csv(CE_csv_name, index_col=0)
+df_MU1 = pd.read_csv(MU1_csv_name, index_col=0)
+df_MU2 = pd.read_csv(MU2_csv_name, index_col=0)    # if the cosmic dataset is too big, separate it in 2 pieces and add it it the concatenation list in the next line
+df = pd.concat([df_CE, df_MU1, df_MU2], axis=0)
+
+# Make input features
+
+df['deltaE'] = df['edep'] - df['mom']
+df['rpoca'] = np.sqrt(np.power(df['pocaX'],2) + np.power(df['pocaY'],2))
+df['trkdir'] = ( df['pocaX'] * df['pocamomX'] + df['pocaY'] * df['pocamomY'] ) / ( np.sqrt(np.power(df['pocaX'],2) + np.power(df['pocaY'],2)) * np.sqrt(np.power(df['pocamomX'],2) + np.power(df['pocamomY'],2)) )
+features = ['deltaE','rpoca','trkdir','dt']
+n_features = len(features)
+df_feature = df[features+['label']].copy()
+
+
+# Shuffle and divide the dataset into a testing dataset with 5000 entries and a training dataset with the rest of the events
+df_shuffle = df_feature.sample(frac=1)
+df_test = df_shuffle.iloc[:5000,:]
+df_train = df_shuffle.iloc[5000:,:]
+
+
+if switch_train:
+    PID_model = train_model(df_train)
+else:   # Use an already trained model saved in keras format
+    PID_model = tf.keras.models.load_model("PID_model.keras")
+    with open("train_history.json",'r') as history_file:    # open file containing the training history to plot later
+        train_history = json.load(history_file)
+    n_epochs = len(train_history['loss'])
+
+df_train,results_train,confusion_matrix_train = make_results(PID_model, df_train, "train", 0.5)
+df_test,results_test,confusion_matrix_test = make_results(PID_model, df_test, "test", 0.5)
+
+# Create datasets based on MC particle information
+df_train_e = df_train[df_train["label"] == 1]
+df_train_mu = df_train[df_train["label"] == 0]
+df_test_e = df_test[df_test["label"] == 1]
+df_test_mu = df_test[df_test["label"] == 0]
+
+
+## Plots
+
+def plot_dataset(csv_name, particle):
+    df = pd.read_csv(csv_name, index_col=0)
+
+    # plot of training features
+    fig,ax = plt.subplots(1,1)
+    ax.hist(df['mom'], bins=100)
+    ax.set_xlabel("reco mom [MeV]")
+    ax.set_title("Reconstructed momentum of "+particle)
+
+    fig,ax = plt.subplots(1,1)
+    ax.hist(df['edep']-df['mom'], bins=100)
+    ax.set_xlabel("deltaE [MeV]")
+    ax.set_title("Difference between calorimeter cluster energy and track momentum of "+particle)
+
+    fig,ax = plt.subplots(1,1)
+    ax.hist(df['dt'], bins=100)
+    ax.set_xlabel("deltaT [MeV]")
+    ax.set_title("Difference between calorimeter cluster time and tracker time of "+particle)
+
+
+def plot_feature(dataset_e, dataset_mu, feature, scale = 'linear'):
+    # Plot of a branch 
+    min_x = min(min(dataset_e[feature]), min(dataset_mu[feature]))
+    max_x = max(max(dataset_e[feature]), max(dataset_mu[feature]))
+
+    fig,ax = plt.subplots(1,1)
+    ax.hist(dataset_e[feature], color='b', alpha=0.5, range=(min_x,max_x), bins=100, density=False)
+    ax.hist(dataset_mu[feature], color='r', alpha=0.5, range=(min_x,max_x), bins=100, density=False)
+
+    ax.set_xlabel(feature)
+    ax.set_xlim(min_x-0.05*(max_x-min_x), max_x+0.05*(max_x-min_x))
+    ax.set_yscale(scale)
+    ax.set_ylabel("# of events")
+    ax.set_title(feature)
+    ax.legend(["conversion electrons", "cosmic muons"], loc='best')
+
+
+def plot_ROC(dataset):
+    # Plot the ROC (Receiver Operating Characteristic) curve
+
+    n_points = 101
+    # Make a list of threshold values not equally distant (follow a power law to have more points close to 0 and 1
+    list_threshold = np.concatenate((0.5 * np.power(np.linspace(0,1,n_points//2+1),4), 1 - 0.5 * np.power(np.linspace(1,0,n_points//2+1),4)), axis=0)
+    list_threshold = np.delete(list_threshold, n_points//2+1)
+
+    true_negative = np.zeros(n_points)
+    false_positive = np.zeros(n_points)
+    false_negative = np.zeros(n_points)
+    true_positive = np.zeros(n_points)
+    dataset['predict_label_temp'] = dataset['label']
+
+    for i in range(n_points):
+        dataset['predict_label_temp'] = (dataset['prediction'] >= list_threshold[i]).astype(int)
+        confusion_matrix_temp = tf.math.confusion_matrix(dataset['label'], dataset['predict_label_temp'], num_classes=2)
+
+        true_negative[i], false_positive[i] = confusion_matrix_temp[0].numpy()
+        false_negative[i], true_positive[i] = confusion_matrix_temp[1].numpy()
+
+    TPR = true_positive / (true_positive + false_negative)
+    TNR = true_negative / (true_negative + false_positive)
+    FNR = 1 - TPR
+    FPR = 1 - TNR
+
+    purity = true_positive / (true_positive + false_positive)
+    significance = true_positive / np.sqrt(true_positive + false_positive)
+    max_significance_idx = np.nanargmax(significance)
+
+    print("\n Max significance at threshold = ", list_threshold[max_significance_idx])
+    print("\n Accuracy at this threshold = ", TPR[max_significance_idx], "\n")
+
+    fig,ax = plt.subplots(1,1)
+    ax.plot(TPR, TNR, '-b')
+    ax.set_xlabel("Signal efficiency (true positive rate)")
+    ax.set_ylabel("Background rejection (true negative rate)")
+    ax.set_title("ROC curve")
+
+    fig,ax = plt.subplots(1,1)
+    ax.plot(list_threshold, TPR, '-k')
+    ax.plot(list_threshold, TNR, '-b')
+    ax.plot(list_threshold, purity, '-g')
+
+    ax2 = ax.twinx()
+    ax2.plot(list_threshold, significance, '-r')
+
+    ax.set_xlabel("Cut threshold value")
+    ax.set_ylabel("Efficiency / Purity")
+    ax2.set_ylabel("Significance")
+    ax.legend(["Signal efficiency", "Background rejection", "Signal purity"], loc="lower left")
+    ax2.legend(["Significance = S/sqrt(S+B)"], loc="lower right")
+
+    return dataset
+
+
+def plot_history(history_file, result):
+    # Plot loss history
+    with open(history_file, 'r') as json_file:
+        history = json.load(json_file)
+    fig,ax = plt.subplots(1,1)
+    ax.plot(history["loss"])
+    ax.plot(history["val_loss"])
+    ax.plot(len(history["loss"]), result[0], marker='o', linestyle='None')
+
+    ax.set_xlabel("Epoch")
+    ax.set_ylabel("Loss")
+    ax.set_title("Binary cross entropy loss")
+    ax.legend(["Train", "Validation","Test"], loc='best')
+
+
+# Comment on and off to choose which group of plots to display
+if switch_plot:
+    plot_dataset(CE_csv_name,"conversion electrons")
+    #plot_dataset(MU1_csv_name,"cosmic muons")
+    for feature in features:
+        plot_feature(df_test_e, df_test_mu, feature)
+    plot_feature(df_train_e, df_train_mu, "prediction", 'log')
+    plot_feature(df_test_e, df_test_mu, "prediction", 'log')
+    df_test = plot_ROC(df_test)
+    plot_history("train_history.json", results_test)
+
+    plt.show()
+