EMSL-Computing
diff --git a/‎README.md‎
Lines changed: 8 additions & 6 deletions b/‎README.md‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎ScoringInference.R‎
Lines changed: 20 additions & 9 deletions b/‎ScoringInference.R‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎ScoringTraining.R‎
Lines changed: 4 additions & 2 deletions b/‎ScoringTraining.R‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎TargetDecoyGenerator.R‎
Lines changed: 1 addition & 0 deletions b/‎TargetDecoyGenerator.R‎
Lines changed: 1 addition & 0 deletions
@@ -4,12 +4,14 @@ PeakDecoder is a machine learning-based metabolite identification algorithm for
 
 ### Workflow
 
-* Step-1: data is processed in untargeted mode (using MS-DIAL) to extract all precursor ion features (MS1) and their respective deconvoluted fragment ions (pseudo MS2) based on co-elution and co-mobility. 
-* Step-2: a preliminary training set is generated by using the detected and deconvoluted peak-groups as targets and producing their corresponding decoys. 
-* Step-3: targeted data extraction is performed (usig Skyline) to extract the precursor and fragment ion signals for the training set from all the LC-IM-MS runs and export their XIC metrics. 
-* Step-4: an SVM classifier is trained using multiple scores calculated from the XIC metrics of the training set. Before training, filtering for high-quality fragments is applied to keep high-quality peak-groups as targets (i.e., based on various thresholds for metrics of precursor and at least 3 fragments: S/N, mass error, RT difference to precursor, and FWHM difference to precursor; details in Methods) and their corresponding decoys in the final training set. The model learns to distinguish true and false co-elution and co-mobility, independently of the features’ metabolite identity. 
-* Step-5: TDX is performed to extract the signals of the query set of metabolites in the library from all the LC-IM-MS runs and export their XIC metrics. 
-* Step-6: the trained model is used to determine the PeakDecoder score of the query set of metabolites and estimate an FDR.
+* <b>Step-1, Feature finding and fragment ion deconvolution:</b> data is processed in untargeted mode (using MS-DIAL) to extract all precursor ion features (MS1) and their respective deconvoluted fragment ions (pseudo MS2) based on co-elution and co-mobility.
+The alignment (Peak ID matrix, msp format) and all peak lists (txt, centroid) should be exported from MS-DIAL. 
+* <b>Step-2, Target and decoy generation:</b> a preliminary training set is generated by using the detected and deconvoluted peak-groups as targets and producing their corresponding decoys. 
+* <b>Step-3, Targeted data extraction for training:</b> targeted data extraction is performed (usig Skyline) to extract the precursor and fragment ion signals for the training set from all the LC-IM-MS runs and export their XIC metrics. 
+The Skyline report should include the required XIC metrics: area, height, mass error, FWHM (LC), RT, expected RT, expected CCS.
+* <b>Step-4, Machine learning training:</b> an SVM classifier is trained using multiple scores calculated from the XIC metrics of the training set. Before training, filtering for high-quality fragments is applied to keep high-quality peak-groups as targets (i.e., based on various thresholds for metrics of precursor and at least 3 fragments: S/N, mass error, RT difference to precursor, and FWHM difference to precursor) and their corresponding decoys in the final training set. The model learns to distinguish true and false co-elution and co-mobility, independently of the features’ metabolite identity. 
+* <b>Step-5, Targeted data extraction for inference:</b> TDX is performed to extract the signals of the query set of metabolites in the library from all the LC-IM-MS runs and export their XIC metrics. 
+* <b>Step-6, Machine learning inference: the trained</b> model is used to determine the PeakDecoder score of the query set of metabolites and estimate an false discovery rate (FDR). Results can be filtered using the PeakDecoder score corresponding to the estimated FDR threshold from a table with pairs of values (FDR, PeakDecoder score) automatically generated after training (file PeakDecoder-FDR-thresholds_[dataset].csv).
 
 
 ### Data
 
@@ -25,16 +25,18 @@ cutoff.PeakDecoderScore = 0.8 #
 #inputFileSamples = "Skyline-Report_Pput_IMS_MS1-and-MSMS.csv"
 #cutoff.PeakDecoderScore = 0.96 # 1% FDR
 
-#setwd(file.path(basePathData,"data/Rhodo"))
-#inputFileSamples = "Skyline-Report_Rhodo_IMS_MS1-and-MSMS.csv"
-#cutoff.PeakDecoderScore = 0.9 # 3% FDR
+# setwd(file.path(basePathData,"data/Rhodo"))
+# inputFileSamples = "Skyline-Report_Rhodo_IMS_MS1-and-MSMS.csv"
+# #inputFileSamples = "Skyline-Report_Rhodo-trypD5_IMS_MS1-and-MSMS.csv"
+# #cutoff.PeakDecoderScore = 0.9 # 3% FDR
+# cutoff.PeakDecoderScore = 0.96 # 1% FDR
 
 sampleString = basename(getwd())
 
 # Thresholds to consider molecules as identified in at least one sample:
 cutoff.Mz.Error = 18 # precursor mass tolerance, ppm
 cutoff.RT.Error = 0.3 # minutes
-cutoff.CCS.Error = 1 # percent
+cutoff.CCS.Error = 0.8 # percent
 cutoff.SignalToNoise = 2
 
 
@@ -90,6 +92,7 @@ dat$methodCE = "20V"
 dat$methodCE[grepl("_40V", dat$Replicate)] = "40V"
 
 dat = merge(dat, ms2lib, by=c("PrecursorName", "ProductName", "methodCE"), all.x = TRUE)
+dat = dat[!(is.na(dat$LibraryIntensity) & dat$ProductName != "precursor"), ] # remove fragments that were not mapped to the library
 
 # Update intensity ranks:
 dat$PrecursorName.Replicate = paste(dat$PrecursorName, dat$Replicate, sep = '.')
@@ -191,7 +194,7 @@ for(f in featureFiles)
 
   for(k in indexes)
   {
-    x = feats[which(abs(feats$Precursor.m.z - outTab$Precursor.Mz[k]) <= 0.02 & 
+    x = feats[which(abs(feats$Precursor.m.z - outTab$Precursor.Mz[k]) <= 0.3 & # Use a larger tolerance because MSDIAL does not process well saturated peaks 
                       abs(feats$RT..min. - outTab$Retention.Time[k]) <= cutoff.RT.Error), ]
     if(nrow(x) > 0)
     {
@@ -202,6 +205,9 @@ for(f in featureFiles)
 }
 
 # Apply filtering thresholds:
+outTab$Mass.Error.PPM = round(outTab$Mass.Error.PPM, 2)
+outTab$CCS.Error = round(outTab$CCS.Error, 2)
+outTab$RetentionTime.Error = round(outTab$RetentionTime.Error, 2)
 outTab$ConfidenceDescription = "None"
 outTab$ConfidenceDescription[which(abs(outTab$Mass.Error.PPM) <= cutoff.Mz.Error &
                                                     abs(outTab$CCS.Error) <= cutoff.CCS.Error &
@@ -210,10 +216,11 @@ outTab$ConfidenceDescription[which(abs(outTab$Mass.Error.PPM) <= cutoff.Mz.Error
 outTab$ConfidenceDescription[which(outTab$PeakDecoderScore >= cutoff.PeakDecoderScore &
                                outTab$ConfidenceDescription == "RT-CCS")] = "RT-CCS-DIA"
 
-# Exclude "RT-CCS" which have fragments but low combined score:
-outTab$ConfidenceDescription[which((outTab$PeakDecoderScore < cutoff.PeakDecoderScore | is.na(outTab$PeakDecoderScore)) &
-                              outTab$CountDetectedFragments > 0 &
-                              outTab$ConfidenceDescription == "RT-CCS")] = "None"
+# Exclude "RT-CCS" of replicates which have fragments but PeakDecoderScore is below threshold, PDS < cutoff.PeakDecoderScore:
+indexesLowPDS = which((outTab$PeakDecoderScore < cutoff.PeakDecoderScore | is.na(outTab$PeakDecoderScore)) &
+                        outTab$CountDetectedFragments > 0 &
+                        outTab$ConfidenceDescription == "RT-CCS")
+outTab$ConfidenceDescription[indexesLowPDS] = "None"
 
 outUniqueBest = outTab[which(outTab$SignalToNoise > cutoff.SignalToNoise), ]
 outUniqueBest = outUniqueBest[with(outUniqueBest, order(PrecursorName, ConfidenceDescription, -PeakDecoderScore, abs(CCS.Error))), ]
@@ -249,6 +256,10 @@ write.csv(outTab[which(outTab$PrecursorName %in% unique(outUniqueBest$PrecursorN
 write.csv(outTab[which(outTab$PrecursorName %in% unique(outUniqueBest$PrecursorName)), ], 
           file = paste("Detected-molecules-all-metrics-replicates_", sampleString, "_", Sys.Date(), ".csv", sep=''), row.names = FALSE)
 
+# Save all-metrics ALL targets table:
+write.csv(outTab, 
+          file = paste("All-targets-all-metrics-replicates_", sampleString, "_", Sys.Date(), ".csv", sep=''), row.names = FALSE)
+
 
 # ---------------------------------------------------
 manualIDs = read.csv(file = paste("Detected-metabolites-manual_", sampleString, ".csv", sep=''), stringsAsFactors = FALSE)
 
@@ -115,11 +115,13 @@ targets$countBadMetrics[indexes] = targets$countBadMetrics[indexes] + 1
 
 # RT difference to precursor, flag fragment if RT difference is larger than tolerance: 
 #indexes = which(abs(targets$DIA.RTdiff) > targets$Precursor.Fwhm * 0.1)
-indexes = which(abs(targets$DIA.RTdiff) > 0.1)
+#indexes = which(abs(targets$DIA.RTdiff) > 0.1)
+indexes = which(abs(targets$DIA.RTdiff) > 0.025)
 targets$countBadMetrics[indexes] = targets$countBadMetrics[indexes] + 1
 
 # FWHM difference to precursor, flag fragment if FWHM is larger than the precursor FWHM: 
-indexes = which(abs(targets$DIA.FWHMdiff) > targets$Precursor.Fwhm * 2)
+#indexes = which(abs(targets$DIA.FWHMdiff) > targets$Precursor.Fwhm * 2)
+indexes = which(abs(targets$DIA.FWHMdiff) > targets$Precursor.Fwhm / 2)
 targets$countBadMetrics[indexes] = targets$countBadMetrics[indexes] + 1
 
 hist(targets$countBadMetrics)
 
@@ -20,6 +20,7 @@ alignmentTable = "xMSDIAL/PeakID_0_20221211731.txt"
 
 #setwd(file.path(basePathData,"data/Rhodo"))
 #alignmentTable = "xMSDIAL/PeakID_0_2022126180.txt"
+alignmentTable = "xMSDIAL/PeakID_1_20221171618.txt"
 
 
 sampleString = basename(getwd())