EASMS-data-processing/src/add_scores.py at main · StructuralGenomicsConsortium/EASMS-data-processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import pandas as pd
import numpy as np
import scipy.stats as stats

def compute_and_add_scores(file_paths, output_dir=None):
    """
    Computes TARGET_VALUE, ENRICHMENT, SELECTIVE_ENRICHMENT, EASMS_ENRICHMENT,
    MEAN_NONTARGET_VALUES, and PVALUE for a list of CSV files.
    Each file is processed individually, using all other files to compute comparison values.

    If output_dir is provided, saves each scored file to that directory using the
    original basename. Otherwise saves back to the original file path (in-place).

    Returns:
        list[str]: Paths of the saved (scored) files.
    """
    if not file_paths:
        print("No files provided for score computation.")
        return

    # Load all CSV files
    dataframes = {f: pd.read_csv(f) for f in file_paths}

    # Ensure necessary columns exist
    required_columns = {"COMPOUND_ID", "POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"}
    for filename, df in dataframes.items():
        if not required_columns.issubset(df.columns):
            raise ValueError(f"Missing required columns in {filename}: {required_columns - set(df.columns)}")

    # Compute TARGET_VALUE for each df
    for df in dataframes.values():
        df["TARGET_VALUE"] = pd.to_numeric(
            df[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]]
            .mean(axis=1, skipna=True), errors="coerce"
        )

    # Step 2: Process each file individually
    for current_file, df in dataframes.items():
        print(f"\n Processing: {os.path.basename(current_file)}")

        # Exclude current df
        other_dfs = [d for f, d in dataframes.items() if f != current_file]
        merged_other = pd.concat(other_dfs, ignore_index=True)

        # Ensure TARGET_VALUE is present in merged_other
        if "TARGET_VALUE" not in merged_other.columns:
            merged_other["TARGET_VALUE"] = pd.to_numeric(
                merged_other[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]]
                .mean(axis=1, skipna=True), errors="coerce"
            )

        # SELECTIVE_VALUE & NTC_VALUE
        max_values = merged_other.groupby("COMPOUND_ID")["TARGET_VALUE"].max()
        min_values = merged_other.groupby("COMPOUND_ID")["TARGET_VALUE"].min()
        df["SELECTIVE_VALUE"] = df["COMPOUND_ID"].map(max_values)
        df["NTC_VALUE"] = df["COMPOUND_ID"].map(min_values)

        # ENRICHMENT calculations
        df["ENRICHMENT"] = df["TARGET_VALUE"] / df["NTC_VALUE"]
        df["SELECTIVE_ENRICHMENT"] = df["TARGET_VALUE"] / df["SELECTIVE_VALUE"]

        # EASMS_ENRICHMENT and MEAN_NONTARGET_VALUES
        def compute_easms_enrichment(row):
            cid = row["COMPOUND_ID"]
            other = merged_other[merged_other["COMPOUND_ID"] == cid]
            if other.empty:
                return pd.Series([None, None])
            other_mean = other[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]]\
                .apply(pd.to_numeric, errors="coerce")\
                .mean(axis=1, skipna=True).mean()
            if pd.isna(other_mean) or other_mean == 0:
                return pd.Series([None, other_mean])
            enrichment = row["TARGET_VALUE"] / other_mean
            return pd.Series([enrichment, other_mean])

        df[["EASMS_ENRICHMENT", "MEAN_NONTARGET_VALUES"]] = df.apply(compute_easms_enrichment, axis=1)

        # PVALUE computation (robust)
        def calculate_p_value(row):
            try:
                compound_id = row["COMPOUND_ID"]
                if pd.isna(compound_id):
                    return None

                # Current compound's replicates
                protein_interest = pd.to_numeric(
                    row[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]],
                    errors="coerce"
                ).dropna().values.astype(float)

                # Replicates from other files for same compound
                other = merged_other[merged_other["COMPOUND_ID"] == compound_id]
                protein_other_values = pd.to_numeric(
                    other[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]]
                    .stack(), errors="coerce"
                ).dropna().values.astype(float)

                if len(protein_interest) == 0 or len(protein_other_values) < 3:
                    return None

                if np.std(protein_interest) < 1e-8 or np.std(protein_other_values) < 1e-8:
                    return 1.0

                _, p_value = stats.ttest_ind(protein_interest, protein_other_values, equal_var=False)
                return p_value

            except Exception as e:
                print(f" Error calculating p-value for {row.get('COMPOUND_ID', 'UNKNOWN')}: {e}")
                return None

        df["PVALUE"] = df.apply(calculate_p_value, axis=1)

    # Step 3: Save the updated files (either in place or into output_dir)
    saved_paths = []
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
    for file_path, df in dataframes.items():
        if output_dir:
            out_path = os.path.join(output_dir, os.path.basename(file_path))
        else:
            out_path = file_path
        df.to_csv(out_path, index=False)
        saved_paths.append(out_path)
        print(f"Updated and saved: {out_path}")

    print("\nAll files have been processed and saved with computed scores.")
    return saved_paths