-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathadd_scores.py
More file actions
128 lines (105 loc) · 5.22 KB
/
add_scores.py
File metadata and controls
128 lines (105 loc) · 5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import pandas as pd
import numpy as np
import scipy.stats as stats
def compute_and_add_scores(file_paths, output_dir=None):
"""
Computes TARGET_VALUE, ENRICHMENT, SELECTIVE_ENRICHMENT, EASMS_ENRICHMENT,
MEAN_NONTARGET_VALUES, and PVALUE for a list of CSV files.
Each file is processed individually, using all other files to compute comparison values.
If output_dir is provided, saves each scored file to that directory using the
original basename. Otherwise saves back to the original file path (in-place).
Returns:
list[str]: Paths of the saved (scored) files.
"""
if not file_paths:
print("No files provided for score computation.")
return
# Load all CSV files
dataframes = {f: pd.read_csv(f) for f in file_paths}
# Ensure necessary columns exist
required_columns = {"COMPOUND_ID", "POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"}
for filename, df in dataframes.items():
if not required_columns.issubset(df.columns):
raise ValueError(f"Missing required columns in {filename}: {required_columns - set(df.columns)}")
# Compute TARGET_VALUE for each df
for df in dataframes.values():
df["TARGET_VALUE"] = pd.to_numeric(
df[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]]
.mean(axis=1, skipna=True), errors="coerce"
)
# Step 2: Process each file individually
for current_file, df in dataframes.items():
print(f"\n Processing: {os.path.basename(current_file)}")
# Exclude current df
other_dfs = [d for f, d in dataframes.items() if f != current_file]
merged_other = pd.concat(other_dfs, ignore_index=True)
# Ensure TARGET_VALUE is present in merged_other
if "TARGET_VALUE" not in merged_other.columns:
merged_other["TARGET_VALUE"] = pd.to_numeric(
merged_other[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]]
.mean(axis=1, skipna=True), errors="coerce"
)
# SELECTIVE_VALUE & NTC_VALUE
max_values = merged_other.groupby("COMPOUND_ID")["TARGET_VALUE"].max()
min_values = merged_other.groupby("COMPOUND_ID")["TARGET_VALUE"].min()
df["SELECTIVE_VALUE"] = df["COMPOUND_ID"].map(max_values)
df["NTC_VALUE"] = df["COMPOUND_ID"].map(min_values)
# ENRICHMENT calculations
df["ENRICHMENT"] = df["TARGET_VALUE"] / df["NTC_VALUE"]
df["SELECTIVE_ENRICHMENT"] = df["TARGET_VALUE"] / df["SELECTIVE_VALUE"]
# EASMS_ENRICHMENT and MEAN_NONTARGET_VALUES
def compute_easms_enrichment(row):
cid = row["COMPOUND_ID"]
other = merged_other[merged_other["COMPOUND_ID"] == cid]
if other.empty:
return pd.Series([None, None])
other_mean = other[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]]\
.apply(pd.to_numeric, errors="coerce")\
.mean(axis=1, skipna=True).mean()
if pd.isna(other_mean) or other_mean == 0:
return pd.Series([None, other_mean])
enrichment = row["TARGET_VALUE"] / other_mean
return pd.Series([enrichment, other_mean])
df[["EASMS_ENRICHMENT", "MEAN_NONTARGET_VALUES"]] = df.apply(compute_easms_enrichment, axis=1)
# PVALUE computation (robust)
def calculate_p_value(row):
try:
compound_id = row["COMPOUND_ID"]
if pd.isna(compound_id):
return None
# Current compound's replicates
protein_interest = pd.to_numeric(
row[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]],
errors="coerce"
).dropna().values.astype(float)
# Replicates from other files for same compound
other = merged_other[merged_other["COMPOUND_ID"] == compound_id]
protein_other_values = pd.to_numeric(
other[["POS_INT_REP1", "POS_INT_REP2", "POS_INT_REP3"]]
.stack(), errors="coerce"
).dropna().values.astype(float)
if len(protein_interest) == 0 or len(protein_other_values) < 3:
return None
if np.std(protein_interest) < 1e-8 or np.std(protein_other_values) < 1e-8:
return 1.0
_, p_value = stats.ttest_ind(protein_interest, protein_other_values, equal_var=False)
return p_value
except Exception as e:
print(f" Error calculating p-value for {row.get('COMPOUND_ID', 'UNKNOWN')}: {e}")
return None
df["PVALUE"] = df.apply(calculate_p_value, axis=1)
# Step 3: Save the updated files (either in place or into output_dir)
saved_paths = []
if output_dir:
os.makedirs(output_dir, exist_ok=True)
for file_path, df in dataframes.items():
if output_dir:
out_path = os.path.join(output_dir, os.path.basename(file_path))
else:
out_path = file_path
df.to_csv(out_path, index=False)
saved_paths.append(out_path)
print(f"Updated and saved: {out_path}")
print("\nAll files have been processed and saved with computed scores.")
return saved_paths