forked from PremSinghBist/GMProt
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstaphylococcus_aureus_db_builder.py
More file actions
116 lines (88 loc) · 3.82 KB
/
staphylococcus_aureus_db_builder.py
File metadata and controls
116 lines (88 loc) · 3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pandas as pd
from scipy.stats import gmean
NON_STANDARD_AA = set("BJOUX") # Non-standard amino acids
def clean_s_aureus(filtered_csv="data/s_aureus_original.csv",
cleaned_csv="data/s_aureus_cleaned.csv"):
"""
Clean and deduplicate S. aureus dataset.
Operations:
1. Remove sequences containing non-standard amino acids (B,J,O,U,X)
2. Remove negative MIC values
3. Deduplicate sequences by computing geometric mean of MIC ('value') for duplicates
4. Save cleaned dataset
Parameters
----------
filtered_csv : str
Path to the filtered S. aureus CSV file (input)
cleaned_csv : str
Path to save the final cleaned dataset (output)
"""
# Load filtered S. aureus dataset
df = pd.read_csv(filtered_csv)
print(f"Original filtered S. aureus records: {len(df)}")
# Remove sequences with non-standard amino acids
def has_non_standard(seq):
return any(aa in NON_STANDARD_AA for aa in seq.upper())
mask_non_standard = df['sequence'].apply(has_non_standard)
df = df[~mask_non_standard].copy()
print(f"Removed {mask_non_standard.sum()} sequences with non-standard amino acids")
print(f"Remaining records: {len(df)}")
# Remove negative MIC values
mask_negative = df['value'] <= 0
df = df[~mask_negative].copy()
print(f"Removed {mask_negative.sum()} records with negative MIC values")
print(f"Remaining records after negative MIC removal: {len(df)}")
# Deduplicate sequences (geometric mean of MIC)
df['sequence_norm'] = df['sequence'].str.strip().str.upper()
grouped = df.groupby('sequence_norm', as_index=False)
# Compute geometric mean of 'value' for duplicates
df_dedup = grouped.apply(lambda x: x.assign(value=gmean(x['value']))).reset_index(drop=True)
# Keep first occurrence per sequence and drop helper column
df_final = df_dedup.drop_duplicates(subset=['sequence_norm']).drop(columns=['sequence_norm'])
print(f"Final deduplicated dataset length: {len(df_final)}")
# Save cleaned dataset
df_final.to_csv(cleaned_csv, index=False)
print(f"Cleaned S. aureus dataset saved to: {cleaned_csv}")
return df_final
def extract_s_aureus(grampa_csv="data/grampa.csv", save_csv="data/s_aureus_original.csv"):
"""
Extract all rows with bacterium = 'S. aureus' from the GRAMPA dataset
and save to CSV.
Parameters
----------
grampa_csv : str
Path to the input GRAMPA CSV file.
save_csv : str
Path to save the filtered S. aureus dataset.
"""
# Load the dataset
df = pd.read_csv(grampa_csv)
print(f"Original GRAMPA dataset length: {len(df)}")
# Clean up bacterium column
df['bacterium'] = df['bacterium'].str.strip()
# Filter for S. aureus
df_s_aureus = df[df['bacterium'] == 'S. aureus']
print(f"Filtered S. aureus dataset length: {len(df_s_aureus)}")
# Save to CSV
df_s_aureus.to_csv(save_csv, index=False)
print(f"S. aureus dataset saved to: {save_csv}")
return df_s_aureus
def print_records_by_database(cleaned_csv="data/s_aureus_cleaned.csv"):
"""
Print number of S. aureus records per source database.
Parameters
----------
cleaned_csv : str
Path to cleaned S. aureus CSV file
"""
df = pd.read_csv(cleaned_csv)
if "database" not in df.columns:
raise ValueError("Column 'database' not found in the dataset.")
counts = df["database"].value_counts()
print("\nS. aureus records by database:")
for db, count in counts.items():
print(f"{db}: {count}")
if __name__ == "__main__":
# extract_s_aureus(grampa_csv="data/grampa.csv", save_csv="data/s_aureus_original.csv")
# clean_s_aureus(filtered_csv="data/s_aureus_original.csv", cleaned_csv="data/s_aureus_cleaned.csv")
print_records_by_database(cleaned_csv="data/s_aureus_cleaned.csv")