-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfingerprint_extraction.py
More file actions
105 lines (89 loc) · 3.55 KB
/
fingerprint_extraction.py
File metadata and controls
105 lines (89 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 2 17:29:20 2025
@author: shagh
"""
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from fingerprints import HitGenMACCS, HitGenECFP4, HitGenECFP6, HitGenFCFP4, HitGenFCFP6, HitGenRDK, HitGenAvalon, HitGenTopTor, HitGenAtomPair
def compute_molecular_properties(smiles):
"""
Computes molecular properties (MW, ALOGP) for a given SMILES.
"""
mol = Chem.MolFromSmiles(smiles)
if mol:
mw = Descriptors.MolWt(mol)
alogp = Descriptors.MolLogP(mol)
else:
mw = np.nan
alogp = np.nan
return mw, alogp
def generate_fingerprints(smiles, fps_dict, fp_format="array"):
"""
Generates fingerprints for a given SMILES string.
Args:
smiles (str): The input SMILES.
fps_dict (dict): Dictionary of fingerprint classes.
fp_format (str): Storage format for each fingerprint value.
"array" (default) -> numpy.ndarray (float32).
"string" -> comma-separated string of values (legacy format,
what earlier pipeline versions emitted).
Returns:
dict: Dictionary with fingerprint names as keys and fingerprint data as values.
"""
fp_data = {}
for fp_name, fp_class in fps_dict.items():
try:
fp_array = fp_class.generate_fps(smis=[smiles]).flatten()
if fp_format == "string":
fp_data[fp_name] = ','.join(map(str, fp_array))
else:
fp_data[fp_name] = fp_array.astype(np.float32)
except Exception:
if fp_format == "string":
fp_data[fp_name] = ','.join(['nan'] * fp_class._dimension)
else:
fp_data[fp_name] = np.full(fp_class._dimension, np.nan, dtype=np.float32)
return fp_data
def extract_fingerprints(df, fp_format="array"):
"""
Extracts molecular fingerprints and molecular properties for a given DataFrame.
Args:
df (pd.DataFrame): Input DataFrame containing a "SMILES" column.
fp_format (str): "array" (default) stores each fingerprint as a numpy
float32 array; "string" stores it as a comma-separated string
(legacy format from earlier pipeline versions).
Returns:
pd.DataFrame: Updated DataFrame with fingerprint features and molecular properties.
"""
# Ensure the 'SMILES' column exists
if "SMILES" not in df.columns:
raise ValueError("Input DataFrame must contain a 'SMILES' column")
# Define fingerprint classes
fingerprint_classes = {
'ECFP4': HitGenECFP4(),
'ECFP6': HitGenECFP6(),
'FCFP4': HitGenFCFP4(),
'FCFP6': HitGenFCFP6(),
'MACCS': HitGenMACCS(),
'RDK': HitGenRDK(),
'AVALON': HitGenAvalon(),
'TOPTOR': HitGenTopTor(),
'ATOMPAIR': HitGenAtomPair()
}
# Compute fingerprints and molecular properties
fingerprint_data = []
molecular_props = []
for smiles in df["SMILES"]:
fps = generate_fingerprints(smiles, fingerprint_classes, fp_format=fp_format)
fingerprint_data.append(fps)
mw, alogp = compute_molecular_properties(smiles)
molecular_props.append({"MW": mw, "ALOGP": alogp})
# Convert lists to DataFrames
fingerprint_df = pd.DataFrame(fingerprint_data)
molecular_props_df = pd.DataFrame(molecular_props)
# Concatenate the original DataFrame with fingerprints and molecular properties
df = pd.concat([df, molecular_props_df, fingerprint_df], axis=1)
return df