EASMS-data-processing/src/fingerprint_extraction.py at main · StructuralGenomicsConsortium/EASMS-data-processing · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# -*- coding: utf-8 -*-
"""
Created on Sun Mar  2 17:29:20 2025

@author: shagh
"""

import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from fingerprints import HitGenMACCS, HitGenECFP4, HitGenECFP6, HitGenFCFP4, HitGenFCFP6, HitGenRDK, HitGenAvalon, HitGenTopTor, HitGenAtomPair

def compute_molecular_properties(smiles):
    """
    Computes molecular properties (MW, ALOGP) for a given SMILES.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mw = Descriptors.MolWt(mol)
        alogp = Descriptors.MolLogP(mol)
    else:
        mw = np.nan
        alogp = np.nan
    return mw, alogp

def generate_fingerprints(smiles, fps_dict, fp_format="array"):
    """
    Generates fingerprints for a given SMILES string.

    Args:
        smiles (str): The input SMILES.
        fps_dict (dict): Dictionary of fingerprint classes.
        fp_format (str): Storage format for each fingerprint value.
            "array"  (default) -> numpy.ndarray (float32).
            "string"            -> comma-separated string of values (legacy format,
                                   what earlier pipeline versions emitted).

    Returns:
        dict: Dictionary with fingerprint names as keys and fingerprint data as values.
    """
    fp_data = {}
    for fp_name, fp_class in fps_dict.items():
        try:
            fp_array = fp_class.generate_fps(smis=[smiles]).flatten()
            if fp_format == "string":
                fp_data[fp_name] = ','.join(map(str, fp_array))
            else:
                fp_data[fp_name] = fp_array.astype(np.float32)
        except Exception:
            if fp_format == "string":
                fp_data[fp_name] = ','.join(['nan'] * fp_class._dimension)
            else:
                fp_data[fp_name] = np.full(fp_class._dimension, np.nan, dtype=np.float32)
    return fp_data

def extract_fingerprints(df, fp_format="array"):
    """
    Extracts molecular fingerprints and molecular properties for a given DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing a "SMILES" column.
        fp_format (str): "array" (default) stores each fingerprint as a numpy
            float32 array; "string" stores it as a comma-separated string
            (legacy format from earlier pipeline versions).

    Returns:
        pd.DataFrame: Updated DataFrame with fingerprint features and molecular properties.
    """

    # Ensure the 'SMILES' column exists
    if "SMILES" not in df.columns:
        raise ValueError("Input DataFrame must contain a 'SMILES' column")

    # Define fingerprint classes
    fingerprint_classes = {
        'ECFP4': HitGenECFP4(),
        'ECFP6': HitGenECFP6(),
        'FCFP4': HitGenFCFP4(),
        'FCFP6': HitGenFCFP6(),
        'MACCS': HitGenMACCS(),
        'RDK': HitGenRDK(),
        'AVALON': HitGenAvalon(),
        'TOPTOR': HitGenTopTor(),
        'ATOMPAIR': HitGenAtomPair()
    }

    # Compute fingerprints and molecular properties
    fingerprint_data = []
    molecular_props = []

    for smiles in df["SMILES"]:
        fps = generate_fingerprints(smiles, fingerprint_classes, fp_format=fp_format)
        fingerprint_data.append(fps)
        mw, alogp = compute_molecular_properties(smiles)
        molecular_props.append({"MW": mw, "ALOGP": alogp})

    # Convert lists to DataFrames
    fingerprint_df = pd.DataFrame(fingerprint_data)
    molecular_props_df = pd.DataFrame(molecular_props)

    # Concatenate the original DataFrame with fingerprints and molecular properties
    df = pd.concat([df, molecular_props_df, fingerprint_df], axis=1)

    return df