Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ jobs:
chmod +x scripts/setup-env.sh
./scripts/setup-env.sh

# Build and install the package
- name: Build and install package
run: |
source .venv/bin/activate
pip install -e .

- name: Build Documentation
run: |
source .venv/bin/activate
Expand Down
3 changes: 1 addition & 2 deletions bioneuralnet/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from .feature_selector import FeatureSelector
from .base_feature_selector import BaseFeatureSelector
from .static_visualization import StaticVisualizer
from .dynamic_visualization import DynamicVisualizer

__all__ = ['FeatureSelector', 'BaseFeatureSelector', 'StaticVisualizer', 'DynamicVisualizer']
__all__ = ['FeatureSelector', 'StaticVisualizer', 'DynamicVisualizer']
158 changes: 0 additions & 158 deletions bioneuralnet/analysis/base_feature_selector.py

This file was deleted.

118 changes: 97 additions & 21 deletions bioneuralnet/analysis/feature_selector.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
from typing import Optional
import os

import pandas as pd
from .base_feature_selector import BaseFeatureSelector
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif

from ..utils.logger import get_logger


class FeatureSelector(BaseFeatureSelector):
class FeatureSelector:
"""
FeatureSelector Class for Selecting Relevant Multi-Omics Features.

Inherits from BaseFeatureSelector and specializes in selecting the most relevant
multi-omics features based on embeddings generated by GNNs.
This class provides methods for feature selection using statistical and machine learning-based approaches.
It allows selection based on correlation, LASSO regression, or Random Forest feature importances.
"""

def __init__(
Expand All @@ -17,7 +22,6 @@ def __init__(
phenotype_data: pd.Series,
num_features: int = 10,
selection_method: str = 'correlation',
output_dir: Optional[str] = None,
):
"""
Initializes the FeatureSelector instance.
Expand All @@ -27,30 +31,102 @@ def __init__(
phenotype_data (pd.Series): Phenotype data corresponding to the samples.
num_features (int, optional): Number of top features to select. Defaults to 10.
selection_method (str, optional): Feature selection method ('correlation', 'lasso', 'random_forest'). Defaults to 'correlation'.
output_dir (str, optional): Directory to save selected features. If None, creates a unique directory.
#output_dir (str, optional): Directory to save selected features. If None, creates a unique directory.
"""
super().__init__(
num_features=num_features,
selection_method=selection_method,
output_dir=output_dir
)
self.logger = get_logger(__name__)
self.logger.info("Initialized FeatureSelector.")
self.num_features = num_features
self.selection_method = selection_method
self.enhanced_omics_data = enhanced_omics_data
self.phenotype_data = phenotype_data
self.logger.info("Initialized FeatureSelector.")

def run_feature_selection(self) -> pd.DataFrame:

def perform_feature_selection(self) -> pd.DataFrame:
"""
Performs feature selection on the enhanced omics data based on the selected method.

Returns:
pd.DataFrame: DataFrame containing the selected features.

Raises:
ValueError: If an unsupported feature selection method is specified.
"""
self.logger.info(f"Performing feature selection using method: {self.selection_method}")

if self.selection_method == 'correlation':
selected_features = self._correlation_based_selection()
elif self.selection_method == 'lasso':
selected_features = self._lasso_based_selection()
elif self.selection_method == 'random_forest':
selected_features = self._random_forest_based_selection()
else:
self.logger.error(f"Unsupported feature selection method: {self.selection_method}")
raise ValueError(f"Unsupported feature selection method: {self.selection_method}")

return selected_features

def _correlation_based_selection(self) -> pd.DataFrame:
"""
Selects top features based on correlation with phenotype using ANOVA.

Returns:
pd.DataFrame: DataFrame containing the selected features.
"""
self.logger.info("Performing correlation-based feature selection using ANOVA (f_classif).")
selector = SelectKBest(score_func=f_classif, k=self.num_features)
selector.fit(self.enhanced_omics_data, self.phenotype_data)
selected_mask = selector.get_support()
selected_features = self.enhanced_omics_data.columns[selected_mask]
self.logger.info(f"Selected {len(selected_features)} features based on correlation.")
return self.enhanced_omics_data[selected_features]

def _lasso_based_selection(self) -> pd.DataFrame:
"""
Selects top features based on LASSO regression coefficients.

Returns:
pd.DataFrame: DataFrame containing the selected features.
"""
self.logger.info("Performing LASSO-based feature selection.")
# Ensure cv does not exceed number of samples
n_samples = len(self.enhanced_omics_data)
cv_folds = 5
if n_samples < cv_folds:
cv_folds = n_samples
self.logger.warning(f"Reducing cv from 5 to {cv_folds} due to insufficient samples.")
if cv_folds < 2:
raise ValueError(f"Number of splits {cv_folds} must be at least 2 for cross-validation.")

lasso = LassoCV(cv=cv_folds, random_state=0).fit(self.enhanced_omics_data, self.phenotype_data)
coef = pd.Series(lasso.coef_, index=self.enhanced_omics_data.columns)
selected_features = coef.abs().sort_values(ascending=False).head(self.num_features).index
self.logger.info(f"Selected {len(selected_features)} features based on LASSO coefficients.")
return self.enhanced_omics_data[selected_features]

def _random_forest_based_selection(self) -> pd.DataFrame:
"""
Selects top features based on Random Forest feature importances.

Returns:
pd.DataFrame: DataFrame containing the selected features.
"""
Executes the feature selection process on the enhanced omics data.
self.logger.info("Performing Random Forest-based feature selection.")
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(self.enhanced_omics_data, self.phenotype_data)
importances = pd.Series(rf.feature_importances_, index=self.enhanced_omics_data.columns)
selected_features = importances.sort_values(ascending=False).head(self.num_features).index
self.logger.info(f"Selected {len(selected_features)} features based on Random Forest importances.")
return self.enhanced_omics_data[selected_features]


Steps:
1. Perform feature selection using the specified method.
2. Save the selected features.
def run_feature_selection(self) -> pd.DataFrame:
"""
Executes the feature selection process and saves the results.

Returns:
pd.DataFrame: DataFrame containing the selected multi-omics features.
pd.DataFrame: DataFrame containing the selected features.
"""
self.logger.info("Starting feature selection on enhanced omics data.")
selected_features = self.perform_feature_selection(self.enhanced_omics_data, self.phenotype_data)
self.save_selected_features(selected_features, filename="selected_genetic_features.csv")
selected_features = self.perform_feature_selection()
self.logger.info("Feature selection on enhanced omics data completed successfully.")
return selected_features
Loading
Loading