UCD-BDLab · ramosv · Dec 20, 2024 · Dec 19, 2024 · Dec 19, 2024
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -36,6 +36,12 @@ jobs:
           chmod +x scripts/setup-env.sh
           ./scripts/setup-env.sh
 
+      # Build and install the package
+      - name: Build and install package
+        run: |
+          source .venv/bin/activate
+          pip install -e .
+
       - name: Build Documentation
         run: |
           source .venv/bin/activate

diff --git a/bioneuralnet/analysis/__init__.py b/bioneuralnet/analysis/__init__.py
@@ -1,6 +1,5 @@
 from .feature_selector import FeatureSelector
-from .base_feature_selector import BaseFeatureSelector
 from .static_visualization import StaticVisualizer
 from .dynamic_visualization import DynamicVisualizer
 
-__all__ = ['FeatureSelector', 'BaseFeatureSelector', 'StaticVisualizer', 'DynamicVisualizer']
+__all__ = ['FeatureSelector', 'StaticVisualizer', 'DynamicVisualizer']
diff --git a/bioneuralnet/analysis/base_feature_selector.py b/bioneuralnet/analysis/base_feature_selector.py
diff --git a/bioneuralnet/analysis/feature_selector.py b/bioneuralnet/analysis/feature_selector.py
@@ -1,14 +1,19 @@
-from typing import Optional
+import os
+
 import pandas as pd
-from .base_feature_selector import BaseFeatureSelector
+from sklearn.linear_model import LassoCV
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.feature_selection import SelectKBest, f_classif
+
+from ..utils.logger import get_logger
 
 
-class FeatureSelector(BaseFeatureSelector):
+class FeatureSelector:
     """
     FeatureSelector Class for Selecting Relevant Multi-Omics Features.
 
-    Inherits from BaseFeatureSelector and specializes in selecting the most relevant
-    multi-omics features based on embeddings generated by GNNs.
+    This class provides methods for feature selection using statistical and machine learning-based approaches.
+    It allows selection based on correlation, LASSO regression, or Random Forest feature importances.
     """
 
     def __init__(
@@ -17,7 +22,6 @@ def __init__(
         phenotype_data: pd.Series,
         num_features: int = 10,
         selection_method: str = 'correlation',
-        output_dir: Optional[str] = None,
     ):
         """
         Initializes the FeatureSelector instance.
@@ -27,30 +31,102 @@ def __init__(
             phenotype_data (pd.Series): Phenotype data corresponding to the samples.
             num_features (int, optional): Number of top features to select. Defaults to 10.
             selection_method (str, optional): Feature selection method ('correlation', 'lasso', 'random_forest'). Defaults to 'correlation'.
-            output_dir (str, optional): Directory to save selected features. If None, creates a unique directory.
+            #output_dir (str, optional): Directory to save selected features. If None, creates a unique directory.
         """
-        super().__init__(
-            num_features=num_features,
-            selection_method=selection_method,
-            output_dir=output_dir
-        )
+        self.logger = get_logger(__name__)
+        self.logger.info("Initialized FeatureSelector.")
+        self.num_features = num_features
+        self.selection_method = selection_method
         self.enhanced_omics_data = enhanced_omics_data
         self.phenotype_data = phenotype_data
-        self.logger.info("Initialized FeatureSelector.")
 
-    def run_feature_selection(self) -> pd.DataFrame:
+
+    def perform_feature_selection(self) -> pd.DataFrame:
+        """
+        Performs feature selection on the enhanced omics data based on the selected method.
+
+        Returns:
+            pd.DataFrame: DataFrame containing the selected features.
+
+        Raises:
+            ValueError: If an unsupported feature selection method is specified.
+        """
+        self.logger.info(f"Performing feature selection using method: {self.selection_method}")
+
+        if self.selection_method == 'correlation':
+            selected_features = self._correlation_based_selection()
+        elif self.selection_method == 'lasso':
+            selected_features = self._lasso_based_selection()
+        elif self.selection_method == 'random_forest':
+            selected_features = self._random_forest_based_selection()
+        else:
+            self.logger.error(f"Unsupported feature selection method: {self.selection_method}")
+            raise ValueError(f"Unsupported feature selection method: {self.selection_method}")
+
+        return selected_features
+
+    def _correlation_based_selection(self) -> pd.DataFrame:
+        """
+        Selects top features based on correlation with phenotype using ANOVA.
+
+        Returns:
+            pd.DataFrame: DataFrame containing the selected features.
+        """
+        self.logger.info("Performing correlation-based feature selection using ANOVA (f_classif).")
+        selector = SelectKBest(score_func=f_classif, k=self.num_features)
+        selector.fit(self.enhanced_omics_data, self.phenotype_data)
+        selected_mask = selector.get_support()
+        selected_features = self.enhanced_omics_data.columns[selected_mask]
+        self.logger.info(f"Selected {len(selected_features)} features based on correlation.")
+        return self.enhanced_omics_data[selected_features]
+
+    def _lasso_based_selection(self) -> pd.DataFrame:
+        """
+        Selects top features based on LASSO regression coefficients.
+
+        Returns:
+            pd.DataFrame: DataFrame containing the selected features.
+        """
+        self.logger.info("Performing LASSO-based feature selection.")
+        # Ensure cv does not exceed number of samples
+        n_samples = len(self.enhanced_omics_data)
+        cv_folds = 5
+        if n_samples < cv_folds:
+            cv_folds = n_samples
+            self.logger.warning(f"Reducing cv from 5 to {cv_folds} due to insufficient samples.")
+        if cv_folds < 2:
+            raise ValueError(f"Number of splits {cv_folds} must be at least 2 for cross-validation.")
+
+        lasso = LassoCV(cv=cv_folds, random_state=0).fit(self.enhanced_omics_data, self.phenotype_data)
+        coef = pd.Series(lasso.coef_, index=self.enhanced_omics_data.columns)
+        selected_features = coef.abs().sort_values(ascending=False).head(self.num_features).index
+        self.logger.info(f"Selected {len(selected_features)} features based on LASSO coefficients.")
+        return self.enhanced_omics_data[selected_features]
+
+    def _random_forest_based_selection(self) -> pd.DataFrame:
+        """
+        Selects top features based on Random Forest feature importances.
+
+        Returns:
+            pd.DataFrame: DataFrame containing the selected features.
         """
-        Executes the feature selection process on the enhanced omics data.
+        self.logger.info("Performing Random Forest-based feature selection.")
+        rf = RandomForestClassifier(n_estimators=100, random_state=0)
+        rf.fit(self.enhanced_omics_data, self.phenotype_data)
+        importances = pd.Series(rf.feature_importances_, index=self.enhanced_omics_data.columns)
+        selected_features = importances.sort_values(ascending=False).head(self.num_features).index
+        self.logger.info(f"Selected {len(selected_features)} features based on Random Forest importances.")
+        return self.enhanced_omics_data[selected_features]
+
 
-        Steps:
-            1. Perform feature selection using the specified method.
-            2. Save the selected features.
+    def run_feature_selection(self) -> pd.DataFrame:
+        """
+        Executes the feature selection process and saves the results.
 
         Returns:
-            pd.DataFrame: DataFrame containing the selected multi-omics features.
+            pd.DataFrame: DataFrame containing the selected features.
         """
         self.logger.info("Starting feature selection on enhanced omics data.")
-        selected_features = self.perform_feature_selection(self.enhanced_omics_data, self.phenotype_data)
-        self.save_selected_features(selected_features, filename="selected_genetic_features.csv")
+        selected_features = self.perform_feature_selection()
         self.logger.info("Feature selection on enhanced omics data completed successfully.")
         return selected_features