UCD-BDLab · ramosv · Dec 14, 2024 · Dec 12, 2024 · Dec 12, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ venv/
 ENV/
 env.bak/
 venv.bak/
+.pytest_cache
 
 # Sphinx documentation build
 docs/build/

diff --git a/README.pdf b/README.pdf
diff --git a/bioneuralnet/__init__.py b/bioneuralnet/__init__.py
@@ -49,7 +49,6 @@
 from .network_embedding.gnns import GNNEmbedding
 from .network_embedding.node2vec import Node2VecEmbedding
 from .subject_representation.subject_representation import SubjectRepresentationEmbedding
-from .utils.quick_start import quick_start
 from .utils.data_utils import combine_omics_data
 
 # Define the public API of the package

diff --git a/bioneuralnet/graph_generation/wgcna.py b/bioneuralnet/graph_generation/wgcna.py
@@ -1,25 +1,15 @@
 import os
 import subprocess
 import pandas as pd
-from typing import List, Optional
+from typing import List
 from ..utils.logger import get_logger
 from datetime import datetime
 
 class WGCNA:
     """
     WGCNA Class for Graph Construction using Weighted Gene Co-expression Network Analysis (WGCNA).
 
-    This class handles the execution of WGCNA R scripts, data preprocessing,
-    and loading of the resulting adjacency matrix.
-
-    Attributes:
-        phenotype_file (str): Path to phenotype data CSV file.
-        omics_list (List[str]): List of paths to omics data CSV files.
-        data_types (List[str]): List of omics data types.
-        soft_power (int): Soft-thresholding power.
-        min_module_size (int): Minimum module size.
-        merge_cut_height (float): Module merging threshold.
-        output_dir (str): Directory to save outputs.
+    Similar in approach to SmCCNet: no cleanup step if not required.
     """
 
     def __init__(
@@ -30,30 +20,16 @@ def __init__(
         soft_power: int = 6,
         min_module_size: int = 30,
         merge_cut_height: float = 0.25,
-        output_dir: Optional[str] = None,
     ):
-        """
-        Initializes the WGCNA instance with direct parameters.
-
-        Args:
-            phenotype_file (str): Path to phenotype data CSV file.
-            omics_list (List[str]): List of paths to omics data CSV files.
-            data_types (List[str]): List of omics data types (e.g., ["gene", "miRNA"]).
-            soft_power (int, optional): Soft-thresholding power. Defaults to 6.
-            min_module_size (int, optional): Minimum module size. Defaults to 30.
-            merge_cut_height (float, optional): Module merging threshold. Defaults to 0.25.
-            output_dir (str, optional): Directory to save outputs. If None, creates a unique directory.
-        """
         # Assign parameters
         self.phenotype_file = phenotype_file
         self.omics_list = omics_list
         self.data_types = data_types
         self.soft_power = soft_power
         self.min_module_size = min_module_size
         self.merge_cut_height = merge_cut_height
-        self.output_dir = output_dir if output_dir else self._create_output_dir()
 
-        # Initialize logger (global logger)
+        # Initialize logger
         self.logger = get_logger(__name__)
         self.logger.info("Initialized WGCNA with the following parameters:")
         self.logger.info(f"Phenotype File: {self.phenotype_file}")
@@ -62,16 +38,10 @@ def __init__(
         self.logger.info(f"Soft Power: {self.soft_power}")
         self.logger.info(f"Minimum Module Size: {self.min_module_size}")
         self.logger.info(f"Merge Cut Height: {self.merge_cut_height}")
-        self.logger.info(f"Output Directory: {self.output_dir}")
 
     def _create_output_dir(self) -> str:
         """
         Creates a unique output directory for the current WGCNA run.
-
-        The directory is named 'wgcna_output_timestamp' and is created in the current working directory.
-
-        Returns:
-            str: Path to the created output directory.
         """
         base_dir = "wgcna_output"
         timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
@@ -84,23 +54,21 @@ def run(self) -> pd.DataFrame:
         """
         Executes the WGCNA pipeline and returns the global network adjacency matrix.
 
-        This method orchestrates the preprocessing of data, execution of the WGCNA R script,
-        loading of the resulting adjacency matrix, and cleanup of output files.
-
-        Returns:
-            pd.DataFrame: Adjacency matrix representing the global network.
+        Steps:
+        1. Create output directory.
+        2. Preprocess data.
+        3. Run WGCNA R script.
+        4. Load adjacency matrix.
 
-        Raises:
-            FileNotFoundError: If essential files are missing.
-            subprocess.CalledProcessError: If the R script execution fails.
-            Exception: For any other unforeseen errors during execution.
+        No cleanup step if not needed.
         """
         try:
             self.logger.info("Starting WGCNA Network Construction")
+            # Create a unique output directory for this run
+            output_dir = self._create_output_dir()
             self.preprocess_data()
-            self.run_wgcna()
-            adjacency_matrix = self.load_global_network()
-            self.cleanup_output()
+            self.run_wgcna(output_dir)
+            adjacency_matrix = self.load_global_network(output_dir)
             self.logger.info("WGCNA executed successfully.")
             return adjacency_matrix
         except Exception as e:
@@ -109,23 +77,13 @@ def run(self) -> pd.DataFrame:
 
     def preprocess_data(self) -> None:
         """
-        Preprocesses the combined omics data by checking for NaN or infinite values.
-
-        Steps:
-            1. Load all omics data from the specified CSV files.
-            2. Ensure that all omics datasets have the same sample IDs and order as the phenotype data.
-            3. Remove samples with any NaN or infinite values across all datasets.
-            4. Save the cleaned omics data back to their respective CSV files.
-
-        Raises:
-            FileNotFoundError: If any omics data file is not found.
-            pd.errors.EmptyDataError: If any omics data file is empty.
+        Preprocesses the omics data similarly to SmCCNet.
         """
         self.logger.info("Preprocessing omics data for NaN or infinite values.")
 
         # Load phenotype data
         try:
-            phenotype_data = pd.read_csv(self.phenotype_file, header=True, stringsAsFactors=False)
+            phenotype_data = pd.read_csv(self.phenotype_file, header=0)
             self.logger.info(f"Phenotype data loaded with shape {phenotype_data.shape}")
         except FileNotFoundError:
             self.logger.error(f"Phenotype data file not found: {self.phenotype_file}")
@@ -138,20 +96,16 @@ def preprocess_data(self) -> None:
             self.logger.error("Number of omics data files does not match number of data types.")
             raise ValueError("Number of omics data files does not match number of data types.")
 
-        # Assuming the first column is sample IDs
         sample_ids_pheno = phenotype_data.iloc[:, 0]
         self.logger.info(f"Number of samples in phenotype data: {len(sample_ids_pheno)}")
 
-        # Initialize a Series to track valid samples
         valid_samples = pd.Series([True] * len(sample_ids_pheno), index=sample_ids_pheno.index)
 
-        # Iterate over each omics dataset
         for omics_file in self.omics_list:
             self.logger.info(f"Processing omics file: {omics_file}")
 
-            # Load omics data
             try:
-                omics_data = pd.read_csv(omics_file, header=True, stringsAsFactors=False)
+                omics_data = pd.read_csv(omics_file, header=0)
                 self.logger.info(f"Omics data loaded with shape {omics_data.shape}")
             except FileNotFoundError:
                 self.logger.error(f"Omics data file not found: {omics_file}")
@@ -160,19 +114,15 @@ def preprocess_data(self) -> None:
                 self.logger.error(f"Omics data file is empty: {omics_file}")
                 raise
 
-            # Assuming the first column is sample IDs
             sample_ids_omics = omics_data.iloc[:, 0]
             omics_values = omics_data.iloc[:, 1:]
 
-            # Align the omics data with the phenotype data using sample IDs
             aligned_data = omics_values.set_index(sample_ids_omics).loc[sample_ids_pheno].reset_index(drop=True)
 
-            # Check for any mismatches after alignment
             if aligned_data.isnull().values.any():
                 self.logger.warning(f"NaN values detected in omics data after alignment for file: {omics_file}")
                 valid_samples &= ~aligned_data.isnull().any(axis=1)
 
-            # Check for infinite values
             if not pd.api.types.is_numeric_dtype(aligned_data.dtypes).all():
                 self.logger.warning(f"Non-numeric values detected in omics data for file: {omics_file}. Attempting to convert.")
                 aligned_data = aligned_data.apply(pd.to_numeric, errors='coerce')
@@ -185,51 +135,38 @@ def preprocess_data(self) -> None:
                 aligned_data.replace([float('inf'), -float('inf')], pd.NA, inplace=True)
                 valid_samples &= ~aligned_data.isnull().any(axis=1)
 
-            # Save the aligned and cleaned omics data back to the CSV file
             omics_data_clean = pd.concat([sample_ids_pheno.reset_index(drop=True), aligned_data], axis=1)
             omics_data_clean.to_csv(omics_file, index=False)
             self.logger.info(f"Cleaned omics data saved to {omics_file}")
 
-        # Determine which samples are valid across all omics datasets
         num_valid_samples = valid_samples.sum()
         self.logger.info(f"Number of valid samples after preprocessing: {num_valid_samples} out of {len(sample_ids_pheno)}")
 
         if num_valid_samples == 0:
             self.logger.error("No valid samples remaining after preprocessing. Aborting WGCNA run.")
             raise ValueError("No valid samples remaining after preprocessing.")
 
-    def run_wgcna(self) -> None:
+    def run_wgcna(self, output_dir: str) -> None:
         """
-        Executes the R script for WGCNA.
-
-        Constructs the command to run the WGCNA R script with appropriate arguments
-        and captures its output.
-
-        Raises:
-            FileNotFoundError: If the WGCNA R script is not found.
-            subprocess.CalledProcessError: If the R script execution fails.
+        Executes the WGCNA R script with required arguments.
         """
-        # Construct argument strings
         omics_files_str = ','.join(self.omics_list)
 
-        # Determine the path to the R script
         script_dir = os.path.dirname(os.path.abspath(__file__))
         r_script = os.path.join(script_dir, "WGCNA.R")
 
         if not os.path.isfile(r_script):
             self.logger.error(f"R script not found: {r_script}")
             raise FileNotFoundError(f"R script not found: {r_script}")
 
-        # Construct the command to execute the R script
         command = [
             "Rscript",
             r_script,
-            self.phenotype_file,           # args[1]: phenotype_file
-            omics_files_str,              # args[2]: omics_files (comma-separated)
-            str(self.soft_power),         # args[3]: soft_power
-            str(self.min_module_size),    # args[4]: min_module_size
-            str(self.merge_cut_height),   # args[5]: merge_cut_height
-            self.output_dir,              # args[6]: output_dir
+            self.phenotype_file,          
+            omics_files_str,             
+            str(self.soft_power),        
+            str(self.min_module_size),    
+            str(self.merge_cut_height),  
         ]
 
         self.logger.debug(f"Executing command: {' '.join(command)}")
@@ -252,24 +189,16 @@ def run_wgcna(self) -> None:
             self.logger.error(f"R script execution failed: {e.stderr}")
             raise
 
-    def load_global_network(self) -> pd.DataFrame:
+    def load_global_network(self, output_dir: str) -> pd.DataFrame:
         """
         Loads the global network adjacency matrix generated by WGCNA.
-
-        Returns:
-            pd.DataFrame: Adjacency matrix of the global network.
-
-        Raises:
-            FileNotFoundError: If the global network CSV file is not found.
-            pd.errors.EmptyDataError: If the CSV file is empty.
         """
-        global_network_csv = os.path.join(self.output_dir, "global_network.csv")
+        global_network_csv = os.path.join(output_dir, "global_network.csv")
 
         if not os.path.isfile(global_network_csv):
             self.logger.error(f"Global network file not found: {global_network_csv}")
             raise FileNotFoundError(f"Global network file not found: {global_network_csv}")
 
-        # Load the adjacency matrix
         try:
             adjacency_matrix = pd.read_csv(global_network_csv, index_col=0)
             self.logger.info("Global network adjacency matrix loaded successfully.")
@@ -278,44 +207,4 @@ def load_global_network(self) -> pd.DataFrame:
             self.logger.error(f"Global network CSV file is empty: {global_network_csv}")
             raise
 
-    def read_adjacency_matrix(self) -> pd.DataFrame:
-        """
-        Reads and returns the global network adjacency matrix.
 
-        Returns:
-            pd.DataFrame: Adjacency matrix of the global network.
-        """
-        return self.load_global_network()
-
-    def cleanup_output(self) -> None:
-        """
-        Cleans up and reorganizes WGCNA output files.
-
-        Moves `.RData` and `.csv` files into a dedicated
-        `wgcna_results` subdirectory within the `wgcna_output` directory for better organization.
-
-        Raises:
-            Exception: If any error occurs during the cleanup process.
-        """
-        import shutil
-
-        try:
-            saving_dir = self.output_dir
-            logger = self.logger
-
-            # Define the target directory for organized outputs
-            results_dir = os.path.join(saving_dir, "wgcna_results")
-            os.makedirs(results_dir, exist_ok=True)
-
-            # Move `.RData` and `.csv` files to `wgcna_results` directory
-            for file_name in os.listdir(saving_dir):
-                if file_name.endswith(".RData") or file_name.endswith(".csv"):
-                    src_file = os.path.join(saving_dir, file_name)
-                    shutil.move(src_file, results_dir)
-                    logger.info(f"Moved {file_name} to {results_dir}")
-
-            logger.info("Cleanup and reorganization completed successfully.")
-
-        except Exception as e:
-            logger.error(f"Error during cleanup: {e}")
-            raise
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -35,7 +35,6 @@ The API Reference provides detailed documentation for each module, class, and fu
     bioneuralnet.subject_representation.SubjectRepresentationEmbedding
     bioneuralnet.utils.file_helpers.find_files
     bioneuralnet.utils.path_utils.validate_paths
-    bioneuralnet.utils.quick_start.quick_start
 ```
 
 **Explanation:**

diff --git a/requirements.txt b/requirements.txt
@@ -3,6 +3,7 @@
 # PyTorch and related libraries
 torch==2.0.0
 torch_geometric
+wheel
 
 # Data handling and processing
 PyYAML>=5.4

diff --git a/scripts/quick-start.py b/scripts/quick-start.py
@@ -41,7 +41,14 @@ def main():
         print("Unsupported OS. Please install R dependencies manually.")
         exit(1)
 
-    print("Quick-start completed successfully!")
+    print("------------------------------------------------")
+    print("BioNeuralNet quick-start completed successfully!")
+    print("------------------------------------------------\n")
+    print("To activate the virtual environment, run:")
+    print("source .venv/bin/activate\n")
+    print("To deactivate the virtual environment, run:")
+    print("deactivate\n")
+
 
 if __name__ == "__main__":
     main()
diff --git a/scripts/requirements-cpu.txt b/scripts/requirements-cpu.txt
@@ -1,5 +1,4 @@
 # CPU-specific dependencies for BioNeuralNet
-
-# PyTorch Geometric dependencies for CPU
-torch_scatter @ https://data.pyg.org/whl/torch-2.0.0+cpu.html
-torch_sparse @ https://data.pyg.org/whl/torch-2.0.0+cpu.html
+-f https://data.pyg.org/whl/torch-2.0.0+cpu.html
+torch-scatter
+torch-sparse
diff --git a/scripts/requirements-dev.txt b/scripts/requirements-dev.txt
@@ -20,5 +20,4 @@ tox
 
 # Build dependencies for BioNeuralNet
 setuptools>=42
-wheel
 twine
diff --git a/scripts/requirements-gpu.txt b/scripts/requirements-gpu.txt
@@ -1,5 +1,6 @@
 # CUDA-specific dependencies for BioNeuralNet
 
-# PyTorch Geometric dependencies for CUDA (e.g. here CUDA 11.7)
-torch_scatter @ https://data.pyg.org/whl/torch-2.0.0+cu117.html
-torch_sparse @ https://data.pyg.org/whl/torch-2.0.0+cu117.html
+# PyTorch Geometric dependencies for CUDA (using CUDA 11.7 here)
+-f https://data.pyg.org/whl/torch-2.0.0+cu117.html
+torch-scatter
+torch-sparse
diff --git a/scripts/setup-R.bat b/scripts/setup-R.bat
@@ -11,9 +11,12 @@ if %ERRORLEVEL% NEQ 0 (
     echo R is already installed.
 )
 
-:: Install R packages
-echo Installing R packages: dplyr, SmCCNet, WGCNA...
-Rscript -e "install.packages(c('dplyr', 'SmCCNet', 'WGCNA'), repos='http://cran.r-project.org')"
+:: Install CRAN and Bioconductor packages
+echo Installing R packages: dplyr, SmCCNet, WGCNA, and dependencies...
+Rscript -e "options(repos = c(CRAN = 'https://cran.r-project.org')); install.packages(c('dplyr', 'SmCCNet'))"
+Rscript -e "options(repos = c(CRAN = 'https://cran.r-project.org')); if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager')"
+Rscript -e "options(repos = c(CRAN = 'https://cran.r-project.org')); BiocManager::install(c('impute', 'preprocessCore', 'GO.db', 'AnnotationDbi'))"
+Rscript -e "options(repos = c(CRAN = 'https://cran.r-project.org')); install.packages('WGCNA')"
 
 echo R dependencies setup completed!
 pause
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,5 +20,4 @@ tox @@
     # Build dependencies for BioNeuralNet
     setuptools>=42
-    wheel
     twine