Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ venv/
ENV/
env.bak/
venv.bak/
.pytest_cache

# Sphinx documentation build
docs/build/
Expand Down
Binary file added README.pdf
Binary file not shown.
1 change: 0 additions & 1 deletion bioneuralnet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@
from .network_embedding.gnns import GNNEmbedding
from .network_embedding.node2vec import Node2VecEmbedding
from .subject_representation.subject_representation import SubjectRepresentationEmbedding
from .utils.quick_start import quick_start
from .utils.data_utils import combine_omics_data

# Define the public API of the package
Expand Down
161 changes: 25 additions & 136 deletions bioneuralnet/graph_generation/wgcna.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,15 @@
import os
import subprocess
import pandas as pd
from typing import List, Optional
from typing import List
from ..utils.logger import get_logger
from datetime import datetime

class WGCNA:
"""
WGCNA Class for Graph Construction using Weighted Gene Co-expression Network Analysis (WGCNA).

This class handles the execution of WGCNA R scripts, data preprocessing,
and loading of the resulting adjacency matrix.

Attributes:
phenotype_file (str): Path to phenotype data CSV file.
omics_list (List[str]): List of paths to omics data CSV files.
data_types (List[str]): List of omics data types.
soft_power (int): Soft-thresholding power.
min_module_size (int): Minimum module size.
merge_cut_height (float): Module merging threshold.
output_dir (str): Directory to save outputs.
Similar in approach to SmCCNet: no cleanup step if not required.
"""

def __init__(
Expand All @@ -30,30 +20,16 @@ def __init__(
soft_power: int = 6,
min_module_size: int = 30,
merge_cut_height: float = 0.25,
output_dir: Optional[str] = None,
):
"""
Initializes the WGCNA instance with direct parameters.

Args:
phenotype_file (str): Path to phenotype data CSV file.
omics_list (List[str]): List of paths to omics data CSV files.
data_types (List[str]): List of omics data types (e.g., ["gene", "miRNA"]).
soft_power (int, optional): Soft-thresholding power. Defaults to 6.
min_module_size (int, optional): Minimum module size. Defaults to 30.
merge_cut_height (float, optional): Module merging threshold. Defaults to 0.25.
output_dir (str, optional): Directory to save outputs. If None, creates a unique directory.
"""
# Assign parameters
self.phenotype_file = phenotype_file
self.omics_list = omics_list
self.data_types = data_types
self.soft_power = soft_power
self.min_module_size = min_module_size
self.merge_cut_height = merge_cut_height
self.output_dir = output_dir if output_dir else self._create_output_dir()

# Initialize logger (global logger)
# Initialize logger
self.logger = get_logger(__name__)
self.logger.info("Initialized WGCNA with the following parameters:")
self.logger.info(f"Phenotype File: {self.phenotype_file}")
Expand All @@ -62,16 +38,10 @@ def __init__(
self.logger.info(f"Soft Power: {self.soft_power}")
self.logger.info(f"Minimum Module Size: {self.min_module_size}")
self.logger.info(f"Merge Cut Height: {self.merge_cut_height}")
self.logger.info(f"Output Directory: {self.output_dir}")

def _create_output_dir(self) -> str:
"""
Creates a unique output directory for the current WGCNA run.

The directory is named 'wgcna_output_timestamp' and is created in the current working directory.

Returns:
str: Path to the created output directory.
"""
base_dir = "wgcna_output"
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
Expand All @@ -84,23 +54,21 @@ def run(self) -> pd.DataFrame:
"""
Executes the WGCNA pipeline and returns the global network adjacency matrix.

This method orchestrates the preprocessing of data, execution of the WGCNA R script,
loading of the resulting adjacency matrix, and cleanup of output files.

Returns:
pd.DataFrame: Adjacency matrix representing the global network.
Steps:
1. Create output directory.
2. Preprocess data.
3. Run WGCNA R script.
4. Load adjacency matrix.

Raises:
FileNotFoundError: If essential files are missing.
subprocess.CalledProcessError: If the R script execution fails.
Exception: For any other unforeseen errors during execution.
No cleanup step if not needed.
"""
try:
self.logger.info("Starting WGCNA Network Construction")
# Create a unique output directory for this run
output_dir = self._create_output_dir()
self.preprocess_data()
self.run_wgcna()
adjacency_matrix = self.load_global_network()
self.cleanup_output()
self.run_wgcna(output_dir)
adjacency_matrix = self.load_global_network(output_dir)
self.logger.info("WGCNA executed successfully.")
return adjacency_matrix
except Exception as e:
Expand All @@ -109,23 +77,13 @@ def run(self) -> pd.DataFrame:

def preprocess_data(self) -> None:
"""
Preprocesses the combined omics data by checking for NaN or infinite values.

Steps:
1. Load all omics data from the specified CSV files.
2. Ensure that all omics datasets have the same sample IDs and order as the phenotype data.
3. Remove samples with any NaN or infinite values across all datasets.
4. Save the cleaned omics data back to their respective CSV files.

Raises:
FileNotFoundError: If any omics data file is not found.
pd.errors.EmptyDataError: If any omics data file is empty.
Preprocesses the omics data similarly to SmCCNet.
"""
self.logger.info("Preprocessing omics data for NaN or infinite values.")

# Load phenotype data
try:
phenotype_data = pd.read_csv(self.phenotype_file, header=True, stringsAsFactors=False)
phenotype_data = pd.read_csv(self.phenotype_file, header=0)
self.logger.info(f"Phenotype data loaded with shape {phenotype_data.shape}")
except FileNotFoundError:
self.logger.error(f"Phenotype data file not found: {self.phenotype_file}")
Expand All @@ -138,20 +96,16 @@ def preprocess_data(self) -> None:
self.logger.error("Number of omics data files does not match number of data types.")
raise ValueError("Number of omics data files does not match number of data types.")

# Assuming the first column is sample IDs
sample_ids_pheno = phenotype_data.iloc[:, 0]
self.logger.info(f"Number of samples in phenotype data: {len(sample_ids_pheno)}")

# Initialize a Series to track valid samples
valid_samples = pd.Series([True] * len(sample_ids_pheno), index=sample_ids_pheno.index)

# Iterate over each omics dataset
for omics_file in self.omics_list:
self.logger.info(f"Processing omics file: {omics_file}")

# Load omics data
try:
omics_data = pd.read_csv(omics_file, header=True, stringsAsFactors=False)
omics_data = pd.read_csv(omics_file, header=0)
self.logger.info(f"Omics data loaded with shape {omics_data.shape}")
except FileNotFoundError:
self.logger.error(f"Omics data file not found: {omics_file}")
Expand All @@ -160,19 +114,15 @@ def preprocess_data(self) -> None:
self.logger.error(f"Omics data file is empty: {omics_file}")
raise

# Assuming the first column is sample IDs
sample_ids_omics = omics_data.iloc[:, 0]
omics_values = omics_data.iloc[:, 1:]

# Align the omics data with the phenotype data using sample IDs
aligned_data = omics_values.set_index(sample_ids_omics).loc[sample_ids_pheno].reset_index(drop=True)

# Check for any mismatches after alignment
if aligned_data.isnull().values.any():
self.logger.warning(f"NaN values detected in omics data after alignment for file: {omics_file}")
valid_samples &= ~aligned_data.isnull().any(axis=1)

# Check for infinite values
if not pd.api.types.is_numeric_dtype(aligned_data.dtypes).all():
self.logger.warning(f"Non-numeric values detected in omics data for file: {omics_file}. Attempting to convert.")
aligned_data = aligned_data.apply(pd.to_numeric, errors='coerce')
Expand All @@ -185,51 +135,38 @@ def preprocess_data(self) -> None:
aligned_data.replace([float('inf'), -float('inf')], pd.NA, inplace=True)
valid_samples &= ~aligned_data.isnull().any(axis=1)

# Save the aligned and cleaned omics data back to the CSV file
omics_data_clean = pd.concat([sample_ids_pheno.reset_index(drop=True), aligned_data], axis=1)
omics_data_clean.to_csv(omics_file, index=False)
self.logger.info(f"Cleaned omics data saved to {omics_file}")

# Determine which samples are valid across all omics datasets
num_valid_samples = valid_samples.sum()
self.logger.info(f"Number of valid samples after preprocessing: {num_valid_samples} out of {len(sample_ids_pheno)}")

if num_valid_samples == 0:
self.logger.error("No valid samples remaining after preprocessing. Aborting WGCNA run.")
raise ValueError("No valid samples remaining after preprocessing.")

def run_wgcna(self) -> None:
def run_wgcna(self, output_dir: str) -> None:
"""
Executes the R script for WGCNA.

Constructs the command to run the WGCNA R script with appropriate arguments
and captures its output.

Raises:
FileNotFoundError: If the WGCNA R script is not found.
subprocess.CalledProcessError: If the R script execution fails.
Executes the WGCNA R script with required arguments.
"""
# Construct argument strings
omics_files_str = ','.join(self.omics_list)

# Determine the path to the R script
script_dir = os.path.dirname(os.path.abspath(__file__))
r_script = os.path.join(script_dir, "WGCNA.R")

if not os.path.isfile(r_script):
self.logger.error(f"R script not found: {r_script}")
raise FileNotFoundError(f"R script not found: {r_script}")

# Construct the command to execute the R script
command = [
"Rscript",
r_script,
self.phenotype_file, # args[1]: phenotype_file
omics_files_str, # args[2]: omics_files (comma-separated)
str(self.soft_power), # args[3]: soft_power
str(self.min_module_size), # args[4]: min_module_size
str(self.merge_cut_height), # args[5]: merge_cut_height
self.output_dir, # args[6]: output_dir
self.phenotype_file,
omics_files_str,
str(self.soft_power),
str(self.min_module_size),
str(self.merge_cut_height),
]

self.logger.debug(f"Executing command: {' '.join(command)}")
Expand All @@ -252,24 +189,16 @@ def run_wgcna(self) -> None:
self.logger.error(f"R script execution failed: {e.stderr}")
raise

def load_global_network(self) -> pd.DataFrame:
def load_global_network(self, output_dir: str) -> pd.DataFrame:
"""
Loads the global network adjacency matrix generated by WGCNA.

Returns:
pd.DataFrame: Adjacency matrix of the global network.

Raises:
FileNotFoundError: If the global network CSV file is not found.
pd.errors.EmptyDataError: If the CSV file is empty.
"""
global_network_csv = os.path.join(self.output_dir, "global_network.csv")
global_network_csv = os.path.join(output_dir, "global_network.csv")

if not os.path.isfile(global_network_csv):
self.logger.error(f"Global network file not found: {global_network_csv}")
raise FileNotFoundError(f"Global network file not found: {global_network_csv}")

# Load the adjacency matrix
try:
adjacency_matrix = pd.read_csv(global_network_csv, index_col=0)
self.logger.info("Global network adjacency matrix loaded successfully.")
Expand All @@ -278,44 +207,4 @@ def load_global_network(self) -> pd.DataFrame:
self.logger.error(f"Global network CSV file is empty: {global_network_csv}")
raise

def read_adjacency_matrix(self) -> pd.DataFrame:
"""
Reads and returns the global network adjacency matrix.

Returns:
pd.DataFrame: Adjacency matrix of the global network.
"""
return self.load_global_network()

def cleanup_output(self) -> None:
"""
Cleans up and reorganizes WGCNA output files.

Moves `.RData` and `.csv` files into a dedicated
`wgcna_results` subdirectory within the `wgcna_output` directory for better organization.

Raises:
Exception: If any error occurs during the cleanup process.
"""
import shutil

try:
saving_dir = self.output_dir
logger = self.logger

# Define the target directory for organized outputs
results_dir = os.path.join(saving_dir, "wgcna_results")
os.makedirs(results_dir, exist_ok=True)

# Move `.RData` and `.csv` files to `wgcna_results` directory
for file_name in os.listdir(saving_dir):
if file_name.endswith(".RData") or file_name.endswith(".csv"):
src_file = os.path.join(saving_dir, file_name)
shutil.move(src_file, results_dir)
logger.info(f"Moved {file_name} to {results_dir}")

logger.info("Cleanup and reorganization completed successfully.")

except Exception as e:
logger.error(f"Error during cleanup: {e}")
raise
1 change: 0 additions & 1 deletion docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ The API Reference provides detailed documentation for each module, class, and fu
bioneuralnet.subject_representation.SubjectRepresentationEmbedding
bioneuralnet.utils.file_helpers.find_files
bioneuralnet.utils.path_utils.validate_paths
bioneuralnet.utils.quick_start.quick_start
```

**Explanation:**
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# PyTorch and related libraries
torch==2.0.0
torch_geometric
wheel

# Data handling and processing
PyYAML>=5.4
Expand Down
9 changes: 8 additions & 1 deletion scripts/quick-start.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,14 @@ def main():
print("Unsupported OS. Please install R dependencies manually.")
exit(1)

print("Quick-start completed successfully!")
print("------------------------------------------------")
print("BioNeuralNet quick-start completed successfully!")
print("------------------------------------------------\n")
print("To activate the virtual environment, run:")
print("source .venv/bin/activate\n")
print("To deactivate the virtual environment, run:")
print("deactivate\n")


if __name__ == "__main__":
main()
7 changes: 3 additions & 4 deletions scripts/requirements-cpu.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# CPU-specific dependencies for BioNeuralNet

# PyTorch Geometric dependencies for CPU
torch_scatter @ https://data.pyg.org/whl/torch-2.0.0+cpu.html
torch_sparse @ https://data.pyg.org/whl/torch-2.0.0+cpu.html
-f https://data.pyg.org/whl/torch-2.0.0+cpu.html
torch-scatter
torch-sparse
1 change: 0 additions & 1 deletion scripts/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,4 @@ tox

# Build dependencies for BioNeuralNet
setuptools>=42
wheel
twine
7 changes: 4 additions & 3 deletions scripts/requirements-gpu.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# CUDA-specific dependencies for BioNeuralNet

# PyTorch Geometric dependencies for CUDA (e.g. here CUDA 11.7)
torch_scatter @ https://data.pyg.org/whl/torch-2.0.0+cu117.html
torch_sparse @ https://data.pyg.org/whl/torch-2.0.0+cu117.html
# PyTorch Geometric dependencies for CUDA (using CUDA 11.7 here)
-f https://data.pyg.org/whl/torch-2.0.0+cu117.html
torch-scatter
torch-sparse
9 changes: 6 additions & 3 deletions scripts/setup-R.bat
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ if %ERRORLEVEL% NEQ 0 (
echo R is already installed.
)

:: Install R packages
echo Installing R packages: dplyr, SmCCNet, WGCNA...
Rscript -e "install.packages(c('dplyr', 'SmCCNet', 'WGCNA'), repos='http://cran.r-project.org')"
:: Install CRAN and Bioconductor packages
echo Installing R packages: dplyr, SmCCNet, WGCNA, and dependencies...
Rscript -e "options(repos = c(CRAN = 'https://cran.r-project.org')); install.packages(c('dplyr', 'SmCCNet'))"
Rscript -e "options(repos = c(CRAN = 'https://cran.r-project.org')); if (!requireNamespace('BiocManager', quietly = TRUE)) install.packages('BiocManager')"
Rscript -e "options(repos = c(CRAN = 'https://cran.r-project.org')); BiocManager::install(c('impute', 'preprocessCore', 'GO.db', 'AnnotationDbi'))"
Rscript -e "options(repos = c(CRAN = 'https://cran.r-project.org')); install.packages('WGCNA')"

echo R dependencies setup completed!
pause
Loading
Loading