Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ s_embedding_testing**
tcga_brca
FireHose_data


/bioneuralnet/external_tools/cptac_wrapper.py
TCGA_BRCA_DATA
Smccnet_output/
Expand Down Expand Up @@ -75,6 +76,10 @@ MOGONET/
!bioneuralnet/datasets/tcga_brca/
!bioneuralnet/datasets/tcga_brca/**/*.csv

!bioneuralnet/datasets/networks/
!bioneuralnet/datasets/networks/**/*.csv
feature_testing

# Sphinx documentation build
docs/build/

Expand All @@ -99,7 +104,8 @@ bioneuralnet-env/
# OS generated files
.DS_Store
Thumbs.db

bioneuralnet/utils/kg_apis/
bioneuralnet/datasets/temp/
# Coverage reports
htmlcov/
.coverage
Expand Down
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/).
- **Updated Tutorials and Documentation**: New end to end jupiter notebook example.
- **Updated Test**: All test have been updated and new ones have been added.

## [1.0.1] - 2025-04-24
## [1.0.1] to [1.0.4] - 2025-04-24

- **BUG**: A bug related to rdata files missing
- **New realease**: A new release will include documentation for the other updates. (1.0.3 or 1.0.2)
- **New realease**: A new release will include documentation for the other updates. (1.1.0)
Copy link

Copilot AI Apr 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo 'realease' should be corrected to 'release'.

Suggested change
- **New realease**: A new release will include documentation for the other updates. (1.1.0)
- **New release**: A new release will include documentation for the other updates. (1.1.0)

Copilot uses AI. Check for mistakes.
92 changes: 92 additions & 0 deletions Cancer_example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,100 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"# BioNeuralNet Cancer Example 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bioneuralnet.datasets.dataset_loader import DatasetLoader\n",
"\n",
"brca = DatasetLoader(\"TCGA_BRCA\")\n",
"print(brca.shape)\n",
"print(brca)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from bioneuralnet.utils.preprocess import select_top_k_variance, top_anova_f_features\n",
"\n",
"# 1) Load your data\n",
"brca_meth_df = brca.data[\"BRCA_Meth\"]\n",
"brca_rna_df = brca.data[\"BRCA_RNA\"]\n",
"pam50_df = brca.data[\"BRCA_PAM50\"]\n",
"\n",
"# 2) Encode PAM50 → numeric codes\n",
"mapping = {\n",
" \"Normal\": 0,\n",
" \"Basal\": 1,\n",
" \"Her2\": 2,\n",
" \"LumA\": 3,\n",
" \"LumB\": 4,\n",
"}\n",
"pam50_series = pam50_df[\"PAM50\"].map(mapping)\n",
"\n",
"# sanity check\n",
"print(pam50_series.value_counts()) # note the () at the end!\n",
"\n",
"# 3) Select top-k by variance\n",
"top_k = 2000\n",
"meth_var = select_top_k_variance(brca_meth_df, k=top_k)\n",
"rna_var = select_top_k_variance(brca_rna_df, k=top_k)\n",
"print(\"Variance‐based:\")\n",
"print(\" Meth shape:\", meth_var.shape) \n",
"print(\" RNA shape:\", rna_var.shape)\n",
"\n",
"# 4) Select top-k by ANOVA F-test\n",
"# Note: the function signature is (X, y, max_features=…)\n",
"meth_anova = top_anova_f_features(\n",
" brca_meth_df,\n",
" pam50_series,\n",
" max_features=top_k\n",
")\n",
"rna_anova = top_anova_f_features(\n",
" brca_rna_df,\n",
" pam50_series,\n",
" max_features=top_k\n",
")\n",
"print(\"ANOVA‐based:\")\n",
"print(\" Meth shape:\", meth_anova.shape)\n",
"print(\" RNA shape:\", rna_anova.shape)\n",
"\n",
"# 5) (Optional) save to CSV\n",
"meth_var.to_csv(\"brca_meth_top2000_var.csv\", index_label=\"sample_id\")\n",
"rna_var.to_csv(\"brca_rna_top2000_var.csv\", index_label=\"sample_id\")\n",
"meth_anova.to_csv(\"brca_meth_top2000_anova.csv\", index_label=\"sample_id\")\n",
"rna_anova.to_csv(\"brca_rna_top2000_anova.csv\", index_label=\"sample_id\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 1) Compute intersections\n",
"common_meth = meth_var.columns.intersection(meth_anova.columns)\n",
"common_rna = rna_var.columns.intersection(rna_anova.columns)\n",
"\n",
"# 2) Print counts and percentages\n",
"print(f\"Methylation overlap: {len(common_meth)} / {top_k} features \"\n",
" f\"({len(common_meth)/top_k:.1%})\")\n",
"print(f\"RNA overlap: {len(common_rna)} / {top_k} features \"\n",
" f\"({len(common_rna)/top_k:.1%})\")\n",
"\n",
"# 3) (Optional) peek at the first few common features\n",
"print(\"\\nFirst 10 common methylation features:\", list(common_meth[:10]))\n",
"print(\"First 10 common RNA features: \", list(common_rna[:10]))"
]
}
],
"metadata": {
Expand Down
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ recursive-include R *.r
recursive-include bioneuralnet/external_tools *.R *.r
recursive-include bioneuralnet/utils *.R *.r

recursive-include bioneuralnet/datasets/monet *.csv
recursive-include bioneuralnet/datasets/example1 *.csv

# Include documentation source files
recursive-include docs *

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

[![Documentation](https://img.shields.io/badge/docs-read%20the%20docs-blue.svg)](https://bioneuralnet.readthedocs.io/en/latest/)

## Welcome to BioNeuralNet 1.0
## Welcome to BioNeuralNet 1.0.4

![BioNeuralNet Logo](assets/LOGO_WB.png)

Expand Down
4 changes: 3 additions & 1 deletion bioneuralnet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
- `datasets`: Contains example (synthetic) datasets for testing and demonstration purposes.
"""

__version__ = "1.0.1"
__version__ = "1.0.4"

from .network_embedding import GNNEmbedding
from .subject_representation import GraphEmbedding
Expand Down Expand Up @@ -68,6 +68,7 @@
from .utils import get_logger

from .datasets import DatasetLoader
from .datasets import NetworkLoader
from .external_tools import SmCCNet
from .external_tools import WGCNA
from .external_tools import Node2Vec
Expand All @@ -83,6 +84,7 @@
"omics_correlation",
"cluster_correlation",
"louvain_to_adjacency",
"NetworkLoader",
"evaluate_rf",
"network_filter",
"rdata_to_df",
Expand Down
3 changes: 2 additions & 1 deletion bioneuralnet/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .dataset_loader import DatasetLoader
from .network_loader import NetworkLoader

__all__ = ["DatasetLoader"]
__all__ = ["DatasetLoader", "NetworkLoader"]
56 changes: 25 additions & 31 deletions bioneuralnet/datasets/dataset_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,44 +2,36 @@
import pandas as pd

class DatasetLoader:
def __init__(self, dataset_name: str):
def __init__(self, dataset_name: str, feature_method: str = "var"):
"""
Args:
dataset_name (str): "example1", "monet", or "tcga_brca"
feature_method (str): for "tcga_brca" only, one of:
- "var" (variance filter, default)
- "ae" (autoencoder selection)
- "anova" (ANOVA F-test selection)
- "rf" (RandomForest importance selection)
"""
self.dataset_name = dataset_name.strip().lower()
self.feature_method = feature_method.strip().lower()
self.base_dir = Path(__file__).parent
self.data: dict[str, pd.DataFrame] = {}

self._load_data()

def _load_and_concat(self, folder: Path, stem: str) -> pd.DataFrame:
p1 = folder / f"{stem}_part1.csv"
p2 = folder / f"{stem}_part2.csv"
if p1.exists() and p2.exists():
df1 = pd.read_csv(p1, index_col=0)
df2 = pd.read_csv(p2, index_col=0)
return pd.concat([df1, df2], axis=0)

single = folder / f"{stem}.csv"
if not single.exists():
raise FileNotFoundError(f"File '{single.name}' not found in '{folder}'.")

return pd.read_csv(single, index_col=0)

def _load_data(self):
"""
Internal loader that fills self.data immediately.
Internal loader for the dataset.
"""
folder = self.base_dir / self.dataset_name
if not folder.is_dir():
raise FileNotFoundError(f"Dataset folder '{folder}' not found.")

if self.dataset_name == "example1":
self.data = {
"X1": pd.read_csv(folder / "X1.csv", index_col=0),
"X2": pd.read_csv(folder / "X2.csv", index_col=0),
"Y": pd.read_csv(folder / "Y.csv", index_col=0),
"X1": pd.read_csv(folder / "X1.csv", index_col=0),
"X2": pd.read_csv(folder / "X2.csv", index_col=0),
"Y": pd.read_csv(folder / "Y.csv", index_col=0),
"clinical_data": pd.read_csv(folder / "clinical_data.csv", index_col=0),
}

Expand All @@ -53,25 +45,27 @@ def _load_data(self):
}

elif self.dataset_name == "tcga_brca":
self.data = {
"BRCA_miRNA": pd.read_csv(folder / "BRCA_miRNA.csv", index_col=0),
"BRCA_Meth": self._load_and_concat(folder, "BRCA_Meth"),
"BRCA_RNA": self._load_and_concat(folder, "BRCA_RNA"),
"BRCA_PAM50": pd.read_csv(folder / "BRCA_PAM50.csv", index_col=0),
"BRCA_Clinical": pd.read_csv(folder / "BRCA_Clinical.csv", index_col=0),
}
valid = {"var", "ae", "anova", "rf"}
if self.feature_method not in valid:
raise ValueError(f"For tcga_brca, feature_method must be one of {valid}, but got {self.feature_method}")

self.data["brca_mirna"] = pd.read_csv(folder / "brca_mirna.csv", index_col=0)
self.data["brca_pam50"] = pd.read_csv(folder / "brca_pam50.csv", index_col=0)
self.data["brca_clinical"] = pd.read_csv(folder / "brca_clinical.csv", index_col=0)

meth_file = f"brca_meth_{self.feature_method}.csv"
rna_file = f"brca_rna_{self.feature_method}.csv"
self.data["brca_meth"] = pd.read_csv(folder / meth_file, index_col=0)
self.data["brca_rna"] = pd.read_csv(folder / rna_file, index_col=0)
else:
raise ValueError(f"Dataset '{self.dataset_name}' is not recognized.")

@property
def shape(self) -> dict[str, tuple[int,int]]:
def shape(self) -> dict[str, tuple[int, int]]:
"""
dict of table_name to (n_rows, n_cols), already loaded in __init__.
dict of table_name to (n_rows, n_cols)
"""
result = {}
for name, df in self.data.items():
result[name] = df.shape

return result

return result
79 changes: 79 additions & 0 deletions bioneuralnet/datasets/network_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from pathlib import Path
import pandas as pd
from typing import List

class NetworkLoader:
"""
Class to load bundled networks from the networks folder.
Current options are:
- brca_smccnet_ae
- brca_smccnet_rf
- brca_smccnet_var

Networks must live in subfolders each containing:
- GlobalNetwork.csv
- size_<n>_net_<i>.csv
"""
def __init__(self):
self.base_dir = Path(__file__).parent / "networks"
if not self.base_dir.is_dir():
raise FileNotFoundError(f"Bundled networks folder not found at: {self.base_dir}")

methods: List[str] = []
for p in self.base_dir.iterdir():
if p.is_dir():
methods.append(p.name)

self.methods = methods

def available_methods(self) -> List[str]:
"""Return list of bundled network-method names"""
return self.methods

def load_global_network(self, method: str) -> pd.DataFrame:
"""
Load the GlobalNetwork.csv for the given method.
"""

folder = self.base_dir / method
path = folder / "GlobalNetwork.csv"

if not path.is_file():
raise FileNotFoundError(f"GlobalNetwork.csv not found for method {method}")

return pd.read_csv(path, index_col=0)

def load_clusters(self, method: str) -> List[pd.DataFrame]:
"""
Load all size_*_net_*.csv cluster files for the given method,
sorted by (size, index), and return them as DataFrames
"""
folder = self.base_dir / method
if not folder.is_dir():
raise FileNotFoundError(f"Method folder '{method}' not found under {self.base_dir}")

raw = list(folder.glob("size_*_net_*.csv"))
sorted_paths: List[Path] = []

for p in raw:
parts = p.stem.split("_")
size = int(parts[1])
idx = int(parts[-1])
inserted = False
for i, ex in enumerate(sorted_paths):
ex_parts = ex.stem.split("_")
ex_size = int(ex_parts[1])
ex_idx = int(ex_parts[-1])
if (size, idx) < (ex_size, ex_idx):
sorted_paths.insert(i, p)
inserted = True
break

if not inserted:
sorted_paths.append(p)

clusters: List[pd.DataFrame] = []
for path in sorted_paths:
clusters.append(pd.read_csv(path, index_col=0))

return clusters
2,504 changes: 2,504 additions & 0 deletions bioneuralnet/datasets/networks/brca_smccnet_ae/GlobalNetwork.csv

Large diffs are not rendered by default.

Loading
Loading