NagaComBio · NagaComBio · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -38,12 +38,29 @@ jobs:
           http-user-agent: ${{ matrix.config.http-user-agent }}
           use-public-rspm: true
 
-      - uses: r-lib/actions/setup-r-dependencies@v2
-        with:
-          extra-packages: rcmdcheck
+      - name: Install all dependencies manually
+        run: |
+          # Install Canopy dependencies from CRAN
+          install.packages(c('ape', 'fields', 'pheatmap', 'scatterplot3d'), repos='https://cloud.r-project.org')
+          # Install Canopy from CRAN archive
+          install.packages('http://cran.r-project.org/src/contrib/Archive/Canopy/Canopy_1.3.0.tar.gz', repos=NULL, type='source')
+          # Install remaining package dependencies including knitr for vignettes
+          install.packages(c('rcmdcheck', 'testthat', 'dplyr', 'ggplot2', 'magrittr', 'purrr', 'gridExtra', 'grid', 'readr', 'knitr'), repos='https://cloud.r-project.org')
+        shell: Rscript {0}
 
       - uses: r-lib/actions/check-r-package@v2
 
+      - name: Run tests
+        run: |
+          testthat::test_dir("tests/testthat")
+        shell: Rscript {0}
+
+      - uses: r-lib/actions/check-r-package@v2
+
+      - name: Run tests
+        run: |
+          Rscript -e "testthat::test_dir('tests/testthat')"
+
       - name: Show testthat output
         if: always()
         run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,125 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+TiNDA (Tumor in Normal Detection Analysis) is an R package that rescues somatic variants misclassified as germline due to tumor DNA contamination in patient blood/control samples. It uses Canopy's EM-cluster algorithm to partition variants into clusters and classify them as somatic rescue, CHIP, or germline based on VAF patterns.
+
+## Development Commands
+
+```bash
+# Run R CMD check
+R CMD check .
+
+# Install package locally
+R CMD INSTALL .
+
+# Build package
+R CMD build .
+
+# Run package tests
+R -e "devtools::test()"
+
+# Load package for development
+devtools::load_all()
+
+# Document with roxygen2
+devtools::document()
+
+# Run specific test file
+R -e "testthat::test_file('tests/testthat/test-filename.R')"
+
+# Check coverage
+covr::package_coverage()
+```
+
+## Architecture
+
+### Core Components
+
+**R/TiNDA.R** - Main package logic
+- `TiNDA()` function: Primary entry point that takes tumor/control variant data and classifies variants
+- Uses Canopy's `canopy.cluster()` for EM-based clustering
+- Key classification parameters: `max_control_af` (0.25), `min_tumor_af` (0.01), `min_clst_members` (0.85)
+- Classification flow: Canopy clustering → potential somatic/CHIP cluster selection → germline exclusion → optional homozygous rescue
+
+**R/data.R** - Data simulation utilities
+- `generate_depth()`: Helper for generating variant depths with normal distribution
+- `simulate_variants()`: Simulates germline, somatic, and CHIP variants with configurable VAFs
+- `generate_test_data()`: Generates test data across all chromosomes
+
+**R/plot.R** - Visualization functions
+- `canopy_clst_plot()`: Shows Canopy cluster assignments in VAF space
+- `tinda_clst_plot()`: Shows TiNDA final classification (Somatic_Rescue, Germline, CHIP)
+- `tinda_linear_plot()`: Linear genome browser-style plot across chromosomes
+- `tinda_summary_plot()`: Combined view with all plots and summary table
+
+**R/write_data.R** - Output utilities
+- `write_data()`: Exports TiNDA results to TSV
+
+### Data Flow
+
+1. Input: Data frame with CHR, POS, Control_ALT_DP, Control_DP, Tumor_ALT_DP, Tumor_DP
+2. VAF calculation: Control_AF = Control_ALT_DP/Control_DP, Tumor_AF = Tumor_ALT_DP/Tumor_DP
+3. Canopy EM clustering with 10 clusters
+4. Cluster classification based on Area of Interest (AOI) criteria
+5. Optional CHIP detection (controlled_af: 0.02-0.35, tumor_af < 0.25)
+6. Homozygous TiN rescue at top-left corner
+7. Output: Classified variants with TiN_Class column
+
+### Key Parameters
+
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| max_control_af | 0.25 | Max control VAF for somatic rescue |
+| min_tumor_af | 0.01 | Min tumor VAF for somatic rescue |
+| min_clst_members | 0.85 | Min cluster members (proportion) in AOI |
+| num_run | 1 | EM runs per cluster count |
+| find_chip | TRUE | Enable CHIP detection |
+| max_control_af_chip | 0.40 | Max control VAF for CHIP |
+| max_tumor_af_chip | 0.25 | Max tumor VAF for CHIP |
+
+### Dependencies
+
+- Canopy (>= 1.3.0): EM clustering
+- ggplot2: Visualization
+- dplyr, purrr: Data manipulation
+- readr: TSV I/O
+
+### Supported Reference Genomes
+
+- hg19: data/hg19_length.rda
+- hg38: data/hg38_length.rda
+
+## Workflow Example
+
+```r
+library(TiNDA)
+data(hg19_length)
+
+# Generate test data
+test_df <- generate_test_data(hg19_length, num_variants = 500)
+
+# Run TiNDA
+tinda_obj <- TiNDA(test_df, sample_name = "sample_1", data_source = "WGS")
+
+# Visualize
+canopy_clst_plot(tinda_obj)
+tinda_clst_plot(tinda_obj)
+tinda_linear_plot(tinda_obj)
+tinda_summary_plot(tinda_obj)
+
+# Export results
+write_data(tinda_obj, "results.tsv")
+```
+
+## Testing
+
+No unit tests currently exist in the repository. Test files should be placed in `tests/testthat/`.
+
+## Notes
+
+- Input data should be pre-filtered rare germline variants (not common SNPs or artifacts)
+- For large datasets, consider using only exonic variants
+- Column names in input must match exactly: CHR, POS, Control_ALT_DP, Control_DP, Tumor_ALT_DP, Tumor_DP
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,8 @@
 export(TiNDA)
 export(canopy_clst_plot)
 export(generate_test_data)
+export(get_tinda_params)
+export(run_pipeline)
 export(tinda_clst_plot)
 export(tinda_linear_plot)
 export(tinda_summary_plot)
@@ -23,3 +25,5 @@ importFrom(stats,median)
 importFrom(stats,quantile)
 importFrom(stats,rnorm)
 importFrom(utils,data)
+S3method(print,TiNDA)
+S3method(summary,TiNDA)
diff --git a/R/TiNDA.R b/R/TiNDA.R
@@ -10,12 +10,12 @@
 #' @param tbl A data frame object with rare germline variants with the raw coverage information.
 #' @param sample_name Name of the sample for the title, Default: 'pid_1'
 #' @param data_source WGS or WES, Default: 'WGS'
-#' @param max_control_af Maximum control variant allele frequency (VAF), Default: 0.45
+#' @param max_control_af Maximum control variant allele frequency (VAF), Default: 0.25
 #' @param min_tumor_af Minimum tumor variant allele frequency (VAF), Default: 0.01
 #' @param min_clst_members Minimum number of members in the cluster to be below the max_control_af and above the min_tumor_af, Default: 0.85
 #' @param num_run For canopy cluster function, "number of EM runs for estimation for each specific number of clusters (to avoid EM being stuck in local optima)", Default: 1
 #' @param min_control_af_chip Minimum control variant allele frequency (VAF) for CHIP cluster, Default: 0.02
-#' @param max_control_af_chip Maximum control variant allele frequency (VAF) for CHIP cluster, Default: 0.35
+#' @param max_control_af_chip Maximum control variant allele frequency (VAF) for CHIP cluster, Default: 0.40
 #' @param max_tumor_af_chip Maximum tumor variant allele frequency (VAF) for CHIP cluster, Default: 0.25
 #' @param find_chip Find CHIP clusters, Default: TRUE
 #' @param verbose Print the potential clusters, Default: FALSE
@@ -24,7 +24,7 @@
 #' @examples
 #' data(hg19_length)
 #' vcf_like_df = TiNDA::generate_test_data(hg19_length)
-#' tinda_test_object <- TiNDA(vcf_like_df, sample_name = "sample_3", data_type = "WGS")
+#' tinda_test_object <- TiNDA(vcf_like_df, sample_name = "sample_3", data_source = "WGS")
 #' 
 #' @importFrom stats median quantile rnorm
 #' @importFrom methods is
@@ -35,7 +35,7 @@
 #' @import purrr
 #' 
 #' @export
-TiNDA <- function(tbl, 
+TiNDA <- function(tbl,
                   sample_name = "pid_1",
                   data_source = "WGS",
                   max_control_af = 0.25,
@@ -47,59 +47,78 @@ TiNDA <- function(tbl,
                   num_run = 1,
                   find_chip = TRUE,
                   verbose = FALSE, ...) {
-
-  if(data_source == "WGS") {
-    mu.init <- cbind(c(0.5, 0.95, 0.50, 0.50, 0.02, 0.02, 0.02, 0.25, 0.15, 0.35), 
-                     c(0.5, 0.95, 0.15, 0.85, 0.20, 0.50, 0.85, 0.02, 0.02, 0.02))
-    numberCluster <- 10
-  } else if(data_source == "WES") {
-    mu.init <- cbind(c(0.5, 0.95, 0.50, 0.50, 0.02, 0.02, 0.02, 0.25, 0.15, 0.35), 
-                     c(0.5, 0.95, 0.15, 0.85, 0.20, 0.50, 0.85, 0.02, 0.02, 0.02))
-    numberCluster <- 10
+
+  # Input validation
+  if (!is.data.frame(tbl)) {
+    stop("tbl must be a data frame")
   }
-
-  # Test tbl 
+
+  if (!data_source %in% c("WGS", "WES")) {
+    stop("data_source must be either 'WGS' or 'WES'")
+  }
+
   expected_col_names <- c('CHR', 'POS', 'Control_ALT_DP', 'Control_DP',
                           'Tumor_ALT_DP', 'Tumor_DP')
-  if (!purrr::is_empty(setdiff(expected_col_names, colnames(tbl)))){
-    cat("Expected column names didn't appear in the input table\n")
-    cat("Expected:", expected_col_names, "\n")
-    stop()
+  missing_cols <- setdiff(expected_col_names, colnames(tbl))
+  if (length(missing_cols) > 0) {
+    stop("Missing required columns: ", paste(missing_cols, collapse = ", "))
   }
-
-
-  cat("Found ", dim(tbl)[1], " variants from the input data\n")
-  new_tbl <- tbl %>% 
+
+  if (nrow(tbl) == 0) {
+    stop("Input data frame is empty")
+  }
+
+  if (any(tbl$Control_DP == 0)) {
+    warning("Found variants with zero control depth - these will be removed")
+  }
+  if (any(tbl$Tumor_DP == 0)) {
+    warning("Found variants with zero tumor depth - these will be removed")
+  }
+
+  # Set clustering parameters based on data source
+  mu.init <- cbind(c(0.5, 0.95, 0.50, 0.50, 0.02, 0.02, 0.02, 0.25, 0.15, 0.35),
+                   c(0.5, 0.95, 0.15, 0.85, 0.20, 0.50, 0.85, 0.02, 0.02, 0.02))
+  numberCluster <- 10
+
+  cat("Found ", nrow(tbl), " variants from the input data\n")
+
+  # Filter invalid depth values
+  new_tbl <- tbl %>%
     filter(.data$Control_ALT_DP < .data$Control_DP,
-           .data$Tumor_ALT_DP < .data$Tumor_DP) -> new_tbl
-
-  if(dim(tbl)[1] == dim(new_tbl)[1]){
-    rm(new_tbl)
-  } else {
-    num_removed <- dim(tbl)[1] - dim(new_tbl)[1]
-    cat("Removed ", num_removed)
-    cat(" variants with ALT read depth more than total read depth\n")
-    tbl = new_tbl
+           .data$Tumor_ALT_DP < .data$Tumor_DP)
+
+  if (nrow(tbl) != nrow(new_tbl)) {
+    num_removed <- nrow(tbl) - nrow(new_tbl)
+    cat("Removed ", num_removed,
+        " variants with ALT read depth more than total read depth\n")
+    tbl <- new_tbl
   }
-  # Variant AF
-  tbl %>% 
-    mutate(Control_AF = .data$Control_ALT_DP/ .data$Control_DP,
-           Tumor_AF = .data$Tumor_ALT_DP/ .data$Tumor_DP) -> tbl
-
-  # Running Canopy -------------------------------------------------------------
-  R <-as.matrix(tbl[,c('Control_ALT_DP', 'Tumor_ALT_DP')])
-  X <-as.matrix(tbl[,c('Control_DP', 'Tumor_DP')])
-
-  # Canopy run and assigning centers
-  try(
-      canopy.clust <- canopy.cluster(R, X, 
-                                     num_cluster = numberCluster, 
-                                     num_run = num_run, Mu.init = mu.init)
-    )
-  if(is.null(canopy.clust)){
-    stop("Failed canopy run\n", call. = FALSE)
+
+  # Calculate variant allele frequencies
+  tbl <- tbl %>%
+    mutate(Control_AF = .data$Control_ALT_DP / .data$Control_DP,
+           Tumor_AF = .data$Tumor_ALT_DP / .data$Tumor_DP)
+
+  # Running Canopy
+  R <- as.matrix(tbl[, c('Control_ALT_DP', 'Tumor_ALT_DP')])
+  X <- as.matrix(tbl[, c('Control_DP', 'Tumor_DP')])
+
+  # Canopy run with proper error handling
+  canopy.clust <- tryCatch(
+    canopy.cluster(R, X,
+                   num_cluster = numberCluster,
+                   num_run = num_run,
+                   Mu.init = mu.init),
+    error = function(e) {
+      stop("Canopy clustering failed: ", e$message, call. = FALSE)
+    }
+  )
+
+  if (is.null(canopy.clust$sna_cluster)) {
+    stop("Canopy clustering failed: no clusters were assigned\n", call. = FALSE)
   }
-  tbl$canopyCluster<-canopy.clust$sna_cluster
+
+  tbl$canopyCluster <- canopy.clust$sna_cluster
 
   # Select the potential TiN clusters ------------------------------------------
   potential_somatic_clst_per <- tbl %>%  
@@ -225,13 +244,32 @@ TiNDA <- function(tbl,
     ) -> tbl
 }
 
-  tinda_object <- list(data=tbl, 
-                       min_tumor_af = min_tumor_af,
-                       max_control_af = max_control_af,
-                       number_cluster = numberCluster,
-                       sample_name = sample_name)
-
-  class(tinda_object) <-'TiNDA'
-
+  # Create classification summary
+  class_summary <- table(tbl$TiN_Class)
+
+  tinda_object <- list(
+    data = tbl,
+    sample_name = sample_name,
+    data_source = data_source,
+    max_control_af = max_control_af,
+    min_tumor_af = min_tumor_af,
+    number_cluster = numberCluster,
+    min_clst_members = min_clst_members,
+    find_chip = find_chip,
+    classification_summary = class_summary,
+    parameters = list(
+      max_control_af = max_control_af,
+      min_tumor_af = min_tumor_af,
+      min_clst_members = min_clst_members,
+      min_control_af_chip = min_control_af_chip,
+      max_control_af_chip = max_control_af_chip,
+      max_tumor_af_chip = max_tumor_af_chip,
+      num_run = num_run,
+      find_chip = find_chip
+    )
+  )
+
+  class(tinda_object) <- 'TiNDA'
+
   return(tinda_object)
 }