PMCC-BioinformaticsCore · qoiopipq · Oct 22, 2025 · Oct 13, 2025 · Oct 20, 2025 · Oct 21, 2025
diff --git a/NAMESPACE b/NAMESPACE
@@ -35,6 +35,7 @@ export(plot_qc_metrics_heatmap)
 export(plot_rle)
 export(plot_volcano)
 export(read_metadata)
+export(select_robust_controls)
 export(subsample_genes)
 export(summarise_de)
 export(validate_metadata)
@@ -66,6 +67,9 @@ importFrom(dplyr,reframe)
 importFrom(dplyr,rename)
 importFrom(dplyr,select)
 importFrom(dplyr,where)
+importFrom(edgeR,DGEList)
+importFrom(edgeR,calcNormFactors)
+importFrom(edgeR,cpm)
 importFrom(ggiraph,geom_point_interactive)
 importFrom(ggiraph,girafe)
 importFrom(ggplot2,aes)
@@ -124,6 +128,7 @@ importFrom(stringr,str_trim)
 importFrom(tibble,column_to_rownames)
 importFrom(tibble,rownames_to_column)
 importFrom(tidyr,drop_na)
+importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
 importFrom(tidyr,unnest)
 importFrom(utils,capture.output)

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
@@ -157,9 +157,9 @@ check_zeroinflation <- function(data = NULL,
       group            = unique(df$group),
       n_genes          = nrow(df),
       n_wells          = sum(combined_id == unique(df$group)),
-      median_p0_obs    = median(df$p0_obs),
-      median_p0_nb     = median(df$p0_nb),
-      median_ZI        = median(df$ZI),
+      mean_p0_obs    = mean(df$p0_obs),
+      mean_p0_nb     = mean(df$p0_nb),
+      mean_ZI        = mean(df$ZI),
       observed_zeros_num       = sum(df$obs_zeros_num),
       expected_zeros_num  = sum(df$expected_zeros_num) 
     )

diff --git a/R/compute_single_de.R b/R/compute_single_de.R
@@ -157,7 +157,8 @@ compute_single_de <- function(data = NULL,
     combined_id <- data$combined_id
     dds <- DESeqDataSetFromMatrix(countData = data@assays$RNA$counts,
                                   colData = pheno_data,
-                                  design = ~ condition)
+                                  # design = ~ condition)
+                                  design = if (length(batch) == 1) ~ condition else ~ condition + batch)
     dds <- DESeq(dds)
     res <- results(dds, contrast = c("condition", treatment_samples, control_samples))
     top_table <- as.data.frame(res) %>%

diff --git a/R/select_robust_controls.R b/R/select_robust_controls.R
@@ -0,0 +1,175 @@
+#' Select high-quality control replicates via TMMwsp log-CPM correlation
+#'
+#' @description
+#' For a given control group (e.g., DMSO) on a specific plate/batch, this function
+#' ranks samples by their average correlation (Fisher z–averaged) to all *other*
+#' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally)
+#' plots per-sample expression distributions and sample–sample correlation heatmaps.
+#'
+#' @param data A tidyseurat object containing an RNA assay with a **counts** layer.
+#' @param samples the control/treatment label to keep in column samples
+#'   (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered.
+#' @param orig_ident Character scalar: the plate/batch identifier to keep
+#'   (e.g., `"VH02012942"`). Only cells/samples from this batch are considered.
+#' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to
+#'   normalization (default `1`).
+#' @param min_samps Integer; a gene must be expressed (CPM > `cpm_filter`) in at least
+#'   this many samples to be retained (default `16`).
+#' @param corr_method Correlation type used for ranking; one of
+#'   `c("spearman","pearson")` (default `"spearman"`).
+#' @param top_n Integer; the number of top-ranked samples to report in `topN`.
+#'   Ties at the cutoff are kept (default `5`).
+#' @param make_plots Logical; if `TRUE`, print a log2-CPM boxplot and Pearson/Spearman
+#'   correlation heatmaps (default `TRUE`).
+#'
+#' @details
+#' Workflow:
+#' 1) Subset to the specified `samples` **and** `orig_ident` (plate/batch).  
+#' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`.  
+#' 3) Normalize with **TMMwsp** and compute log2-CPM.  
+#' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples
+#'    (according to `corr_method`).  
+#' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally)
+#'    plots for QC.
+#'
+#' Column names of the counts matrix are rewritten to `"<orig.ident>_<Well_ID>"`
+#' for easier visual inspection in plots.
+#'
+#' @return A list with elements:
+#' \item{subset_obj}{The Seurat object subset used for analysis.}
+#' \item{dge}{The filtered `edgeR::DGEList`.}
+#' \item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).}
+#' \item{boxplot_df}{Long-format data frame used for the boxplot (`gene`, `sample`, `log_cpm`).}
+#' \item{cor_pearson}{Sample–sample Pearson correlation matrix.}
+#' \item{cor_spearman}{Sample–sample Spearman correlation matrix.}
+#' \item{ranking_method}{The correlation method used for ranking.}
+#' \item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed
+#'   correlations (higher = better), sorted decreasing.}
+#' \item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).}
+#'
+#' @examples
+#' data(mini_mac)
+#' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini")
+#'
+#' 
+#'
+#' @importFrom edgeR DGEList calcNormFactors cpm
+#' @importFrom tibble rownames_to_column
+#' @importFrom tidyr pivot_longer
+#' @export
+
+select_robust_controls <- function(
+    data,
+    samples,                 # e.g. "CB_43_EP73_0"
+    orig_ident,                  # e.g. "VH02012942"
+    cpm_filter    = 1,           # CPM threshold for gene filtering
+    min_samps     = 16,          # number of samples a gene must be expressed in
+    corr_method   = c("spearman","pearson"),
+    top_n         = 5,
+    make_plots    = TRUE
+){
+  validate_inputs <- function(data, samples, orig_ident) {
+    if (!inherits(data, "Seurat")) {
+      stop("argument 'data' must be a Seurat or TidySeurat object.")
+    }
+
+    # check samples and orig_ident columns
+    if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) {
+      stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.")
+    }
+    # check samples in samples column
+    if (is.null(samples)){
+      stop("Please provide a value for 'samples'.")
+    } else if (!all(samples %in% unique(data$combined_id))) {
+      stop("Some values in 'samples' are not found in the 'combined_id' column of 'data'.")
+    }
+    # check orig.ident in the orig.ident column
+    if (is.null(orig_ident)){
+      stop("Please provide a value for 'orig_ident'.")
+    } else if (!orig_ident %in% unique(data$orig.ident)) {
+      stop("The value of 'orig_ident' is not found in the 'orig.ident' column of 'data'.")
+    }
+    return(list(data = data, samples = samples, orig_ident = orig_ident))
+  }
+  validated <- validate_inputs(data = data, samples = samples, orig_ident = orig_ident)
+  data <- validated$data
+  group_by <- validated$orig_ident
+  samples <- validated$samples
+
+  corr_method <- match.arg(corr_method)
+  sel_cells <- colnames(data)[data$combined_id == samples &
+                                data$orig.ident  == orig_ident]
+  if (length(sel_cells) == 0L) {
+    stop("No cells/samples match the specified 'combined_id' and 'orig_ident'.")
+  }
+  subgroup <- subset(data, cells = sel_cells)
+  # Counts and human-friendly column names
+  counts_d <- Seurat::GetAssayData(subgroup, assay = "RNA", layer = "counts")
+  well_colnames <- paste0(subgroup$orig.ident, "_", subgroup$Well_ID)
+  names(well_colnames) <- rownames(subgroup@meta.data)
+  colnames(counts_d) <- well_colnames[colnames(counts_d)]
+  # edgeR container + gene filtering
+  y <- edgeR::DGEList(counts_d, group = subgroup$orig.ident)
+  keep <- rowSums(edgeR::cpm(y) > cpm_filter) >= min_samps
+  y <- y[keep, , keep.lib.sizes = FALSE]
+  y <- edgeR::calcNormFactors(y, method = "TMMwsp")
+  log_cpm_tmm <- edgeR::cpm(y, log = TRUE, normalized.lib.sizes = TRUE)
+  # Long data for boxplot 
+  df_long <- as.data.frame(log_cpm_tmm) |>
+    tibble::rownames_to_column(var = "gene") |>
+    tidyr::pivot_longer(-gene, names_to = "sample", values_to = "log_cpm")
+  if (make_plots) {
+    if (!requireNamespace("ggplot2", quietly = TRUE)) {
+      warning("Package 'ggplot2' not available; skipping boxplot.")
+    } else {
+      print(
+        ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) +
+          ggplot2::geom_boxplot(outlier.size = 0.5) +
+          ggplot2::labs(x = "Sample", y = "log2 CPM",
+                        title = "Boxplot of log2-CPM (TMMwsp)") +
+          ggplot2::theme_classic() +
+          ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1))
+      )
+    }
+  }
+  # Correlation matrices
+  cors_pear  <- stats::cor(log_cpm_tmm, method = "pearson")
+  cors_spear <- stats::cor(log_cpm_tmm, method = "spearman")
+  if (make_plots) {
+    if (!requireNamespace("pheatmap", quietly = TRUE)) {
+      warning("Package 'pheatmap' not available; skipping heatmaps.")
+    } else {
+      pheatmap::pheatmap(cors_pear,  main = "Sample–sample correlation (Pearson, log2-CPM)")
+      pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)")
+    }
+  }
+  # Ranking by mean Fisher-z correlation to all *other* samples
+  R <- stats::cor(log_cpm_tmm, method = corr_method, use = "pairwise.complete.obs")
+  diag(R) <- NA_real_
+  # Clip to (-1,1), Fisher z-transform, average, back-transform
+  Z <- atanh(pmin(pmax(R, -0.999999), 0.999999))
+  score_z <- rowMeans(Z, na.rm = TRUE)
+  score_r <- tanh(score_z)
+  # Top-N names and scores (keep ties at the cutoff)
+  ord   <- order(score_r, decreasing = TRUE, na.last = NA)
+  srt   <- score_r[ord]
+  if (length(srt) == 0L) {
+    stop("No samples available after filtering; adjust 'cpm_filter'/'min_samps'.")
+  }
+  k      <- min(top_n, length(srt))
+  cutoff <- srt[k]
+  keepN  <- srt >= cutoff
+  topN   <- srt[keepN]
+  # Return everything useful
+  list(
+    subset_obj              = subgroup,
+    dge                     = y,
+    log_cpm_tmm             = log_cpm_tmm,
+    boxplot_df              = df_long,
+    cor_pearson             = cors_pear,
+    cor_spearman            = cors_spear,
+    ranking_method          = corr_method,
+    scores_mean_to_others   = sort(score_r, decreasing = TRUE),
+    topN                    = topN
+  )
+}
diff --git a/_pkgdown.yaml b/_pkgdown.yaml
@@ -31,8 +31,8 @@ navbar:
           href: articles/integration_across_modalities.html
         - text: "Working with Bioconductor classes"
           href: articles/macpie_bioc.html
-        - text: "Chck zero-inflation"
-          href: articles/check_zero_inflation.html
+        - text: "Assessing zero-inflation"
+          href: articles/assessing_zero_inflation.html
     - text: "Reference"
       href: reference/index.html  # Links to your function reference docs.
   right:

diff --git a/man/select_robust_controls.Rd b/man/select_robust_controls.Rd
diff --git a/tests/testthat/test-select_robust_controls.R b/tests/testthat/test-select_robust_controls.R
@@ -0,0 +1,7 @@
+test_that("multiplication works", {
+  # Load test dataset from tests/testthat/testdata
+  testdata <- get_testdata()
+  expect_error(select_robust_controls(testdata, samples=NULL, orig_ident="PLATE"))
+  res <- select_robust_controls(testdata, samples="DMSO_0", orig_ident="testdata", make_plots = FALSE)
+  expect_true(is.list(res))
+})