PMCC-BioinformaticsCore · qoiopipq · Oct 9, 2025 · Oct 1, 2025 · Oct 6, 2025 · Oct 7, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -75,7 +75,8 @@ Suggests:
     scuttle,
     methods,
     SeuratObject,
-    SummarizedExperiment
+    SummarizedExperiment,
+    seqgendiff
 Config/testthat/edition: 3
 Imports:
     dplyr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(aggregate_by_de)
+export(check_zeroinflation)
 export(compute_chem_descriptors)
 export(compute_de_umap)
 export(compute_hyper_enrich_bg)
@@ -34,6 +35,7 @@ export(plot_qc_metrics_heatmap)
 export(plot_rle)
 export(plot_volcano)
 export(read_metadata)
+export(subsample_genes)
 export(summarise_de)
 export(validate_metadata)
 import(DESeq2)

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
@@ -0,0 +1,179 @@
+#' Quick group-aware zero-inflation check (Negative Binomial baseline via edgeR)
+#'
+#' @description
+#' Computes a **sample group-aware Zero-Inflation (ZI) index** for each gene using a
+#' negative-binomial (NB) baseline fitted with **edgeR**. For each group
+#' (e.g., drug condition), the function:
+#' 1) estimates gene-wise tagwise dispersions with edgeR (using all selected groups),
+#' 2) builds NB-expected zero probabilities from TMMwsp-scaled means, and
+#' 3) returns per-gene ZI (observed zeros minus NB-expected zeros) and
+#'    per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined. 
+#'
+#' This is intended as a **fast screening diagnostic** to decide whether
+#' standard NB GLM methods (edgeR/DESeq2) are adequate or whether a
+#' zero-aware workflow (e.g., ZINB-WaVE) might be warranted.
+#'
+#' This function **relies on edgeR** to estimate dispersion. The current
+#' implementation requires **≥2 groups** in the design so that edgeR can
+#' stabilize gene-wise dispersions across groups. If you only have a single
+#' group and still want a design-aware baseline for expected zeros, fit a
+#' Gamma–Poisson/NB GLM and compute the
+#' expected zero probabilities from its fitted means and over-dispersion.
+#'
+#' @param data Seurat object.
+#' @param group_by Character, column in `data@meta.data` that defines groups
+#'   (default: `"combined_id"`).
+#' @param samples Character vector of group labels/patterns to include. If
+#'   `NULL` or if none match, all groups in `group_by` are used.
+#' @param batch Optional batch indicator; if length 1, an intercept-free design
+#'   is used with group dummies.
+#' @param cutoffs Numeric vector of user-supply ZI thresholds for summary statistics
+#'
+#' @returns A list with:
+#' * `gene_metrics_by_group`: long data frame (group × gene) with `p0_obs`,
+#'   `p0_nb`, `ZI`, and counts.
+#' * `summary_by_group`: one row per group with medians and % ZI thresholds,
+#'   plus observed/expected zero **counts** for the group.
+#'
+#' @note
+#' - This is a **screening** tool; it is not a replacement for fitting a full
+#'   GLM with your actual design. If strong covariates exist, a GLM baseline
+#'   (e.g., `glmGamPoi::glm_gp`) will yield more faithful expected-zero rates.
+#' - For single-group experiments, consider either adding a reference group or
+#'   switching to a GLM-based baseline that does not require multiple groups.
+#' @export
+#' @examples
+#' data(mini_mac)
+#' check_zeroinflation(mini_mac, group_by = "combined_id",
+#'                      samples = c("DMSO_0","Staurosporine_10"))
+
+
+check_zeroinflation <- function(data = NULL,
+                                     group_by = NULL,
+                                     samples = NULL,
+                                     batch = 1,
+                                     cutoffs = c(0.1, 0.20)
+){
+  validate_inputs <- function(data, group_by, samples, cutoffs) {
+    if (!inherits(data, "Seurat")) {
+      stop("argument 'data' must be a Seurat or TidySeurat object.")
+    }
+    group_by <- if (is.null(group_by)) "combined_id" else group_by
+
+    # check samples in combined_id column
+    meta_groups <- as.character(data@meta.data[[group_by]])
+    matched_groups <-  samples %in% meta_groups
+    if (is.null(samples)){
+      # all samples included
+      samples <- unique(data@meta.data[[group_by]])
+      cat("All samples will be included in the combined_id column.")
+    } else if  (length(samples) == 1 || length(which(matched_groups==TRUE)) < 2)  {
+      # need at least two groups for edgeR dispersion estimation
+      stop("Two treatment groups are needed to calculate dispersion using edgeR.")
+    } 
+    # check cutoffs
+    if (any(cutoffs <= 0) || any(cutoffs >= 1)) {
+      stop("cutoffs must be between 0 and 1.")
+    }
+    return(list(data = data, group_by = group_by, samples = samples, cutoffs = cutoffs))
+  }
+  validated <- validate_inputs(data, group_by, samples,cutoffs)
+  data <- validated$data
+  group_by <- validated$group_by
+  samples <- validated$samples
+  cutoffs <- validated$cutoffs
+  mac_data <- subset(data, subset = combined_id %in% samples)
+  count_matrix <- GetAssayData(mac_data, assay = "RNA", layer = "counts")
+  count_matrix <- Matrix::Matrix(count_matrix, sparse = TRUE)
+  obs_zero <- Matrix::rowMeans(count_matrix == 0)
+  # Negative binomial expected zeros
+  # using edgeR for dispersion estimation
+  dge <- edgeR::DGEList(counts = count_matrix)
+  dge <- edgeR::calcNormFactors(dge, method = "TMMwsp")
+  #design matrix
+  combined_id <- mac_data$combined_id
+  #make up batch parameter
+  model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
+    model.matrix(~0 + combined_id + batch)
+  # tagwise dispersion
+  dge <- edgeR::estimateDisp(dge, design = model_matrix)  
+  phi <- dge$tagwise.dispersion  # NB variance: mu + phi * mu^2  (phi >= 0)
+  # Build per-sample NB mean mu_gj using TMMwsp-scaled library sizes
+  # Effective library sizes
+  eff_lib <- dge$samples$lib.size * dge$samples$norm.factors
+  per_group_gene_metrics <- lapply(samples, function(g){
+    idx <- which(combined_id == g)
+    n_wells <- length(idx)
+    # sub count matrix for group g
+    count_matrix_g <- count_matrix[, idx, drop=FALSE]
+    # Observed zeros within group g
+    p0_obs_g <- Matrix::rowMeans(count_matrix_g==0)
+    # count zeros per gene within group g
+    # sum later for summary
+    obs_zero_num_g <- Matrix::rowSums(count_matrix_g==0)
+    # Group-specific q_{g,g} using only wells in group g
+    eff_lib_g <- eff_lib[idx]
+    total_eff_lib_g <- sum(eff_lib_g)
+    total_counts_per_gene_g <- Matrix::rowSums(count_matrix_g)
+    q_g_g <- as.numeric(total_counts_per_gene_g) / total_eff_lib_g
+    # NB-expected zeros within group g (average over wells in g)
+    eps <- 1e-12
+    phi_safe <- pmax(phi, eps)
+    inv_phi <- 1 / phi_safe
+    # Fast loop over wells in g, no GxJ materialization
+    p0_nb_sum_g <- numeric(nrow(count_matrix))
+    for (j in seq_along(idx)) {
+      Lj <- eff_lib_g[j]
+      mu_gj <- q_g_g * Lj
+      p0_nb_sum_g <- p0_nb_sum_g + (1 + phi_safe * mu_gj)^(-inv_phi)
+    }
+    p0_nb_g <- p0_nb_sum_g / length(idx)
+    # Poisson fallback where phi ~ 0
+    poi_idx <- which(phi < 1e-8)
+    if (length(poi_idx)) {
+      mu_bar_g <- q_g_g * mean(eff_lib_g)
+      p0_nb_g[poi_idx] <- exp(-mu_bar_g[poi_idx])
+    }
+    # ZI within group g
+    zi_g <- p0_obs_g - p0_nb_g
+    data.frame(
+      group = g,
+      gene  = rownames(count_matrix),
+      mean_count_group = total_counts_per_gene_g / length(idx),
+      dispersion = phi,
+      p0_obs = p0_obs_g,
+      obs_zeros_num = obs_zero_num_g,
+      p0_nb  = p0_nb_g,
+      expected_zeros_num = p0_nb_g*n_wells,
+      ZI     = zi_g,
+      stringsAsFactors = FALSE
+    )
+  })
+  gene_metrics_by_group <- do.call(rbind, per_group_gene_metrics)
+  #if there are more than one cutoffs, calculate pct_ZI_gt_ for each cutoff
+  # Per-group summaries (one row per group)
+  summary_by_group <- do.call(rbind, lapply(split(gene_metrics_by_group, gene_metrics_by_group$group), function(df){
+    list_a <- list(
+      group            = unique(df$group),
+      n_genes          = nrow(df),
+      n_wells          = sum(combined_id == unique(df$group)),
+      median_p0_obs    = median(df$p0_obs),
+      median_p0_nb     = median(df$p0_nb),
+      median_ZI        = median(df$ZI),
+      observed_zeros_num       = sum(df$obs_zeros_num),
+      expected_zeros_num  = sum(df$expected_zeros_num) 
+    )
+    list_b <- lapply(cutoffs, function(cutoff){
+      pct_name <- paste0("pct_ZI_gt_", cutoff)
+      pct_value <- mean(df$ZI > cutoff)
+      setNames(list(pct_value), pct_name)
+    })
+    as.data.frame(c(list_a, list_b))
+  })) 
+  # Return just the selected groups' indices instead of plate-level
+  list(
+    gene_metrics_by_group = gene_metrics_by_group,     # long format: group × gene
+    summary_by_group      = summary_by_group
+  )
+
+}
diff --git a/R/subsample_genes.R b/R/subsample_genes.R
@@ -0,0 +1,45 @@
+#' Subsample genes (fast helper function for zero-inflation checks)
+#' 
+#' @description
+#' Quickly subsample a specified number of **genes** from a Seurat object and
+#' return a smaller Seurat object containing the selected features and all
+#' original wells/samples. This is a lightweight convenience wrapper around
+#' **`seqgendiff::select_counts()`** and is intended for creating a small
+#' working object to run **`check_zeroinflation()`** (or similar
+#' diagnostics) rapidly.
+#' 
+#' @param data   A Seurat object (v4 or v5) with counts in assay "RNA".
+#' @param ngene  Integer. Number of genes to keep (must be <= total genes).
+#' @param gselect Gene-selection strategy as used by
+#'   **`seqgendiff::select_counts()`**. Defaults to `"random"`.
+#' @param seed   Integer random seed for reproducibility.
+#' @return A Seurat object containing the subsampled genes and all original wells/samples.
+#' @export
+#' @examples
+#' data(mini_mac)
+#' subsample_genes(mini_mac, ngene = 50, gselect = "random", seed = 1 )
+#' 
+subsample_genes <- function(data, 
+                            ngene = 100, 
+                            gselect = "random", 
+                            seed = 1){
+  if (utils::packageVersion("Seurat") < "5.0.0"
+  ) {
+    count_matrix <- GetAssayData(data, assay = "RNA", slot = "counts")
+  } else {
+    count_matrix <- GetAssayData(data, assay = "RNA", layer = "counts")
+  }
+  count_matrix <- as.matrix(count_matrix)
+  if (!inherits(data, "Seurat")) {
+    stop("argument 'data' must be a Seurat or TidySeurat object.")
+  }
+  if (ngene > nrow(count_matrix)){
+    stop("ngene should be less than the total number of genes in the dataset.")
+  }
+  set.seed(seed)
+  sub_matrix <- seqgendiff::select_counts(count_matrix, ngene = ngene, gselect = gselect)
+  subsample_genes <- rownames(sub_matrix)
+  data_sub <- data[subsample_genes, ]
+  return(data_sub)
+
+}
diff --git a/_pkgdown.yaml b/_pkgdown.yaml
@@ -31,6 +31,8 @@ navbar:
           href: articles/integration_across_modalities.html
         - text: "Working with Bioconductor classes"
           href: articles/macpie_bioc.html
+        - text: "Chck zero-inflation"
+          href: articles/check_zero_inflation.html
     - text: "Reference"
       href: reference/index.html  # Links to your function reference docs.
   right:

diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd
diff --git a/man/subsample_genes.Rd b/man/subsample_genes.Rd
diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R
@@ -0,0 +1,18 @@
+test_that("chech_zeroinflation works", {
+  # Load test dataset from tests/testthat/testdata
+  testdata <- get_testdata()
+  #only match one group should fail the minimum requirement of two groups
+  expect_error(check_zeroinflation(data = testdata, group_by = "combined_id",
+                                    samples = c("DrugA_10", "DMSO_0"),
+                                    cutoffs = c(0.1, 0.2)))
+  #cutoffs out of range
+  expect_error(check_zeroinflation(data = testdata, group_by = "combined_id",
+                                   samples = c("Luminespib_10", "DMSO_0"),
+                                    cutoffs = 1.2))
+  # expect a list as output
+  res <- check_zeroinflation(data = testdata, group_by = "combined_id",
+                             samples = c("Luminespib_10", "DMSO_0"),
+                             cutoffs = c(0.1, 0.2))
+  expect_type(res, "list")
+}) 
+