PMCC-BioinformaticsCore · qoiopipq · Oct 22, 2025 · Oct 22, 2025
diff --git a/R/select_robust_controls.R b/R/select_robust_controls.R
@@ -2,57 +2,59 @@
 #'
 #' @description
 #' For a given control group (e.g., DMSO) on a specific plate/batch, this function
-#' ranks samples by their average correlation (Fisher z–averaged) to all *other*
+#' ranks samples by their average correlation (Fisher z-averaged) to all *other*
 #' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally)
-#' plots per-sample expression distributions and sample–sample correlation heatmaps.
+#' plots per-sample expression distributions and sample-sample correlation heatmaps.
 #'
 #' @param data A tidyseurat object containing an RNA assay with a **counts** layer.
 #' @param samples the control/treatment label to keep in column samples
-#'   (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered.
+#'   (e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered.
 #' @param orig_ident Character scalar: the plate/batch identifier to keep
-#'   (e.g., `"VH02012942"`). Only cells/samples from this batch are considered.
+#'   (e.g., "VH02012942"). Only cells/samples from this batch are considered.
 #' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to
-#'   normalization (default `1`).
-#' @param min_samps Integer; a gene must be expressed (CPM > `cpm_filter`) in at least
-#'   this many samples to be retained (default `16`).
+#'   normalization (default 1).
+#' @param min_samps Integer; a gene must be expressed (CPM > cpm_filter) in at least
+#'   this many samples to be retained (default 16).
 #' @param corr_method Correlation type used for ranking; one of
-#'   `c("spearman","pearson")` (default `"spearman"`).
-#' @param top_n Integer; the number of top-ranked samples to report in `topN`.
-#'   Ties at the cutoff are kept (default `5`).
-#' @param make_plots Logical; if `TRUE`, print a log2-CPM boxplot and Pearson/Spearman
-#'   correlation heatmaps (default `TRUE`).
+#'   c("spearman","pearson") (default "spearman").
+#' @param top_n Integer; the number of top-ranked samples to report in topN.
+#'   Ties at the cutoff are kept (default 5).
+#' @param make_plots Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman
+#'   correlation heatmaps (default TRUE).
 #'
 #' @details
 #' Workflow:
-#' 1) Subset to the specified `samples` **and** `orig_ident` (plate/batch).  
-#' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`.  
-#' 3) Normalize with **TMMwsp** and compute log2-CPM.  
-#' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples
-#'    (according to `corr_method`).  
+#' 1) Subset to the specified samples and orig_ident (plate/batch).
+#' 2) Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps.
+#' 3) Normalize with TMMwsp and compute log2-CPM.
+#' 4) Rank samples by mean Fisher z transformed correlation to all other samples
+#'    (according to corr_method).
 #' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally)
 #'    plots for QC.
 #'
-#' Column names of the counts matrix are rewritten to `"<orig.ident>_<Well_ID>"`
+#' Column names of the counts matrix are rewritten to "<orig.ident>_<Well_ID>"
 #' for easier visual inspection in plots.
 #'
 #' @return A list with elements:
-#' \item{subset_obj}{The Seurat object subset used for analysis.}
-#' \item{dge}{The filtered `edgeR::DGEList`.}
-#' \item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).}
-#' \item{boxplot_df}{Long-format data frame used for the boxplot (`gene`, `sample`, `log_cpm`).}
-#' \item{cor_pearson}{Sample–sample Pearson correlation matrix.}
-#' \item{cor_spearman}{Sample–sample Spearman correlation matrix.}
-#' \item{ranking_method}{The correlation method used for ranking.}
-#' \item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed
-#'   correlations (higher = better), sorted decreasing.}
-#' \item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).}
-#'
+#' * subset_obj: The Seurat object subset used for analysis.
+#' * dge: The filtered edgeR::DGEList
+#' * log_cpm_tmm: Matrix of TMMwsp log2-CPM.
+#' * boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm).
+#' * cor_pearson: Sample-sample Pearson correlation matrix.
+#' * cor_spearman: Sample-sample Spearman correlation matrix.
+#' * ranking_method: The correlation method used for ranking.
+#' * scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed
+#'   correlations (higher = better), sorted decreasing.
+#' * topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept).
+
+#' 
+#' 
 #' @examples
 #' data(mini_mac)
 #' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini")
 #'
 #' 
-#'
+#' @importFrom rlang .data
 #' @importFrom edgeR DGEList calcNormFactors cpm
 #' @importFrom tibble rownames_to_column
 #' @importFrom tidyr pivot_longer
@@ -72,7 +74,7 @@ select_robust_controls <- function(
     if (!inherits(data, "Seurat")) {
       stop("argument 'data' must be a Seurat or TidySeurat object.")
     }
-
+    
     # check samples and orig_ident columns
     if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) {
       stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.")
@@ -123,7 +125,7 @@ select_robust_controls <- function(
       warning("Package 'ggplot2' not available; skipping boxplot.")
     } else {
       print(
-        ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) +
+        ggplot2::ggplot(df_long, ggplot2::aes(x = .data$sample, y = .data$log_cpm)) +
           ggplot2::geom_boxplot(outlier.size = 0.5) +
           ggplot2::labs(x = "Sample", y = "log2 CPM",
                         title = "Boxplot of log2-CPM (TMMwsp)") +
@@ -139,8 +141,8 @@ select_robust_controls <- function(
     if (!requireNamespace("pheatmap", quietly = TRUE)) {
       warning("Package 'pheatmap' not available; skipping heatmaps.")
     } else {
-      pheatmap::pheatmap(cors_pear,  main = "Sample–sample correlation (Pearson, log2-CPM)")
-      pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)")
+      pheatmap::pheatmap(cors_pear,  main = "Pearson correlation")
+      pheatmap::pheatmap(cors_spear, main = "Spearman correlation")
     }
   }
   # Ranking by mean Fisher-z correlation to all *other* samples

diff --git a/R/utils.R b/R/utils.R
@@ -1,3 +1,4 @@
 utils::globalVariables(c("diff_expressed", "gene_labels", ".cell",
                          "UMAPde_1", "UMAPde_2",
-                         "padj", "target", "target_id"))
+                         "padj", "target", "target_id",
+                         "log_cpm"))
diff --git a/man/select_robust_controls.Rd b/man/select_robust_controls.Rd