UCSF-DSCOLAB · dtm2451 · Feb 14, 2024 · Mar 20, 2024 · Mar 20, 2024 · Mar 20, 2024
diff --git a/README.md b/README.md
@@ -36,4 +36,5 @@ you would like to add:
 | single_cell/density_plotter.R | When `source()`'d, defines an R function that plots density of clusters across the umap space | Dan | Yes |
 | single_cell/annotation_import.R | When `source()`'d, defines an R function for pulling annotations into Seurat or SCE objects from a csv. An [example 'annots_file'](single_cell/annotation_import_example.csv) and [txt version of the function documentation](single_cell/annotation_import.txt) is also included. | Dan | Yes | 
 | single_cell/CITEseq/subcluster/function.R | When `source()`'d, defines an R function for subclustering Seurat CITEseq data. A script.R is also included to provide example usage. | Dan | No |
+| single_cell/RNAseq/process/function.R | When `source()`'d, defines an R function for (re-)processing Seurat RNAseq data. A script.R is also included to provide example usage. | Dan | No |
 | count_cores | a command line executable that allows a user to 1) self-monitor their active cores on `krummellab` nodes (default) or 2) use optional flags to query all DSCoLab active jobs to test for core monopoly | Rebecca | Yes |
diff --git a/single_cell/CITEseq/subcluster/function.R → single_cell/CITEseq/process/function.R b/single_cell/CITEseq/subcluster/function.R → single_cell/CITEseq/process/function.R
@@ -15,7 +15,7 @@
 #      - Aside from "S.Score" and "G2M.Score" which will be (re-)created internally via CellCycleScoring on the RNA assay, any metadata intended to be regressed out in scaling steps and given to 'rna_vars_to_regress' and/or 'adt_vars_to_regress' should already exist in the object.
 #      - When using Seurat's rpca-based integration approach for ADT batch correction, a metadata holding library identities must exist and should be a factor with levels ordered by processing/sequencing batch. (See 'integration_references' input details for why.)
 # 2. Runs DietSeurat to clean old dimensionality reductions and possible SCT assays
-# 3. Runs CellCycleScoring, highly variable gene selection, scaling, pca, and harmony batch correction on the RNA assay
+# 3. Runs CellCycleScoring, highly variable gene selection, scaling, pca
 # 4. Uses RunHarmony to batch correct the RNA-side.
 # 5a. If using Seurat's rpca-based integration for ADT batch correction: Runs scaling on PCA in a per-library fashion and uses Seurat's 'IntegrateData' methodology for batch corretion of ADT data, pulling a PCA run on the integrated data as the batch corrected dimensionality reduction assay here.
 # 5b. If using harmony for ADT batch correction: Runs scaling, pca, and harmony batch correction.
@@ -40,6 +40,7 @@
 #  - Assays:
 #    - RNA
 #    - ADT - even if no batch correction is performed on the ADT-side, the 'counts' and 'data' slots of this assay are retained.
+#    - any additional assays given to 'assays_keep', *with only 'counts' and 'data' slots retained.*
 #    - integrated.ADT - default / *only when 'adt_batch_correction_method = "rpca-integration"'*. Contains the rpca-integrated ADT data matrices. Except for QC, there's little need to look at this assay. Continue using the standard 'ADT' assay for protein expresssion analysis and visualization 
 #  - Dimensionality reduction names:
 #    - pca: RNA-side PCA, pre-batch correction
@@ -48,7 +49,7 @@
 #    - adt.pca: ADT-side PCA, pre-batch correction, *only when 'adt_batch_correction_method = "harmony"'*
 #    - adt.harmony: ADT-side harmonized PCA, *only when 'adt_batch_correction_method = "harmony"'*
 #    - umap: WNN-based umap, *only when 'adt_batch_correction_method' is NOT "none"*
-#    - umap_rna: WNN-based umap, *only when 'rna_only_umap_and_clustering = TRUE'*
+#    - umap_rna: RNA-based umap, *only when 'rna_only_umap_and_clustering = TRUE'*
 #  - Clustering Prefixes:
 #    - wsnn_: WNN-based clustering, *only when 'adt_batch_correction_method' is NOT "none"*
 #    - RNA_snn_: RNA-only clustering, *only when 'rna_only_umap_and_clustering = TRUE'*
@@ -82,6 +83,7 @@ process_normalized_citeseq_data <- function(
   adt_pca_dims = 1:20, # Number vector. The ADT PCs to utilize in WNN, and subsequent clustering and UMAP calculations.
   clustering_resolutions = seq(0.1, 2.0, 0.1), # Number vector used for the resolution parameter of Seurat::FindClusters() calls.
   rna_only_umap_and_clustering = TRUE, # Boolean. Controls whether or not to calculate a umap and clusterings based only on the RNA assay (*This is in addition to WNN-based umap and clustering unless 'adt_batch_correction_method' was set to FALSE).
+  assays_keep = c("RNA", "ADT"), # String vector giving names of all assays whose raw and normalized counts you wish to retain.  Can be a larger set than exists, but if you have data in another assay that you will want to access later, it must be named here or re-added later.
   warn_if_no_rds = TRUE # Logical. Controls whether or not a message-level warning will be given from the start if the re-processed object will only be returned as output, and not first saved to a file.
   ) {
 
@@ -103,10 +105,15 @@ process_normalized_citeseq_data <- function(
   if (!rna_only_umap_and_clustering && adt_batch_correction_method=="none") {
     stop(warn_start, "No reclustering requested. Either set 'rna_only_umap_and_clustering' to TRUE or change 'adt_batch_correction_method' from 'none'.")
   }
+  if (!all(c("RNA", "ADT") %in% assays_keep)) {
+    warning(warn_start, "Ensuring both 'RNA' and 'ADT' given to 'assays_keep'. Without these assays, this function cannot run.")
+    assays_keep <- unique(c(assays_keep, "RNA", "ADT"))
+  }
 
   print_message(log_prefix, "Starting processing")
   DefaultAssay(object) <- "RNA"
-  object <- DietSeurat(object, counts = TRUE, data = TRUE, scale.data = FALSE, assays = c("RNA", "ADT"), dimreducs = NULL, graphs = NULL, misc = FALSE)
+  assays_keep <- assays_keep[assays_keep %in% Assays(object)]
+  object <- DietSeurat(object, counts = TRUE, data = TRUE, scale.data = FALSE, assays = assays_keep, dimreducs = NULL, graphs = NULL, misc = FALSE)
 
   print_message(log_prefix, "RNA: Running Cell Cycle Scoring")
   object <- CellCycleScoring(object, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, nbin = 12)
@@ -115,21 +122,20 @@ process_normalized_citeseq_data <- function(
   object <- ScaleData(object, vars.to.regress = rna_vars_to_regress, verbose = FALSE)
   object <- RunPCA(object, reduction.name = "pca", verbose = FALSE)
   print_message(log_prefix, "RNA: Batch Correcting PCA with Harmony")
-  object <- RunHarmony(object, group.by.vars = harmony_group_by, reduction = "pca", reduction.save = "harmony", verbose = FALSE)
+  object <- harmony::RunHarmony(object, group.by.vars = harmony_group_by, reduction = "pca", reduction.save = "harmony", verbose = FALSE)
 
   if (adt_batch_correction_method == "harmony") {
     print_message(log_prefix, "ADT: Selecting non-isotypes as \"HVGs\", regressing metrics and scaling, and PCA")
     VariableFeatures(object, assay = "ADT") <- adt_features
-    object <- ScaleData(object, vars.to.regress = adt_vars_to_regress, verbose = FALSE, assay = "ADT")
-    object <- RunPCA(object, reduction.name = "adt.pca", verbose = FALSE, assay = "ADT")
+    object <- ScaleData(object, vars.to.regress = adt_vars_to_regress, verbose = FALSE, assay = "ADT", features = adt_features)
+    object <- RunPCA(object, reduction.name = "adt.pca", verbose = FALSE, assay = "ADT", features = adt_features)
     print_message(log_prefix, "ADT: Batch Correcting PCA with Harmony")
-    object <- RunHarmony(object, group.by.vars = harmony_group_by, reduction = "adt.pca", reduction.save = "adt.harmony", verbose = FALSE)
+    object <- harmony::RunHarmony(object, group.by.vars = harmony_group_by, reduction = "adt.pca", reduction.save = "adt.harmony", verbose = FALSE)
     adt_reduction <- "adt.harmony"
   } else if (adt_batch_correction_method == "rpca-integration") {
     print_message(log_prefix, "ADT: Selecting non-isotypes, then subsetting to per-library objects")
     DefaultAssay(object) <- "ADT"
     diet_object <- DietSeurat(object, counts = TRUE, data = TRUE, scale.data = FALSE, assays = "ADT", dimreducs = NULL, graphs = NULL, misc = FALSE)
-    DefaultAssay(object) <- "RNA"
     libs <- levels(as.factor(diet_object[[integration_split_by, drop = TRUE]]))
     sobjs.list <- lapply(libs, function(lib) {
       diet_object[, diet_object[[integration_split_by, drop = TRUE]]==lib]
@@ -139,8 +145,8 @@ process_normalized_citeseq_data <- function(
       cat("\t", libs[x], "\n", sep = "")
       x <- sobjs.list[[x]]
       VariableFeatures(object, assay = "ADT") <- adt_features
-      x <- ScaleData(x, verbose = FALSE)
-      x <- RunPCA(x, verbose = FALSE)
+      x <- ScaleData(x, verbose = FALSE, assay = "ADT", features = adt_features)
+      x <- RunPCA(x, verbose = FALSE, assay = "ADT", features = adt_features)
     })
     names(sobjs.list) <- libs
     print_message(log_prefix, "ADT: Finding integration anchors")
@@ -156,8 +162,8 @@ process_normalized_citeseq_data <- function(
     DefaultAssay(adt_int) <- "integrated.ADT"
     VariableFeatures(adt_int, assay = "integrated.ADT") <- adt_features
     print_message(log_prefix, "ADT: Scaling and PCA of Integrated ADT data")
-    adt_int <- ScaleData(adt_int, vars.to.regress = adt_vars_to_regress, verbose = FALSE)
-    adt_int <- RunPCA(adt_int, reduction.name = "int.adt.pca", reduction.key = "integratedADTpca_", verbose = FALSE)
+    adt_int <- ScaleData(adt_int, vars.to.regress = adt_vars_to_regress, verbose = FALSE, features = adt_features)
+    adt_int <- RunPCA(adt_int, reduction.name = "int.adt.pca", reduction.key = "integratedADTpca_", verbose = FALSE, features = adt_features)
     print_message(log_prefix, "ADT: Pulling integrated ADT assay and PCA to in-progress object")
     stopifnot(all(colnames(adt_int) %in% colnames(object)))
     stopifnot(all(colnames(object) %in% colnames(adt_int)))

diff --git a/single_cell/CITEseq/subcluster/script.R → single_cell/CITEseq/process/script.R b/single_cell/CITEseq/subcluster/script.R → single_cell/CITEseq/process/script.R
@@ -1,11 +1,11 @@
 ##### This script provides example usage of the 'process_normalized_citeseq_data' function from the adjacent script.
 ### This script is intended to be copied elsewhere and edited for direct use
 ### It generates lots of timestamped log messages to always know what's going on.
-### Here I'm subsetting on broad annotations that aren't yet in the object, so will be loading those in, and then subsetting to all the Tcell calls
+### In this example, I'm subsetting on broad annotations that aren't yet in the object, so will be loading those in, subsetting to all the Tcell calls, then running the processing function.
 
 # Replace this path with the path to your own data
 full_object_rds_file <- "/path/to/full_data.Rds"
-source("/path/to/single_cell/CITEseq/subcluster/function.R")
+source("/path/to/single_cell/CITEseq/process/function.R")
 # I recommend doing a DRYRUN for double-checking proper subseting and reference setup.
 # Set this to FALSE to run processing!
 DRYRUN <- TRUE
@@ -57,7 +57,7 @@ ref_libs <- grep("SCG1|SCG5", levels(all_data$orig.ident))
 print_message("References for ADT integration:\n\t", paste0(paste0(ref_libs, ": ", levels(all_data$orig.ident)[ref_libs]), collapse = "\n\t"))
 
 if (DRYRUN) {
-  print_message("DRYRUN=TRUE. Check the above logs for correctness. If all loks good re-run with DRYRUN set to FALSE.")
+  print_message("DRYRUN=TRUE. Check the above logs for correctness. If all looks good re-run with DRYRUN set to FALSE.")
 } else {
   print_message("DRYRUN=FALSE. Moving on with to processing the subset data:")
   process_normalized_citeseq_data(

diff --git a/single_cell/RNAseq/process/function.R b/single_cell/RNAseq/process/function.R
@@ -0,0 +1,115 @@
+##### This script initializes a function called 'process_normalized_rnaseq_data' which aims to achieve full "pre-processing" / "clustering" / "sub-clustering" of a Seurat object containing RNAseq data
+# To use:
+# - 'source("path/to/this/file")'
+# - Subset / merge / prepare the data you plan to process.
+# - Adjust inputs as needed.
+# - Invoke the 'process_normalized_rnaseq_data()' function!
+
+### Primary steps of the function:
+# 1. Takes in a Seurat object which should contain only the set of cells you wish to cluster or "sub"cluster, then runs pre-processing steps from right after the data normalization stage.
+#    **Object / Processing Plan assumptions**:
+#      - Gene expression data in "RNA" assay. (Additional assays can exist, but will be lost unless given to 'assays_keep', or re-added after the function.)
+#      - Desired batch correction methodology.
+#      - Aside from "S.Score" and "G2M.Score" which will be (re-)created internally via CellCycleScoring on the RNA assay, any metadata intended to be regressed out in scaling steps and given to 'rna_vars_to_regress' should already exist in the object.
+# 2. Runs DietSeurat to clean old dimensionality reductions and possible SCT assays
+# 3. Runs CellCycleScoring, highly variable gene selection, scaling, pca
+# 4. Uses RunHarmony to batch correct.
+# 7. Runs umap and clustering based on the RNA assay only
+# 8. Returns this newly pre-processed object.
+
+### R Package Requirements:
+# - Seurat v4 or later, originally tested in 4.1.1
+# - harmony
+
+### Inputs:
+# Documented within the function definition below!
+
+### Outputs:
+# Two output methods can be used:
+#  - The re-processed Seurat object can be saved to a .Rds file by giving a filepath to the "rds_path" input.
+#  - The re-processed Seurat object can also be output directly as the result of the function call when "output" is set to TRUE, which is the default.
+# Components of the output Seurat object:
+#  - All elements of the input object except for metadata & RNA and ADT assay counts and normalized counts matrices are cleared at the start. Everything else is re-created in the function:
+#  - Assays:
+#    - RNA
+#    - any additional assays given to 'assays_keep', default = ADT, *with only 'counts' and 'data' slots retained.*
+#  - Dimensionality reduction names:
+#    - pca: PCA, pre-batch correction
+#    - harmony: harmonized PCA
+#    - umap (or alternative name given to 'umap_name'): umap built from harmonized PCA
+#    - umap_no_harmony(or '{umap_name}_no_harmony'): umap built pre-batch correction PCA
+#  - Clustering Prefixes:
+#    - RNA_snn_: RNA-based clustering, built from nearest neighbors determined on harmonized PCA
+#    - Note that clustering is stored as metadata, and the DietSeurat cleanup step does not affect metadata. Thus, *previous clustering metadata will remain unless overwritten via the prefix above.*
+
+suppressPackageStartupMessages({
+  library(Seurat)
+  library(harmony)
+})
+
+process_normalized_rnaseq_data <- function(
+  ### Primary Inputs
+  object, # The Seurat object to target, which should already be subset to your cells of interest
+  log_prefix = "", # String which will be added to the beginning of any timestamped log lines. E.g.: "T cells: ". Useful when multiple subsets will be processed with the same log file.
+  rds_path = NULL, # String or NULL. File path naming where to output the processed Seurat as an Rds file.  If left NULL, the object will not be written to a file.
+  output = TRUE, # Boolean. Whether to return the Seurat object, or not, at the end.
+  ### Batch Correction Control
+  harmony_group_by = "orig.ident", # String (or String vector) naming your batch metadata (and any other metadata) intended to be given to the harmony::RunHarmony()'s 'group.by.vars' input.
+  ### Secondary Inputs
+  vars_to_regress = c("nCount_RNA", "percent.mt", "S.Score", "G2M.Score"), # Value is passed to Seurat::ScaleData()'s 'vars.to.regress' input in RNA assay re-scaling.
+  pca_dims = 1:30, # Number vector. The RNA PCs to utilize in WNN (and optional RNAonly) clustering and UMAP calculations.
+  clustering_resolutions = seq(0.1, 2.0, 0.1), # Number vector used for the resolution parameter of Seurat::FindClusters() calls.
+  assays_keep = c("RNA", "ADT"), # String vector giving names of all assays whose raw and normalized counts you wish to retain.  Can be a larger set than exists, but if you have data in another assay that you will want to access later, it must be named here or re-added later.
+  umap_name = "umap", # String giving the name to use for the umap dimensionalty reduction.
+  warn_if_no_rds = TRUE # Logical. Controls whether or not a message-level warning will be given from the start if the re-processed object will only be returned as output, and not first saved to a file.
+  ) {
+
+  print_message <- function(...) {
+    cat("[ ", format(Sys.time()), " ] ", ..., "\n", sep = "")
+  }
+
+  warn_start <- ifelse(log_prefix=="", "", paste0("For prefix '", log_prefix, ": "))
+  if (is.null(rds_path)) {
+    if (!output) {
+      stop(warn_start, "No 'rds_path' given && 'output' set to FALSE. Nothing would be returned!")
+    }
+    if (warn_if_no_rds) {
+      message(warn_start, "No 'rds_path' given. Object will be returned as output but not saved to disk within this function. (Set 'warn_if_no_rds = FALSE' to turn off this message.)")
+    }
+  }
+
+  print_message(log_prefix, "Starting processing")
+  DefaultAssay(object) <- "RNA"
+  assays_keep <- assays_keep[assays_keep %in% Assays(object)]
+  object <- DietSeurat(object, counts = TRUE, data = TRUE, scale.data = FALSE, assays = assays_keep, dimreducs = NULL, graphs = NULL, misc = FALSE)
+
+  print_message(log_prefix, "Running Cell Cycle Scoring")
+  object <- CellCycleScoring(object, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, nbin = 12)
+  print_message(log_prefix, "Selecting HVGs, regressing metrics and scaling, and PCA")
+  object <- FindVariableFeatures(object, verbose = FALSE)
+  object <- ScaleData(object, vars.to.regress = vars_to_regress, verbose = FALSE)
+  object <- RunPCA(object, reduction.name = "pca", verbose = FALSE)
+  print_message(log_prefix, "Batch Correcting PCA with Harmony")
+  object <- harmony::RunHarmony(object, group.by.vars = harmony_group_by, reduction = "pca", reduction.save = "harmony", verbose = FALSE)
+
+  print_message(log_prefix, "Nearest Neighbors")
+  object <- FindNeighbors(object, reduction = "harmony", dims = pca_dims, verbose = FALSE)
+  print_message(log_prefix, "UMAP")
+  object <- RunUMAP(object, reduction = "harmony", dims = pca_dims, reduction.name = umap_name, reduction.key = "UMAP_", verbose = FALSE)
+  object <- RunUMAP(object, reduction = "pca", dims = pca_dims, reduction.name = paste0(umap_name, "_no_harmony"), reduction.key = "UMAPnoHarmony_", verbose = FALSE)
+  print_message(log_prefix, "Clustering")
+  object <- FindClusters(object, algorithm = 2, resolution = clustering_resolutions, graph.name = "RNA_snn", verbose = FALSE)
+
+  print_message(log_prefix, "Object Processing Complete!")
+
+  if (!is.null(rds_path)) {
+    print_message(log_prefix, "Saving to '", rds_path, "' ...")
+    saveRDS(object, file = rds_path)
+    print_message(log_prefix, "Saved")
+  }
+
+  if (output) {
+    print_message(log_prefix, "Processed Object:")
+    object
+  }
+}