Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ you would like to add:
| single_cell/density_plotter.R | When `source()`'d, defines an R function that plots density of clusters across the umap space | Dan | Yes |
| single_cell/annotation_import.R | When `source()`'d, defines an R function for pulling annotations into Seurat or SCE objects from a csv. An [example 'annots_file'](single_cell/annotation_import_example.csv) and [txt version of the function documentation](single_cell/annotation_import.txt) is also included. | Dan | Yes |
| single_cell/CITEseq/subcluster/function.R | When `source()`'d, defines an R function for subclustering Seurat CITEseq data. A script.R is also included to provide example usage. | Dan | No |
| single_cell/RNAseq/process/function.R | When `source()`'d, defines an R function for (re-)processing Seurat RNAseq data. A script.R is also included to provide example usage. | Dan | No |
| count_cores | a command line executable that allows a user to 1) self-monitor their active cores on `krummellab` nodes (default) or 2) use optional flags to query all DSCoLab active jobs to test for core monopoly | Rebecca | Yes |
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# - Aside from "S.Score" and "G2M.Score" which will be (re-)created internally via CellCycleScoring on the RNA assay, any metadata intended to be regressed out in scaling steps and given to 'rna_vars_to_regress' and/or 'adt_vars_to_regress' should already exist in the object.
# - When using Seurat's rpca-based integration approach for ADT batch correction, a metadata holding library identities must exist and should be a factor with levels ordered by processing/sequencing batch. (See 'integration_references' input details for why.)
# 2. Runs DietSeurat to clean old dimensionality reductions and possible SCT assays
# 3. Runs CellCycleScoring, highly variable gene selection, scaling, pca, and harmony batch correction on the RNA assay
# 3. Runs CellCycleScoring, highly variable gene selection, scaling, pca
# 4. Uses RunHarmony to batch correct the RNA-side.
# 5a. If using Seurat's rpca-based integration for ADT batch correction: Runs scaling on PCA in a per-library fashion and uses Seurat's 'IntegrateData' methodology for batch corretion of ADT data, pulling a PCA run on the integrated data as the batch corrected dimensionality reduction assay here.
# 5b. If using harmony for ADT batch correction: Runs scaling, pca, and harmony batch correction.
Expand All @@ -40,6 +40,7 @@
# - Assays:
# - RNA
# - ADT - even if no batch correction is performed on the ADT-side, the 'counts' and 'data' slots of this assay are retained.
# - any additional assays given to 'assays_keep', *with only 'counts' and 'data' slots retained.*
# - integrated.ADT - default / *only when 'adt_batch_correction_method = "rpca-integration"'*. Contains the rpca-integrated ADT data matrices. Except for QC, there's little need to look at this assay. Continue using the standard 'ADT' assay for protein expresssion analysis and visualization
# - Dimensionality reduction names:
# - pca: RNA-side PCA, pre-batch correction
Expand All @@ -48,7 +49,7 @@
# - adt.pca: ADT-side PCA, pre-batch correction, *only when 'adt_batch_correction_method = "harmony"'*
# - adt.harmony: ADT-side harmonized PCA, *only when 'adt_batch_correction_method = "harmony"'*
# - umap: WNN-based umap, *only when 'adt_batch_correction_method' is NOT "none"*
# - umap_rna: WNN-based umap, *only when 'rna_only_umap_and_clustering = TRUE'*
# - umap_rna: RNA-based umap, *only when 'rna_only_umap_and_clustering = TRUE'*
# - Clustering Prefixes:
# - wsnn_: WNN-based clustering, *only when 'adt_batch_correction_method' is NOT "none"*
# - RNA_snn_: RNA-only clustering, *only when 'rna_only_umap_and_clustering = TRUE'*
Expand Down Expand Up @@ -82,6 +83,7 @@ process_normalized_citeseq_data <- function(
adt_pca_dims = 1:20, # Number vector. The ADT PCs to utilize in WNN, and subsequent clustering and UMAP calculations.
clustering_resolutions = seq(0.1, 2.0, 0.1), # Number vector used for the resolution parameter of Seurat::FindClusters() calls.
rna_only_umap_and_clustering = TRUE, # Boolean. Controls whether or not to calculate a umap and clusterings based only on the RNA assay (*This is in addition to WNN-based umap and clustering unless 'adt_batch_correction_method' was set to FALSE).
assays_keep = c("RNA", "ADT"), # String vector giving names of all assays whose raw and normalized counts you wish to retain. Can be a larger set than exists, but if you have data in another assay that you will want to access later, it must be named here or re-added later.
warn_if_no_rds = TRUE # Logical. Controls whether or not a message-level warning will be given from the start if the re-processed object will only be returned as output, and not first saved to a file.
) {

Expand All @@ -103,10 +105,15 @@ process_normalized_citeseq_data <- function(
if (!rna_only_umap_and_clustering && adt_batch_correction_method=="none") {
stop(warn_start, "No reclustering requested. Either set 'rna_only_umap_and_clustering' to TRUE or change 'adt_batch_correction_method' from 'none'.")
}
if (!all(c("RNA", "ADT") %in% assays_keep)) {
warning(warn_start, "Ensuring both 'RNA' and 'ADT' given to 'assays_keep'. Without these assays, this function cannot run.")
assays_keep <- unique(c(assays_keep, "RNA", "ADT"))
}

print_message(log_prefix, "Starting processing")
DefaultAssay(object) <- "RNA"
object <- DietSeurat(object, counts = TRUE, data = TRUE, scale.data = FALSE, assays = c("RNA", "ADT"), dimreducs = NULL, graphs = NULL, misc = FALSE)
assays_keep <- assays_keep[assays_keep %in% Assays(object)]
object <- DietSeurat(object, counts = TRUE, data = TRUE, scale.data = FALSE, assays = assays_keep, dimreducs = NULL, graphs = NULL, misc = FALSE)

print_message(log_prefix, "RNA: Running Cell Cycle Scoring")
object <- CellCycleScoring(object, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, nbin = 12)
Expand All @@ -115,21 +122,20 @@ process_normalized_citeseq_data <- function(
object <- ScaleData(object, vars.to.regress = rna_vars_to_regress, verbose = FALSE)
object <- RunPCA(object, reduction.name = "pca", verbose = FALSE)
print_message(log_prefix, "RNA: Batch Correcting PCA with Harmony")
object <- RunHarmony(object, group.by.vars = harmony_group_by, reduction = "pca", reduction.save = "harmony", verbose = FALSE)
object <- harmony::RunHarmony(object, group.by.vars = harmony_group_by, reduction = "pca", reduction.save = "harmony", verbose = FALSE)

if (adt_batch_correction_method == "harmony") {
print_message(log_prefix, "ADT: Selecting non-isotypes as \"HVGs\", regressing metrics and scaling, and PCA")
VariableFeatures(object, assay = "ADT") <- adt_features
object <- ScaleData(object, vars.to.regress = adt_vars_to_regress, verbose = FALSE, assay = "ADT")
object <- RunPCA(object, reduction.name = "adt.pca", verbose = FALSE, assay = "ADT")
object <- ScaleData(object, vars.to.regress = adt_vars_to_regress, verbose = FALSE, assay = "ADT", features = adt_features)
object <- RunPCA(object, reduction.name = "adt.pca", verbose = FALSE, assay = "ADT", features = adt_features)
print_message(log_prefix, "ADT: Batch Correcting PCA with Harmony")
object <- RunHarmony(object, group.by.vars = harmony_group_by, reduction = "adt.pca", reduction.save = "adt.harmony", verbose = FALSE)
object <- harmony::RunHarmony(object, group.by.vars = harmony_group_by, reduction = "adt.pca", reduction.save = "adt.harmony", verbose = FALSE)
adt_reduction <- "adt.harmony"
} else if (adt_batch_correction_method == "rpca-integration") {
print_message(log_prefix, "ADT: Selecting non-isotypes, then subsetting to per-library objects")
DefaultAssay(object) <- "ADT"
diet_object <- DietSeurat(object, counts = TRUE, data = TRUE, scale.data = FALSE, assays = "ADT", dimreducs = NULL, graphs = NULL, misc = FALSE)
DefaultAssay(object) <- "RNA"
libs <- levels(as.factor(diet_object[[integration_split_by, drop = TRUE]]))
sobjs.list <- lapply(libs, function(lib) {
diet_object[, diet_object[[integration_split_by, drop = TRUE]]==lib]
Expand All @@ -139,8 +145,8 @@ process_normalized_citeseq_data <- function(
cat("\t", libs[x], "\n", sep = "")
x <- sobjs.list[[x]]
VariableFeatures(object, assay = "ADT") <- adt_features
x <- ScaleData(x, verbose = FALSE)
x <- RunPCA(x, verbose = FALSE)
x <- ScaleData(x, verbose = FALSE, assay = "ADT", features = adt_features)
x <- RunPCA(x, verbose = FALSE, assay = "ADT", features = adt_features)
})
names(sobjs.list) <- libs
print_message(log_prefix, "ADT: Finding integration anchors")
Expand All @@ -156,8 +162,8 @@ process_normalized_citeseq_data <- function(
DefaultAssay(adt_int) <- "integrated.ADT"
VariableFeatures(adt_int, assay = "integrated.ADT") <- adt_features
print_message(log_prefix, "ADT: Scaling and PCA of Integrated ADT data")
adt_int <- ScaleData(adt_int, vars.to.regress = adt_vars_to_regress, verbose = FALSE)
adt_int <- RunPCA(adt_int, reduction.name = "int.adt.pca", reduction.key = "integratedADTpca_", verbose = FALSE)
adt_int <- ScaleData(adt_int, vars.to.regress = adt_vars_to_regress, verbose = FALSE, features = adt_features)
adt_int <- RunPCA(adt_int, reduction.name = "int.adt.pca", reduction.key = "integratedADTpca_", verbose = FALSE, features = adt_features)
print_message(log_prefix, "ADT: Pulling integrated ADT assay and PCA to in-progress object")
stopifnot(all(colnames(adt_int) %in% colnames(object)))
stopifnot(all(colnames(object) %in% colnames(adt_int)))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
##### This script provides example usage of the 'process_normalized_citeseq_data' function from the adjacent script.
### This script is intended to be copied elsewhere and edited for direct use
### It generates lots of timestamped log messages to always know what's going on.
### Here I'm subsetting on broad annotations that aren't yet in the object, so will be loading those in, and then subsetting to all the Tcell calls
### In this example, I'm subsetting on broad annotations that aren't yet in the object, so will be loading those in, subsetting to all the Tcell calls, then running the processing function.

# Replace this path with the path to your own data
full_object_rds_file <- "/path/to/full_data.Rds"
source("/path/to/single_cell/CITEseq/subcluster/function.R")
source("/path/to/single_cell/CITEseq/process/function.R")
# I recommend doing a DRYRUN for double-checking proper subseting and reference setup.
# Set this to FALSE to run processing!
DRYRUN <- TRUE
Expand Down Expand Up @@ -57,7 +57,7 @@ ref_libs <- grep("SCG1|SCG5", levels(all_data$orig.ident))
print_message("References for ADT integration:\n\t", paste0(paste0(ref_libs, ": ", levels(all_data$orig.ident)[ref_libs]), collapse = "\n\t"))

if (DRYRUN) {
print_message("DRYRUN=TRUE. Check the above logs for correctness. If all loks good re-run with DRYRUN set to FALSE.")
print_message("DRYRUN=TRUE. Check the above logs for correctness. If all looks good re-run with DRYRUN set to FALSE.")
} else {
print_message("DRYRUN=FALSE. Moving on with to processing the subset data:")
process_normalized_citeseq_data(
Expand Down
115 changes: 115 additions & 0 deletions single_cell/RNAseq/process/function.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
##### This script initializes a function called 'process_normalized_rnaseq_data' which aims to achieve full "pre-processing" / "clustering" / "sub-clustering" of a Seurat object containing RNAseq data
# To use:
# - 'source("path/to/this/file")'
# - Subset / merge / prepare the data you plan to process.
# - Adjust inputs as needed.
# - Invoke the 'process_normalized_rnaseq_data()' function!

### Primary steps of the function:
# 1. Takes in a Seurat object which should contain only the set of cells you wish to cluster or "sub"cluster, then runs pre-processing steps from right after the data normalization stage.
# **Object / Processing Plan assumptions**:
# - Gene expression data in "RNA" assay. (Additional assays can exist, but will be lost unless given to 'assays_keep', or re-added after the function.)
# - Desired batch correction methodology.
# - Aside from "S.Score" and "G2M.Score" which will be (re-)created internally via CellCycleScoring on the RNA assay, any metadata intended to be regressed out in scaling steps and given to 'rna_vars_to_regress' should already exist in the object.
# 2. Runs DietSeurat to clean old dimensionality reductions and possible SCT assays
# 3. Runs CellCycleScoring, highly variable gene selection, scaling, pca
# 4. Uses RunHarmony to batch correct.
# 7. Runs umap and clustering based on the RNA assay only
# 8. Returns this newly pre-processed object.

### R Package Requirements:
# - Seurat v4 or later, originally tested in 4.1.1
# - harmony

### Inputs:
# Documented within the function definition below!

### Outputs:
# Two output methods can be used:
# - The re-processed Seurat object can be saved to a .Rds file by giving a filepath to the "rds_path" input.
# - The re-processed Seurat object can also be output directly as the result of the function call when "output" is set to TRUE, which is the default.
# Components of the output Seurat object:
# - All elements of the input object except for metadata & RNA and ADT assay counts and normalized counts matrices are cleared at the start. Everything else is re-created in the function:
# - Assays:
# - RNA
# - any additional assays given to 'assays_keep', default = ADT, *with only 'counts' and 'data' slots retained.*
# - Dimensionality reduction names:
# - pca: PCA, pre-batch correction
# - harmony: harmonized PCA
# - umap (or alternative name given to 'umap_name'): umap built from harmonized PCA
# - umap_no_harmony(or '{umap_name}_no_harmony'): umap built pre-batch correction PCA
# - Clustering Prefixes:
# - RNA_snn_: RNA-based clustering, built from nearest neighbors determined on harmonized PCA
# - Note that clustering is stored as metadata, and the DietSeurat cleanup step does not affect metadata. Thus, *previous clustering metadata will remain unless overwritten via the prefix above.*

suppressPackageStartupMessages({
library(Seurat)
library(harmony)
})

process_normalized_rnaseq_data <- function(
### Primary Inputs
object, # The Seurat object to target, which should already be subset to your cells of interest
log_prefix = "", # String which will be added to the beginning of any timestamped log lines. E.g.: "T cells: ". Useful when multiple subsets will be processed with the same log file.
rds_path = NULL, # String or NULL. File path naming where to output the processed Seurat as an Rds file. If left NULL, the object will not be written to a file.
output = TRUE, # Boolean. Whether to return the Seurat object, or not, at the end.
### Batch Correction Control
harmony_group_by = "orig.ident", # String (or String vector) naming your batch metadata (and any other metadata) intended to be given to the harmony::RunHarmony()'s 'group.by.vars' input.
### Secondary Inputs
vars_to_regress = c("nCount_RNA", "percent.mt", "S.Score", "G2M.Score"), # Value is passed to Seurat::ScaleData()'s 'vars.to.regress' input in RNA assay re-scaling.
pca_dims = 1:30, # Number vector. The RNA PCs to utilize in WNN (and optional RNAonly) clustering and UMAP calculations.
clustering_resolutions = seq(0.1, 2.0, 0.1), # Number vector used for the resolution parameter of Seurat::FindClusters() calls.
assays_keep = c("RNA", "ADT"), # String vector giving names of all assays whose raw and normalized counts you wish to retain. Can be a larger set than exists, but if you have data in another assay that you will want to access later, it must be named here or re-added later.
umap_name = "umap", # String giving the name to use for the umap dimensionalty reduction.
warn_if_no_rds = TRUE # Logical. Controls whether or not a message-level warning will be given from the start if the re-processed object will only be returned as output, and not first saved to a file.
) {

print_message <- function(...) {
cat("[ ", format(Sys.time()), " ] ", ..., "\n", sep = "")
}

warn_start <- ifelse(log_prefix=="", "", paste0("For prefix '", log_prefix, ": "))
if (is.null(rds_path)) {
if (!output) {
stop(warn_start, "No 'rds_path' given && 'output' set to FALSE. Nothing would be returned!")
}
if (warn_if_no_rds) {
message(warn_start, "No 'rds_path' given. Object will be returned as output but not saved to disk within this function. (Set 'warn_if_no_rds = FALSE' to turn off this message.)")
}
}

print_message(log_prefix, "Starting processing")
DefaultAssay(object) <- "RNA"
assays_keep <- assays_keep[assays_keep %in% Assays(object)]
object <- DietSeurat(object, counts = TRUE, data = TRUE, scale.data = FALSE, assays = assays_keep, dimreducs = NULL, graphs = NULL, misc = FALSE)

print_message(log_prefix, "Running Cell Cycle Scoring")
object <- CellCycleScoring(object, s.features = cc.genes$s.genes, g2m.features = cc.genes$g2m.genes, nbin = 12)
print_message(log_prefix, "Selecting HVGs, regressing metrics and scaling, and PCA")
object <- FindVariableFeatures(object, verbose = FALSE)
object <- ScaleData(object, vars.to.regress = vars_to_regress, verbose = FALSE)
object <- RunPCA(object, reduction.name = "pca", verbose = FALSE)
print_message(log_prefix, "Batch Correcting PCA with Harmony")
object <- harmony::RunHarmony(object, group.by.vars = harmony_group_by, reduction = "pca", reduction.save = "harmony", verbose = FALSE)

print_message(log_prefix, "Nearest Neighbors")
object <- FindNeighbors(object, reduction = "harmony", dims = pca_dims, verbose = FALSE)
print_message(log_prefix, "UMAP")
object <- RunUMAP(object, reduction = "harmony", dims = pca_dims, reduction.name = umap_name, reduction.key = "UMAP_", verbose = FALSE)
object <- RunUMAP(object, reduction = "pca", dims = pca_dims, reduction.name = paste0(umap_name, "_no_harmony"), reduction.key = "UMAPnoHarmony_", verbose = FALSE)
print_message(log_prefix, "Clustering")
object <- FindClusters(object, algorithm = 2, resolution = clustering_resolutions, graph.name = "RNA_snn", verbose = FALSE)

print_message(log_prefix, "Object Processing Complete!")

if (!is.null(rds_path)) {
print_message(log_prefix, "Saving to '", rds_path, "' ...")
saveRDS(object, file = rds_path)
print_message(log_prefix, "Saved")
}

if (output) {
print_message(log_prefix, "Processed Object:")
object
}
}
Loading