Skip to content
Empty file.
188 changes: 188 additions & 0 deletions single_cell/pseudobulk_function.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
### This script provides a function for pseudobulking single-cell RNAseq data, with a focus towards DGE functions also being planned
#
#' Standardized Pseudobulk Implementations
#' @param object A Seurat object
#' @param sample.by String or string vector naming metadata columns of \code{object} to use for assigning sample identities of cells.
#' A biospecimen, timepoint, or subject name is the common target here, and a batch identifier may be desired as well.
#' @param cell.by String naming the metadata column of \code{object} to use for assigning annotation or cluster identities of cells.
#' @param min.cells Number, 10 by default, which sets the minimum number of cells that a pseudobulk should represent in order to be retained.
#' @param cell.targets Optionally, a string vector naming particular \code{cell.by} values to target. Only these cell identities will be retained and pseudobulked.
#' @param features Optionally, a string vector naming a subset of genes to target. Only counts of these genes will be retained in pseudobulking.
#' @param assay String naming the assay of \code{object} to target. Default is "RNA".
#' @param meta.targets Optionally, a string vector naming particular metadata columns of \code{object} to target for retention. By default, as much a possible will be retained.
#' @param meta.num.summary.method Function like \code{\link[base]{mean}} or \code{\link[base]{median}} (with an \code{na.rm = TRUE} option) describing how to summarize numeric metadata from all cells of each pseudobulk
#' @param method "Seurat", "scuttle", or "dreamlet". Determines which pseudobulking tool to rely on. \itemize{
#' \item "Seurat" = \code{\link[Seurat]{AggregateExpression}}
#' \item Others planned but not yet implemented
#' }
#' @param output.style "Seurat" or "raw". Determines how data should be returned. Either as a Seurat object, or as a raw list of the 'counts' (matrix) and 'metadata' (data.frame)
#' @param output.metadata.cell.count String denoting the metadata column name added to hold the number of original cells going into each pseudobulk
#' @param verbose Logical which controls whether timestamped log messages should be shown
#' @return A Seurat object or, if \code{output.style = 'raw'} a named list with elements 'counts' and 'metadata'.
#' @details
#' The primary role of this function is standardizing the pseudobulking process.
#'
#' \emph{It is under construction and subject to change!}
#'
#' Currently, a Seurat object is expected and pseudobulking is performed per the groupings given by \code{cell.by} and any number of \code{sample.by} metadata.
#' Pseudobulk counts reflect the sum of counts across all cells that they represent.
#'
#' For the \code{method = "Seurat"} (the only method currently implemented) this is done using \code{Seurat::\link[Seurat]{AggregateExpression}} function with parameters:
#' \code{return.seurat = TRUE, group.by = c(sample.by, cell.by), features = features, assays = assay}.
#'
#' The number of cells that each pseudbulk represents are added to a metadata named 'cells_in_pseudobulk' by default.
#' Use \code{output.metadata.cell.count} to use a different column name.
#' \emph{Pseudobulks representing fewer than \code{min.cells} cells are removed.}
#'
#' \code{meta.targets}, or all metadata or \code{object} are then extracted for each pseudobulk.
#' For discrete metadata, the column will be ignored if data are not consistent within ALL pseudobulks.
#' For numeric metadata, values from each cell in the pseudobulks are summarized by the \code{meta.num.summary.method} function (\code{\link[base]{median}} by default).
#'
#' Finally, the output structure is determined by the \code{output.style} input.
#'
#' Prior to pseudobulking, particular cell identities or genes can be targeted using the \code{cell.targets} and \code{features} inputs, respectively.
#'
#' @author Daniel Bunis
#' @examples
#' library(Seurat)
#' dsco_pseudobulk(
#' object = pbmc_small,
#' sample.by = "groups",
#' cell.by = "RNA_snn_res.0.8"
#' )
#'
dsco_pseudobulk <- function(
object, sample.by, cell.by,
min.cells = 10,
cell.targets = NULL, features = NULL,
meta.targets = NULL,
assay = "RNA",
method = c('Seurat', 'scuttle', 'dreamlet'),
meta.num.summary.method = median,
output.style = c('Seurat', 'raw'),
output.metadata.cell.count = 'cells_in_pseudobulk',
verbose = TRUE
) {
method <- match.arg(method)
output.style <- match.arg(output.style)

orig_metas <- colnames(object@meta.data)

# Ensure ts_log exists, or source it.
if (verbose) {
if (!exists('ts_log')) {
stop("'ts_log' function required when 'verbose = TRUE'. Set 'verbose = FALSE' or run 'source('<path_to_this_essential_scripts_repo>/R_utils/ts_log.R', chdir = TRUE, local = TRUE)' and then try again.")
}
msg_if <- ts_log
} else {
msg_if <- function(...) {}
}

# Ensure all of cell.by and sample.by exist as metadata
if (!all(c(cell.by, sample.by) %in% orig_metas)) {
stop("A 'cell.by' or 'sample.by' element does not exist as a column the 'object' metadata.")
}
# Ensure assay exists
if (!assay %in% Seurat::Assays(object)) {
stop("'assay' is not a valid assay of the 'object'.")
}

# Trim to 'cell.targets'
if (!is.null(cell.targets)) {
object <- object[,object@meta.data[,cell.by] %in% cell.targets]
retained_cell_num <- ncol(object)
retained_cell_idents <- unique(object@meta.data[,cell.by])
msg_if(
"Trimming to 'cell.targets' retained ", retained_cell_num,
" cells of identities:\n", paste0(retained_cell_idents, collapse=", ")
)
}

### Pseudobulk
# (Trim to features internally)
if (method == 'Seurat') {
msg_if("Initiating pseudobulking with Seurat's AggregateExpression...")
group.metas <- c(sample.by, cell.by)
psobject <- Seurat::AggregateExpression(
object = object, return.seurat = TRUE, group.by = group.metas,
features = features,
assays = assay)

### Add cell counts and Trim too small pseudobulks
msg_if("Adding cell counts as metadata '", output.metadata.cell.count, "'.")
psobject@meta.data[,output.metadata.cell.count] <- vapply(
seq_len(ncol(psobject)),
function(i) {
# Using unlist for ignorance of rownames (cell vs pseudobulk won't match!)
targ <- unlist(psobject@meta.data[i,group.metas])
sum(
apply(object@meta.data[,group.metas], 1, function(x) {
identical(unlist(x), targ)
})
)
},
numeric(1)
)
too_small <- psobject@meta.data[,output.metadata.cell.count] < min.cells
if (too_small == ncol(psobject)) {
warning(paste0("Skipping triming pseudobulks smaller than 'min_cells' as NONE were built from more than ", min_cells, " cells."))
} else if (too_small > 0) {
msg_if("\tTrimming ", too_small, " pseudobulks built from fewer than ", min_cells, " cells.")
psobject <- psobject[,psobject@meta.data[,output.metadata.cell.count] >= min.cells]
}
Comment on lines +126 to +132
Copy link
Collaborator Author

@dtm2451 dtm2451 Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be worthwhile to include a pre-pseudobulk filter -- e.g. only pseudobulk a sample/cell type pair if there are at least X cells of that cell type in that sample

This is included already, here for the R function! It runs after the pseudobulking currently, but could move it to before instead if there's good reason.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh awesome! apologies, I should have looked more carefully. I just realized it is not in the dreamlet pseudobulk function, but then is implemented in the processAssays, so was kind of making a note to myself

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do think the downstream DEG filter to at least min.samples per category though is also useful

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed! got pulled away before posting that half =)


### Ensure meta.targets, or as many metadata as possible, are retained.
meta_ignored <- c()
meta_kept <- colnames(psobject@meta.data)
if (is.null(meta.targets)) {
meta.targets <- orig_metas
}
meta.targets <- meta.targets[!meta.targets %in% meta_kept]
msg_if("Grabbing additional metadata '", paste0(meta.targets, collapse = "', '"), "'.")
meta_collapse <- function(df_col, name) {
if (is.numeric(df_col)) {
meta.num.summary.method(df_col, na.rm = TRUE)
} else {
if (name %in% meta_ignored) {
NA
} else {
if (length(unique(df_col))>1) {
msg_if("\tignoring discrete metadata column '", name, "' because it is not consistent within all pseudobulks")
assign('meta_ignored', c(meta_ignored, name), inherits = TRUE)
NA
} else {
df_col[1]
}
}
}
}
for (i in seq_len(ncol(psobject))) {
targ <- unlist(psobject@meta.data[i,group.metas])
matches <- apply(object@meta.data[,group.metas], 1, function(x) {
identical(unlist(x), targ)
})
for (meta in meta.targets) {
if (i == 1) {
psobject@meta.data[,meta] <- NA
}
psobject@meta.data[i,meta] <- meta_collapse(object@meta.data[matches,meta], meta)
}
}
psobject@meta.data <- psobject@meta.data[,!colnames(psobject@meta.data)%in% meta_ignored]
} else {
stop('Alternative Pseudobulking methods not yet implemented.')
}

# Output in requested style
msg_if("Pseudobulk function COMPLETE.")
if (output.style == 'Seurat') {
psobject
} else {
exp <- if (packageVersion("Seurat")>='5.0') {
Seurat::GetAssayData(psobject, layer = 'counts')
} else {
Seurat::GetAssayData(psobject, slot = 'counts')
}
list(counts = exp, metadata = psobject@meta.data)
}
}
183 changes: 183 additions & 0 deletions single_cell/pseudobulk_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
from ..python_utils.ts_log import ts_log
from scanpy.get import aggregate
import anndata as ad
from pandas.api.types import is_numeric_dtype
from warnings import warn

def dsco_pseudobulk(
object, sample_by, cell_by,
layer = None, use_raw = True,
modality = "rna",
min_cells = 10,
cell_targets = None,
# features = None,
meta_targets = None,
meta_num_summary_method = 'median',
output_style = 'adata', # 'adata' or 'raw'
output_metadata_cell_count = 'cells_in_pseudobulk',
verbose = True):
"""
The primary role of this function is standardizing the pseudobulking process.
Note that it is under construction still and subject to change!.

Pseudobulking is performed per the groupings given by cell_by and sample_by metadata using the scanpy.get.aggregate
method to sum counts across cells. Beforehand, for MuData 'object's, the targeted modality will be extracted.

Raw counts data is normally the desired data for pseudobulking. If 'layer' is given, the matrix used for pseudobulking
will be the 'object'.layers['layer'].
If no 'layer' is given and (default) 'use_raw' = True, the 'object'.raw.X slot element is used for pseudobulking.
Otherwise, 'object'.X is used in pseudobulking.

The number of cells that each pseudbulk represents are added to a metadata named 'cells_in_pseudobulk' by default, but
this column name is adjustable with the 'output_metadata_cell_count' argument.

Pseudobulks representing fewer than 'min.cells' cells are removed.

'meta_targets', or all metadata of the target 'object' (or the targeted 'modality' for a muon 'object') are then
extracted for each pseudobulk.
For discrete metadata, the column will be ignored if data are not consistent within ALL pseudobulks.
For numeric metadata, values from each cell in the pseudobulks are summarized by the 'meta_num_summary_method' method,
("median" by default).

Finally, the output structure is determined by the output_style argument, either as an AnnData ('adata') or as a dict
('raw') with keys ['counts', 'feature_names', 'metadata']

Prior to pseudobulking, particular cell identities or genes can be targeted using the 'cell_targets' and 'features'
inputs, respectively.

Arguments:
object An AnnData or MuData. For MuData objects, only elements inside the internal AnnData of
the target 'modality' will remain accessible to the function.
samply_by A string or list of strings naming columns of 'object'.obs to use for assigning sample
identity of cells. A biospecimen, timepoint, or subject name is the common target here,
and a batch identifier may be desired as well.
cell_by String naming the metadata column of 'object'.obs to use for assigning annotation or
cluster identities of cells.
layer Optionally, a string naming as 'object'.layers key holding the count data.
use_raw Boolean, True by default, denoting whether to use the object's .raw.X matrix instead of
its .X matrix unless a 'layer' is given.
modality String, "rna" by default, giving the modality name to use when 'object' is a MuData.
min_cells Number, 10 by default, which sets the minimum number of cells that a pseudobulk should
represent in order to be retained.
cell_targets Optionally, a string list naming particular 'cell_by' data values to target. Only these
cell identities will be retained and pseudobulked.
meta_targets Optionally, a string list naming particular columns of 'object'.obs to target for
retention. By default, as much a possible will be retained.
meta_num_summary_method String like "median" (default) or "mean" naming how to summarize numeric metadata
describing how to summarize numeric metadata from all cells of each pseudobulk.
output_style 'adata' (default) or 'raw'. Determines how data should be returned. Either as an AnnData
object, or as a 'raw' dict of the 'counts', 'feature_names', and 'metadata'.
output_metadata_cell_count String ('cells_in_pseudobulk' by default) denoting the metadata column name added to
hold the number of original cells going into each pseudobulk.
verbose Logical which controls whether timestamped log messages should be shown

Example call:
dsco_pseudobulk(
object = adata,
sample_by = ['biospecimen_id', 'orig.ident'],
cell_by = 'leiden_res1',
layer = 'counts'
)

"""
if verbose:
msg_if = ts_log
else:
def msg_if(*args, **kwargs):
return

# To anndata with target data in 'layer' or .X
if not isinstance(object, ad.AnnData):
if modality in object.mod_names:
object = object[modality]
else:
raise Exception("'object' is not an AnnData or not a MuData with 'modality' in its mod_names.")
if layer is None and use_raw:
object.X = object.raw.X

# Ensure all of cell_by and sample_by exist as metadata
orig_metas = object.obs.columns
group_metas = list(sample_by) + [cell_by]
if not all([i in orig_metas for i in group_metas]):
raise Exception("A 'cell_by' or 'sample_by' element does not exist as a column of 'object.obs'.")

# Trim to 'cell_targets'
if not cell_targets is None:
object = object[object.obs[cell_by] in cell_targets].copy()
msg_if(
"Trimming to 'cell_targets' retained ", object.n_obs,
" cells of identities:\n", object.obs[cell_by].unique.join(", ")
)

### Pseudobulk
# (Trim to features internally)
msg_if("Initiating pseudobulking with scanpy.get.aggregate...")
psobject = aggregate(
adata = object,
by = group_metas,
func = 'sum',
layer = layer)
msg_if("completed, moving pseudobulk sum counts matrix to .X")
psobject.X = psobject.layers['sum'].copy()
del psobject.layers['sum']

### Add cell counts and Trim too small pseudobulks
msg_if("Adding cell counts as metadata '", output_metadata_cell_count, "'.")
counts = object.obs.value_counts(subset=group_metas)
pseudo_vals = counts.index.copy()
counts.index = ['_'.join(map(str, x)) for x in counts.index]
psobject.obs[output_metadata_cell_count] = counts
psobject.obs[output_metadata_cell_count] = psobject.obs[output_metadata_cell_count].fillna(0).astype('int')
zero_cells = sum(psobject.obs[output_metadata_cell_count] == 0)
if zero_cells > 0:
msg_if(f"\tTrimming fake pseudobulks matching zero original cells.")
psobject = psobject[psobject.obs[output_metadata_cell_count] > 0].copy()
if min_cells > 0:
too_small = sum(psobject.obs[output_metadata_cell_count] < min_cells)
if too_small == psobject.obs.shape[0]:
warn(f"Skipping triming pseudobulks smaller than 'min_cells' as NONE were built from more than {min_cells} cells.")
elif too_small > 0:
msg_if(f"\tTrimming {too_small} pseudobulks built from fewer than {min_cells} cells.")
psobject = psobject[psobject.obs[output_metadata_cell_count] >= min_cells].copy()

### Ensure meta_targets, or as many metadata as possible, are retained.
meta_ignored = []
meta_kept = psobject.obs.columns
if meta_targets is None:
meta_targets = orig_metas
meta_targets = [i for i in meta_targets if not i in meta_kept]
if len(meta_targets) > 0:
msg_if(f"Pulling in additional metadata targets: '{', '.join(meta_targets)}'.")
numeric = [i for i in meta_targets if is_numeric_dtype(object.obs[i])]
discrete = [i for i in meta_targets if not is_numeric_dtype(object.obs[i])]
if len(discrete)>0:
for col in discrete:
if len(object.obs.value_counts(subset=group_metas+[col]).index) > len(pseudo_vals):
meta_ignored.append(col)
meta_targets.remove(col)
if len(meta_ignored)>0:
msg_if(f"\tignoring discrete metadata column(s) with inconsistency within some pseudobulks: '{', '.join(meta_ignored)}'.")
if len(meta_targets) > 0:
dtypes = object.obs.dtypes.astype(str).to_dict()
if len(meta_targets) > 0:
for col in meta_targets:
method = 'first' if not col in numeric else meta_num_summary_method
new_dat = object.obs.groupby(group_metas, observed = True).agg(x=(col, method)).loc[pseudo_vals,:]
new_dat.index = counts.index
psobject.obs[col] = new_dat
try:
psobject.obs[col] = psobject.obs[col].astype(dtypes[col])
except (KeyError, ValueError):
pass
else:
msg_if('\tZero non-ignored additional metadata targets.')

# Output in requested style
msg_if("Pseudobulk function COMPLETE.")
if (output_style == 'adata'):
return psobject
return {
'counts': psobject.X,
'feature_names': psobject.var_names,
'metadata': psobject.obs
}
Loading