UCSF-DSCOLAB · dtm2451 · Dec 29, 2025 · Dec 29, 2025 · Dec 30, 2025 · Dec 30, 2025
diff --git a/single_cell/dge/STUB_put_dge_functions_here.txt b/single_cell/dge/STUB_put_dge_functions_here.txt
diff --git a/single_cell/pseudobulk_function.R b/single_cell/pseudobulk_function.R
@@ -0,0 +1,188 @@
+### This script provides a function for pseudobulking single-cell RNAseq data, with a focus towards DGE functions also being planned
+#
+#' Standardized Pseudobulk Implementations
+#' @param object A Seurat object
+#' @param sample.by String or string vector naming metadata columns of \code{object} to use for assigning sample identities of cells.
+#' A biospecimen, timepoint, or subject name is the common target here, and a batch identifier may be desired as well.
+#' @param cell.by String naming the metadata column of \code{object} to use for assigning annotation or cluster identities of cells.
+#' @param min.cells Number, 10 by default, which sets the minimum number of cells that a pseudobulk should represent in order to be retained.
+#' @param cell.targets Optionally, a string vector naming particular \code{cell.by} values to target. Only these cell identities will be retained and pseudobulked.
+#' @param features Optionally, a string vector naming a subset of genes to target. Only counts of these genes will be retained in pseudobulking.
+#' @param assay String naming the assay of \code{object} to target. Default is "RNA".
+#' @param meta.targets Optionally, a string vector naming particular metadata columns of \code{object} to target for retention. By default, as much a possible will be retained.
+#' @param meta.num.summary.method Function like \code{\link[base]{mean}} or \code{\link[base]{median}} (with an \code{na.rm = TRUE} option) describing how to summarize numeric metadata from all cells of each pseudobulk
+#' @param method "Seurat", "scuttle", or "dreamlet". Determines which pseudobulking tool to rely on. \itemize{
+#' \item "Seurat" = \code{\link[Seurat]{AggregateExpression}}
+#' \item Others planned but not yet implemented
+#' }
+#' @param output.style "Seurat" or "raw". Determines how data should be returned. Either as a Seurat object, or as a raw list of the 'counts' (matrix) and 'metadata' (data.frame)
+#' @param output.metadata.cell.count String denoting the metadata column name added to hold the number of original cells going into each pseudobulk
+#' @param verbose Logical which controls whether timestamped log messages should be shown
+#' @return A Seurat object or, if \code{output.style = 'raw'} a named list with elements 'counts' and 'metadata'.
+#' @details
+#' The primary role of this function is standardizing the pseudobulking process.
+#'
+#' \emph{It is under construction and subject to change!}
+#'
+#' Currently, a Seurat object is expected and pseudobulking is performed per the groupings given by \code{cell.by} and any number of \code{sample.by} metadata.
+#' Pseudobulk counts reflect the sum of counts across all cells that they represent.
+#'
+#' For the \code{method = "Seurat"} (the only method currently implemented) this is done using \code{Seurat::\link[Seurat]{AggregateExpression}} function with parameters:
+#' \code{return.seurat = TRUE, group.by = c(sample.by, cell.by), features = features, assays = assay}.
+#'
+#' The number of cells that each pseudbulk represents are added to a metadata named 'cells_in_pseudobulk' by default.
+#' Use \code{output.metadata.cell.count} to use a different column name.
+#' \emph{Pseudobulks representing fewer than \code{min.cells} cells are removed.}
+#'
+#' \code{meta.targets}, or all metadata or \code{object} are then extracted for each pseudobulk.
+#' For discrete metadata, the column will be ignored if data are not consistent within ALL pseudobulks.
+#' For numeric metadata, values from each cell in the pseudobulks are summarized by the \code{meta.num.summary.method} function (\code{\link[base]{median}} by default).
+#'
+#' Finally, the output structure is determined by the \code{output.style} input.
+#'
+#' Prior to pseudobulking, particular cell identities or genes can be targeted using the \code{cell.targets} and \code{features} inputs, respectively.
+#'
+#' @author Daniel Bunis
+#' @examples
+#' library(Seurat)
+#' dsco_pseudobulk(
+#'     object = pbmc_small,
+#'     sample.by = "groups",
+#'     cell.by = "RNA_snn_res.0.8"
+#' )
+#'
+dsco_pseudobulk <- function(
+    object, sample.by, cell.by,
+    min.cells = 10,
+    cell.targets = NULL, features = NULL,
+    meta.targets = NULL,
+    assay = "RNA",
+    method = c('Seurat', 'scuttle', 'dreamlet'),
+    meta.num.summary.method = median,
+    output.style = c('Seurat', 'raw'),
+    output.metadata.cell.count = 'cells_in_pseudobulk',
+    verbose = TRUE
+) {
+    method <- match.arg(method)
+    output.style <- match.arg(output.style)
+
+    orig_metas <- colnames(object@meta.data)
+
+    # Ensure ts_log exists, or source it.
+    if (verbose) {
+        if (!exists('ts_log')) {
+            stop("'ts_log' function required when 'verbose = TRUE'. Set 'verbose = FALSE' or run 'source('<path_to_this_essential_scripts_repo>/R_utils/ts_log.R', chdir = TRUE, local = TRUE)' and then try again.")
+        }
+        msg_if <- ts_log
+    } else {
+        msg_if <- function(...) {}
+    }
+
+    # Ensure all of cell.by and sample.by exist as metadata
+    if (!all(c(cell.by, sample.by) %in% orig_metas)) {
+        stop("A 'cell.by' or 'sample.by' element does not exist as a column the 'object' metadata.")
+    }
+    # Ensure assay exists
+    if (!assay %in% Seurat::Assays(object)) {
+        stop("'assay' is not a valid assay of the 'object'.")
+    }
+
+    # Trim to 'cell.targets'
+    if (!is.null(cell.targets)) {
+        object <- object[,object@meta.data[,cell.by] %in% cell.targets]
+        retained_cell_num <- ncol(object)
+        retained_cell_idents <- unique(object@meta.data[,cell.by])
+        msg_if(
+            "Trimming to 'cell.targets' retained ", retained_cell_num,
+            " cells of identities:\n", paste0(retained_cell_idents, collapse=", ")
+        )
+    }
+
+    ### Pseudobulk
+    # (Trim to features internally)
+    if (method == 'Seurat') {
+        msg_if("Initiating pseudobulking with Seurat's AggregateExpression...")
+        group.metas <- c(sample.by, cell.by)
+        psobject <- Seurat::AggregateExpression(
+            object = object, return.seurat = TRUE, group.by = group.metas,
+            features = features,
+            assays = assay)
+
+        ### Add cell counts and Trim too small pseudobulks
+        msg_if("Adding cell counts as metadata '", output.metadata.cell.count, "'.")
+        psobject@meta.data[,output.metadata.cell.count] <- vapply(
+            seq_len(ncol(psobject)),
+            function(i) {
+                # Using unlist for ignorance of rownames (cell vs pseudobulk won't match!)
+                targ <- unlist(psobject@meta.data[i,group.metas])
+                sum(
+                    apply(object@meta.data[,group.metas], 1, function(x) {
+                        identical(unlist(x), targ)
+                    })
+                )
+            },
+            numeric(1)
+        )
+        too_small <- psobject@meta.data[,output.metadata.cell.count] < min.cells
+        if (too_small == ncol(psobject)) {
+            warning(paste0("Skipping triming pseudobulks smaller than 'min_cells' as NONE were built from more than ", min_cells, " cells."))
+        } else if (too_small > 0) {
+            msg_if("\tTrimming ", too_small, " pseudobulks built from fewer than ", min_cells, " cells.")
+            psobject <- psobject[,psobject@meta.data[,output.metadata.cell.count] >= min.cells]
+        }
+
+        ### Ensure meta.targets, or as many metadata as possible, are retained.
+        meta_ignored <- c()
+        meta_kept <- colnames(psobject@meta.data)
+        if (is.null(meta.targets)) {
+            meta.targets <- orig_metas
+        }
+        meta.targets <- meta.targets[!meta.targets %in% meta_kept]
+        msg_if("Grabbing additional metadata '", paste0(meta.targets, collapse = "', '"), "'.")
+        meta_collapse <- function(df_col, name) {
+            if (is.numeric(df_col)) {
+                meta.num.summary.method(df_col, na.rm = TRUE)
+            } else {
+                if (name %in% meta_ignored) {
+                    NA
+                } else {
+                    if (length(unique(df_col))>1) {
+                        msg_if("\tignoring discrete metadata column '", name, "' because it is not consistent within all pseudobulks")
+                        assign('meta_ignored', c(meta_ignored, name), inherits = TRUE)
+                        NA
+                    } else {
+                        df_col[1]
+                    }
+                }
+            }
+        }
+        for (i in seq_len(ncol(psobject))) {
+            targ <- unlist(psobject@meta.data[i,group.metas])
+            matches <- apply(object@meta.data[,group.metas], 1, function(x) {
+                identical(unlist(x), targ)
+            })
+            for (meta in meta.targets) {
+                if (i == 1) {
+                    psobject@meta.data[,meta] <- NA
+                }
+                psobject@meta.data[i,meta] <- meta_collapse(object@meta.data[matches,meta], meta)
+            }
+        }
+        psobject@meta.data <- psobject@meta.data[,!colnames(psobject@meta.data)%in% meta_ignored]
+    } else {
+        stop('Alternative Pseudobulking methods not yet implemented.')
+    }
+
+    # Output in requested style
+    msg_if("Pseudobulk function COMPLETE.")
+    if (output.style == 'Seurat') {
+        psobject
+    } else {
+        exp <- if (packageVersion("Seurat")>='5.0') {
+            Seurat::GetAssayData(psobject, layer = 'counts')
+        } else {
+            Seurat::GetAssayData(psobject, slot = 'counts')
+        }
+        list(counts = exp, metadata = psobject@meta.data)
+    }
+}
diff --git a/single_cell/pseudobulk_function.py b/single_cell/pseudobulk_function.py
@@ -0,0 +1,183 @@
+from ..python_utils.ts_log import ts_log
+from scanpy.get import aggregate
+import anndata as ad
+from pandas.api.types import is_numeric_dtype
+from warnings import warn
+
+def dsco_pseudobulk(
+    object, sample_by, cell_by,
+    layer = None, use_raw = True,
+    modality = "rna",
+    min_cells = 10,
+    cell_targets = None,
+    # features = None,
+    meta_targets = None,
+    meta_num_summary_method = 'median',
+    output_style = 'adata', # 'adata' or 'raw'
+    output_metadata_cell_count = 'cells_in_pseudobulk',
+    verbose = True):
+    """
+The primary role of this function is standardizing the pseudobulking process.
+Note that it is under construction still and subject to change!.
+
+Pseudobulking is performed per the groupings given by cell_by and sample_by metadata using the scanpy.get.aggregate
+method to sum counts across cells. Beforehand, for MuData 'object's, the targeted modality will be extracted.
+
+Raw counts data is normally the desired data for pseudobulking. If 'layer' is given, the matrix used for pseudobulking
+will be the 'object'.layers['layer'].
+If no 'layer' is given and (default) 'use_raw' = True, the 'object'.raw.X slot element is used for pseudobulking.
+Otherwise, 'object'.X is used in pseudobulking.
+
+The number of cells that each pseudbulk represents are added to a metadata named 'cells_in_pseudobulk' by default, but
+this column name is adjustable with the 'output_metadata_cell_count' argument.
+
+Pseudobulks representing fewer than 'min.cells' cells are removed.
+
+'meta_targets', or all metadata of the target 'object' (or the targeted 'modality' for a muon 'object') are then
+extracted for each pseudobulk.
+For discrete metadata, the column will be ignored if data are not consistent within ALL pseudobulks.
+For numeric metadata, values from each cell in the pseudobulks are summarized by the 'meta_num_summary_method' method,
+("median" by default).
+
+Finally, the output structure is determined by the output_style argument, either as an AnnData ('adata') or as a dict
+('raw') with keys ['counts', 'feature_names', 'metadata']
+
+Prior to pseudobulking, particular cell identities or genes can be targeted using the 'cell_targets' and 'features'
+inputs, respectively.
+
+Arguments:
+    object                      An AnnData or MuData. For MuData objects, only elements inside the internal AnnData of
+                                the target 'modality' will remain accessible to the function.
+    samply_by                   A string or list of strings naming columns of 'object'.obs to use for assigning sample
+                                identity of cells. A biospecimen, timepoint, or subject name is the common target here,
+                                and a batch identifier may be desired as well.
+    cell_by                     String naming the metadata column of 'object'.obs to use for assigning annotation or
+                                cluster identities of cells.
+    layer                       Optionally, a string naming as 'object'.layers key holding the count data.
+    use_raw                     Boolean, True by default, denoting whether to use the object's .raw.X matrix instead of
+                                its .X matrix unless a 'layer' is given.
+    modality                    String, "rna" by default, giving the modality name to use when 'object' is a MuData.
+    min_cells                   Number, 10 by default, which sets the minimum number of cells that a pseudobulk should
+                                represent in order to be retained.
+    cell_targets                Optionally, a string list naming particular 'cell_by' data values to target. Only these
+                                cell identities will be retained and pseudobulked.
+    meta_targets                Optionally, a string list naming particular columns of 'object'.obs to target for
+                                retention. By default, as much a possible will be retained.
+    meta_num_summary_method     String like "median" (default) or "mean" naming how to summarize numeric metadata
+                                describing how to summarize numeric metadata from all cells of each pseudobulk.
+    output_style                'adata' (default) or 'raw'. Determines how data should be returned. Either as an AnnData
+                                object, or as a 'raw' dict of the 'counts', 'feature_names', and 'metadata'.
+    output_metadata_cell_count  String ('cells_in_pseudobulk' by default) denoting the metadata column name added to
+                                hold the number of original cells going into each pseudobulk.
+    verbose                     Logical which controls whether timestamped log messages should be shown
+
+Example call:
+    dsco_pseudobulk(
+        object = adata,
+        sample_by = ['biospecimen_id', 'orig.ident'],
+        cell_by = 'leiden_res1',
+        layer = 'counts'
+    )
+
+    """
+    if verbose:
+        msg_if = ts_log
+    else:
+        def msg_if(*args, **kwargs):
+            return
+
+    # To anndata with target data in 'layer' or .X
+    if not isinstance(object, ad.AnnData):
+        if modality in object.mod_names:
+            object = object[modality]
+        else:
+            raise Exception("'object' is not an AnnData or not a MuData with 'modality' in its mod_names.")
+    if layer is None and use_raw:
+        object.X = object.raw.X
+
+    # Ensure all of cell_by and sample_by exist as metadata
+    orig_metas = object.obs.columns
+    group_metas = list(sample_by) + [cell_by]
+    if not all([i in orig_metas for i in group_metas]):
+        raise Exception("A 'cell_by' or 'sample_by' element does not exist as a column of 'object.obs'.")
+
+    # Trim to 'cell_targets'
+    if not cell_targets is None:
+        object = object[object.obs[cell_by] in cell_targets].copy()
+        msg_if(
+            "Trimming to 'cell_targets' retained ", object.n_obs,
+            " cells of identities:\n", object.obs[cell_by].unique.join(", ")
+        )
+
+    ### Pseudobulk
+    # (Trim to features internally)
+    msg_if("Initiating pseudobulking with scanpy.get.aggregate...")
+    psobject = aggregate(
+        adata = object,
+        by = group_metas,
+        func = 'sum',
+        layer = layer)
+    msg_if("completed, moving pseudobulk sum counts matrix to .X")
+    psobject.X = psobject.layers['sum'].copy()
+    del psobject.layers['sum']
+
+    ### Add cell counts and Trim too small pseudobulks
+    msg_if("Adding cell counts as metadata '", output_metadata_cell_count, "'.")
+    counts = object.obs.value_counts(subset=group_metas)
+    pseudo_vals = counts.index.copy()
+    counts.index = ['_'.join(map(str, x)) for x in counts.index]
+    psobject.obs[output_metadata_cell_count] = counts
+    psobject.obs[output_metadata_cell_count] = psobject.obs[output_metadata_cell_count].fillna(0).astype('int')
+    zero_cells = sum(psobject.obs[output_metadata_cell_count] == 0)
+    if zero_cells > 0:
+        msg_if(f"\tTrimming fake pseudobulks matching zero original cells.")
+        psobject = psobject[psobject.obs[output_metadata_cell_count] > 0].copy()
+    if min_cells > 0:
+        too_small = sum(psobject.obs[output_metadata_cell_count] < min_cells)
+        if too_small == psobject.obs.shape[0]:
+            warn(f"Skipping triming pseudobulks smaller than 'min_cells' as NONE were built from more than {min_cells} cells.")
+        elif too_small > 0:
+            msg_if(f"\tTrimming {too_small} pseudobulks built from fewer than {min_cells} cells.")
+            psobject = psobject[psobject.obs[output_metadata_cell_count] >= min_cells].copy()
+
+    ### Ensure meta_targets, or as many metadata as possible, are retained.
+    meta_ignored = []
+    meta_kept = psobject.obs.columns
+    if meta_targets is None:
+        meta_targets = orig_metas
+    meta_targets = [i for i in meta_targets if not i in meta_kept]
+    if len(meta_targets) > 0:
+        msg_if(f"Pulling in additional metadata targets: '{', '.join(meta_targets)}'.")
+        numeric = [i for i in meta_targets if is_numeric_dtype(object.obs[i])]
+        discrete = [i for i in meta_targets if not is_numeric_dtype(object.obs[i])]
+        if len(discrete)>0:
+            for col in discrete:
+                if len(object.obs.value_counts(subset=group_metas+[col]).index) > len(pseudo_vals):
+                    meta_ignored.append(col)
+                    meta_targets.remove(col)
+        if len(meta_ignored)>0:
+            msg_if(f"\tignoring discrete metadata column(s) with inconsistency within some pseudobulks: '{', '.join(meta_ignored)}'.")
+        if len(meta_targets) > 0:
+            dtypes = object.obs.dtypes.astype(str).to_dict()
+            if len(meta_targets) > 0:
+                for col in meta_targets:
+                    method = 'first' if not col in numeric else meta_num_summary_method
+                    new_dat = object.obs.groupby(group_metas, observed = True).agg(x=(col, method)).loc[pseudo_vals,:]
+                    new_dat.index = counts.index
+                    psobject.obs[col] = new_dat
+                    try:
+                        psobject.obs[col] = psobject.obs[col].astype(dtypes[col])
+                    except (KeyError, ValueError):
+                        pass
+        else:
+            msg_if('\tZero non-ignored additional metadata targets.')
+
+    # Output in requested style
+    msg_if("Pseudobulk function COMPLETE.")
+    if (output_style == 'adata'):
+        return psobject
+    return {
+        'counts': psobject.X,
+        'feature_names': psobject.var_names,
+        'metadata': psobject.obs
+    }