forked from openproblems-bio/openproblems
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnormalize.py
More file actions
86 lines (66 loc) · 2.08 KB
/
normalize.py
File metadata and controls
86 lines (66 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from . import decorators
import anndata as ad
import logging
import scanpy as sc
import scprep
log = logging.getLogger("openproblems")
_scran = scprep.run.RFunction(
setup="""
library('scran')
library('BiocParallel')
""",
args="sce, min.mean=0.1",
body="""
sce <- computeSumFactors(
sce, min.mean=min.mean,
assay.type="X",
BPPARAM=SerialParam()
)
sizeFactors(sce)
""",
)
@decorators.normalizer
def log_scran_pooling(adata: ad.AnnData) -> ad.AnnData:
"""Normalize data with scran via rpy2."""
scprep.run.install_bioconductor("scran")
adata.obs["size_factors"] = _scran(adata)
adata.X = scprep.utils.matrix_vector_elementwise_multiply(
adata.X, adata.obs["size_factors"], axis=0
)
sc.pp.log1p(adata)
return adata
def _cpm(adata: ad.AnnData):
adata.layers["counts"] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=1e6, key_added="size_factors")
@decorators.normalizer
def cpm(adata: ad.AnnData) -> ad.AnnData:
"""Normalize data to counts per million."""
_cpm(adata)
return adata
@decorators.normalizer
def log_cpm(adata: ad.AnnData) -> ad.AnnData:
"""Normalize data to log counts per million."""
_cpm(adata)
sc.pp.log1p(adata)
return adata
@decorators.normalizer
def sqrt_cpm(adata: ad.AnnData) -> ad.AnnData:
"""Normalize data to sqrt counts per million."""
_cpm(adata)
adata.X = scprep.transform.sqrt(adata.X)
return adata
@decorators.normalizer
def log_cpm_hvg(adata: ad.AnnData, n_genes: int = 1000) -> ad.AnnData:
"""Normalize logCPM HVG
Normalize data to log counts per million and select n_genes highly
variable genes
"""
adata = log_cpm(adata)
if adata.n_vars < n_genes:
log.warning(
f"Less than {n_genes} genes, setting 'n_genes' to {int(adata.n_vars * 0.5)}"
)
n_genes = int(adata.n_vars * 0.5)
sc.pp.highly_variable_genes(adata, n_top_genes=n_genes, flavor="cell_ranger")
adata = adata[:, adata.var["highly_variable"]].copy()
return adata