Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/pr_validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ jobs:
strategy:
matrix:
include:
- os: ubuntu-latest
python: "3.8"
- os: ubuntu-latest
python: "3.9"
- os: ubuntu-latest
Expand Down
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ For reference, the code is being tested in a github workflow (CI) with python
- pandas 1.4.3
- scikit-learn 1.1.1
- scipy 1.8.1
- statsmodels 0.13.2
```

Please don't hesitate to open an issue in case you encounter any issue due to possible deprecations.
Expand Down
5 changes: 0 additions & 5 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@

# -- Path setup --------------------------------------------------------------

import warnings

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
Expand All @@ -17,12 +15,9 @@
from pathlib import Path

import git
from statsmodels.tools.sm_exceptions import DomainWarning

import pydeseq2

# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link.
warnings.simplefilter("ignore", DomainWarning)

# -- Project information -----------------------------------------------------

Expand Down
1 change: 0 additions & 1 deletion docs/source/usage/requirements.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,5 @@ For reference, the code was tested with python 3.8 and the following package ver
- pandas==1.4.3
- scikit-learn==1.1.1
- scipy==1.8.1
- statsmodels==0.13.2

Please don't hesitate to open an issue in case you encounter any problems due to possible deprecations.
77 changes: 59 additions & 18 deletions pydeseq2/dds.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def vst_fit(
if use_design:
# Check that the dispersion trend curve was fitted. If not, fit it.
# This will call previous functions in a cascade.
if "trend_coeffs" not in self.uns:
if "disp_function" not in self.uns:
self.fit_dispersion_trend()
else:
# Reduce the design matrix to an intercept and reconstruct at the end
Expand Down Expand Up @@ -477,7 +477,7 @@ def fit_size_factors(
warnings.warn(
"Every gene contains at least one zero, "
"cannot compute log geometric means. Switching to iterative mode.",
RuntimeWarning,
UserWarning,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, what is the rationale of selecting UserWarning ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The bad reason: it was easier to catch it simultaneously with a switch to "mean" trend curve fitting in the tests. The better reason is that I felt it was more suited (RuntimeWarning is supposed to cover dubious runtime behavior which is not really what's going on here).

stacklevel=2,
)
self._fit_iterate_size_factors()
Expand Down Expand Up @@ -601,12 +601,35 @@ def fit_dispersion_trend(self) -> None:
# Initialize coefficients
old_coeffs = pd.Series([0.1, 0.1])
coeffs = pd.Series([1.0, 1.0])

while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6:
while (coeffs > 0).all() and (
np.log(np.abs(coeffs / old_coeffs)) ** 2
).sum() >= 1e-6:
old_coeffs = coeffs
coeffs, predictions = self.inference.dispersion_trend_gamma_glm(
coeffs, predictions, converged = self.inference.dispersion_trend_gamma_glm(
covariates, targets
)

if not converged:
warnings.warn(
"The dispersion trend curve fitting did not converge. "
"Switching to a mean-based dispersion trend.",
UserWarning,
stacklevel=2,
)
self.uns["mean_disp"] = trim_mean(
self.varm["genewise_dispersions"][
self.varm["genewise_dispersions"] > 10 * self.min_disp
],
proportiontocut=0.001,
)

self.uns["disp_function_type"] = "mean"
self.varm["fitted_dispersions"] = np.full(
self.n_vars, self.uns["mean_disp"]
)

break

# Filter out genes that are too far away from the curve before refitting
pred_ratios = (
self[:, covariates.index].varm["genewise_dispersions"] / predictions
Expand All @@ -620,19 +643,25 @@ def fit_dispersion_trend(self) -> None:
covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
inplace=True,
)
self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])

self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
self.uns["disp_function_type"] = "parametric"
self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function(
self.varm["_normed_means"][self.varm["non_zero"]]
)

end = time.time()

if not self.quiet:
print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr)

self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])

self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
self.varm["fitted_dispersions"][self.varm["non_zero"]] = dispersion_trend(
self.varm["_normed_means"][self.varm["non_zero"]],
self.uns["trend_coeffs"],
)
def disp_function(self, x):
"""Return the dispersion trend function at x."""
if self.uns["disp_function_type"] == "parametric":
return dispersion_trend(x, self.uns["trend_coeffs"])
elif self.uns["disp_function_type"] == "mean":
return self.uns["mean_disp"]

def fit_dispersion_prior(self) -> None:
"""Fit dispersion variance priors and standard deviation of log-residuals.
Expand Down Expand Up @@ -1001,11 +1030,14 @@ def _refit_without_outliers(

# Compute trend dispersions.
# Note: the trend curve is not refitted.
sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"]
sub_dds.uns["disp_function_type"] = self.uns["disp_function_type"]
if sub_dds.uns["disp_function_type"] == "parametric":
sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"]
elif sub_dds.uns["disp_function_type"] == "mean":
sub_dds.uns["mean_disp"] = self.uns["mean_disp"]
sub_dds.varm["_normed_means"] = sub_dds.layers["normed_counts"].mean(0)
sub_dds.varm["fitted_dispersions"] = dispersion_trend(
sub_dds.varm["_normed_means"],
sub_dds.uns["trend_coeffs"],
sub_dds.varm["fitted_dispersions"] = sub_dds.disp_function(
sub_dds.varm["_normed_means"]
)

# Estimate MAP dispersions.
Expand Down Expand Up @@ -1089,9 +1121,18 @@ def objective(p):
& self.varm["non_zero"]
]

mean_disp = trimmed_mean(
self[:, use_for_mean_genes].varm["genewise_dispersions"], trim=0.001
if len(use_for_mean_genes) == 0:
print(
"No genes have a dispersion above 10 * min_disp in "
"_fit_iterate_size_factors."
)
break

mean_disp = trim_mean(
self[:, use_for_mean_genes].varm["genewise_dispersions"],
proportiontocut=0.001,
)

self.varm["fitted_dispersions"] = np.ones(self.n_vars) * mean_disp
self.fit_dispersion_prior()
self.fit_MAP_dispersions()
Expand Down
41 changes: 25 additions & 16 deletions pydeseq2/default_inference.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,17 @@
import warnings
from typing import Literal
from typing import Optional
from typing import Tuple

import numpy as np
import pandas as pd
import statsmodels.api as sm # type: ignore
from joblib import Parallel # type: ignore
from joblib import delayed
from joblib import parallel_backend
from statsmodels.tools.sm_exceptions import DomainWarning # type: ignore
from scipy.optimize import minimize # type: ignore

from pydeseq2 import inference
from pydeseq2 import utils

# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link.
warnings.simplefilter("ignore", DomainWarning)


class DefaultInference(inference.Inference):
"""Default DESeq2-related inference methods, using scipy/sklearn/numpy.
Expand Down Expand Up @@ -206,18 +201,32 @@ def wald_test( # noqa: D102

def dispersion_trend_gamma_glm( # noqa: D102
self, covariates: pd.Series, targets: pd.Series
) -> Tuple[np.ndarray, np.ndarray]:
covariates_w_intercept = sm.add_constant(covariates)
targets_fit = targets.values
) -> Tuple[np.ndarray, np.ndarray, bool]:
covariates_w_intercept = covariates.to_frame()
covariates_w_intercept.insert(0, "intercept", 1)
covariates_fit = covariates_w_intercept.values
glm_gamma = sm.GLM(
targets_fit,
covariates_fit,
family=sm.families.Gamma(link=sm.families.links.identity()),
targets_fit = targets.values

def loss(coeffs):
mu = covariates_fit @ coeffs
return np.nanmean(targets_fit / mu + np.log(mu), axis=0)

def grad(coeffs):
mu = covariates_fit @ coeffs
return -np.nanmean(
((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None], axis=0
)

res = minimize(
loss,
x0=np.array([1.0, 1.0]),
jac=grad,
method="L-BFGS-B",
bounds=[(0, np.inf)],
)
res = glm_gamma.fit()
coeffs = res.params
return (coeffs, covariates_fit @ coeffs)

coeffs = res.x
return coeffs, covariates_fit @ coeffs, res.success

def lfc_shrink_nbinom_glm( # noqa: D102
self,
Expand Down
21 changes: 9 additions & 12 deletions pydeseq2/ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
# import anndata as ad
import numpy as np
import pandas as pd
import statsmodels.api as sm # type: ignore
from scipy.optimize import root_scalar # type: ignore
from scipy.stats import f # type: ignore
from statsmodels.stats.multitest import multipletests # type: ignore
from scipy.stats import false_discovery_control # type: ignore

from pydeseq2.dds import DeseqDataSet
from pydeseq2.default_inference import DefaultInference
from pydeseq2.inference import Inference
from pydeseq2.utils import lowess
from pydeseq2.utils import make_MA_plot
from pydeseq2.utils import n_or_more_replicates

Expand Down Expand Up @@ -526,18 +526,15 @@ def _independent_filtering(self) -> None:
use = (self.base_mean >= cutoff) & (~self.p_values.isna())
U2 = self.p_values[use]
if not U2.empty:
result.loc[use, i] = multipletests(
U2, alpha=self.alpha, method="fdr_bh"
)[1]

num_rej = (result < self.alpha).sum(0)
lowess = sm.nonparametric.lowess(num_rej, theta, frac=1 / 5)
result.loc[use, i] = false_discovery_control(U2, method="bh")
num_rej = (result < self.alpha).sum(0).values
lowess_res = lowess(num_rej, theta, frac=1 / 5)

if num_rej.max() <= 10:
j = 0
else:
residual = num_rej[num_rej > 0] - lowess[num_rej > 0, 1]
thresh = lowess[:, 1].max() - np.sqrt(np.mean(residual**2))
residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0, 1]
thresh = lowess_res[:, 1].max() - np.sqrt(np.mean(residual**2))

if np.any(num_rej > thresh):
j = np.where(num_rej > thresh)[0][0]
Expand All @@ -557,8 +554,8 @@ def _p_value_adjustment(self) -> None:
self.run_wald_test()

self.padj = pd.Series(np.nan, index=self.dds.var_names)
self.padj.loc[~self.p_values.isna()] = multipletests(
self.p_values.dropna(), alpha=self.alpha, method="fdr_bh"
self.padj.loc[~self.p_values.isna()] = false_discovery_control(
self.p_values.dropna(), method="bh"
)[1]

def _cooks_filtering(self) -> None:
Expand Down
4 changes: 3 additions & 1 deletion pydeseq2/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def fit_moments_dispersions(
@abstractmethod
def dispersion_trend_gamma_glm(
self, covariates: pd.Series, targets: pd.Series
) -> Tuple[np.ndarray, np.ndarray]:
) -> Tuple[np.ndarray, np.ndarray, bool]:
"""Fit a gamma glm on gene dispersions.

The intercept should be concatenated in this method
Expand All @@ -304,6 +304,8 @@ def dispersion_trend_gamma_glm(
Coefficients of the regression.
predictions : ndarray
Predictions of the regression.
converged : bool
Whether the optimization converged.
"""

@abstractmethod
Expand Down
Loading