scverse · BorisMuzellec · Feb 19, 2024 · Feb 13, 2024 · Feb 13, 2024 · Feb 13, 2024
@@ -17,8 +17,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - os: ubuntu-latest
-            python: "3.8"
           - os: ubuntu-latest
             python: "3.9"
           - os: ubuntu-latest

@@ -71,7 +71,6 @@ For reference, the code is being tested in a github workflow (CI) with python
 - pandas 1.4.3
 - scikit-learn 1.1.1
 - scipy 1.8.1
-- statsmodels 0.13.2
 ```
 
 Please don't hesitate to open an issue in case you encounter any issue due to possible deprecations.

@@ -7,8 +7,6 @@
 
 # -- Path setup --------------------------------------------------------------
 
-import warnings
-
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
@@ -17,12 +15,9 @@
 from pathlib import Path
 
 import git
-from statsmodels.tools.sm_exceptions import DomainWarning
 
 import pydeseq2
 
-# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link.
-warnings.simplefilter("ignore", DomainWarning)
 
 # -- Project information -----------------------------------------------------
 

@@ -8,6 +8,5 @@ For reference, the code was tested with python 3.8 and the following package ver
     - pandas==1.4.3
     - scikit-learn==1.1.1
     - scipy==1.8.1
-    - statsmodels==0.13.2
 
 Please don't hesitate to open an issue in case you encounter any problems due to possible deprecations.
@@ -343,7 +343,7 @@ def vst_fit(
         if use_design:
             # Check that the dispersion trend curve was fitted. If not, fit it.
             # This will call previous functions in a cascade.
-            if "trend_coeffs" not in self.uns:
+            if "disp_function" not in self.uns:
                 self.fit_dispersion_trend()
         else:
             # Reduce the design matrix to an intercept and reconstruct at the end
@@ -477,7 +477,7 @@ def fit_size_factors(
             warnings.warn(
                 "Every gene contains at least one zero, "
                 "cannot compute log geometric means. Switching to iterative mode.",
-                RuntimeWarning,
+                UserWarning,
                 stacklevel=2,
             )
             self._fit_iterate_size_factors()
@@ -601,12 +601,35 @@ def fit_dispersion_trend(self) -> None:
         # Initialize coefficients
         old_coeffs = pd.Series([0.1, 0.1])
         coeffs = pd.Series([1.0, 1.0])
-
-        while (np.log(np.abs(coeffs / old_coeffs)) ** 2).sum() >= 1e-6:
+        while (coeffs > 0).all() and (
+            np.log(np.abs(coeffs / old_coeffs)) ** 2
+        ).sum() >= 1e-6:
             old_coeffs = coeffs
-            coeffs, predictions = self.inference.dispersion_trend_gamma_glm(
+            coeffs, predictions, converged = self.inference.dispersion_trend_gamma_glm(
                 covariates, targets
             )
+
+            if not converged:
+                warnings.warn(
+                    "The dispersion trend curve fitting did not converge. "
+                    "Switching to a mean-based dispersion trend.",
+                    UserWarning,
+                    stacklevel=2,
+                )
+                self.uns["mean_disp"] = trim_mean(
+                    self.varm["genewise_dispersions"][
+                        self.varm["genewise_dispersions"] > 10 * self.min_disp
+                    ],
+                    proportiontocut=0.001,
+                )
+
+                self.uns["disp_function_type"] = "mean"
+                self.varm["fitted_dispersions"] = np.full(
+                    self.n_vars, self.uns["mean_disp"]
+                )
+
+                break
+
             # Filter out genes that are too far away from the curve before refitting
             pred_ratios = (
                 self[:, covariates.index].varm["genewise_dispersions"] / predictions
@@ -620,19 +643,25 @@ def fit_dispersion_trend(self) -> None:
                 covariates[(pred_ratios < 1e-4) | (pred_ratios >= 15)].index,
                 inplace=True,
             )
+        self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])
+
+        self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
+        self.uns["disp_function_type"] = "parametric"
+        self.varm["fitted_dispersions"][self.varm["non_zero"]] = self.disp_function(
+            self.varm["_normed_means"][self.varm["non_zero"]]
+        )
 
         end = time.time()
 
         if not self.quiet:
             print(f"... done in {end - start:.2f} seconds.\n", file=sys.stderr)
 
-        self.uns["trend_coeffs"] = pd.Series(coeffs, index=["a0", "a1"])
-
-        self.varm["fitted_dispersions"] = np.full(self.n_vars, np.NaN)
-        self.varm["fitted_dispersions"][self.varm["non_zero"]] = dispersion_trend(
-            self.varm["_normed_means"][self.varm["non_zero"]],
-            self.uns["trend_coeffs"],
-        )
+    def disp_function(self, x):
+        """Return the dispersion trend function at x."""
+        if self.uns["disp_function_type"] == "parametric":
+            return dispersion_trend(x, self.uns["trend_coeffs"])
+        elif self.uns["disp_function_type"] == "mean":
+            return self.uns["mean_disp"]
 
     def fit_dispersion_prior(self) -> None:
         """Fit dispersion variance priors and standard deviation of log-residuals.
@@ -1001,11 +1030,14 @@ def _refit_without_outliers(
 
         # Compute trend dispersions.
         # Note: the trend curve is not refitted.
-        sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"]
+        sub_dds.uns["disp_function_type"] = self.uns["disp_function_type"]
+        if sub_dds.uns["disp_function_type"] == "parametric":
+            sub_dds.uns["trend_coeffs"] = self.uns["trend_coeffs"]
+        elif sub_dds.uns["disp_function_type"] == "mean":
+            sub_dds.uns["mean_disp"] = self.uns["mean_disp"]
         sub_dds.varm["_normed_means"] = sub_dds.layers["normed_counts"].mean(0)
-        sub_dds.varm["fitted_dispersions"] = dispersion_trend(
-            sub_dds.varm["_normed_means"],
-            sub_dds.uns["trend_coeffs"],
+        sub_dds.varm["fitted_dispersions"] = sub_dds.disp_function(
+            sub_dds.varm["_normed_means"]
         )
 
         # Estimate MAP dispersions.
@@ -1089,9 +1121,18 @@ def objective(p):
                 & self.varm["non_zero"]
             ]
 
-            mean_disp = trimmed_mean(
-                self[:, use_for_mean_genes].varm["genewise_dispersions"], trim=0.001
+            if len(use_for_mean_genes) == 0:
+                print(
+                    "No genes have a dispersion above 10 * min_disp in "
+                    "_fit_iterate_size_factors."
+                )
+                break
+
+            mean_disp = trim_mean(
+                self[:, use_for_mean_genes].varm["genewise_dispersions"],
+                proportiontocut=0.001,
             )
+
             self.varm["fitted_dispersions"] = np.ones(self.n_vars) * mean_disp
             self.fit_dispersion_prior()
             self.fit_MAP_dispersions()

@@ -1,22 +1,17 @@
-import warnings
 from typing import Literal
 from typing import Optional
 from typing import Tuple
 
 import numpy as np
 import pandas as pd
-import statsmodels.api as sm  # type: ignore
 from joblib import Parallel  # type: ignore
 from joblib import delayed
 from joblib import parallel_backend
-from statsmodels.tools.sm_exceptions import DomainWarning  # type: ignore
+from scipy.optimize import minimize  # type: ignore
 
 from pydeseq2 import inference
 from pydeseq2 import utils
 
-# Ignore DomainWarning raised by statsmodels when fitting a Gamma GLM with identity link.
-warnings.simplefilter("ignore", DomainWarning)
-
 
 class DefaultInference(inference.Inference):
     """Default DESeq2-related inference methods, using scipy/sklearn/numpy.
@@ -206,18 +201,32 @@ def wald_test(  # noqa: D102
 
     def dispersion_trend_gamma_glm(  # noqa: D102
         self, covariates: pd.Series, targets: pd.Series
-    ) -> Tuple[np.ndarray, np.ndarray]:
-        covariates_w_intercept = sm.add_constant(covariates)
-        targets_fit = targets.values
+    ) -> Tuple[np.ndarray, np.ndarray, bool]:
+        covariates_w_intercept = covariates.to_frame()
+        covariates_w_intercept.insert(0, "intercept", 1)
         covariates_fit = covariates_w_intercept.values
-        glm_gamma = sm.GLM(
-            targets_fit,
-            covariates_fit,
-            family=sm.families.Gamma(link=sm.families.links.identity()),
+        targets_fit = targets.values
+
+        def loss(coeffs):
+            mu = covariates_fit @ coeffs
+            return np.nanmean(targets_fit / mu + np.log(mu), axis=0)
+
+        def grad(coeffs):
+            mu = covariates_fit @ coeffs
+            return -np.nanmean(
+                ((targets_fit / mu - 1)[:, None] * covariates_fit) / mu[:, None], axis=0
+            )
+
+        res = minimize(
+            loss,
+            x0=np.array([1.0, 1.0]),
+            jac=grad,
+            method="L-BFGS-B",
+            bounds=[(0, np.inf)],
         )
-        res = glm_gamma.fit()
-        coeffs = res.params
-        return (coeffs, covariates_fit @ coeffs)
+
+        coeffs = res.x
+        return coeffs, covariates_fit @ coeffs, res.success
 
     def lfc_shrink_nbinom_glm(  # noqa: D102
         self,

@@ -7,14 +7,14 @@
 # import anndata as ad
 import numpy as np
 import pandas as pd
-import statsmodels.api as sm  # type: ignore
 from scipy.optimize import root_scalar  # type: ignore
 from scipy.stats import f  # type: ignore
-from statsmodels.stats.multitest import multipletests  # type: ignore
+from scipy.stats import false_discovery_control  # type: ignore
 
 from pydeseq2.dds import DeseqDataSet
 from pydeseq2.default_inference import DefaultInference
 from pydeseq2.inference import Inference
+from pydeseq2.utils import lowess
 from pydeseq2.utils import make_MA_plot
 from pydeseq2.utils import n_or_more_replicates
 
@@ -526,18 +526,15 @@ def _independent_filtering(self) -> None:
             use = (self.base_mean >= cutoff) & (~self.p_values.isna())
             U2 = self.p_values[use]
             if not U2.empty:
-                result.loc[use, i] = multipletests(
-                    U2, alpha=self.alpha, method="fdr_bh"
-                )[1]
-
-        num_rej = (result < self.alpha).sum(0)
-        lowess = sm.nonparametric.lowess(num_rej, theta, frac=1 / 5)
+                result.loc[use, i] = false_discovery_control(U2, method="bh")
+        num_rej = (result < self.alpha).sum(0).values
+        lowess_res = lowess(num_rej, theta, frac=1 / 5)
 
         if num_rej.max() <= 10:
             j = 0
         else:
-            residual = num_rej[num_rej > 0] - lowess[num_rej > 0, 1]
-            thresh = lowess[:, 1].max() - np.sqrt(np.mean(residual**2))
+            residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0, 1]
+            thresh = lowess_res[:, 1].max() - np.sqrt(np.mean(residual**2))
 
             if np.any(num_rej > thresh):
                 j = np.where(num_rej > thresh)[0][0]
@@ -557,8 +554,8 @@ def _p_value_adjustment(self) -> None:
             self.run_wald_test()
 
         self.padj = pd.Series(np.nan, index=self.dds.var_names)
-        self.padj.loc[~self.p_values.isna()] = multipletests(
-            self.p_values.dropna(), alpha=self.alpha, method="fdr_bh"
+        self.padj.loc[~self.p_values.isna()] = false_discovery_control(
+            self.p_values.dropna(), method="bh"
         )[1]
 
     def _cooks_filtering(self) -> None:

@@ -285,7 +285,7 @@ def fit_moments_dispersions(
     @abstractmethod
     def dispersion_trend_gamma_glm(
         self, covariates: pd.Series, targets: pd.Series
-    ) -> Tuple[np.ndarray, np.ndarray]:
+    ) -> Tuple[np.ndarray, np.ndarray, bool]:
         """Fit a gamma glm on gene dispersions.
 
         The intercept should be concatenated in this method
@@ -304,6 +304,8 @@ def dispersion_trend_gamma_glm(
             Coefficients of the regression.
         predictions : ndarray
             Predictions of the regression.
+        converged : bool
+            Whether the optimization converged.
         """
 
     @abstractmethod