PolicyEngine · baogorek · Dec 1, 2025 · Nov 27, 2025
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: patch
+  changes:
+    fixed:
+      - Fix weighted quantile/median to use inverse CDF method instead of interpolation
diff --git a/microdf/microseries.py b/microdf/microseries.py
@@ -112,29 +112,39 @@ def mean(self) -> float:
     def quantile(self, q: np.array) -> pd.Series:
         """Calculates weighted quantiles of the MicroSeries.
 
-        Doesn't exactly match unweighted quantiles of stacked values.
-        See stackoverflow.com/q/21844024#comment102342137_29677616.
+        Uses the inverse CDF method: the q-th quantile is the smallest
+        value where the cumulative weight proportion >= q. This matches
+        the default behavior of R's survey::svyquantile.
 
-        :param q: Array of quantiles to calculate.
-        :type q: np.array
+        :param q: Quantile(s) to calculate, must be in [0, 1].
+        :type q: float or np.array
 
-        :return: Array of weighted quantiles.
-        :rtype: pd.Series
+        :return: Weighted quantile value(s).
+        :rtype: float or pd.Series
         """
         values = np.array(self.values)
-        quantiles = np.array(q)
+        quantiles = np.atleast_1d(q)
         sample_weight = np.array(self.weights)
         assert np.all(quantiles >= 0) and np.all(
             quantiles <= 1
         ), "quantiles should be in [0, 1]"
         sorter = np.argsort(values)
         values = values[sorter]
         sample_weight = sample_weight[sorter]
-        weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
-        weighted_quantiles /= np.sum(sample_weight)
-        result = np.interp(quantiles, weighted_quantiles, values)
-        if quantiles.shape == ():
-            return result
+        cumsum = np.cumsum(sample_weight)
+        cumsum_normalized = cumsum / cumsum[-1]
+        result = np.array(
+            [
+                values[
+                    min(
+                        np.searchsorted(cumsum_normalized, qi), len(values) - 1
+                    )
+                ]
+                for qi in quantiles
+            ]
+        )
+        if np.array(q).shape == ():
+            return result[0]
         return pd.Series(result, index=quantiles)
 
     @scalar_function

diff --git a/microdf/tests/test_microseries_dataframe.py b/microdf/tests/test_microseries_dataframe.py
@@ -112,6 +112,55 @@ def test_median() -> None:
     assert series.median() == 4
 
 
+def test_weighted_quantile_skewed() -> None:
+    # 99% of the population has 0 income, 1% has 1M
+    # The median should be 0, not an interpolated value
+    series = mdf.MicroSeries([0, 1_000_000], weights=[99, 1])
+    assert series.median() == 0
+    assert series.quantile(0.5) == 0
+    # 99th percentile is still 0 since exactly 99% have 0
+    assert series.quantile(0.99) == 0
+    # Only quantile > 0.99 gives 1M
+    assert series.quantile(1.0) == 1_000_000
+    # Test multiple quantiles
+    result = series.quantile([0.1, 0.5, 0.99, 1.0])
+    assert result[0.1] == 0
+    assert result[0.5] == 0
+    assert result[0.99] == 0
+    assert result[1.0] == 1_000_000
+
+
+def test_weighted_quantile_boundaries() -> None:
+    # Test q=0 returns minimum, q=1 returns maximum
+    series = mdf.MicroSeries([10, 20, 30], weights=[1, 1, 1])
+    assert series.quantile(0.0) == 10
+    assert series.quantile(1.0) == 30
+
+
+def test_weighted_quantile_equal_weights() -> None:
+    # With equal weights, should match "replicated" interpretation
+    # Values: 1, 2, 3 each with weight 2 -> like [1,1,2,2,3,3]
+    series = mdf.MicroSeries([1, 2, 3], weights=[2, 2, 2])
+    # cumsum_normalized = [2/6, 4/6, 6/6] = [0.333, 0.667, 1.0]
+    # median (0.5): smallest where cumsum >= 0.5 -> index 1 -> value 2
+    assert series.median() == 2
+    # 0.25 quantile: smallest where cumsum >= 0.25 -> index 0 -> value 1
+    assert series.quantile(0.25) == 1
+    # 0.75 quantile: smallest where cumsum >= 0.75 -> index 2 -> value 3
+    assert series.quantile(0.75) == 3
+
+
+def test_weighted_quantile_unsorted_input() -> None:
+    # Ensure sorting works correctly
+    series = mdf.MicroSeries([30, 10, 20], weights=[1, 2, 1])
+    # Sorted: values [10, 20, 30], weights [2, 1, 1]
+    # cumsum_normalized = [0.5, 0.75, 1.0]
+    assert series.quantile(0.0) == 10
+    assert series.quantile(0.5) == 10  # cumsum[0]=0.5 >= 0.5
+    assert series.quantile(0.6) == 20  # cumsum[1]=0.75 >= 0.6
+    assert series.quantile(1.0) == 30
+
+
 def test_unweighted_groupby() -> None:
     df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]})
     assert (df.groupby("x").z.sum().values == np.array([5.0, 6.0])).all()