Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: patch
changes:
fixed:
- Fix weighted quantile/median to use inverse CDF method instead of interpolation
34 changes: 22 additions & 12 deletions microdf/microseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,29 +112,39 @@ def mean(self) -> float:
def quantile(self, q: np.array) -> pd.Series:
"""Calculates weighted quantiles of the MicroSeries.

Doesn't exactly match unweighted quantiles of stacked values.
See stackoverflow.com/q/21844024#comment102342137_29677616.
Uses the inverse CDF method: the q-th quantile is the smallest
value where the cumulative weight proportion >= q. This matches
the default behavior of R's survey::svyquantile.

:param q: Array of quantiles to calculate.
:type q: np.array
:param q: Quantile(s) to calculate, must be in [0, 1].
:type q: float or np.array

:return: Array of weighted quantiles.
:rtype: pd.Series
:return: Weighted quantile value(s).
:rtype: float or pd.Series
"""
values = np.array(self.values)
quantiles = np.array(q)
quantiles = np.atleast_1d(q)
sample_weight = np.array(self.weights)
assert np.all(quantiles >= 0) and np.all(
quantiles <= 1
), "quantiles should be in [0, 1]"
sorter = np.argsort(values)
values = values[sorter]
sample_weight = sample_weight[sorter]
weighted_quantiles = np.cumsum(sample_weight) - 0.5 * sample_weight
weighted_quantiles /= np.sum(sample_weight)
result = np.interp(quantiles, weighted_quantiles, values)
if quantiles.shape == ():
return result
cumsum = np.cumsum(sample_weight)
cumsum_normalized = cumsum / cumsum[-1]
result = np.array(
[
values[
min(
np.searchsorted(cumsum_normalized, qi), len(values) - 1
)
]
for qi in quantiles
]
)
if np.array(q).shape == ():
return result[0]
return pd.Series(result, index=quantiles)

@scalar_function
Expand Down
49 changes: 49 additions & 0 deletions microdf/tests/test_microseries_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,55 @@ def test_median() -> None:
assert series.median() == 4


def test_weighted_quantile_skewed() -> None:
# 99% of the population has 0 income, 1% has 1M
# The median should be 0, not an interpolated value
series = mdf.MicroSeries([0, 1_000_000], weights=[99, 1])
assert series.median() == 0
assert series.quantile(0.5) == 0
# 99th percentile is still 0 since exactly 99% have 0
assert series.quantile(0.99) == 0
# Only quantile > 0.99 gives 1M
assert series.quantile(1.0) == 1_000_000
# Test multiple quantiles
result = series.quantile([0.1, 0.5, 0.99, 1.0])
assert result[0.1] == 0
assert result[0.5] == 0
assert result[0.99] == 0
assert result[1.0] == 1_000_000


def test_weighted_quantile_boundaries() -> None:
# Test q=0 returns minimum, q=1 returns maximum
series = mdf.MicroSeries([10, 20, 30], weights=[1, 1, 1])
assert series.quantile(0.0) == 10
assert series.quantile(1.0) == 30


def test_weighted_quantile_equal_weights() -> None:
# With equal weights, should match "replicated" interpretation
# Values: 1, 2, 3 each with weight 2 -> like [1,1,2,2,3,3]
series = mdf.MicroSeries([1, 2, 3], weights=[2, 2, 2])
# cumsum_normalized = [2/6, 4/6, 6/6] = [0.333, 0.667, 1.0]
# median (0.5): smallest where cumsum >= 0.5 -> index 1 -> value 2
assert series.median() == 2
# 0.25 quantile: smallest where cumsum >= 0.25 -> index 0 -> value 1
assert series.quantile(0.25) == 1
# 0.75 quantile: smallest where cumsum >= 0.75 -> index 2 -> value 3
assert series.quantile(0.75) == 3


def test_weighted_quantile_unsorted_input() -> None:
# Ensure sorting works correctly
series = mdf.MicroSeries([30, 10, 20], weights=[1, 2, 1])
# Sorted: values [10, 20, 30], weights [2, 1, 1]
# cumsum_normalized = [0.5, 0.75, 1.0]
assert series.quantile(0.0) == 10
assert series.quantile(0.5) == 10 # cumsum[0]=0.5 >= 0.5
assert series.quantile(0.6) == 20 # cumsum[1]=0.75 >= 0.6
assert series.quantile(1.0) == 30


def test_unweighted_groupby() -> None:
df = mdf.MicroDataFrame({"x": [1, 2], "y": [3, 4], "z": [5, 6]})
assert (df.groupby("x").z.sum().values == np.array([5.0, 6.0])).all()
Expand Down