Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 16 additions & 13 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,27 +60,30 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.13'

- name: Set up Node.js
uses: actions/setup-node@v4
with:
node-version: '20'

- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"
- name: Install dependencies

- name: Install Python dependencies
run: |
uv pip install -e ".[dev,docs]" --system
- name: Build Jupyter Book
uv pip install -e ".[dev]" --system

- name: Install MyST
run: |
jb build docs/.
- name: Check documentation build
npm install -g mystmd

- name: Build documentation with MyST
run: |
for notebook in $(find docs/_build/jupyter_execute -name "*.ipynb"); do
if grep -q '"output_type": "error"' "$notebook"; then
echo "Error found in $notebook"
cat "$notebook"
exit 1
fi
done
cd docs && myst build --html
6 changes: 6 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- bump: minor
changes:
fixed:
- MicroDataFrame.loc[] and .iloc[] now preserve MicroDataFrame type and weights when filtering rows (fixes issue #265)
- MicroDataFrame.groupby(col)["y"].sum() and groupby(col)[["y"]].sum() now use weighted aggregation (fixes issue #193)
- Documentation build updated to use Jupyter Book 2.0 / MyST
29 changes: 29 additions & 0 deletions docs/myst.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
version: 1
project:
title: microdf documentation
authors:
- name:
given: Max
family: Ghenis
copyright: '2024'
github: PolicyEngine/microdf
jupyter:
myst:
enable_extensions:
- colon_fence
- deflist
- dollarmath
- linkify
- substitution
toc:
- file: home.md
- file: examples.md
children:
- file: gini.ipynb
site:
options:
logo: microdf_logo.png
favicon: ''
analytics_google: ''
folders: true
template: book-theme
201 changes: 191 additions & 10 deletions microdf/microdataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,80 @@
logger = logging.getLogger(__name__)


class _MicroLocIndexer:
"""Custom loc indexer that returns MicroDataFrame with proper weights."""

def __init__(self, mdf: "MicroDataFrame"):
self._mdf = mdf
# Get the parent's loc indexer
self._parent_loc = pd.DataFrame.loc.fget(mdf)

def __getitem__(self, key):
# Use the parent DataFrame's loc indexer
result = self._parent_loc[key]

if isinstance(result, pd.DataFrame):
# Get the filtered weights based on the result's index
new_weights = self._mdf.weights.reindex(result.index)
return MicroDataFrame(result, weights=new_weights)
elif isinstance(result, pd.Series):
# Single row or column selected
if result.name in self._mdf.columns:
# Column was selected - return MicroSeries with all weights
return MicroSeries(result, weights=self._mdf.weights)
else:
# Row was selected - return as-is (scalar values for each col)
return result
else:
# Scalar value
return result

def __setitem__(self, key, value):
self._parent_loc[key] = value
self._mdf._link_all_weights()


class _MicroILocIndexer:
"""Custom iloc indexer that returns MicroDataFrame with proper weights."""

def __init__(self, mdf: "MicroDataFrame"):
self._mdf = mdf
# Get the parent's iloc indexer
self._parent_iloc = pd.DataFrame.iloc.fget(mdf)

def __getitem__(self, key):
# Use the parent DataFrame's iloc indexer
result = self._parent_iloc[key]

if isinstance(result, pd.DataFrame):
# Get the filtered weights based on the result's index
new_weights = self._mdf.weights.iloc[
self._mdf.index.get_indexer(result.index)
]
new_weights = pd.Series(new_weights.values, index=result.index)
return MicroDataFrame(result, weights=new_weights)
elif isinstance(result, pd.Series):
# Single row or column selected
if isinstance(key, tuple) and len(key) == 2:
# df.iloc[:, col_idx] - column selection
row_key = key[0]
if isinstance(row_key, slice) and row_key == slice(None):
# All rows selected for a column
return MicroSeries(result, weights=self._mdf.weights)
# Check if this is a column (result index matches mdf index)
if result.index.equals(self._mdf.index):
return MicroSeries(result, weights=self._mdf.weights)
# Row selection - return as-is
return result
else:
# Scalar value
return result

def __setitem__(self, key, value):
self._parent_iloc[key] = value
self._mdf._link_all_weights()


class MicroDataFrame(pd.DataFrame):
def __init__(self, *args, weights=None, **kwargs):
"""A DataFrame-inheriting class for weighted microdata. Weights can be
Expand All @@ -26,6 +100,23 @@ def __init__(self, *args, weights=None, **kwargs):
self._link_all_weights()
self.override_df_functions()

@property
def loc(self) -> _MicroLocIndexer:
"""Label-based indexer that preserves MicroDataFrame type and weights.

:return: Custom loc indexer for MicroDataFrame
"""
return _MicroLocIndexer(self)

@property
def iloc(self) -> _MicroILocIndexer:
"""Integer-based indexer that preserves MicroDataFrame type and
weights.

:return: Custom iloc indexer for MicroDataFrame
"""
return _MicroILocIndexer(self)

def override_df_functions(self) -> None:
"""Override DataFrame functions to work with weighted operations."""
for name in MicroSeries.FUNCTIONS:
Expand Down Expand Up @@ -643,6 +734,7 @@ def __repr__(self) -> str:

class MicroDataFrameGroupBy(pd.core.groupby.generic.DataFrameGroupBy):
def _init(self, by: Union[str, List]):
self._by = by
self.columns = list(self.obj.columns)
if isinstance(by, list):
for column in by:
Expand All @@ -656,6 +748,10 @@ def _init(self, by: Union[str, List]):
for col in self.columns
if pd.api.types.is_numeric_dtype(self.obj[col])
]
# Store reference to weights groupby for column selection
self._weights_groupby = copy.deepcopy(
super().__getitem__("__tmp_weights")
)
for fn_name in MicroSeries.SCALAR_FUNCTIONS:

def get_fn(name):
Expand All @@ -669,11 +765,9 @@ def fn(*args, **kwargs):
except Exception:
# Skip columns that can't be aggregated
pass
return (
MicroDataFrame(results)
if results
else MicroDataFrame()
)
# Return plain DataFrame - aggregated results don't have
# per-row weights (weights were already applied)
return pd.DataFrame(results) if results else pd.DataFrame()

return fn

Expand All @@ -691,12 +785,99 @@ def fn(*args, **kwargs) -> Union[pd.Series, pd.DataFrame]:
except Exception:
# Skip columns that can't be aggregated
pass
return (
MicroDataFrame(results)
if results
else MicroDataFrame()
)
# Return plain DataFrame - aggregated results don't have
# per-row weights (weights were already applied)
return pd.DataFrame(results) if results else pd.DataFrame()

return fn

setattr(self, fn_name, get_fn(fn_name))

def __getitem__(
self, key: Union[str, List]
) -> Union["MicroSeriesGroupBy", "MicroDataFrameGroupBy"]:
"""Select columns from the groupby object while preserving weights.

This ensures that operations like groupby(col)["y"].sum() or
groupby(col)[["y"]].sum() use weighted aggregation.

:param key: Column name or list of column names
:return: MicroSeriesGroupBy for single column, MicroDataFrameGroupBy
for multiple columns
"""
if isinstance(key, str):
# Single column - return MicroSeriesGroupBy
result = super().__getitem__(key)
result.__class__ = MicroSeriesGroupBy
result._init()
result.weights = self._weights_groupby
return result
else:
# Multiple columns - return a new MicroDataFrameGroupBy
# with only the selected columns
result = super().__getitem__(key)
result.__class__ = MicroDataFrameGroupBy
# Re-initialize with the subset of columns
result._by = self._by
result.columns = list(key) if hasattr(key, "__iter__") else [key]
result.numeric_columns = [
col
for col in result.columns
if pd.api.types.is_numeric_dtype(result.obj[col])
]
result._weights_groupby = self._weights_groupby
# Set up the column attributes as MicroSeriesGroupBy
for col in result.columns:
col_gb = super().__getitem__(col)
col_gb.__class__ = MicroSeriesGroupBy
col_gb._init()
col_gb.weights = self._weights_groupby
setattr(result, col, col_gb)
# Set up the scalar and vector functions
for fn_name in MicroSeries.SCALAR_FUNCTIONS:

def get_scalar_fn(name, res):
def fn(*args, **kwargs):
results = {}
for col in res.numeric_columns:
try:
results[col] = getattr(
getattr(res, col), name
)(*args, **kwargs)
except Exception:
pass
# Return plain DataFrame - aggregated results don't
# have per-row weights (weights were already applied)
return (
pd.DataFrame(results)
if results
else pd.DataFrame()
)

return fn

setattr(result, fn_name, get_scalar_fn(fn_name, result))
for fn_name in MicroSeries.VECTOR_FUNCTIONS:

def get_vector_fn(name, res):
def fn(*args, **kwargs):
results = {}
for col in res.numeric_columns:
try:
results[col] = getattr(
getattr(res, col), name
)(*args, **kwargs)
except Exception:
pass
# Return plain DataFrame - aggregated results don't
# have per-row weights (weights were already applied)
return (
pd.DataFrame(results)
if results
else pd.DataFrame()
)

return fn

setattr(result, fn_name, get_vector_fn(fn_name, result))
return result
57 changes: 57 additions & 0 deletions microdf/tests/test_microseries_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,60 @@ def test_reset_index_inplace() -> None:
assert "second" in mdf_multi.columns
assert list(mdf_multi.index) == [0, 1, 2, 3]
np.testing.assert_array_equal(mdf_multi.weights.values, weights)


def test_loc_preserves_weights() -> None:
"""Test that .loc[] returns MicroDataFrame with proper weights (issue
#265)."""
df = mdf.MicroDataFrame(
{"one": [1, 1, 1, 1, 1]}, weights=[10, 20, 30, 40, 50]
)

# Filter all rows (should get same weights)
filtered = df.loc[df.one == 1]
assert isinstance(filtered, MicroDataFrame)
assert filtered.one.sum() == 150.0 # Weighted sum

# Partial filter
df2 = mdf.MicroDataFrame(
{"x": [1, 2, 3, 4, 5]}, weights=[10, 20, 30, 40, 50]
)
subset = df2.loc[df2.x > 2]
assert isinstance(subset, MicroDataFrame)
assert subset.x.sum() == 500.0 # 3*30 + 4*40 + 5*50 = 500
np.testing.assert_array_equal(subset.weights.values, [30.0, 40.0, 50.0])


def test_iloc_preserves_weights() -> None:
"""Test that .iloc[] returns MicroDataFrame with proper weights."""
df = mdf.MicroDataFrame(
{"x": [1, 2, 3, 4, 5]}, weights=[10, 20, 30, 40, 50]
)

# Select rows by position
subset = df.iloc[2:5]
assert isinstance(subset, MicroDataFrame)
assert subset.x.sum() == 500.0 # 3*30 + 4*40 + 5*50 = 500
np.testing.assert_array_equal(subset.weights.values, [30.0, 40.0, 50.0])


def test_groupby_column_selection() -> None:
"""Test that groupby column selection preserves weights (issue #193)."""
d = mdf.MicroDataFrame(
dict(g=["a", "a", "b"], y=[1, 2, 3]), weights=[4, 5, 6]
)

# Test single column string selection
result_str = d.groupby("g")["y"].sum()
assert result_str["a"] == 14.0 # 1*4 + 2*5 = 14
assert result_str["b"] == 18.0 # 3*6 = 18

# Test list column selection
result_list = d.groupby("g")[["y"]].sum()
assert result_list.loc["a", "y"] == 14.0
assert result_list.loc["b", "y"] == 18.0

# Aggregated results should be plain DataFrame (no spurious weight column)
result_all = d.groupby("g").sum()
assert "weight" not in result_all.columns
assert list(result_all.columns) == ["y"]
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ dev = [
"pytest-cov",
"setuptools",
]
docs = [
"jupyter_book",
]
# Note: Documentation uses MyST (Jupyter Book 2.0) which is installed via npm
# Run: cd docs && myst build --html

[tool.setuptools.packages.find]
where = ["."]
Expand Down