PolicyEngine · MaxGhenis · Nov 26, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -60,27 +60,30 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
-      
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.13'
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
       - name: Install uv
         uses: astral-sh/setup-uv@v3
         with:
           version: "latest"
-      - name: Install dependencies
+
+      - name: Install Python dependencies
         run: |
-          uv pip install -e ".[dev,docs]" --system
-      - name: Build Jupyter Book
+          uv pip install -e ".[dev]" --system
+
+      - name: Install MyST
         run: |
-          jb build docs/.
-      - name: Check documentation build
+          npm install -g mystmd
+
+      - name: Build documentation with MyST
         run: |
-          for notebook in $(find docs/_build/jupyter_execute -name "*.ipynb"); do
-            if grep -q '"output_type": "error"' "$notebook"; then
-              echo "Error found in $notebook"
-              cat "$notebook"
-              exit 1
-            fi
-          done
+          cd docs && myst build --html
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,6 @@
+- bump: minor
+  changes:
+    fixed:
+      - MicroDataFrame.loc[] and .iloc[] now preserve MicroDataFrame type and weights when filtering rows (fixes issue #265)
+      - MicroDataFrame.groupby(col)["y"].sum() and groupby(col)[["y"]].sum() now use weighted aggregation (fixes issue #193)
+      - Documentation build updated to use Jupyter Book 2.0 / MyST
diff --git a/docs/myst.yml b/docs/myst.yml
@@ -0,0 +1,29 @@
+version: 1
+project:
+  title: microdf documentation
+  authors:
+    - name:
+        given: Max
+        family: Ghenis
+  copyright: '2024'
+  github: PolicyEngine/microdf
+  jupyter:
+    myst:
+      enable_extensions:
+        - colon_fence
+        - deflist
+        - dollarmath
+        - linkify
+        - substitution
+  toc:
+    - file: home.md
+    - file: examples.md
+      children:
+        - file: gini.ipynb
+site:
+  options:
+    logo: microdf_logo.png
+    favicon: ''
+    analytics_google: ''
+    folders: true
+  template: book-theme
diff --git a/microdf/microdataframe.py b/microdf/microdataframe.py
@@ -12,6 +12,80 @@
 logger = logging.getLogger(__name__)
 
 
+class _MicroLocIndexer:
+    """Custom loc indexer that returns MicroDataFrame with proper weights."""
+
+    def __init__(self, mdf: "MicroDataFrame"):
+        self._mdf = mdf
+        # Get the parent's loc indexer
+        self._parent_loc = pd.DataFrame.loc.fget(mdf)
+
+    def __getitem__(self, key):
+        # Use the parent DataFrame's loc indexer
+        result = self._parent_loc[key]
+
+        if isinstance(result, pd.DataFrame):
+            # Get the filtered weights based on the result's index
+            new_weights = self._mdf.weights.reindex(result.index)
+            return MicroDataFrame(result, weights=new_weights)
+        elif isinstance(result, pd.Series):
+            # Single row or column selected
+            if result.name in self._mdf.columns:
+                # Column was selected - return MicroSeries with all weights
+                return MicroSeries(result, weights=self._mdf.weights)
+            else:
+                # Row was selected - return as-is (scalar values for each col)
+                return result
+        else:
+            # Scalar value
+            return result
+
+    def __setitem__(self, key, value):
+        self._parent_loc[key] = value
+        self._mdf._link_all_weights()
+
+
+class _MicroILocIndexer:
+    """Custom iloc indexer that returns MicroDataFrame with proper weights."""
+
+    def __init__(self, mdf: "MicroDataFrame"):
+        self._mdf = mdf
+        # Get the parent's iloc indexer
+        self._parent_iloc = pd.DataFrame.iloc.fget(mdf)
+
+    def __getitem__(self, key):
+        # Use the parent DataFrame's iloc indexer
+        result = self._parent_iloc[key]
+
+        if isinstance(result, pd.DataFrame):
+            # Get the filtered weights based on the result's index
+            new_weights = self._mdf.weights.iloc[
+                self._mdf.index.get_indexer(result.index)
+            ]
+            new_weights = pd.Series(new_weights.values, index=result.index)
+            return MicroDataFrame(result, weights=new_weights)
+        elif isinstance(result, pd.Series):
+            # Single row or column selected
+            if isinstance(key, tuple) and len(key) == 2:
+                # df.iloc[:, col_idx] - column selection
+                row_key = key[0]
+                if isinstance(row_key, slice) and row_key == slice(None):
+                    # All rows selected for a column
+                    return MicroSeries(result, weights=self._mdf.weights)
+            # Check if this is a column (result index matches mdf index)
+            if result.index.equals(self._mdf.index):
+                return MicroSeries(result, weights=self._mdf.weights)
+            # Row selection - return as-is
+            return result
+        else:
+            # Scalar value
+            return result
+
+    def __setitem__(self, key, value):
+        self._parent_iloc[key] = value
+        self._mdf._link_all_weights()
+
+
 class MicroDataFrame(pd.DataFrame):
     def __init__(self, *args, weights=None, **kwargs):
         """A DataFrame-inheriting class for weighted microdata. Weights can be
@@ -26,6 +100,23 @@ def __init__(self, *args, weights=None, **kwargs):
         self._link_all_weights()
         self.override_df_functions()
 
+    @property
+    def loc(self) -> _MicroLocIndexer:
+        """Label-based indexer that preserves MicroDataFrame type and weights.
+
+        :return: Custom loc indexer for MicroDataFrame
+        """
+        return _MicroLocIndexer(self)
+
+    @property
+    def iloc(self) -> _MicroILocIndexer:
+        """Integer-based indexer that preserves MicroDataFrame type and
+        weights.
+
+        :return: Custom iloc indexer for MicroDataFrame
+        """
+        return _MicroILocIndexer(self)
+
     def override_df_functions(self) -> None:
         """Override DataFrame functions to work with weighted operations."""
         for name in MicroSeries.FUNCTIONS:
@@ -643,6 +734,7 @@ def __repr__(self) -> str:
 
 class MicroDataFrameGroupBy(pd.core.groupby.generic.DataFrameGroupBy):
     def _init(self, by: Union[str, List]):
+        self._by = by
         self.columns = list(self.obj.columns)
         if isinstance(by, list):
             for column in by:
@@ -656,6 +748,10 @@ def _init(self, by: Union[str, List]):
             for col in self.columns
             if pd.api.types.is_numeric_dtype(self.obj[col])
         ]
+        # Store reference to weights groupby for column selection
+        self._weights_groupby = copy.deepcopy(
+            super().__getitem__("__tmp_weights")
+        )
         for fn_name in MicroSeries.SCALAR_FUNCTIONS:
 
             def get_fn(name):
@@ -669,11 +765,9 @@ def fn(*args, **kwargs):
                         except Exception:
                             # Skip columns that can't be aggregated
                             pass
-                    return (
-                        MicroDataFrame(results)
-                        if results
-                        else MicroDataFrame()
-                    )
+                    # Return plain DataFrame - aggregated results don't have
+                    # per-row weights (weights were already applied)
+                    return pd.DataFrame(results) if results else pd.DataFrame()
 
                 return fn
 
@@ -691,12 +785,99 @@ def fn(*args, **kwargs) -> Union[pd.Series, pd.DataFrame]:
                         except Exception:
                             # Skip columns that can't be aggregated
                             pass
-                    return (
-                        MicroDataFrame(results)
-                        if results
-                        else MicroDataFrame()
-                    )
+                    # Return plain DataFrame - aggregated results don't have
+                    # per-row weights (weights were already applied)
+                    return pd.DataFrame(results) if results else pd.DataFrame()
 
                 return fn
 
             setattr(self, fn_name, get_fn(fn_name))
+
+    def __getitem__(
+        self, key: Union[str, List]
+    ) -> Union["MicroSeriesGroupBy", "MicroDataFrameGroupBy"]:
+        """Select columns from the groupby object while preserving weights.
+
+        This ensures that operations like groupby(col)["y"].sum() or
+        groupby(col)[["y"]].sum() use weighted aggregation.
+
+        :param key: Column name or list of column names
+        :return: MicroSeriesGroupBy for single column, MicroDataFrameGroupBy
+            for multiple columns
+        """
+        if isinstance(key, str):
+            # Single column - return MicroSeriesGroupBy
+            result = super().__getitem__(key)
+            result.__class__ = MicroSeriesGroupBy
+            result._init()
+            result.weights = self._weights_groupby
+            return result
+        else:
+            # Multiple columns - return a new MicroDataFrameGroupBy
+            # with only the selected columns
+            result = super().__getitem__(key)
+            result.__class__ = MicroDataFrameGroupBy
+            # Re-initialize with the subset of columns
+            result._by = self._by
+            result.columns = list(key) if hasattr(key, "__iter__") else [key]
+            result.numeric_columns = [
+                col
+                for col in result.columns
+                if pd.api.types.is_numeric_dtype(result.obj[col])
+            ]
+            result._weights_groupby = self._weights_groupby
+            # Set up the column attributes as MicroSeriesGroupBy
+            for col in result.columns:
+                col_gb = super().__getitem__(col)
+                col_gb.__class__ = MicroSeriesGroupBy
+                col_gb._init()
+                col_gb.weights = self._weights_groupby
+                setattr(result, col, col_gb)
+            # Set up the scalar and vector functions
+            for fn_name in MicroSeries.SCALAR_FUNCTIONS:
+
+                def get_scalar_fn(name, res):
+                    def fn(*args, **kwargs):
+                        results = {}
+                        for col in res.numeric_columns:
+                            try:
+                                results[col] = getattr(
+                                    getattr(res, col), name
+                                )(*args, **kwargs)
+                            except Exception:
+                                pass
+                        # Return plain DataFrame - aggregated results don't
+                        # have per-row weights (weights were already applied)
+                        return (
+                            pd.DataFrame(results)
+                            if results
+                            else pd.DataFrame()
+                        )
+
+                    return fn
+
+                setattr(result, fn_name, get_scalar_fn(fn_name, result))
+            for fn_name in MicroSeries.VECTOR_FUNCTIONS:
+
+                def get_vector_fn(name, res):
+                    def fn(*args, **kwargs):
+                        results = {}
+                        for col in res.numeric_columns:
+                            try:
+                                results[col] = getattr(
+                                    getattr(res, col), name
+                                )(*args, **kwargs)
+                            except Exception:
+                                pass
+                        # Return plain DataFrame - aggregated results don't
+                        # have per-row weights (weights were already applied)
+                        return (
+                            pd.DataFrame(results)
+                            if results
+                            else pd.DataFrame()
+                        )
+
+                    return fn
+
+                setattr(result, fn_name, get_vector_fn(fn_name, result))
+            return result
diff --git a/microdf/tests/test_microseries_dataframe.py b/microdf/tests/test_microseries_dataframe.py
@@ -287,3 +287,60 @@ def test_reset_index_inplace() -> None:
     assert "second" in mdf_multi.columns
     assert list(mdf_multi.index) == [0, 1, 2, 3]
     np.testing.assert_array_equal(mdf_multi.weights.values, weights)
+
+
+def test_loc_preserves_weights() -> None:
+    """Test that .loc[] returns MicroDataFrame with proper weights (issue
+    #265)."""
+    df = mdf.MicroDataFrame(
+        {"one": [1, 1, 1, 1, 1]}, weights=[10, 20, 30, 40, 50]
+    )
+
+    # Filter all rows (should get same weights)
+    filtered = df.loc[df.one == 1]
+    assert isinstance(filtered, MicroDataFrame)
+    assert filtered.one.sum() == 150.0  # Weighted sum
+
+    # Partial filter
+    df2 = mdf.MicroDataFrame(
+        {"x": [1, 2, 3, 4, 5]}, weights=[10, 20, 30, 40, 50]
+    )
+    subset = df2.loc[df2.x > 2]
+    assert isinstance(subset, MicroDataFrame)
+    assert subset.x.sum() == 500.0  # 3*30 + 4*40 + 5*50 = 500
+    np.testing.assert_array_equal(subset.weights.values, [30.0, 40.0, 50.0])
+
+
+def test_iloc_preserves_weights() -> None:
+    """Test that .iloc[] returns MicroDataFrame with proper weights."""
+    df = mdf.MicroDataFrame(
+        {"x": [1, 2, 3, 4, 5]}, weights=[10, 20, 30, 40, 50]
+    )
+
+    # Select rows by position
+    subset = df.iloc[2:5]
+    assert isinstance(subset, MicroDataFrame)
+    assert subset.x.sum() == 500.0  # 3*30 + 4*40 + 5*50 = 500
+    np.testing.assert_array_equal(subset.weights.values, [30.0, 40.0, 50.0])
+
+
+def test_groupby_column_selection() -> None:
+    """Test that groupby column selection preserves weights (issue #193)."""
+    d = mdf.MicroDataFrame(
+        dict(g=["a", "a", "b"], y=[1, 2, 3]), weights=[4, 5, 6]
+    )
+
+    # Test single column string selection
+    result_str = d.groupby("g")["y"].sum()
+    assert result_str["a"] == 14.0  # 1*4 + 2*5 = 14
+    assert result_str["b"] == 18.0  # 3*6 = 18
+
+    # Test list column selection
+    result_list = d.groupby("g")[["y"]].sum()
+    assert result_list.loc["a", "y"] == 14.0
+    assert result_list.loc["b", "y"] == 18.0
+
+    # Aggregated results should be plain DataFrame (no spurious weight column)
+    result_all = d.groupby("g").sum()
+    assert "weight" not in result_all.columns
+    assert list(result_all.columns) == ["y"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -30,9 +30,8 @@ dev = [
     "pytest-cov",
     "setuptools",
 ]
-docs = [
-    "jupyter_book",
-]
+# Note: Documentation uses MyST (Jupyter Book 2.0) which is installed via npm
+# Run: cd docs && myst build --html
 
 [tool.setuptools.packages.find]
 where = ["."]