Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- bump: patch
changes:
fixed:
- Fixed pandas 3.0 compatibility issues with MicroSeries method access and Copy-on-Write behavior
31 changes: 18 additions & 13 deletions microdf/microdataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,18 +278,18 @@ def __setitem__(self, *args, **kwargs) -> None:
self._link_all_weights()

def _link_weights(self, column) -> None:
# self[column] = ... triggers __setitem__, which forces pd.Series
# this workaround avoids that
self[column].__class__ = MicroSeries
self[column].set_weights(self.weights)
# In pandas 3.0+, we can't modify column classes in-place due to CoW.
# Instead, we rely on __getitem__ to wrap columns as MicroSeries on
# access. This method is kept for backward compatibility but is now
# a no-op.
pass

def _link_all_weights(self) -> None:
if self.weights is None:
if len(self) > 0:
self.set_weights(np.ones((len(self))))
for column in self.columns:
if column != self.weights_col:
self._link_weights(column)
# In pandas 3.0+, columns are wrapped as MicroSeries on access via
# __getitem__, not stored as MicroSeries internally.

def set_weights(
self,
Expand Down Expand Up @@ -365,7 +365,7 @@ def nullify_weights(self) -> None:

def __getitem__(
self, key: Union[str, List]
) -> Union[pd.Series, pd.DataFrame]:
) -> Union[MicroSeries, "MicroDataFrame"]:
# Let pandas handle the initial slicing
result = super().__getitem__(key)

Expand All @@ -374,17 +374,22 @@ def __getitem__(
new_weights = self.weights.reindex(result.index)
return MicroDataFrame(result, weights=new_weights)

# Otherwise, the result is a Series or a scalar, so just return it
# If the result is a Series (single column), wrap as MicroSeries
if isinstance(result, pd.Series):
return MicroSeries(result, weights=self.weights)

# Otherwise, the result is a scalar, so just return it
return result

def catch_series_relapse(self) -> None:
for col in self.columns:
if self[col].__class__ == pd.Series:
self._link_weights(col)
# In pandas 3.0+, we don't need to track series class changes since
# __getitem__ always wraps columns as MicroSeries on access.
pass

def __setattr__(self, key, value) -> None:
super().__setattr__(key, value)
self.catch_series_relapse()
# No need to call catch_series_relapse in pandas 3.0+ since we wrap
# on access rather than store MicroSeries internally.

def reset_index(
self,
Expand Down
130 changes: 73 additions & 57 deletions microdf/tests/test_pandas3_compatibility.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""
Tests for pandas 3.0.0 compatibility in microdf.
"""Tests for pandas 3.0.0 compatibility in microdf.

These tests verify that microdf works correctly with pandas 3.0.0,
which introduces:
Expand All @@ -10,18 +9,17 @@

import numpy as np
import pandas as pd
import pytest

from microdf.microseries import MicroSeries
from microdf.microdataframe import MicroDataFrame
from microdf.microseries import MicroSeries


class TestMicroSeriesSubclassPreservation:
"""Test that MicroSeries subclass is preserved across operations."""

def test_microseries_set_weights_after_creation(self):
"""
Ensure set_weights works on MicroSeries.
"""Ensure set_weights works on MicroSeries.

This is the error reported in pandas 3:
AttributeError: 'Series' object has no attribute 'set_weights'
"""
Expand All @@ -34,108 +32,115 @@ def test_microseries_set_weights_after_creation(self):
assert np.allclose(ms.weights, [2.0, 2.0, 2.0])

def test_microseries_preserved_after_arithmetic(self):
"""
Arithmetic operations should return MicroSeries, not plain Series.
"""
"""Arithmetic operations should return MicroSeries, not plain
Series."""
ms = MicroSeries([1, 2, 3], weights=np.array([1.0, 2.0, 3.0]))

# Addition
result = ms + 1
assert isinstance(result, MicroSeries), f"Got {type(result)} instead of MicroSeries"
assert isinstance(
result, MicroSeries
), f"Got {type(result)} instead of MicroSeries"
assert hasattr(result, "weights")
assert hasattr(result, "set_weights")

# Multiplication
result = ms * 2
assert isinstance(result, MicroSeries), f"Got {type(result)} instead of MicroSeries"
assert isinstance(
result, MicroSeries
), f"Got {type(result)} instead of MicroSeries"

# Division
result = ms / 2
assert isinstance(result, MicroSeries), f"Got {type(result)} instead of MicroSeries"
assert isinstance(
result, MicroSeries
), f"Got {type(result)} instead of MicroSeries"

def test_microseries_preserved_after_comparison(self):
"""
Comparison operations should return MicroSeries, not plain Series.
"""
"""Comparison operations should return MicroSeries, not plain
Series."""
ms = MicroSeries([1, 2, 3], weights=np.array([1.0, 2.0, 3.0]))

# Greater than
result = ms > 1
assert isinstance(result, MicroSeries), f"Got {type(result)} instead of MicroSeries"
assert isinstance(
result, MicroSeries
), f"Got {type(result)} instead of MicroSeries"
assert hasattr(result, "weights")

# Less than
result = ms < 3
assert isinstance(result, MicroSeries), f"Got {type(result)} instead of MicroSeries"
assert isinstance(
result, MicroSeries
), f"Got {type(result)} instead of MicroSeries"

def test_microseries_preserved_after_indexing(self):
"""
Indexing operations should return MicroSeries, not plain Series.
"""
ms = MicroSeries([1, 2, 3, 4, 5], weights=np.array([1.0, 2.0, 3.0, 4.0, 5.0]))
"""Indexing operations should return MicroSeries, not plain Series."""
ms = MicroSeries(
[1, 2, 3, 4, 5], weights=np.array([1.0, 2.0, 3.0, 4.0, 5.0])
)

# Boolean indexing
result = ms[ms > 2]
assert isinstance(result, MicroSeries), f"Got {type(result)} instead of MicroSeries"
assert isinstance(
result, MicroSeries
), f"Got {type(result)} instead of MicroSeries"
assert hasattr(result, "weights")

# Slice indexing
result = ms[1:3]
assert isinstance(result, MicroSeries), f"Got {type(result)} instead of MicroSeries"
assert isinstance(
result, MicroSeries
), f"Got {type(result)} instead of MicroSeries"


class TestMicroDataFrameSubclassPreservation:
"""Test that MicroDataFrame column access returns MicroSeries."""

def test_microdataframe_column_returns_microseries(self):
"""
Accessing a column from MicroDataFrame should return MicroSeries.
"""
"""Accessing a column from MicroDataFrame should return MicroSeries."""
mdf = MicroDataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6]},
weights=np.array([1.0, 2.0, 3.0])
{"a": [1, 2, 3], "b": [4, 5, 6]}, weights=np.array([1.0, 2.0, 3.0])
)

# Column access
col = mdf["a"]
assert isinstance(col, MicroSeries), f"Got {type(col)} instead of MicroSeries"
assert isinstance(
col, MicroSeries
), f"Got {type(col)} instead of MicroSeries"
assert hasattr(col, "weights")
assert hasattr(col, "set_weights")

def test_microdataframe_operations_preserve_type(self):
"""
Operations on MicroDataFrame columns should preserve MicroSeries type.
"""
"""Operations on MicroDataFrame columns should preserve MicroSeries
type."""
mdf = MicroDataFrame(
{"a": [1, 2, 3], "b": [4, 5, 6]},
weights=np.array([1.0, 2.0, 3.0])
{"a": [1, 2, 3], "b": [4, 5, 6]}, weights=np.array([1.0, 2.0, 3.0])
)

# Column operations
result = mdf["a"] + mdf["b"]
assert isinstance(result, MicroSeries), f"Got {type(result)} instead of MicroSeries"
assert isinstance(
result, MicroSeries
), f"Got {type(result)} instead of MicroSeries"
assert hasattr(result, "weights")


class TestStringDtypeHandling:
"""Test that MicroSeries/MicroDataFrame handle pandas 3 string dtypes."""

def test_microseries_with_string_data(self):
"""
MicroSeries should work with string data in pandas 3.
"""
"""MicroSeries should work with string data in pandas 3."""
# Create with string data
ms = MicroSeries(["a", "b", "c"], weights=np.array([1.0, 2.0, 3.0]))
assert len(ms) == 3
assert hasattr(ms, "weights")

def test_microdataframe_with_string_columns(self):
"""
MicroDataFrame should work with string columns in pandas 3.
"""
"""MicroDataFrame should work with string columns in pandas 3."""
mdf = MicroDataFrame(
{"names": ["alice", "bob", "charlie"], "values": [1, 2, 3]},
weights=np.array([1.0, 2.0, 3.0])
weights=np.array([1.0, 2.0, 3.0]),
)
assert len(mdf) == 3

Expand Down Expand Up @@ -169,9 +174,7 @@ class TestCopyOnWriteCompatibility:
"""Test compatibility with pandas 3 Copy-on-Write."""

def test_microseries_copy_independent(self):
"""
Copying a MicroSeries should create an independent copy.
"""
"""Copying a MicroSeries should create an independent copy."""
ms = MicroSeries([1, 2, 3], weights=np.array([1.0, 2.0, 3.0]))
ms_copy = ms.copy()

Expand All @@ -182,12 +185,9 @@ def test_microseries_copy_independent(self):
assert np.allclose(ms_copy.weights, [1.0, 2.0, 3.0])

def test_microdataframe_copy_independent(self):
"""
Copying a MicroDataFrame should create an independent copy.
"""
"""Copying a MicroDataFrame should create an independent copy."""
mdf = MicroDataFrame(
{"a": [1, 2, 3]},
weights=np.array([1.0, 2.0, 3.0])
{"a": [1, 2, 3]}, weights=np.array([1.0, 2.0, 3.0])
)
mdf_copy = mdf.copy()

Expand All @@ -197,14 +197,32 @@ def test_microdataframe_copy_independent(self):
# Copy should be unchanged
assert np.allclose(mdf_copy.weights, [1.0, 2.0, 3.0])

def test_column_set_weights_after_access_regression(self):
"""Regression test for pandas 3.0 CoW compatibility.

In pandas 3.0 with Copy-on-Write, modifying column.__class__ doesn't
persist because each access returns a copy. This test verifies the fix
that wraps columns as MicroSeries on access in __getitem__.
"""
mdf = MicroDataFrame(
{"income": [10000, 20000, 30000]},
weights=np.array([1.0, 2.0, 3.0]),
)

# This was the exact error that occurred:
# AttributeError: 'Series' object has no attribute 'set_weights'
col = mdf["income"]
col.set_weights(np.array([4.0, 5.0, 6.0])) # Would fail before fix

# Verify the new weights took effect
assert np.allclose(col.weights, [4.0, 5.0, 6.0])


class TestGroupByWithPandas3:
"""Test groupby operations with pandas 3."""

def test_microseries_groupby_preserves_weights(self):
"""
GroupBy operations should preserve weights.
"""
"""GroupBy operations should preserve weights."""
ms = MicroSeries([1, 2, 3, 4], weights=np.array([1.0, 2.0, 3.0, 4.0]))
groups = pd.Series(["a", "a", "b", "b"])

Expand All @@ -217,12 +235,10 @@ def test_microseries_groupby_preserves_weights(self):
assert result["b"] == 25

def test_microdataframe_groupby_preserves_weights(self):
"""
MicroDataFrame groupby should preserve weights on columns.
"""
"""MicroDataFrame groupby should preserve weights on columns."""
mdf = MicroDataFrame(
{"group": ["a", "a", "b", "b"], "value": [1, 2, 3, 4]},
weights=np.array([1.0, 2.0, 3.0, 4.0])
weights=np.array([1.0, 2.0, 3.0, 4.0]),
)

gb = mdf.groupby("group")
Expand Down