Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog/319.fix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
- Fixed `ScmRun.groupby` raising `TypeError: Cannot interpret '<StringDtype(...)>' as a data type` on Python 3.12 with pandas 3.x and numpy 2.x. Under the newer stack, string-valued meta columns can be `pandas.StringDtype` rather than `object`, and `np.issubdtype` rejects `StringDtype` as a non-coercible dtype. The numeric-dtype check inside `RunGroupBy.__init__` is now routed through a `try/except` wrapper that returns `False` for any dtype numpy cannot classify, which is the semantically correct fallback. Unblocks downstream callers of `ScmRun.convert_unit` (and other groupby-using methods) on the newer Python / numpy / pandas stack.
18 changes: 17 additions & 1 deletion src/scmdata/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,22 @@
]


def _is_numeric_dtype(dtype: Any) -> bool:
"""
Return ``True`` if ``dtype`` is a numpy numeric dtype.

Wraps :func:`numpy.issubdtype` to gracefully handle pandas extension
dtypes such as :class:`pandas.StringDtype`, which numpy 2.x rejects
with a ``TypeError`` (``Cannot interpret <StringDtype(...)> as a data
type``). Semantically, an extension dtype that numpy cannot classify
is not a numpy numeric dtype, so ``False`` is the correct fallback.
"""
try:
return bool(np.issubdtype(dtype, np.number))
except TypeError:
return False


class RunGroupBy(ImplementsArrayReduce, Generic[GenericRun]):
"""
GroupBy object specialized to grouping ScmRun objects
Expand All @@ -58,7 +74,7 @@ def __init__(
self.na_fill_value = float(na_fill_value)

# Work around the bad handling of NaN values in groupbys
if any([np.issubdtype(m[c].dtype, np.number) for c in m]):
if any(_is_numeric_dtype(m[c].dtype) for c in m):
if (m == na_fill_value).any(axis=None):
raise ValueError(
"na_fill_value conflicts with data value. Choose a na_fill_value "
Expand Down
53 changes: 53 additions & 0 deletions tests/unit/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,56 @@ def increment_ensemble_member(scmrun):
exp["has_nan"] = [False, True]

assert_scmdf_almost_equal(res, exp, allow_unordered=True, check_ts_names=False)


def test_groupby_with_string_extension_dtype():
"""
Regression test for issue #318.

Under pandas 3.x, string-valued meta columns of a DataFrame round-
tripped through MultiIndex can come back as ``pandas.StringDtype``
rather than ``object``. ``RunGroupBy.__init__`` previously called
``np.issubdtype(StringDtype, np.number)`` directly, which numpy
2.x rejects with ``TypeError: Cannot interpret <StringDtype(...)>``.
The fix routes that check through ``_is_numeric_dtype``, which
returns ``False`` for any dtype numpy cannot classify.

The test exercises ``ScmRun.groupby`` directly to keep the
regression surface tight; ``convert_unit`` (the path that
originally tripped this for downstream users) has unrelated
pandas 3.x issues that are not in scope here.
"""
import pandas as pd

run = ScmRun(
pd.DataFrame(
[[1.0, 2.0]],
index=pd.MultiIndex.from_tuples(
[("FaIR", "ssp245", "m", "World", "Emissions|CO2", "GtC/yr", 0)],
names=[
"climate_model",
"scenario",
"model",
"region",
"variable",
"unit",
"run_id",
],
),
columns=[2010, 2020],
)
)

# Sanity check: under pandas 3.x at least one meta column should be
# StringDtype after the MultiIndex round-trip. On older stacks they
# are plain object; the test still passes (the fix is a no-op on
# numpy 1.x where np.issubdtype handled the inputs anyway).
meta = run.meta.reset_index(drop=True)
dtypes = [str(meta[c].dtype) for c in meta]

# Pre-fix, ScmRun.groupby raised TypeError when any meta column was
# StringDtype. Post-fix it should return a single group.
groups = list(run.groupby("variable"))
assert len(groups) == 1
assert groups[0].get_unique_meta("variable", no_duplicates=True) == "Emissions|CO2"
assert "string" in dtypes or "str" in dtypes or "object" in dtypes
Loading