openscm · benmsanderson · May 22, 2026 · May 22, 2026
diff --git a/changelog/319.fix.md b/changelog/319.fix.md
@@ -0,0 +1 @@
+- Fixed `ScmRun.groupby` raising `TypeError: Cannot interpret '<StringDtype(...)>' as a data type` on Python 3.12 with pandas 3.x and numpy 2.x. Under the newer stack, string-valued meta columns can be `pandas.StringDtype` rather than `object`, and `np.issubdtype` rejects `StringDtype` as a non-coercible dtype. The numeric-dtype check inside `RunGroupBy.__init__` is now routed through a `try/except` wrapper that returns `False` for any dtype numpy cannot classify, which is the semantically correct fallback. Unblocks downstream callers of `ScmRun.convert_unit` (and other groupby-using methods) on the newer Python / numpy / pandas stack.
diff --git a/src/scmdata/groupby.py b/src/scmdata/groupby.py
@@ -43,6 +43,22 @@
     ]
 
 
+def _is_numeric_dtype(dtype: Any) -> bool:
+    """
+    Return ``True`` if ``dtype`` is a numpy numeric dtype.
+
+    Wraps :func:`numpy.issubdtype` to gracefully handle pandas extension
+    dtypes such as :class:`pandas.StringDtype`, which numpy 2.x rejects
+    with a ``TypeError`` (``Cannot interpret <StringDtype(...)> as a data
+    type``). Semantically, an extension dtype that numpy cannot classify
+    is not a numpy numeric dtype, so ``False`` is the correct fallback.
+    """
+    try:
+        return bool(np.issubdtype(dtype, np.number))
+    except TypeError:
+        return False
+
+
 class RunGroupBy(ImplementsArrayReduce, Generic[GenericRun]):
     """
     GroupBy object specialized to grouping ScmRun objects
@@ -58,7 +74,7 @@ def __init__(
         self.na_fill_value = float(na_fill_value)
 
         # Work around the bad handling of NaN values in groupbys
-        if any([np.issubdtype(m[c].dtype, np.number) for c in m]):
+        if any(_is_numeric_dtype(m[c].dtype) for c in m):
             if (m == na_fill_value).any(axis=None):
                 raise ValueError(
                     "na_fill_value conflicts with data value. Choose a na_fill_value "

diff --git a/tests/unit/test_groupby.py b/tests/unit/test_groupby.py
@@ -159,3 +159,56 @@ def increment_ensemble_member(scmrun):
     exp["has_nan"] = [False, True]
 
     assert_scmdf_almost_equal(res, exp, allow_unordered=True, check_ts_names=False)
+
+
+def test_groupby_with_string_extension_dtype():
+    """
+    Regression test for issue #318.
+
+    Under pandas 3.x, string-valued meta columns of a DataFrame round-
+    tripped through MultiIndex can come back as ``pandas.StringDtype``
+    rather than ``object``. ``RunGroupBy.__init__`` previously called
+    ``np.issubdtype(StringDtype, np.number)`` directly, which numpy
+    2.x rejects with ``TypeError: Cannot interpret <StringDtype(...)>``.
+    The fix routes that check through ``_is_numeric_dtype``, which
+    returns ``False`` for any dtype numpy cannot classify.
+
+    The test exercises ``ScmRun.groupby`` directly to keep the
+    regression surface tight; ``convert_unit`` (the path that
+    originally tripped this for downstream users) has unrelated
+    pandas 3.x issues that are not in scope here.
+    """
+    import pandas as pd
+
+    run = ScmRun(
+        pd.DataFrame(
+            [[1.0, 2.0]],
+            index=pd.MultiIndex.from_tuples(
+                [("FaIR", "ssp245", "m", "World", "Emissions|CO2", "GtC/yr", 0)],
+                names=[
+                    "climate_model",
+                    "scenario",
+                    "model",
+                    "region",
+                    "variable",
+                    "unit",
+                    "run_id",
+                ],
+            ),
+            columns=[2010, 2020],
+        )
+    )
+
+    # Sanity check: under pandas 3.x at least one meta column should be
+    # StringDtype after the MultiIndex round-trip. On older stacks they
+    # are plain object; the test still passes (the fix is a no-op on
+    # numpy 1.x where np.issubdtype handled the inputs anyway).
+    meta = run.meta.reset_index(drop=True)
+    dtypes = [str(meta[c].dtype) for c in meta]
+
+    # Pre-fix, ScmRun.groupby raised TypeError when any meta column was
+    # StringDtype. Post-fix it should return a single group.
+    groups = list(run.groupby("variable"))
+    assert len(groups) == 1
+    assert groups[0].get_unique_meta("variable", no_duplicates=True) == "Emissions|CO2"
+    assert "string" in dtypes or "str" in dtypes or "object" in dtypes
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		- Fixed `ScmRun.groupby` raising `TypeError: Cannot interpret '<StringDtype(...)>' as a data type` on Python 3.12 with pandas 3.x and numpy 2.x. Under the newer stack, string-valued meta columns can be `pandas.StringDtype` rather than `object`, and `np.issubdtype` rejects `StringDtype` as a non-coercible dtype. The numeric-dtype check inside `RunGroupBy.__init__` is now routed through a `try/except` wrapper that returns `False` for any dtype numpy cannot classify, which is the semantically correct fallback. Unblocks downstream callers of `ScmRun.convert_unit` (and other groupby-using methods) on the newer Python / numpy / pandas stack.