rebase and some minor work

rok · rok · commit 1ce9c09a4168 · 2025-12-21T23:49:06.000+01:00
diff --git a/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi b/python/pyarrow-stubs/pyarrow/_stubs_typing.pyi
@@ -45,6 +45,7 @@ Compression: TypeAlias = Literal[
 ]
 NullEncoding: TypeAlias = Literal["mask", "encode"]
 NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"]
+TimeUnit: TypeAlias = Literal["s", "ms", "us", "ns"]
 Mask: TypeAlias = (
     Sequence[bool | None]
     | NDArray[np.bool_]
@@ -123,7 +124,7 @@ class SupportArrowDeviceArray(Protocol):
 
 
 class SupportArrowSchema(Protocol):
-    def __arrow_c_schema(self) -> Any: ...
+    def __arrow_c_schema__(self) -> Any: ...
 
 
 class NullableCollection(Protocol[_V]):  # type: ignore[reportInvalidTypeVarUse]
diff --git a/python/pyarrow-stubs/pyarrow/_types.pyi b/python/pyarrow-stubs/pyarrow/_types.pyi
@@ -44,7 +44,7 @@ from typing_extensions import TypeVar, deprecated
 
 from .io import Buffer
 from .scalar import ExtensionScalar
-
+from ._stubs_typing import TimeUnit
 
 class _Weakrefable:
     ...
@@ -192,7 +192,7 @@ class BinaryViewType(_BasicDataType[bytes]):
     ...
 
 
-_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"])
+_Unit = TypeVar("_Unit", bound=TimeUnit, default=Literal["us"])
 _Tz = TypeVar("_Tz", str, None, default=None)
 
 
diff --git a/python/pyarrow-stubs/pyarrow/compute.pyi b/python/pyarrow-stubs/pyarrow/compute.pyi
@@ -116,7 +116,7 @@ from pyarrow._compute import HashAggregateKernel as HashAggregateKernel  # noqa:
 # Udf
 
 from pyarrow._compute import _Order, _Placement
-from pyarrow._stubs_typing import ArrayLike, ScalarLike, PyScalar
+from pyarrow._stubs_typing import ArrayLike, ScalarLike, PyScalar, TimeUnit
 from pyarrow._types import _RunEndType
 from . import lib
 
@@ -1351,7 +1351,7 @@ def strptime(
     strings: StringScalar | StringArray | Expression,
     /,
     format: str,
-    unit: Literal["s", "ms", "us", "ns"],
+    unit: TimeUnit,
     error_is_null: bool = False,
     *,
     options: StrptimeOptions | None = None,
diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py
@@ -162,12 +162,9 @@ def alltypes_sample(size=10000, seed=0, categorical=False):
         'float32': np.arange(size, dtype=np.float32),
         'float64': np.arange(size, dtype=np.float64),
         'bool': np.random.randn(size) > 0,
-        'datetime_ms': np.arange("2016-01-01T00:00:00.001", size,
-                                 dtype='datetime64[ms]'),
-        'datetime_us': np.arange("2016-01-01T00:00:00.000001", size,
-                                 dtype='datetime64[us]'),
-        'datetime_ns': np.arange("2016-01-01T00:00:00.000000001", size,
-                                 dtype='datetime64[ns]'),
+        'datetime_ms': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ms').values,
+        'datetime_us': pd.date_range("2016-01-01T00:00:00.000001", periods=size, freq='us').values,
+        'datetime_ns': pd.date_range("2016-01-01T00:00:00.000000001", periods=size, freq='ns').values,
         'timedelta': np.arange(0, size, dtype="timedelta64[s]"),
         'str': pd.Series([str(x) for x in range(size)]),
         'empty_str': [''] * size,
diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py
@@ -18,7 +18,6 @@
 import decimal
 import io
 import random
-from typing import cast
 
 try:
     import numpy as np
@@ -391,7 +390,7 @@ def test_parquet_nested_convenience(tempdir):
 
     read = pq.read_table(
         path, columns=['a'])
-    tm.assert_frame_equal(read.to_pandas(), cast(pd.DataFrame, df[['a']]))
+    tm.assert_frame_equal(read.to_pandas(), df[['a']])
 
     read = pq.read_table(
         path, columns=['a', 'b'])
diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py
@@ -937,8 +937,7 @@ def _test_write_to_dataset_with_partitions(base_path,
         'group2': list('eefeffgeee'),
         'num': list(range(10)),
         'nan': [np.nan] * 10,
-        'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype(
-            'datetime64[ns]')
+        'date': pd.date_range('2017-01-01', periods=10, freq='D').values.astype('datetime64[ns]')
     })
     cols = output_df.columns.tolist()
     partition_by = ['group1', 'group2']
@@ -995,8 +994,7 @@ def _test_write_to_dataset_no_partitions(base_path,
         'group1': list('aaabbbbccc'),
         'group2': list('eefeffgeee'),
         'num': list(range(10)),
-        'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype(
-            'datetime64[ns]')
+        'date': pd.date_range('2017-01-01', periods=10, freq='D').values.astype('datetime64[ns]')
     })
     cols = output_df.columns.tolist()
     output_table = pa.Table.from_pandas(output_df)
diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py
@@ -572,17 +572,17 @@ def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir):
     )
     result = pq.read_table(str(tempdir / "case1")).to_pandas()
     tm.assert_frame_equal(
-        cast(pd.DataFrame, result[["col"]]), cast(pd.DataFrame, df[["col"]]))
+        result[["col"]], df[["col"]])
 
     pq.write_to_dataset(table, str(tempdir / "case2"))
     result = pq.read_table(str(tempdir / "case2")).to_pandas()
     tm.assert_frame_equal(
-        cast(pd.DataFrame, result[["col"]]), cast(pd.DataFrame, df[["col"]]))
+        result[["col"]], df[["col"]])
 
     pq.write_table(table, str(tempdir / "data.parquet"))
     result = pq.read_table(str(tempdir / "data.parquet")).to_pandas()
     tm.assert_frame_equal(
-        cast(pd.DataFrame, result[["col"]]), cast(pd.DataFrame, df[["col"]]))
+        result[["col"]], df[["col"]])
 
 
 @pytest.mark.pandas
@@ -599,7 +599,7 @@ def test_write_to_dataset_pandas_preserve_index(tempdir):
         table, str(tempdir / "case1"), partition_cols=['part'],
     )
     result = pq.read_table(str(tempdir / "case1")).to_pandas()
-    tm.assert_frame_equal(result, cast(pd.DataFrame, df_cat))
+    tm.assert_frame_equal(result, df_cat)
 
     pq.write_to_dataset(table, str(tempdir / "case2"))
     result = pq.read_table(str(tempdir / "case2")).to_pandas()
diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py
@@ -2297,7 +2297,7 @@ def test_strptime():
 @pytest.mark.pandas
 @pytest.mark.timezone_data
 def test_strftime():
-    times = ["2018-03-10 09:00", "2038-01-31 12:23", None]
+    times: list[str | None] = ["2018-03-10 09:00", "2038-01-31 12:23", None]
     timezones = ["CET", "UTC", "Europe/Ljubljana"]
 
     formats = ["%a", "%A", "%w", "%d", "%b", "%B", "%m", "%y", "%Y", "%H", "%I",
@@ -2307,7 +2307,7 @@ def test_strftime():
         formats.extend(["%c", "%x", "%X"])
 
     for timezone in timezones:
-        ts = pd.to_datetime(times).tz_localize(timezone)
+        ts = pd.to_datetime(times).tz_localize(timezone)  # type: ignore[no-matching-overload]
         for unit in ["s", "ms", "us", "ns"]:
             tsa = pa.array(ts, type=pa.timestamp(unit, timezone))
             for fmt in formats:
@@ -2360,7 +2360,7 @@ def test_strftime():
 
     # Test timestamps without timezone
     fmt = "%Y-%m-%dT%H:%M:%S"
-    ts = pd.to_datetime(times)
+    ts = pd.to_datetime(times)  # type: ignore[no-matching-overload]
     tsa = pa.array(ts, type=pa.timestamp("s"))
     result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt))
     st = ts.strftime(fmt)  # type: ignore[call-non-callable]
@@ -3440,7 +3440,7 @@ def test_struct_fields_options():
         pc.struct_field(arr, '.a.foo')
 
     with pytest.raises(pa.ArrowInvalid, match="cannot be called without options"):
-        pc.struct_field(arr)
+        pc.struct_field(arr)  # type: ignore[call-arg]
 
 
 def test_case_when():
diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py
@@ -1872,7 +1872,7 @@ def use_threads(self):
 
 
 class BaseTestCompressedCSVRead:
-    def write_file(self, path, content):
+    def write_file(self, path, contents):
         pass
     csv_filename = ""
 
diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py
@@ -4946,8 +4946,7 @@ def test_write_table_partitioned_dict(tempdir):
 def test_write_dataset_parquet(tempdir):
     table = pa.table([
         pa.array(range(20), type="uint32"),
-        pa.array(np.arange("2012-01-01", 20, dtype="datetime64[D]").astype(
-            "datetime64[ns]")),
+        pa.array(pd.date_range("2012-01-01", periods=20, freq='D').values.astype("datetime64[ns]")),
         pa.array(np.repeat(['a', 'b'], 10))
     ], names=["f1", "f2", "part"])
 
diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py
@@ -85,7 +85,7 @@ def create_encryption_config(footer_key=FOOTER_KEY_NAME, column_keys=COLUMN_KEYS
 
 
 def create_decryption_config():
-    return pe.DecryptionConfiguration(cache_lifetime=300)
+    return pe.DecryptionConfiguration(cache_lifetime=timedelta(seconds=300))
 
 
 def create_kms_connection_config(keys=KEYS):
@@ -435,6 +435,9 @@ def unwrap_key(self, wrapped_key: bytes, _: str  # type: ignore[override]
     encryption_unavailable, reason="Parquet Encryption is not currently enabled"
 )
 def test_dataset_encryption_with_selected_column_statistics():
+    assert ds is not None
+    assert pq is not None
+
     table = create_sample_table()
 
     encryption_config = create_encryption_config()
@@ -478,7 +481,7 @@ def test_dataset_encryption_with_selected_column_statistics():
 
     for fragment in dataset.get_fragments():
         decryption_properties = crypto_factory.file_decryption_properties(
-            kms_connection_config, decryption_config, fragment.path, mockfs)
+            kms_connection_config, decryption_config, fragment.path, mockfs)  # type: ignore[call-arg]
         with pq.ParquetFile(
             fragment.path,
             decryption_properties=decryption_properties,
@@ -487,12 +490,14 @@ def test_dataset_encryption_with_selected_column_statistics():
             for rg_idx in range(parquet_file.metadata.num_row_groups):
                 row_group = parquet_file.metadata.row_group(rg_idx)
 
-                assert row_group.column(0).statistics is not None
-                assert row_group.column(0).statistics.min == 2019
-                assert row_group.column(0).statistics.max == 2022
+                stats0 = row_group.column(0).statistics
+                assert stats0 is not None
+                assert stats0.min == 2019
+                assert stats0.max == 2022
 
-                assert row_group.column(1).statistics is not None
-                assert row_group.column(1).statistics.min == 2
-                assert row_group.column(1).statistics.max == 100
+                stats1 = row_group.column(1).statistics
+                assert stats1 is not None
+                assert stats1.min == 2
+                assert stats1.max == 100
 
                 assert row_group.column(2).statistics is None
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -74,14 +74,10 @@ def _alltypes_example(size=100):
         'float32': np.arange(size, dtype=np.float32),
         'float64': np.arange(size, dtype=np.float64),
         'bool': np.random.randn(size) > 0,
-        'datetime[s]': np.arange("2016-01-01T00:00:00.001", size,
-                                 dtype='datetime64[s]'),
-        'datetime[ms]': np.arange("2016-01-01T00:00:00.001", size,
-                                  dtype='datetime64[ms]'),
-        'datetime[us]': np.arange("2016-01-01T00:00:00.001", size,
-                                  dtype='datetime64[us]'),
-        'datetime[ns]': np.arange("2016-01-01T00:00:00.001", size,
-                                  dtype='datetime64[ns]'),
+        'datetime[s]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='s').values,
+        'datetime[ms]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ms').values,
+        'datetime[us]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='us').values,
+        'datetime[ns]': pd.date_range("2016-01-01T00:00:00.001", periods=size, freq='ns').values,
         'timedelta64[s]': np.arange(0, size, dtype='timedelta64[s]'),
         'timedelta64[ms]': np.arange(0, size, dtype='timedelta64[ms]'),
         'timedelta64[us]': np.arange(0, size, dtype='timedelta64[us]'),
@@ -3156,9 +3152,8 @@ def test_strided_data_import(self):
         boolean_objects[5] = None
         cases.append(boolean_objects)
 
-        cases.append(np.arange("2016-01-01T00:00:00.001", N * K,
-                               dtype='datetime64[ms]')
-                     .reshape(N, K).copy())
+        cases.append(pd.date_range("2016-01-01T00:00:00.001", periods=N * K, freq='ms')
+                     .values.reshape(N, K).copy())
 
         strided_mask = (random_numbers > 0).astype(bool)[:, 0]
 
diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py
@@ -419,7 +419,7 @@ def test_to_pandas_empty_table():
     table = pa.table(df)
     result = table.schema.empty_table().to_pandas()
     assert result.shape == (0, 2)
-    expected = cast(pd.DataFrame, df.iloc[:0])
+    expected = df.iloc[:0]
     tm.assert_frame_equal(result, expected)
 
 
@@ -1207,7 +1207,7 @@ def test_recordbatch_to_tensor_null():
         batch.to_tensor()
 
     result = batch.to_tensor(null_to_nan=True, row_major=False)
-    x = np.column_stack([arr1, arr2]).astype(np.float64, order="F")
+    x = np.column_stack([arr1, arr2]).astype(np.float64, order="F")  # type: ignore[no-matching-overload]
     expected = pa.Tensor.from_numpy(x)
 
     np.testing.assert_equal(result.to_numpy(), x)
@@ -1241,7 +1241,7 @@ def test_recordbatch_to_tensor_null():
     )
 
     result = batch.to_tensor(null_to_nan=True, row_major=False)
-    x = np.column_stack([arr1, arr2]).astype(np.float32, order="F")
+    x = np.column_stack([arr1, arr2]).astype(np.float32, order="F")  # type: ignore[no-matching-overload]
     expected = pa.Tensor.from_numpy(x)
 
     np.testing.assert_equal(result.to_numpy(), x)
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -99,20 +99,20 @@ fallback_version = '23.0.0a0'
 
 [tool.mypy]
 files = ["pyarrow"]
-exclude = 'pyarrow/interchange/.*|pyarrow/vendored/.*|pyarrow/tests/test_cuda*'
+exclude = 'pyarrow/interchange/.*|pyarrow/tests/interchange/.*|pyarrow/vendored/.*|pyarrow/tests/test_cuda*'
 mypy_path = "$MYPY_CONFIG_FILE_DIR/pyarrow-stubs"
 
 [tool.pyright]
 pythonPlatform = "All"
 pythonVersion = "3.10"
 include = ["pyarrow"]
-exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/test_cuda*"]
+exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/interchange", "pyarrow/tests/test_cuda*"]
 stubPath = "pyarrow-stubs"
 typeCheckingMode = "basic"
 
 [tool.ty.src]
 include = ["pyarrow"]
-exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/test_cuda*"]
+exclude = ["pyarrow/vendored", "pyarrow/interchange", "pyarrow/tests/interchange", "pyarrow/tests/test_cuda*"]
 
 [tool.ty.environment]
 root = ["pyarrow"]