Add testing for CI failure

timsaucer · timsaucer · commit d00c61999a5d · 2026-05-21T12:13:58.000-04:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -102,6 +102,20 @@ jobs:
           git submodule update --init
           uv run --no-project pytest -v --import-mode=importlib
 
+      # Always dump the multiprocessing worker diagnostic log, even on
+      # job timeout, so we can see where forkserver/spawn workers stalled.
+      # See python/tests/_pickle_multiprocessing_helpers.py for what each
+      # line means. Safe to remove once forkserver/spawn hang is resolved.
+      - name: Dump multiprocessing diagnostic log
+        if: always()
+        run: |
+          echo "=== /tmp/df_mp_worker_diag.log ==="
+          if [ -f /tmp/df_mp_worker_diag.log ]; then
+            cat /tmp/df_mp_worker_diag.log
+          else
+            echo "(no diagnostic log produced)"
+          fi
+
       - name: FFI unit tests
         run: |
           cd examples/datafusion-ffi-example
diff --git a/python/tests/_pickle_multiprocessing_helpers.py b/python/tests/_pickle_multiprocessing_helpers.py
@@ -23,9 +23,56 @@
 
 from __future__ import annotations
 
-import pyarrow as pa
-from datafusion import SessionContext, udf
-from datafusion.ipc import clear_worker_ctx, set_worker_ctx
+import os
+import tempfile
+import time
+import traceback
+from pathlib import Path
+
+# Diagnostic log path for multiprocessing worker timing.
+# Workers write here so a CI-side `cat` after a job timeout can show
+# where each worker stalled (e.g. inside `import datafusion`). Lives in
+# the system temp dir so it persists across Pool worker exits and is
+# readable by a follow-up workflow step. Override via env var when
+# debugging locally.
+_DIAG_LOG = Path(
+    os.environ.get(
+        "DF_MP_DIAG_LOG",
+        str(Path(tempfile.gettempdir()) / "df_mp_worker_diag.log"),
+    )
+)
+
+
+def _diag(event: str) -> None:
+    """Append a diagnostic line: timestamp, pid, parent pid, event.
+
+    Opens / flushes / closes per call so a hang mid-import still leaves
+    a partial trail on disk. Parent pid distinguishes forkserver-born
+    workers (parent = forkserver) from spawn-born workers (parent =
+    main pytest process).
+    """
+    try:
+        with _DIAG_LOG.open("a", encoding="utf-8") as fh:
+            fh.write(
+                f"{time.time():.3f} pid={os.getpid()} ppid={os.getppid()} {event}\n"
+            )
+            fh.flush()
+            os.fsync(fh.fileno())
+    except OSError:
+        # Best-effort diagnostic; never let logging itself break a test.
+        pass
+
+
+_diag("helpers module: starting imports")
+import pyarrow as pa  # noqa: E402
+
+_diag("helpers module: pyarrow imported")
+from datafusion import SessionContext, udf  # noqa: E402
+
+_diag("helpers module: datafusion imported")
+from datafusion.ipc import clear_worker_ctx, set_worker_ctx  # noqa: E402
+
+_diag("helpers module: all imports complete")
 
 
 def make_double_udf():
@@ -65,12 +112,31 @@ def init_worker_clear():
     clear_worker_ctx()
 
 
+def diag_init():
+    """Pool initializer used by the diagnostic-instrumented tests.
+
+    Logs that a worker process is alive and has finished its module
+    imports. If this line never appears for a given pid, the hang is
+    inside import / Rust extension init (before any task runs).
+    """
+    _diag("worker init: ready for tasks")
+
+
 def unpickle_and_describe(blob: bytes) -> str:
     """Unpickle a proto-bytes blob and return its canonical name."""
     import pickle
 
-    expr = pickle.loads(blob)  # noqa: S301
-    return expr.canonical_name()
+    _diag("unpickle_and_describe: enter")
+    try:
+        expr = pickle.loads(blob)  # noqa: S301
+        _diag("unpickle_and_describe: pickle.loads done")
+        name = expr.canonical_name()
+    except BaseException as exc:
+        _diag(f"unpickle_and_describe: raised {type(exc).__name__}: {exc}")
+        _diag(traceback.format_exc())
+        raise
+    _diag(f"unpickle_and_describe: returning name={name!r}")
+    return name
 
 
 def unpickle_and_evaluate(blob: bytes, batch: list[int]) -> list[int]:
@@ -82,8 +148,19 @@ def unpickle_and_evaluate(blob: bytes, batch: list[int]) -> list[int]:
     """
     import pickle
 
-    expr = pickle.loads(blob)  # noqa: S301
-    ctx = SessionContext()
-    df = ctx.from_pydict({"a": batch})
-    out = df.with_column("result", expr).select("result")
-    return out.to_pydict()["result"]
+    _diag(f"unpickle_and_evaluate: enter batch_len={len(batch)}")
+    try:
+        expr = pickle.loads(blob)  # noqa: S301
+        _diag("unpickle_and_evaluate: pickle.loads done")
+        ctx = SessionContext()
+        _diag("unpickle_and_evaluate: SessionContext built")
+        df = ctx.from_pydict({"a": batch})
+        out = df.with_column("result", expr).select("result")
+        _diag("unpickle_and_evaluate: plan built, collecting")
+        result = out.to_pydict()["result"]
+    except BaseException as exc:
+        _diag(f"unpickle_and_evaluate: raised {type(exc).__name__}: {exc}")
+        _diag(traceback.format_exc())
+        raise
+    _diag(f"unpickle_and_evaluate: returning len={len(result)}")
+    return result
diff --git a/python/tests/test_pickle_multiprocessing.py b/python/tests/test_pickle_multiprocessing.py
@@ -80,12 +80,16 @@ def _skip_if_multiprocessing_unavailable():
 @pytest.mark.timeout(120)
 def test_builtin_pickle_via_pool(start_method):
     """Built-in expressions round-trip in every start method."""
+    helpers._diag(f"test_builtin_pickle_via_pool[{start_method}]: enter")
     expr = col("a") + lit(1)
     blob = pickle.dumps(expr)
 
     ctx = mp.get_context(start_method)
-    with ctx.Pool(processes=2) as pool:
+    helpers._diag(f"test_builtin_pickle_via_pool[{start_method}]: creating Pool")
+    with ctx.Pool(processes=2, initializer=helpers.diag_init) as pool:
+        helpers._diag(f"test_builtin_pickle_via_pool[{start_method}]: pool ready, map")
         results = pool.map(helpers.unpickle_and_describe, [blob, blob, blob])
+    helpers._diag(f"test_builtin_pickle_via_pool[{start_method}]: pool closed")
 
     assert all(r == expr.canonical_name() for r in results)
 
@@ -98,16 +102,22 @@ def test_udf_pickle_self_contained(start_method):
     Workers start with no UDF registered. The Rust-side ``PythonUDFCodec``
     reconstructs the UDF from bytes embedded in the pickle blob.
     """
+    helpers._diag(f"test_udf_pickle_self_contained[{start_method}]: enter")
     udf_obj = helpers.make_double_udf()
     expr = udf_obj(col("a"))
     blob = pickle.dumps(expr)
 
     ctx = mp.get_context(start_method)
-    with ctx.Pool(processes=2) as pool:
+    helpers._diag(f"test_udf_pickle_self_contained[{start_method}]: creating Pool")
+    with ctx.Pool(processes=2, initializer=helpers.diag_init) as pool:
+        helpers._diag(
+            f"test_udf_pickle_self_contained[{start_method}]: pool ready, starmap"
+        )
         results = pool.starmap(
             helpers.unpickle_and_evaluate,
             [(blob, [1, 2, 3]), (blob, [10, 20, 30])],
         )
+    helpers._diag(f"test_udf_pickle_self_contained[{start_method}]: pool closed")
 
     assert results[0] == [2, 4, 6]
     assert results[1] == [20, 40, 60]
@@ -117,12 +127,18 @@ def test_udf_pickle_self_contained(start_method):
 @pytest.mark.timeout(120)
 def test_closure_capturing_udf_via_pool(start_method):
     """Cloudpickle preserves closure state across the codec boundary."""
+    helpers._diag(f"test_closure_capturing_udf_via_pool[{start_method}]: enter")
     udf_obj = helpers.make_times_seven_udf()
     expr = udf_obj(col("a"))
     blob = pickle.dumps(expr)
 
     ctx = mp.get_context(start_method)
-    with ctx.Pool(processes=2) as pool:
+    helpers._diag(f"test_closure_capturing_udf_via_pool[{start_method}]: creating Pool")
+    with ctx.Pool(processes=2, initializer=helpers.diag_init) as pool:
+        helpers._diag(
+            f"test_closure_capturing_udf_via_pool[{start_method}]: pool ready, apply"
+        )
         result = pool.apply(helpers.unpickle_and_evaluate, (blob, [1, 2, 3]))
+    helpers._diag(f"test_closure_capturing_udf_via_pool[{start_method}]: pool closed")
 
     assert result == [7, 14, 21]