test(pickle): remove multiprocessing CI debug instrumentation

timsaucer · claude · timsaucer · commit 0fc78b76b608 · 2026-05-22T11:53:23.000-04:00
Multiprocessing forkserver/spawn hang was diagnosed and fixed: workers
could not import `tests._pickle_multiprocessing_helpers` because
`pytest --import-mode=importlib` does not add the test parent dir to
`sys.path`. The fix (appending the parent dir to `sys.path` so it is
inherited by mp workers without shadowing the installed `datafusion`
wheel) is retained. This commit drops the diagnostic scaffolding that
was added to identify the hang point:

- `_diag` + per-import / per-task log writes to /tmp
- `snapshot_processes` and the `threading.Timer` that captured worker
  state mid-hang
- `diag_init` Pool initializer
- "Dump multiprocessing diagnostic log" CI step

Pre-existing infrastructure is kept: per-test `@pytest.mark.timeout(120)`
(backed by `pytest-timeout` dev dep) and the job-level
`timeout-minutes: 30` backstop on the test matrix.

Co-Authored-By: Claude Opus 4.7 &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -102,20 +102,6 @@ jobs:
           git submodule update --init
           uv run --no-project pytest -v --import-mode=importlib
 
-      # Always dump the multiprocessing worker diagnostic log, even on
-      # job timeout, so we can see where forkserver/spawn workers stalled.
-      # See python/tests/_pickle_multiprocessing_helpers.py for what each
-      # line means. Safe to remove once forkserver/spawn hang is resolved.
-      - name: Dump multiprocessing diagnostic log
-        if: always()
-        run: |
-          echo "=== /tmp/df_mp_worker_diag.log ==="
-          if [ -f /tmp/df_mp_worker_diag.log ]; then
-            cat /tmp/df_mp_worker_diag.log
-          else
-            echo "(no diagnostic log produced)"
-          fi
-
       - name: FFI unit tests
         run: |
           cd examples/datafusion-ffi-example
diff --git a/python/tests/_pickle_multiprocessing_helpers.py b/python/tests/_pickle_multiprocessing_helpers.py
@@ -23,56 +23,9 @@
 
 from __future__ import annotations
 
-import os
-import tempfile
-import time
-import traceback
-from pathlib import Path
-
-# Diagnostic log path for multiprocessing worker timing.
-# Workers write here so a CI-side `cat` after a job timeout can show
-# where each worker stalled (e.g. inside `import datafusion`). Lives in
-# the system temp dir so it persists across Pool worker exits and is
-# readable by a follow-up workflow step. Override via env var when
-# debugging locally.
-_DIAG_LOG = Path(
-    os.environ.get(
-        "DF_MP_DIAG_LOG",
-        str(Path(tempfile.gettempdir()) / "df_mp_worker_diag.log"),
-    )
-)
-
-
-def _diag(event: str) -> None:
-    """Append a diagnostic line: timestamp, pid, parent pid, event.
-
-    Opens / flushes / closes per call so a hang mid-import still leaves
-    a partial trail on disk. Parent pid distinguishes forkserver-born
-    workers (parent = forkserver) from spawn-born workers (parent =
-    main pytest process).
-    """
-    try:
-        with _DIAG_LOG.open("a", encoding="utf-8") as fh:
-            fh.write(
-                f"{time.time():.3f} pid={os.getpid()} ppid={os.getppid()} {event}\n"
-            )
-            fh.flush()
-            os.fsync(fh.fileno())
-    except OSError:
-        # Best-effort diagnostic; never let logging itself break a test.
-        pass
-
-
-_diag("helpers module: starting imports")
-import pyarrow as pa  # noqa: E402
-
-_diag("helpers module: pyarrow imported")
-from datafusion import SessionContext, udf  # noqa: E402
-
-_diag("helpers module: datafusion imported")
-from datafusion.ipc import clear_worker_ctx, set_worker_ctx  # noqa: E402
-
-_diag("helpers module: all imports complete")
+import pyarrow as pa
+from datafusion import SessionContext, udf
+from datafusion.ipc import clear_worker_ctx, set_worker_ctx
 
 
 def make_double_udf():
@@ -112,96 +65,12 @@ def init_worker_clear():
     clear_worker_ctx()
 
 
-def diag_init():
-    """Pool initializer used by the diagnostic-instrumented tests.
-
-    Logs that a worker process is alive and has finished its module
-    imports. If this line never appears for a given pid, the hang is
-    inside import / Rust extension init (before any task runs).
-    """
-    _diag("worker init: ready for tasks")
-
-
-def _read_text(path: str) -> str:
-    """Read a /proc file; return ``"<unreadable>"`` if not accessible."""
-    try:
-        return Path(path).read_text(encoding="utf-8", errors="replace").strip()
-    except OSError:
-        return "<unreadable>"
-
-
-def _descendants(root_pid: int) -> list[int]:
-    """Return ``root_pid`` plus all descendant pids via /proc/<pid>/task/.../children.
-
-    Linux-only; returns ``[root_pid]`` on platforms without ``/proc``.
-    """
-    out: list[int] = [root_pid]
-    if not Path("/proc").is_dir():
-        return out
-    queue = [root_pid]
-    while queue:
-        pid = queue.pop()
-        try:
-            task_dir = Path(f"/proc/{pid}/task")
-            if not task_dir.is_dir():
-                continue
-            for tdir in task_dir.iterdir():
-                children_file = tdir / "children"
-                try:
-                    children = children_file.read_text(encoding="utf-8").split()
-                except OSError:
-                    continue
-                for child in children:
-                    try:
-                        cpid = int(child)
-                    except ValueError:
-                        continue
-                    out.append(cpid)
-                    queue.append(cpid)
-        except OSError:
-            continue
-    return out
-
-
-def snapshot_processes(label: str, root_pid: int | None = None) -> None:
-    """Dump a process-state snapshot to the diagnostic log.
-
-    For each descendant of ``root_pid`` (default: current process), record
-    cmdline, status (``R``/``S``/``D``), wchan (kernel function the task
-    is blocked in), and kernel stack. Use this to localize a worker hang:
-    a wchan of ``do_futex`` points at a lock; ``poll_schedule_timeout``
-    points at a blocking I/O wait; ``do_select`` at multiprocessing's
-    pipe read.
-    """
-    pid = root_pid if root_pid is not None else os.getpid()
-    _diag(f"snapshot[{label}] root_pid={pid}")
-    for cpid in _descendants(pid):
-        cmd = _read_text(f"/proc/{cpid}/cmdline").replace("\x00", " ").strip()
-        stat = _read_text(f"/proc/{cpid}/status").splitlines()
-        state_line = next((s for s in stat if s.startswith("State:")), "State: ?")
-        wchan = _read_text(f"/proc/{cpid}/wchan")
-        stack = _read_text(f"/proc/{cpid}/stack")
-        _diag(f"snapshot[{label}] pid={cpid} {state_line} wchan={wchan} cmd={cmd!r}")
-        if stack and stack != "<unreadable>":
-            for line in stack.splitlines()[:10]:
-                _diag(f"snapshot[{label}] pid={cpid} stack: {line}")
-
-
 def unpickle_and_describe(blob: bytes) -> str:
     """Unpickle a proto-bytes blob and return its canonical name."""
     import pickle
 
-    _diag("unpickle_and_describe: enter")
-    try:
-        expr = pickle.loads(blob)  # noqa: S301
-        _diag("unpickle_and_describe: pickle.loads done")
-        name = expr.canonical_name()
-    except BaseException as exc:
-        _diag(f"unpickle_and_describe: raised {type(exc).__name__}: {exc}")
-        _diag(traceback.format_exc())
-        raise
-    _diag(f"unpickle_and_describe: returning name={name!r}")
-    return name
+    expr = pickle.loads(blob)  # noqa: S301
+    return expr.canonical_name()
 
 
 def unpickle_and_evaluate(blob: bytes, batch: list[int]) -> list[int]:
@@ -213,19 +82,8 @@ def unpickle_and_evaluate(blob: bytes, batch: list[int]) -> list[int]:
     """
     import pickle
 
-    _diag(f"unpickle_and_evaluate: enter batch_len={len(batch)}")
-    try:
-        expr = pickle.loads(blob)  # noqa: S301
-        _diag("unpickle_and_evaluate: pickle.loads done")
-        ctx = SessionContext()
-        _diag("unpickle_and_evaluate: SessionContext built")
-        df = ctx.from_pydict({"a": batch})
-        out = df.with_column("result", expr).select("result")
-        _diag("unpickle_and_evaluate: plan built, collecting")
-        result = out.to_pydict()["result"]
-    except BaseException as exc:
-        _diag(f"unpickle_and_evaluate: raised {type(exc).__name__}: {exc}")
-        _diag(traceback.format_exc())
-        raise
-    _diag(f"unpickle_and_evaluate: returning len={len(result)}")
-    return result
+    expr = pickle.loads(blob)  # noqa: S301
+    ctx = SessionContext()
+    df = ctx.from_pydict({"a": batch})
+    out = df.with_column("result", expr).select("result")
+    return out.to_pydict()["result"]
diff --git a/python/tests/test_pickle_multiprocessing.py b/python/tests/test_pickle_multiprocessing.py
@@ -27,12 +27,10 @@
 
 from __future__ import annotations
 
-import contextlib
 import functools
 import multiprocessing as mp
 import pickle
 import sys
-import threading
 from pathlib import Path
 
 import pytest
@@ -57,27 +55,6 @@
     sys.path.append(_TESTS_PARENT)
 
 
-@contextlib.contextmanager
-def _snapshot_on_hang(label: str, fire_after_seconds: float = 30.0):
-    """Schedule a process-state snapshot ``fire_after_seconds`` from now.
-
-    Cancelled if the ``with`` block exits before then. Used to capture
-    worker state mid-hang — fork tests return in well under the delay,
-    so the timer only fires when something is actually stuck.
-    """
-    timer = threading.Timer(
-        fire_after_seconds,
-        helpers.snapshot_processes,
-        args=(label,),
-    )
-    timer.daemon = True
-    timer.start()
-    try:
-        yield
-    finally:
-        timer.cancel()
-
-
 @functools.cache
 def _multiprocessing_available() -> tuple[bool, str]:
     """Return (available, reason). Some sandboxed environments deny semaphore
@@ -120,19 +97,12 @@ def _skip_if_multiprocessing_unavailable():
 @pytest.mark.timeout(120)
 def test_builtin_pickle_via_pool(start_method):
     """Built-in expressions round-trip in every start method."""
-    helpers._diag(f"test_builtin_pickle_via_pool[{start_method}]: enter")
     expr = col("a") + lit(1)
     blob = pickle.dumps(expr)
 
     ctx = mp.get_context(start_method)
-    helpers._diag(f"test_builtin_pickle_via_pool[{start_method}]: creating Pool")
-    with (
-        ctx.Pool(processes=2, initializer=helpers.diag_init) as pool,
-        _snapshot_on_hang(f"builtin[{start_method}]"),
-    ):
-        helpers._diag(f"test_builtin_pickle_via_pool[{start_method}]: pool ready, map")
+    with ctx.Pool(processes=2) as pool:
         results = pool.map(helpers.unpickle_and_describe, [blob, blob, blob])
-    helpers._diag(f"test_builtin_pickle_via_pool[{start_method}]: pool closed")
 
     assert all(r == expr.canonical_name() for r in results)
 
@@ -145,25 +115,16 @@ def test_udf_pickle_self_contained(start_method):
     Workers start with no UDF registered. The Rust-side ``PythonUDFCodec``
     reconstructs the UDF from bytes embedded in the pickle blob.
     """
-    helpers._diag(f"test_udf_pickle_self_contained[{start_method}]: enter")
     udf_obj = helpers.make_double_udf()
     expr = udf_obj(col("a"))
     blob = pickle.dumps(expr)
 
     ctx = mp.get_context(start_method)
-    helpers._diag(f"test_udf_pickle_self_contained[{start_method}]: creating Pool")
-    with (
-        ctx.Pool(processes=2, initializer=helpers.diag_init) as pool,
-        _snapshot_on_hang(f"udf[{start_method}]"),
-    ):
-        helpers._diag(
-            f"test_udf_pickle_self_contained[{start_method}]: pool ready, starmap"
-        )
+    with ctx.Pool(processes=2) as pool:
         results = pool.starmap(
             helpers.unpickle_and_evaluate,
             [(blob, [1, 2, 3]), (blob, [10, 20, 30])],
         )
-    helpers._diag(f"test_udf_pickle_self_contained[{start_method}]: pool closed")
 
     assert results[0] == [2, 4, 6]
     assert results[1] == [20, 40, 60]
@@ -173,21 +134,12 @@ def test_udf_pickle_self_contained(start_method):
 @pytest.mark.timeout(120)
 def test_closure_capturing_udf_via_pool(start_method):
     """Cloudpickle preserves closure state across the codec boundary."""
-    helpers._diag(f"test_closure_capturing_udf_via_pool[{start_method}]: enter")
     udf_obj = helpers.make_times_seven_udf()
     expr = udf_obj(col("a"))
     blob = pickle.dumps(expr)
 
     ctx = mp.get_context(start_method)
-    helpers._diag(f"test_closure_capturing_udf_via_pool[{start_method}]: creating Pool")
-    with (
-        ctx.Pool(processes=2, initializer=helpers.diag_init) as pool,
-        _snapshot_on_hang(f"closure[{start_method}]"),
-    ):
-        helpers._diag(
-            f"test_closure_capturing_udf_via_pool[{start_method}]: pool ready, apply"
-        )
+    with ctx.Pool(processes=2) as pool:
         result = pool.apply(helpers.unpickle_and_evaluate, (blob, [1, 2, 3]))
-    helpers._diag(f"test_closure_capturing_udf_via_pool[{start_method}]: pool closed")
 
     assert result == [7, 14, 21]