2323
2424from __future__ import annotations
2525
26- import pyarrow as pa
27- from datafusion import SessionContext , udf
28- from datafusion .ipc import clear_worker_ctx , set_worker_ctx
26+ import os
27+ import tempfile
28+ import time
29+ import traceback
30+ from pathlib import Path
31+
32+ # Diagnostic log path for multiprocessing worker timing.
33+ # Workers write here so a CI-side `cat` after a job timeout can show
34+ # where each worker stalled (e.g. inside `import datafusion`). Lives in
35+ # the system temp dir so it persists across Pool worker exits and is
36+ # readable by a follow-up workflow step. Override via env var when
37+ # debugging locally.
38+ _DIAG_LOG = Path (
39+ os .environ .get (
40+ "DF_MP_DIAG_LOG" ,
41+ str (Path (tempfile .gettempdir ()) / "df_mp_worker_diag.log" ),
42+ )
43+ )
44+
45+
46+ def _diag (event : str ) -> None :
47+ """Append a diagnostic line: timestamp, pid, parent pid, event.
48+
49+ Opens / flushes / closes per call so a hang mid-import still leaves
50+ a partial trail on disk. Parent pid distinguishes forkserver-born
51+ workers (parent = forkserver) from spawn-born workers (parent =
52+ main pytest process).
53+ """
54+ try :
55+ with _DIAG_LOG .open ("a" , encoding = "utf-8" ) as fh :
56+ fh .write (
57+ f"{ time .time ():.3f} pid={ os .getpid ()} ppid={ os .getppid ()} { event } \n "
58+ )
59+ fh .flush ()
60+ os .fsync (fh .fileno ())
61+ except OSError :
62+ # Best-effort diagnostic; never let logging itself break a test.
63+ pass
64+
65+
66+ _diag ("helpers module: starting imports" )
67+ import pyarrow as pa # noqa: E402
68+
69+ _diag ("helpers module: pyarrow imported" )
70+ from datafusion import SessionContext , udf # noqa: E402
71+
72+ _diag ("helpers module: datafusion imported" )
73+ from datafusion .ipc import clear_worker_ctx , set_worker_ctx # noqa: E402
74+
75+ _diag ("helpers module: all imports complete" )
2976
3077
3178def make_double_udf ():
@@ -65,12 +112,31 @@ def init_worker_clear():
65112 clear_worker_ctx ()
66113
67114
115+ def diag_init ():
116+ """Pool initializer used by the diagnostic-instrumented tests.
117+
118+ Logs that a worker process is alive and has finished its module
119+ imports. If this line never appears for a given pid, the hang is
120+ inside import / Rust extension init (before any task runs).
121+ """
122+ _diag ("worker init: ready for tasks" )
123+
124+
68125def unpickle_and_describe (blob : bytes ) -> str :
69126 """Unpickle a proto-bytes blob and return its canonical name."""
70127 import pickle
71128
72- expr = pickle .loads (blob ) # noqa: S301
73- return expr .canonical_name ()
129+ _diag ("unpickle_and_describe: enter" )
130+ try :
131+ expr = pickle .loads (blob ) # noqa: S301
132+ _diag ("unpickle_and_describe: pickle.loads done" )
133+ name = expr .canonical_name ()
134+ except BaseException as exc :
135+ _diag (f"unpickle_and_describe: raised { type (exc ).__name__ } : { exc } " )
136+ _diag (traceback .format_exc ())
137+ raise
138+ _diag (f"unpickle_and_describe: returning name={ name !r} " )
139+ return name
74140
75141
76142def unpickle_and_evaluate (blob : bytes , batch : list [int ]) -> list [int ]:
@@ -82,8 +148,19 @@ def unpickle_and_evaluate(blob: bytes, batch: list[int]) -> list[int]:
82148 """
83149 import pickle
84150
85- expr = pickle .loads (blob ) # noqa: S301
86- ctx = SessionContext ()
87- df = ctx .from_pydict ({"a" : batch })
88- out = df .with_column ("result" , expr ).select ("result" )
89- return out .to_pydict ()["result" ]
151+ _diag (f"unpickle_and_evaluate: enter batch_len={ len (batch )} " )
152+ try :
153+ expr = pickle .loads (blob ) # noqa: S301
154+ _diag ("unpickle_and_evaluate: pickle.loads done" )
155+ ctx = SessionContext ()
156+ _diag ("unpickle_and_evaluate: SessionContext built" )
157+ df = ctx .from_pydict ({"a" : batch })
158+ out = df .with_column ("result" , expr ).select ("result" )
159+ _diag ("unpickle_and_evaluate: plan built, collecting" )
160+ result = out .to_pydict ()["result" ]
161+ except BaseException as exc :
162+ _diag (f"unpickle_and_evaluate: raised { type (exc ).__name__ } : { exc } " )
163+ _diag (traceback .format_exc ())
164+ raise
165+ _diag (f"unpickle_and_evaluate: returning len={ len (result )} " )
166+ return result
0 commit comments