@@ -255,7 +255,22 @@ def __init__(
255255
256256 self .__create_task_runner_processes (workers , configuration , metrics_settings )
257257 self .__create_metrics_provider_process (metrics_settings )
258+
259+ # Initialize worker restart counter directly using prometheus_client (if metrics enabled).
260+ # We use prometheus_client directly in the parent process instead of MetricsCollector
261+ # to avoid registry confusion between parent and worker processes.
258262 self ._worker_restart_counter = None
263+ if metrics_settings is not None :
264+ try :
265+ from prometheus_client import Counter
266+ self ._worker_restart_counter = Counter (
267+ name = 'worker_restart_total' ,
268+ documentation = 'Number of times TaskHandler restarted a worker subprocess' ,
269+ labelnames = ['taskType' ]
270+ )
271+ logger .debug ("Initialized worker_restart_total counter in parent process" )
272+ except Exception as e :
273+ logger .debug ("Failed to initialize worker restart counter: %s" , e )
259274
260275 # Optional supervision: monitor worker processes and (optionally) restart on failure.
261276 self .monitor_processes = monitor_processes
@@ -268,6 +283,8 @@ def __init__(
268283 self ._monitor_thread : Optional [threading .Thread ] = None
269284 self ._restart_counts : List [int ] = [0 for _ in self .workers ]
270285 self ._next_restart_at : List [float ] = [0.0 for _ in self .workers ]
286+ # Lock to protect process list during concurrent access (monitor thread vs main thread)
287+ self ._process_lock = threading .Lock ()
271288 logger .info ("TaskHandler initialized" )
272289
273290 def __enter__ (self ):
@@ -280,8 +297,10 @@ def stop_processes(self) -> None:
280297 self ._monitor_stop_event .set ()
281298 if self ._monitor_thread is not None and self ._monitor_thread .is_alive ():
282299 self ._monitor_thread .join (timeout = 2.0 )
283- self .__stop_task_runner_processes ()
284- self .__stop_metrics_provider_process ()
300+ # Lock to prevent race conditions with monitor thread
301+ with self ._process_lock :
302+ self .__stop_task_runner_processes ()
303+ self .__stop_metrics_provider_process ()
285304 logger .info ("Stopped worker processes..." )
286305 self .queue .put (None )
287306 self .logger_process .terminate ()
@@ -381,20 +400,22 @@ def __monitor_loop(self) -> None:
381400 def __check_and_restart_processes (self ) -> None :
382401 if self ._monitor_stop_event .is_set ():
383402 return
384- for i , process in enumerate (list (self .task_runner_processes )):
385- if process is None :
386- continue
387- if process .is_alive ():
388- continue
389- exitcode = process .exitcode
390- if exitcode is None :
391- continue
392- worker = self .workers [i ] if i < len (self .workers ) else None
393- worker_name = worker .get_task_definition_name () if worker is not None else f"worker[{ i } ]"
394- logger .warning ("Worker process exited (worker=%s, pid=%s, exitcode=%s)" , worker_name , process .pid , exitcode )
395- if not self .restart_on_failure :
396- continue
397- self .__restart_worker_process (i )
403+ # Lock to prevent race conditions with stop_processes
404+ with self ._process_lock :
405+ for i , process in enumerate (list (self .task_runner_processes )):
406+ if process is None :
407+ continue
408+ if process .is_alive ():
409+ continue
410+ exitcode = process .exitcode
411+ if exitcode is None :
412+ continue
413+ worker = self .workers [i ] if i < len (self .workers ) else None
414+ worker_name = worker .get_task_definition_name () if worker is not None else f"worker[{ i } ]"
415+ logger .warning ("Worker process exited (worker=%s, pid=%s, exitcode=%s)" , worker_name , process .pid , exitcode )
416+ if not self .restart_on_failure :
417+ continue
418+ self .__restart_worker_process (i )
398419
399420 def __restart_worker_process (self , index : int ) -> None :
400421 if self ._monitor_stop_event .is_set ():
@@ -420,58 +441,52 @@ def __restart_worker_process(self, index: int) -> None:
420441 attempt = self ._restart_counts [index ] + 1
421442
422443 # Exponential backoff per-worker to avoid tight crash loops
423- backoff = min (self .restart_backoff_seconds * (2 ** max ( self ._restart_counts [index ], 0 ) ), self .restart_backoff_max_seconds )
444+ backoff = min (self .restart_backoff_seconds * (2 ** self ._restart_counts [index ]), self .restart_backoff_max_seconds )
424445 self ._next_restart_at [index ] = now + backoff
425446
426447 try :
427448 # Reap the old process (avoid accumulating zombies on repeated restarts).
428449 old_process = self .task_runner_processes [index ]
450+ old_pid = getattr (old_process , "pid" , None )
429451 try :
430452 if old_process is not None and old_process .exitcode is not None :
431- old_process .join (timeout = 0.0 )
453+ # Give process a bit more time to clean up
454+ old_process .join (timeout = 0.5 )
432455 try :
433456 old_process .close ()
434- except Exception :
435- pass
436- except Exception :
437- pass
457+ logger .debug ("Cleaned up old worker process (worker=%s, pid=%s)" , worker .get_task_definition_name (), old_pid )
458+ except Exception as close_err :
459+ logger .debug ("Failed to close old worker process (worker=%s, pid=%s): %s" ,
460+ worker .get_task_definition_name (), old_pid , close_err )
461+ except Exception as join_err :
462+ logger .debug ("Failed to join old worker process (worker=%s, pid=%s): %s" ,
463+ worker .get_task_definition_name (), old_pid , join_err )
438464
439465 new_process = self .__build_process_for_worker (worker )
440466 self .task_runner_processes [index ] = new_process
441467 new_process .start ()
442468 self ._restart_counts [index ] = attempt
443469 self .__inc_worker_restart_metric (worker .get_task_definition_name ())
444470 logger .info (
445- "Restarted worker process (worker=%s, attempt=%s, pid =%s, next_backoff=%ss)" ,
471+ "Restarted worker process (worker=%s, attempt=%s, old_pid=%s, new_pid =%s, next_backoff=%ss)" ,
446472 worker .get_task_definition_name (),
447473 attempt ,
474+ old_pid ,
448475 new_process .pid ,
449476 backoff
450477 )
451478 except Exception as e :
452479 logger .error ("Failed to restart worker process (worker=%s): %s" , worker .get_task_definition_name (), e )
453480
454481 def __inc_worker_restart_metric (self , task_type : str ) -> None :
455- """Best-effort counter increment for worker subprocess restarts (requires metrics_settings) ."""
456- if self ._metrics_settings is None :
482+ """Best-effort counter increment for worker subprocess restarts."""
483+ if self ._worker_restart_counter is None :
457484 return
458485
459486 try :
460- # Avoid instantiating MetricsCollector here: it keeps a global registry which can be problematic
461- # when multiple TaskHandlers/tests use different PROMETHEUS_MULTIPROC_DIR values in one process.
462- from conductor .client .telemetry import metrics_collector as mc
463-
464- mc ._ensure_prometheus_imported ()
465- if self ._worker_restart_counter is None :
466- # Use a dedicated registry to avoid duplicate metric registration errors in the default registry.
467- registry = mc .CollectorRegistry ()
468- self ._worker_restart_counter = mc .Counter (
469- name = MetricName .WORKER_RESTART ,
470- documentation = MetricDocumentation .WORKER_RESTART ,
471- labelnames = [MetricLabel .TASK_TYPE .value ],
472- registry = registry ,
473- )
474- self ._worker_restart_counter .labels (task_type ).inc ()
487+ # Increment the prometheus counter directly.
488+ # This writes to the shared multiprocess metrics directory.
489+ self ._worker_restart_counter .labels (taskType = task_type ).inc ()
475490 except Exception as e :
476491 # Metrics should never break worker supervision.
477492 logger .debug ("Failed to increment worker_restart metric: %s" , e )
@@ -496,17 +511,19 @@ def __build_process_for_worker(self, worker: WorkerInterface) -> Process:
496511
497512 def get_worker_process_status (self ) -> List [Dict [str , Any ]]:
498513 """Return basic worker process status for health checks / observability."""
499- statuses : List [Dict [str , Any ]] = []
500- for i , worker in enumerate (self .workers ):
501- process = self .task_runner_processes [i ] if i < len (self .task_runner_processes ) else None
502- statuses .append ({
503- "worker" : worker .get_task_definition_name (),
504- "pid" : getattr (process , "pid" , None ),
505- "alive" : process .is_alive () if process is not None else False ,
506- "exitcode" : getattr (process , "exitcode" , None ),
507- "restart_count" : self ._restart_counts [i ] if i < len (self ._restart_counts ) else 0 ,
508- })
509- return statuses
514+ # Lock to ensure consistent snapshot of process state
515+ with self ._process_lock :
516+ statuses : List [Dict [str , Any ]] = []
517+ for i , worker in enumerate (self .workers ):
518+ process = self .task_runner_processes [i ] if i < len (self .task_runner_processes ) else None
519+ statuses .append ({
520+ "worker" : worker .get_task_definition_name (),
521+ "pid" : getattr (process , "pid" , None ),
522+ "alive" : process .is_alive () if process is not None else False ,
523+ "exitcode" : getattr (process , "exitcode" , None ),
524+ "restart_count" : self ._restart_counts [i ] if i < len (self ._restart_counts ) else 0 ,
525+ })
526+ return statuses
510527
511528 def is_healthy (self ) -> bool :
512529 """True if all worker processes are alive."""
@@ -522,10 +539,9 @@ def __start_task_runner_processes(self):
522539 n = 0
523540 for i , task_runner_process in enumerate (self .task_runner_processes ):
524541 task_runner_process .start ()
525- print (f'task runner process { task_runner_process .name } started' )
526542 worker = self .workers [i ]
527543 paused_status = "PAUSED" if getattr (worker , "paused" , False ) else "ACTIVE"
528- logger .debug ("Started worker '%s' [%s]" , worker .get_task_definition_name (), paused_status )
544+ logger .info ("Started worker '%s' [%s] (pid=%s) " , worker .get_task_definition_name (), paused_status , task_runner_process . pid )
529545 n = n + 1
530546 logger .info ("Started %s TaskRunner process(es)" , n )
531547
0 commit comments