Skip to content

Commit 4a66ba3

Browse files
committed
fix: Lazy tokenizer init in StructuredOutputManager to prevent semaphore leak
GGUF models without precomputed merges trigger `build_merges_on_the_fly` in the transformers library, which uses multiprocessing primitives. When this happens in both the APIServer process (for request validation) and the EngineCore subprocess (via StructuredOutputManager), the subprocess leaks a semaphore, causing the server to hang indefinitely. This change makes tokenizer initialization lazy in StructuredOutputManager: - Tokenizer is only loaded when grammar_init() is first called - Most inference requests don't use structured output, so the tokenizer in EngineCore is never loaded - For requests that do use structured output, tokenizer is loaded on-demand The fix resolves the following symptoms: - Server hangs after "resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown" - Tokenizer merges being built twice (once in APIServer, once in EngineCore) - GGUF models failing to start even though weights load successfully Tested with bartowski/Phi-3.5-mini-instruct-GGUF (Q5_K_M).
1 parent 541a2ef commit 4a66ba3

File tree

1 file changed

+59
-28
lines changed

1 file changed

+59
-28
lines changed

vllm/v1/structured_output/__init__.py

Lines changed: 59 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
import multiprocessing
4+
import threading
45
from concurrent.futures import Future, ThreadPoolExecutor
56
from typing import TYPE_CHECKING
67

@@ -63,39 +64,66 @@ def __init__(self, vllm_config: VllmConfig):
6364
max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8))
6465
self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers)
6566

66-
if not self.vllm_config.model_config.skip_tokenizer_init:
67-
# The default max_workers if not specified is the number of
68-
# CPUs * 5, which is way too high since these tasks are CPU-bound,
69-
# not I/O bound. We also know we would never dominate CPU usage
70-
# with just grammar compilation, so we set it to half the number
71-
# of CPUs.
72-
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
73-
self.executor = ThreadPoolExecutor(max_workers=max_workers)
74-
self.tokenizer = init_tokenizer_from_config(
75-
model_config=self.vllm_config.model_config
76-
)
77-
reasoning_parser = (
78-
self.vllm_config.structured_outputs_config.reasoning_parser
79-
)
80-
reasoning_parser_plugin = (
81-
self.vllm_config.structured_outputs_config.reasoning_parser_plugin
82-
)
83-
if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
84-
ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
85-
86-
reasoning_parser = (
87-
self.vllm_config.structured_outputs_config.reasoning_parser
88-
)
89-
if reasoning_parser:
90-
reasoner_cls = ReasoningParserManager.get_reasoning_parser(
91-
reasoning_parser
92-
)
93-
self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
67+
# Tokenizer is loaded lazily to avoid duplicate tokenizer initialization
68+
# in multiprocess mode. For GGUF models, this prevents a semaphore leak
69+
# that causes server hangs (tokenizer builds merges on the fly, which
70+
# uses multiprocessing primitives that don't clean up in subprocesses).
71+
self._tokenizer = None
72+
self._tokenizer_initialized = False
73+
self._tokenizer_init_lock = threading.Lock()
74+
self.executor = None
9475

9576
self.enable_in_reasoning = (
9677
self.vllm_config.structured_outputs_config.enable_in_reasoning
9778
)
9879

80+
@property
81+
def tokenizer(self):
82+
"""Lazily initialize tokenizer when first accessed (thread-safe)."""
83+
# Double-checked locking pattern for thread-safe lazy initialization
84+
if not self._tokenizer_initialized:
85+
with self._tokenizer_init_lock:
86+
if not self._tokenizer_initialized:
87+
self._init_tokenizer()
88+
return self._tokenizer
89+
90+
def _init_tokenizer(self):
91+
"""Initialize tokenizer and related components on first use."""
92+
if self._tokenizer_initialized:
93+
return
94+
95+
if self.vllm_config.model_config.skip_tokenizer_init:
96+
self._tokenizer_initialized = True
97+
return
98+
99+
# The default max_workers if not specified is the number of
100+
# CPUs * 5, which is way too high since these tasks are CPU-bound,
101+
# not I/O bound. We also know we would never dominate CPU usage
102+
# with just grammar compilation, so we set it to half the number
103+
# of CPUs.
104+
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
105+
self.executor = ThreadPoolExecutor(max_workers=max_workers)
106+
self._tokenizer = init_tokenizer_from_config(
107+
model_config=self.vllm_config.model_config
108+
)
109+
110+
reasoning_parser = (
111+
self.vllm_config.structured_outputs_config.reasoning_parser
112+
)
113+
reasoning_parser_plugin = (
114+
self.vllm_config.structured_outputs_config.reasoning_parser_plugin
115+
)
116+
if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3:
117+
ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin)
118+
119+
if reasoning_parser:
120+
reasoner_cls = ReasoningParserManager.get_reasoning_parser(
121+
reasoning_parser
122+
)
123+
self.reasoner = reasoner_cls(tokenizer=self._tokenizer)
124+
125+
self._tokenizer_initialized = True
126+
99127
def grammar_init(self, request: Request) -> None:
100128
if request.structured_output_request is None:
101129
return
@@ -149,6 +177,9 @@ def grammar_init(self, request: Request) -> None:
149177
raise ValueError(f"Unsupported structured output backend: {backend}")
150178

151179
if self._use_async_grammar_compilation:
180+
# Ensure tokenizer (and executor) is initialized
181+
_ = self.tokenizer
182+
assert self.executor is not None, "Executor should be initialized with tokenizer"
152183
grammar = self.executor.submit(self._create_grammar, request)
153184
else:
154185
grammar = self._create_grammar(request) # type: ignore[assignment]

0 commit comments

Comments
 (0)