|
1 | 1 | # SPDX-License-Identifier: Apache-2.0 |
2 | 2 | # SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
3 | 3 | import multiprocessing |
| 4 | +import threading |
4 | 5 | from concurrent.futures import Future, ThreadPoolExecutor |
5 | 6 | from typing import TYPE_CHECKING |
6 | 7 |
|
@@ -63,39 +64,66 @@ def __init__(self, vllm_config: VllmConfig): |
63 | 64 | max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8)) |
64 | 65 | self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers) |
65 | 66 |
|
66 | | - if not self.vllm_config.model_config.skip_tokenizer_init: |
67 | | - # The default max_workers if not specified is the number of |
68 | | - # CPUs * 5, which is way too high since these tasks are CPU-bound, |
69 | | - # not I/O bound. We also know we would never dominate CPU usage |
70 | | - # with just grammar compilation, so we set it to half the number |
71 | | - # of CPUs. |
72 | | - max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) |
73 | | - self.executor = ThreadPoolExecutor(max_workers=max_workers) |
74 | | - self.tokenizer = init_tokenizer_from_config( |
75 | | - model_config=self.vllm_config.model_config |
76 | | - ) |
77 | | - reasoning_parser = ( |
78 | | - self.vllm_config.structured_outputs_config.reasoning_parser |
79 | | - ) |
80 | | - reasoning_parser_plugin = ( |
81 | | - self.vllm_config.structured_outputs_config.reasoning_parser_plugin |
82 | | - ) |
83 | | - if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: |
84 | | - ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) |
85 | | - |
86 | | - reasoning_parser = ( |
87 | | - self.vllm_config.structured_outputs_config.reasoning_parser |
88 | | - ) |
89 | | - if reasoning_parser: |
90 | | - reasoner_cls = ReasoningParserManager.get_reasoning_parser( |
91 | | - reasoning_parser |
92 | | - ) |
93 | | - self.reasoner = reasoner_cls(tokenizer=self.tokenizer) |
| 67 | + # Tokenizer is loaded lazily to avoid duplicate tokenizer initialization |
| 68 | + # in multiprocess mode. For GGUF models, this prevents a semaphore leak |
| 69 | + # that causes server hangs (tokenizer builds merges on the fly, which |
| 70 | + # uses multiprocessing primitives that don't clean up in subprocesses). |
| 71 | + self._tokenizer = None |
| 72 | + self._tokenizer_initialized = False |
| 73 | + self._tokenizer_init_lock = threading.Lock() |
| 74 | + self.executor = None |
94 | 75 |
|
95 | 76 | self.enable_in_reasoning = ( |
96 | 77 | self.vllm_config.structured_outputs_config.enable_in_reasoning |
97 | 78 | ) |
98 | 79 |
|
| 80 | + @property |
| 81 | + def tokenizer(self): |
| 82 | + """Lazily initialize tokenizer when first accessed (thread-safe).""" |
| 83 | + # Double-checked locking pattern for thread-safe lazy initialization |
| 84 | + if not self._tokenizer_initialized: |
| 85 | + with self._tokenizer_init_lock: |
| 86 | + if not self._tokenizer_initialized: |
| 87 | + self._init_tokenizer() |
| 88 | + return self._tokenizer |
| 89 | + |
| 90 | + def _init_tokenizer(self): |
| 91 | + """Initialize tokenizer and related components on first use.""" |
| 92 | + if self._tokenizer_initialized: |
| 93 | + return |
| 94 | + |
| 95 | + if self.vllm_config.model_config.skip_tokenizer_init: |
| 96 | + self._tokenizer_initialized = True |
| 97 | + return |
| 98 | + |
| 99 | + # The default max_workers if not specified is the number of |
| 100 | + # CPUs * 5, which is way too high since these tasks are CPU-bound, |
| 101 | + # not I/O bound. We also know we would never dominate CPU usage |
| 102 | + # with just grammar compilation, so we set it to half the number |
| 103 | + # of CPUs. |
| 104 | + max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) |
| 105 | + self.executor = ThreadPoolExecutor(max_workers=max_workers) |
| 106 | + self._tokenizer = init_tokenizer_from_config( |
| 107 | + model_config=self.vllm_config.model_config |
| 108 | + ) |
| 109 | + |
| 110 | + reasoning_parser = ( |
| 111 | + self.vllm_config.structured_outputs_config.reasoning_parser |
| 112 | + ) |
| 113 | + reasoning_parser_plugin = ( |
| 114 | + self.vllm_config.structured_outputs_config.reasoning_parser_plugin |
| 115 | + ) |
| 116 | + if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: |
| 117 | + ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) |
| 118 | + |
| 119 | + if reasoning_parser: |
| 120 | + reasoner_cls = ReasoningParserManager.get_reasoning_parser( |
| 121 | + reasoning_parser |
| 122 | + ) |
| 123 | + self.reasoner = reasoner_cls(tokenizer=self._tokenizer) |
| 124 | + |
| 125 | + self._tokenizer_initialized = True |
| 126 | + |
99 | 127 | def grammar_init(self, request: Request) -> None: |
100 | 128 | if request.structured_output_request is None: |
101 | 129 | return |
@@ -149,6 +177,9 @@ def grammar_init(self, request: Request) -> None: |
149 | 177 | raise ValueError(f"Unsupported structured output backend: {backend}") |
150 | 178 |
|
151 | 179 | if self._use_async_grammar_compilation: |
| 180 | + # Ensure tokenizer (and executor) is initialized |
| 181 | + _ = self.tokenizer |
| 182 | + assert self.executor is not None, "Executor should be initialized with tokenizer" |
152 | 183 | grammar = self.executor.submit(self._create_grammar, request) |
153 | 184 | else: |
154 | 185 | grammar = self._create_grammar(request) # type: ignore[assignment] |
|
0 commit comments