Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion sdk/src/beta9/abstractions/integrations/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ class VLLM(ASGI):
vllm_version (str):
The version of vLLM that will be installed from PyPI. As the configuration of the vLLM engine depends on the version of vLLM, using a non-default vllm_version might require subclassing VLLMArgs in order to add the missing configuration options. Default is version 0.8.4.
huggingface_hub_version (str):
The version of huggingface_hub that will be installed from PyPI. Different versions of vLLM require different versions of huggingface_hub, thus using a non-default vLLM version might require using a non-default version of huggingface_hub. Default is version 0.30.2.
The version of huggingface_hub that will be installed from PyPI. Different versions of vLLM require different versions of huggingface_hub, thus using a non-default vLLM version might require using a non-default version of huggingface_hub. Default is version 0.30.2.
workers (int):
The number of workers to run in the container. Default is 1.
concurrent_requests (int):
Expand All @@ -194,6 +194,8 @@ class VLLM(ASGI):
The secrets to pass to the container. If you need huggingface authentication to download models, you should set HF_TOKEN in the secrets.
autoscaler (Autoscaler):
The autoscaler to use. Default is a queue depth autoscaler.
checkpoint_enabled (bool):
Whether to enable checkpointing for the endpoint. Default is False. If enabled, the app will be checkpointed after the on_start function has completed. On next invocation, each container will restore from a checkpoint and resume execution instead of booting up from cold.
vllm_args (VLLMArgs):
The arguments for the vLLM model.

Expand Down Expand Up @@ -228,6 +230,7 @@ def __init__(
volumes: Optional[List[Union[Volume, CloudBucket]]] = [],
secrets: Optional[List[str]] = None,
autoscaler: Autoscaler = QueueDepthAutoscaler(),
checkpoint_enabled: bool = False,
vllm_args: VLLMArgs = VLLMArgs(),
):
if vllm_args.download_dir == DEFAULT_VLLM_CACHE_DIR:
Expand Down Expand Up @@ -261,6 +264,7 @@ def __init__(
volumes=volumes,
secrets=secrets,
autoscaler=autoscaler,
checkpoint_enabled=checkpoint_enabled,
)

self.chat_template_url = vllm_args.chat_template_url
Expand Down