|
16 | 16 |
|
17 | 17 | import yaml |
18 | 18 | from model_engine_server.common.config import hmi_config |
19 | | -from model_engine_server.core.config import infra_config |
20 | 19 | from model_engine_server.common.dtos.batch_jobs import CreateDockerImageBatchJobResourceRequests |
21 | 20 | from model_engine_server.common.dtos.llms import ( |
22 | 21 | ChatCompletionV2Request, |
|
62 | 61 | from model_engine_server.common.dtos.tasks import SyncEndpointPredictV1Request, TaskStatus |
63 | 62 | from model_engine_server.common.resource_limits import validate_resource_requests |
64 | 63 | from model_engine_server.core.auth.authentication_repository import User |
| 64 | +from model_engine_server.core.config import infra_config |
65 | 65 | from model_engine_server.core.configmap import read_config_map |
66 | 66 | from model_engine_server.core.loggers import ( |
67 | 67 | LoggerTagKey, |
@@ -373,7 +373,7 @@ def check_docker_image_exists_for_image_tag( |
373 | 373 | # Skip ECR validation for on-prem deployments - images are in local registry |
374 | 374 | if infra_config().cloud_provider == "onprem": |
375 | 375 | return |
376 | | - |
| 376 | + |
377 | 377 | if not self.docker_repository.image_exists( |
378 | 378 | image_tag=framework_image_tag, |
379 | 379 | repository_name=repository_name, |
@@ -638,9 +638,11 @@ def load_model_weights_sub_commands_s3( |
638 | 638 | file_selection_str = '--include "*.model" --include "*.model.v*" --include "*.json" --include "*.safetensors" --include "*.txt" --exclude "optimizer*"' |
639 | 639 | if trust_remote_code: |
640 | 640 | file_selection_str += ' --include "*.py"' |
641 | | - |
| 641 | + |
642 | 642 | # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var |
643 | | - endpoint_flag = '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)' |
| 643 | + endpoint_flag = ( |
| 644 | + '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)' |
| 645 | + ) |
644 | 646 | subcommands.append( |
645 | 647 | f"{s5cmd} {endpoint_flag} --numworkers 512 cp --concurrency 10 {file_selection_str} {os.path.join(checkpoint_path, '*')} {final_weights_folder}" |
646 | 648 | ) |
@@ -695,7 +697,9 @@ def load_model_files_sub_commands_trt_llm( |
695 | 697 | """ |
696 | 698 | if checkpoint_path.startswith("s3://"): |
697 | 699 | # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var |
698 | | - endpoint_flag = '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)' |
| 700 | + endpoint_flag = ( |
| 701 | + '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)' |
| 702 | + ) |
699 | 703 | subcommands = [ |
700 | 704 | f"./s5cmd {endpoint_flag} --numworkers 512 cp --concurrency 50 {os.path.join(checkpoint_path, '*')} ./" |
701 | 705 | ] |
@@ -1028,8 +1032,9 @@ async def create_vllm_bundle( |
1028 | 1032 | protocol="http", |
1029 | 1033 | readiness_initial_delay_seconds=10, |
1030 | 1034 | healthcheck_route="/health", |
1031 | | - predict_route="/predict", |
1032 | | - streaming_predict_route="/stream", |
| 1035 | + # vLLM 0.5+ uses OpenAI-compatible endpoints |
| 1036 | + predict_route=OPENAI_COMPLETION_PATH, # "/v1/completions" |
| 1037 | + streaming_predict_route=OPENAI_COMPLETION_PATH, # "/v1/completions" (streaming via same endpoint) |
1033 | 1038 | routes=[ |
1034 | 1039 | OPENAI_CHAT_COMPLETION_PATH, |
1035 | 1040 | OPENAI_COMPLETION_PATH, |
@@ -1110,8 +1115,9 @@ async def create_vllm_multinode_bundle( |
1110 | 1115 | protocol="http", |
1111 | 1116 | readiness_initial_delay_seconds=10, |
1112 | 1117 | healthcheck_route="/health", |
1113 | | - predict_route="/predict", |
1114 | | - streaming_predict_route="/stream", |
| 1118 | + # vLLM 0.5+ uses OpenAI-compatible endpoints |
| 1119 | + predict_route=OPENAI_COMPLETION_PATH, # "/v1/completions" |
| 1120 | + streaming_predict_route=OPENAI_COMPLETION_PATH, # "/v1/completions" (streaming via same endpoint) |
1115 | 1121 | routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH], |
1116 | 1122 | env=common_vllm_envs, |
1117 | 1123 | worker_command=worker_command, |
@@ -1912,18 +1918,42 @@ def model_output_to_completion_output( |
1912 | 1918 |
|
1913 | 1919 | elif model_content.inference_framework == LLMInferenceFramework.VLLM: |
1914 | 1920 | tokens = None |
1915 | | - if with_token_probs: |
1916 | | - tokens = [ |
1917 | | - TokenOutput( |
1918 | | - token=model_output["tokens"][index], |
1919 | | - log_prob=list(t.values())[0], |
1920 | | - ) |
1921 | | - for index, t in enumerate(model_output["log_probs"]) |
1922 | | - ] |
| 1921 | + # Handle OpenAI-compatible format (vLLM 0.5+) vs legacy format |
| 1922 | + if "choices" in model_output and model_output["choices"]: |
| 1923 | + # OpenAI-compatible format: {"choices": [{"text": "...", ...}], "usage": {...}} |
| 1924 | + choice = model_output["choices"][0] |
| 1925 | + text = choice.get("text", "") |
| 1926 | + usage = model_output.get("usage", {}) |
| 1927 | + num_prompt_tokens = usage.get("prompt_tokens", 0) |
| 1928 | + num_completion_tokens = usage.get("completion_tokens", 0) |
| 1929 | + # OpenAI format logprobs are in choice.logprobs |
| 1930 | + if with_token_probs and choice.get("logprobs"): |
| 1931 | + logprobs = choice["logprobs"] |
| 1932 | + if logprobs.get("tokens") and logprobs.get("token_logprobs"): |
| 1933 | + tokens = [ |
| 1934 | + TokenOutput( |
| 1935 | + token=logprobs["tokens"][i], |
| 1936 | + log_prob=logprobs["token_logprobs"][i] or 0.0, |
| 1937 | + ) |
| 1938 | + for i in range(len(logprobs["tokens"])) |
| 1939 | + ] |
| 1940 | + else: |
| 1941 | + # Legacy format: {"text": "...", "count_prompt_tokens": ..., ...} |
| 1942 | + text = model_output["text"] |
| 1943 | + num_prompt_tokens = model_output["count_prompt_tokens"] |
| 1944 | + num_completion_tokens = model_output["count_output_tokens"] |
| 1945 | + if with_token_probs and model_output.get("log_probs"): |
| 1946 | + tokens = [ |
| 1947 | + TokenOutput( |
| 1948 | + token=model_output["tokens"][index], |
| 1949 | + log_prob=list(t.values())[0], |
| 1950 | + ) |
| 1951 | + for index, t in enumerate(model_output["log_probs"]) |
| 1952 | + ] |
1923 | 1953 | return CompletionOutput( |
1924 | | - text=model_output["text"], |
1925 | | - num_prompt_tokens=model_output["count_prompt_tokens"], |
1926 | | - num_completion_tokens=model_output["count_output_tokens"], |
| 1954 | + text=text, |
| 1955 | + num_prompt_tokens=num_prompt_tokens, |
| 1956 | + num_completion_tokens=num_completion_tokens, |
1927 | 1957 | tokens=tokens, |
1928 | 1958 | ) |
1929 | 1959 | elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM: |
@@ -2663,20 +2693,43 @@ async def _response_chunk_generator( |
2663 | 2693 | # VLLM |
2664 | 2694 | elif model_content.inference_framework == LLMInferenceFramework.VLLM: |
2665 | 2695 | token = None |
2666 | | - if request.return_token_log_probs: |
2667 | | - token = TokenOutput( |
2668 | | - token=result["result"]["text"], |
2669 | | - log_prob=list(result["result"]["log_probs"].values())[0], |
2670 | | - ) |
2671 | | - finished = result["result"]["finished"] |
2672 | | - num_prompt_tokens = result["result"]["count_prompt_tokens"] |
| 2696 | + vllm_output: dict = result["result"] |
| 2697 | + # Handle OpenAI-compatible streaming format (vLLM 0.5+) vs legacy format |
| 2698 | + if "choices" in vllm_output and vllm_output["choices"]: |
| 2699 | + # OpenAI streaming format: {"choices": [{"text": "...", "finish_reason": ...}], ...} |
| 2700 | + choice = vllm_output["choices"][0] |
| 2701 | + text = choice.get("text", "") |
| 2702 | + finished = choice.get("finish_reason") is not None |
| 2703 | + usage = vllm_output.get("usage", {}) |
| 2704 | + num_prompt_tokens = usage.get("prompt_tokens", 0) |
| 2705 | + num_completion_tokens = usage.get("completion_tokens", 0) |
| 2706 | + if request.return_token_log_probs and choice.get("logprobs"): |
| 2707 | + logprobs = choice["logprobs"] |
| 2708 | + if logprobs.get("tokens") and logprobs.get("token_logprobs"): |
| 2709 | + # Get the last token from the logprobs |
| 2710 | + idx = len(logprobs["tokens"]) - 1 |
| 2711 | + token = TokenOutput( |
| 2712 | + token=logprobs["tokens"][idx], |
| 2713 | + log_prob=logprobs["token_logprobs"][idx] or 0.0, |
| 2714 | + ) |
| 2715 | + else: |
| 2716 | + # Legacy format: {"text": "...", "finished": ..., ...} |
| 2717 | + text = vllm_output["text"] |
| 2718 | + finished = vllm_output["finished"] |
| 2719 | + num_prompt_tokens = vllm_output["count_prompt_tokens"] |
| 2720 | + num_completion_tokens = vllm_output["count_output_tokens"] |
| 2721 | + if request.return_token_log_probs and vllm_output.get("log_probs"): |
| 2722 | + token = TokenOutput( |
| 2723 | + token=vllm_output["text"], |
| 2724 | + log_prob=list(vllm_output["log_probs"].values())[0], |
| 2725 | + ) |
2673 | 2726 | yield CompletionStreamV1Response( |
2674 | 2727 | request_id=request_id, |
2675 | 2728 | output=CompletionStreamOutput( |
2676 | | - text=result["result"]["text"], |
| 2729 | + text=text, |
2677 | 2730 | finished=finished, |
2678 | 2731 | num_prompt_tokens=num_prompt_tokens if finished else None, |
2679 | | - num_completion_tokens=result["result"]["count_output_tokens"], |
| 2732 | + num_completion_tokens=num_completion_tokens, |
2680 | 2733 | token=token, |
2681 | 2734 | ), |
2682 | 2735 | ) |
|
0 commit comments