Add on-prem env vars to Helm chart (S3_ENDPOINT_URL, REDIS_HOST, etc.)

charlesahn-scale · charlesahn-scale · commit a849a7f75fff · 2026-01-09T12:47:28.000-05:00
diff --git a/charts/model-engine/templates/_helpers.tpl b/charts/model-engine/templates/_helpers.tpl
@@ -256,6 +256,10 @@ env:
   - name: ABS_CONTAINER_NAME
     value: {{ .Values.azure.abs_container_name }}
   {{- end }}
+  {{- if .Values.s3EndpointUrl }}
+  - name: S3_ENDPOINT_URL
+    value: {{ .Values.s3EndpointUrl | quote }}
+  {{- end }}
 {{- end }}
 
 {{- define "modelEngine.syncForwarderTemplateEnv" -}}
@@ -342,9 +346,27 @@ env:
     value: "/workspace/model-engine/model_engine_server/core/configs/config.yaml"
   {{- end }}
   - name: CELERY_ELASTICACHE_ENABLED
-    value: "true"
+    value: {{ .Values.celeryElasticacheEnabled | default true | quote }}
   - name: LAUNCH_SERVICE_TEMPLATE_FOLDER
     value: "/workspace/model-engine/model_engine_server/infra/gateways/resources/templates"
+  {{- if .Values.s3EndpointUrl }}
+  - name: S3_ENDPOINT_URL
+    value: {{ .Values.s3EndpointUrl | quote }}
+  {{- end }}
+  {{- if .Values.redisHost }}
+  - name: REDIS_HOST
+    value: {{ .Values.redisHost | quote }}
+  - name: REDIS_PORT
+    value: {{ .Values.redisPort | default "6379" | quote }}
+  {{- end }}
+  {{- if .Values.celeryBrokerUrl }}
+  - name: CELERY_BROKER_URL
+    value: {{ .Values.celeryBrokerUrl | quote }}
+  {{- end }}
+  {{- if .Values.celeryResultBackend }}
+  - name: CELERY_RESULT_BACKEND
+    value: {{ .Values.celeryResultBackend | quote }}
+  {{- end }}
   {{- if .Values.redis.auth}}
   - name: REDIS_AUTH_TOKEN
     value: {{ .Values.redis.auth }}
diff --git a/model-engine/model_engine_server/common/config.py b/model-engine/model_engine_server/common/config.py
@@ -96,7 +96,7 @@ def cache_redis_url(self) -> str:
         # On-prem Redis support - check explicit URL first, then fallback to env vars
         if self.cache_redis_onprem_url:
             return self.cache_redis_onprem_url
-        
+
         if cloud_provider == "onprem":
             if self.cache_redis_aws_url:
                 logger.info("On-prem deployment using cache_redis_aws_url")
diff --git a/model-engine/model_engine_server/core/aws/roles.py b/model-engine/model_engine_server/core/aws/roles.py
@@ -119,15 +119,15 @@ def session(role: Optional[str], session_type: SessionT = Session) -> SessionT:
 
     :param:`session_type` defines the type of session to return. Most users will use
     the default boto3 type. Some users required a special type (e.g aioboto3 session).
-    
+
     For on-prem deployments without AWS profiles, pass role=None or role=""
     to use default credentials from environment variables (AWS_ACCESS_KEY_ID, etc).
     """
     # Do not assume roles in CIRCLECI
     if os.getenv("CIRCLECI"):
         logger.warning(f"In circleci, not assuming role (ignoring: {role})")
         role = None
-    
+
     # Use profile-based auth only if role is specified
     # For on-prem with MinIO, role will be None or empty - use env var credentials
     if role:
diff --git a/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py b/model-engine/model_engine_server/domain/use_cases/llm_model_endpoint_use_cases.py
@@ -16,7 +16,6 @@
 
 import yaml
 from model_engine_server.common.config import hmi_config
-from model_engine_server.core.config import infra_config
 from model_engine_server.common.dtos.batch_jobs import CreateDockerImageBatchJobResourceRequests
 from model_engine_server.common.dtos.llms import (
     ChatCompletionV2Request,
@@ -62,6 +61,7 @@
 from model_engine_server.common.dtos.tasks import SyncEndpointPredictV1Request, TaskStatus
 from model_engine_server.common.resource_limits import validate_resource_requests
 from model_engine_server.core.auth.authentication_repository import User
+from model_engine_server.core.config import infra_config
 from model_engine_server.core.configmap import read_config_map
 from model_engine_server.core.loggers import (
     LoggerTagKey,
@@ -373,7 +373,7 @@ def check_docker_image_exists_for_image_tag(
         # Skip ECR validation for on-prem deployments - images are in local registry
         if infra_config().cloud_provider == "onprem":
             return
-        
+
         if not self.docker_repository.image_exists(
             image_tag=framework_image_tag,
             repository_name=repository_name,
@@ -638,9 +638,11 @@ def load_model_weights_sub_commands_s3(
         file_selection_str = '--include "*.model" --include "*.model.v*" --include "*.json" --include "*.safetensors" --include "*.txt" --exclude "optimizer*"'
         if trust_remote_code:
             file_selection_str += ' --include "*.py"'
-        
+
         # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var
-        endpoint_flag = '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)'
+        endpoint_flag = (
+            '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)'
+        )
         subcommands.append(
             f"{s5cmd} {endpoint_flag} --numworkers 512 cp --concurrency 10 {file_selection_str} {os.path.join(checkpoint_path, '*')} {final_weights_folder}"
         )
@@ -695,7 +697,9 @@ def load_model_files_sub_commands_trt_llm(
         """
         if checkpoint_path.startswith("s3://"):
             # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var
-            endpoint_flag = '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)'
+            endpoint_flag = (
+                '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)'
+            )
             subcommands = [
                 f"./s5cmd {endpoint_flag} --numworkers 512 cp --concurrency 50 {os.path.join(checkpoint_path, '*')} ./"
             ]
@@ -1028,8 +1032,9 @@ async def create_vllm_bundle(
                 protocol="http",
                 readiness_initial_delay_seconds=10,
                 healthcheck_route="/health",
-                predict_route="/predict",
-                streaming_predict_route="/stream",
+                # vLLM 0.5+ uses OpenAI-compatible endpoints
+                predict_route=OPENAI_COMPLETION_PATH,  # "/v1/completions"
+                streaming_predict_route=OPENAI_COMPLETION_PATH,  # "/v1/completions" (streaming via same endpoint)
                 routes=[
                     OPENAI_CHAT_COMPLETION_PATH,
                     OPENAI_COMPLETION_PATH,
@@ -1110,8 +1115,9 @@ async def create_vllm_multinode_bundle(
                 protocol="http",
                 readiness_initial_delay_seconds=10,
                 healthcheck_route="/health",
-                predict_route="/predict",
-                streaming_predict_route="/stream",
+                # vLLM 0.5+ uses OpenAI-compatible endpoints
+                predict_route=OPENAI_COMPLETION_PATH,  # "/v1/completions"
+                streaming_predict_route=OPENAI_COMPLETION_PATH,  # "/v1/completions" (streaming via same endpoint)
                 routes=[OPENAI_CHAT_COMPLETION_PATH, OPENAI_COMPLETION_PATH],
                 env=common_vllm_envs,
                 worker_command=worker_command,
@@ -1912,18 +1918,42 @@ def model_output_to_completion_output(
 
         elif model_content.inference_framework == LLMInferenceFramework.VLLM:
             tokens = None
-            if with_token_probs:
-                tokens = [
-                    TokenOutput(
-                        token=model_output["tokens"][index],
-                        log_prob=list(t.values())[0],
-                    )
-                    for index, t in enumerate(model_output["log_probs"])
-                ]
+            # Handle OpenAI-compatible format (vLLM 0.5+) vs legacy format
+            if "choices" in model_output and model_output["choices"]:
+                # OpenAI-compatible format: {"choices": [{"text": "...", ...}], "usage": {...}}
+                choice = model_output["choices"][0]
+                text = choice.get("text", "")
+                usage = model_output.get("usage", {})
+                num_prompt_tokens = usage.get("prompt_tokens", 0)
+                num_completion_tokens = usage.get("completion_tokens", 0)
+                # OpenAI format logprobs are in choice.logprobs
+                if with_token_probs and choice.get("logprobs"):
+                    logprobs = choice["logprobs"]
+                    if logprobs.get("tokens") and logprobs.get("token_logprobs"):
+                        tokens = [
+                            TokenOutput(
+                                token=logprobs["tokens"][i],
+                                log_prob=logprobs["token_logprobs"][i] or 0.0,
+                            )
+                            for i in range(len(logprobs["tokens"]))
+                        ]
+            else:
+                # Legacy format: {"text": "...", "count_prompt_tokens": ..., ...}
+                text = model_output["text"]
+                num_prompt_tokens = model_output["count_prompt_tokens"]
+                num_completion_tokens = model_output["count_output_tokens"]
+                if with_token_probs and model_output.get("log_probs"):
+                    tokens = [
+                        TokenOutput(
+                            token=model_output["tokens"][index],
+                            log_prob=list(t.values())[0],
+                        )
+                        for index, t in enumerate(model_output["log_probs"])
+                    ]
             return CompletionOutput(
-                text=model_output["text"],
-                num_prompt_tokens=model_output["count_prompt_tokens"],
-                num_completion_tokens=model_output["count_output_tokens"],
+                text=text,
+                num_prompt_tokens=num_prompt_tokens,
+                num_completion_tokens=num_completion_tokens,
                 tokens=tokens,
             )
         elif model_content.inference_framework == LLMInferenceFramework.LIGHTLLM:
@@ -2663,20 +2693,43 @@ async def _response_chunk_generator(
                 # VLLM
                 elif model_content.inference_framework == LLMInferenceFramework.VLLM:
                     token = None
-                    if request.return_token_log_probs:
-                        token = TokenOutput(
-                            token=result["result"]["text"],
-                            log_prob=list(result["result"]["log_probs"].values())[0],
-                        )
-                    finished = result["result"]["finished"]
-                    num_prompt_tokens = result["result"]["count_prompt_tokens"]
+                    vllm_output: dict = result["result"]
+                    # Handle OpenAI-compatible streaming format (vLLM 0.5+) vs legacy format
+                    if "choices" in vllm_output and vllm_output["choices"]:
+                        # OpenAI streaming format: {"choices": [{"text": "...", "finish_reason": ...}], ...}
+                        choice = vllm_output["choices"][0]
+                        text = choice.get("text", "")
+                        finished = choice.get("finish_reason") is not None
+                        usage = vllm_output.get("usage", {})
+                        num_prompt_tokens = usage.get("prompt_tokens", 0)
+                        num_completion_tokens = usage.get("completion_tokens", 0)
+                        if request.return_token_log_probs and choice.get("logprobs"):
+                            logprobs = choice["logprobs"]
+                            if logprobs.get("tokens") and logprobs.get("token_logprobs"):
+                                # Get the last token from the logprobs
+                                idx = len(logprobs["tokens"]) - 1
+                                token = TokenOutput(
+                                    token=logprobs["tokens"][idx],
+                                    log_prob=logprobs["token_logprobs"][idx] or 0.0,
+                                )
+                    else:
+                        # Legacy format: {"text": "...", "finished": ..., ...}
+                        text = vllm_output["text"]
+                        finished = vllm_output["finished"]
+                        num_prompt_tokens = vllm_output["count_prompt_tokens"]
+                        num_completion_tokens = vllm_output["count_output_tokens"]
+                        if request.return_token_log_probs and vllm_output.get("log_probs"):
+                            token = TokenOutput(
+                                token=vllm_output["text"],
+                                log_prob=list(vllm_output["log_probs"].values())[0],
+                            )
                     yield CompletionStreamV1Response(
                         request_id=request_id,
                         output=CompletionStreamOutput(
-                            text=result["result"]["text"],
+                            text=text,
                             finished=finished,
                             num_prompt_tokens=num_prompt_tokens if finished else None,
-                            num_completion_tokens=result["result"]["count_output_tokens"],
+                            num_completion_tokens=num_completion_tokens,
                             token=token,
                         ),
                     )
diff --git a/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py b/model-engine/model_engine_server/inference/batch_inference/vllm_batch.py
@@ -62,7 +62,7 @@ def get_s3_client():
         session = boto3.Session(profile_name=profile_name)
     else:
         session = boto3.Session()
-    
+
     # Support for MinIO/on-prem S3-compatible storage
     endpoint_url = os.getenv("S3_ENDPOINT_URL")
     return session.client("s3", region_name=AWS_REGION, endpoint_url=endpoint_url)
@@ -72,7 +72,7 @@ def download_model(checkpoint_path, final_weights_folder):
     # Support for MinIO/on-prem S3-compatible storage
     s3_endpoint_url = os.getenv("S3_ENDPOINT_URL", "")
     endpoint_flag = f"--endpoint-url {s3_endpoint_url}" if s3_endpoint_url else ""
-    
+
     s5cmd = f"./s5cmd {endpoint_flag} --numworkers 512 sync --concurrency 10 --include '*.model' --include '*.json' --include '*.bin' --include '*.safetensors' --exclude 'optimizer*' --exclude 'train*' {os.path.join(checkpoint_path, '*')} {final_weights_folder}"
     env = os.environ.copy()
     env["AWS_PROFILE"] = os.getenv("S3_WRITE_AWS_PROFILE", "default")
diff --git a/model-engine/model_engine_server/inference/vllm/vllm_batch.py b/model-engine/model_engine_server/inference/vllm/vllm_batch.py
@@ -78,11 +78,11 @@ async def download_model(checkpoint_path: str, target_dir: str, trust_remote_cod
 
     print(f"Downloading model from {checkpoint_path} to {target_dir}", flush=True)
     additional_include = "--include '*.py'" if trust_remote_code else ""
-    
+
     # Support for MinIO/on-prem S3-compatible storage
     s3_endpoint_url = os.getenv("S3_ENDPOINT_URL", "")
     endpoint_flag = f"--endpoint-url {s3_endpoint_url}" if s3_endpoint_url else ""
-    
+
     s5cmd = f"./s5cmd {endpoint_flag} --numworkers 512 sync --concurrency 10 --include '*.model' --include '*.json' --include '*.safetensors' --include '*.txt' {additional_include} --exclude 'optimizer*' --exclude 'train*' {os.path.join(checkpoint_path, '*')} {target_dir}"
     print(s5cmd, flush=True)
     env = os.environ.copy()
diff --git a/model-engine/model_engine_server/infra/gateways/resources/k8s_resource_types.py b/model-engine/model_engine_server/infra/gateways/resources/k8s_resource_types.py
@@ -579,7 +579,7 @@ def get_endpoint_resource_arguments_from_request(
     abs_account_name = os.getenv("ABS_ACCOUNT_NAME")
     if abs_account_name is not None:
         main_env.append({"name": "ABS_ACCOUNT_NAME", "value": abs_account_name})
-    
+
     # Support for MinIO/on-prem S3-compatible storage
     s3_endpoint_url = os.getenv("S3_ENDPOINT_URL")
     if s3_endpoint_url:
diff --git a/model-engine/model_engine_server/infra/gateways/s3_utils.py b/model-engine/model_engine_server/infra/gateways/s3_utils.py
@@ -14,15 +14,11 @@ def _get_onprem_client_kwargs() -> Dict[str, Any]:
     global _s3_config_logged
     client_kwargs: Dict[str, Any] = {}
 
-    s3_endpoint = getattr(infra_config(), "s3_endpoint_url", None) or os.getenv(
-        "S3_ENDPOINT_URL"
-    )
+    s3_endpoint = getattr(infra_config(), "s3_endpoint_url", None) or os.getenv("S3_ENDPOINT_URL")
     if s3_endpoint:
         client_kwargs["endpoint_url"] = s3_endpoint
 
-    addressing_style = cast(
-        AddressingStyle, getattr(infra_config(), "s3_addressing_style", "path")
-    )
+    addressing_style = cast(AddressingStyle, getattr(infra_config(), "s3_addressing_style", "path"))
     client_kwargs["config"] = Config(s3={"addressing_style": addressing_style})
 
     if not _s3_config_logged and s3_endpoint:
diff --git a/model-engine/tests/unit/domain/test_llm_use_cases.py b/model-engine/tests/unit/domain/test_llm_use_cases.py
@@ -583,8 +583,12 @@ def test_load_model_weights_sub_commands(
         framework, framework_image_tag, checkpoint_path, final_weights_folder
     )
 
+    # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var
+    endpoint_flag = (
+        '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)'
+    )
     expected_result = [
-        './s5cmd --numworkers 512 cp --concurrency 10 --include "*.model" --include "*.model.v*" --include "*.json" --include "*.safetensors" --include "*.txt" --exclude "optimizer*" s3://fake-checkpoint/* test_folder',
+        f'./s5cmd {endpoint_flag} --numworkers 512 cp --concurrency 10 --include "*.model" --include "*.model.v*" --include "*.json" --include "*.safetensors" --include "*.txt" --exclude "optimizer*" s3://fake-checkpoint/* test_folder',
     ]
     assert expected_result == subcommands
 
@@ -594,7 +598,7 @@ def test_load_model_weights_sub_commands(
     )
 
     expected_result = [
-        './s5cmd --numworkers 512 cp --concurrency 10 --include "*.model" --include "*.model.v*" --include "*.json" --include "*.safetensors" --include "*.txt" --exclude "optimizer*" --include "*.py" s3://fake-checkpoint/* test_folder',
+        f'./s5cmd {endpoint_flag} --numworkers 512 cp --concurrency 10 --include "*.model" --include "*.model.v*" --include "*.json" --include "*.safetensors" --include "*.txt" --exclude "optimizer*" --include "*.py" s3://fake-checkpoint/* test_folder',
     ]
     assert expected_result == subcommands
 
diff --git a/model-engine/tests/unit/domain/test_openai_format_fix.py b/model-engine/tests/unit/domain/test_openai_format_fix.py
diff --git a/model-engine/tests/unit/domain/test_vllm_integration_fix.py b/model-engine/tests/unit/domain/test_vllm_integration_fix.py

Original file line number	Diff line number	Diff line change
`@@ -583,8 +583,12 @@ def test_load_model_weights_sub_commands(`
`583`	`583`	`framework, framework_image_tag, checkpoint_path, final_weights_folder`
`584`	`584`	`)`
`585`	`585`
	`586`	`+ # Support for MinIO/on-prem S3-compatible storage via S3_ENDPOINT_URL env var`
	`587`	`+ endpoint_flag = (`
	`588`	`+ '$(if [ -n "$S3_ENDPOINT_URL" ]; then echo "--endpoint-url $S3_ENDPOINT_URL"; fi)'`
	`589`	`+ )`
`586`	`590`	`expected_result = [`
`587`		`- './s5cmd --numworkers 512 cp --concurrency 10 --include ".model" --include ".model.v" --include ".json" --include ".safetensors" --include ".txt" --exclude "optimizer" s3://fake-checkpoint/ test_folder',`
	`591`	`+ f'./s5cmd {endpoint_flag} --numworkers 512 cp --concurrency 10 --include ".model" --include ".model.v" --include ".json" --include ".safetensors" --include ".txt" --exclude "optimizer" s3://fake-checkpoint/ test_folder',`
`588`	`592`	`]`
`589`	`593`	`assert expected_result == subcommands`
`590`	`594`
`@@ -594,7 +598,7 @@ def test_load_model_weights_sub_commands(`
`594`	`598`	`)`
`595`	`599`
`596`	`600`	`expected_result = [`
`597`		`- './s5cmd --numworkers 512 cp --concurrency 10 --include ".model" --include ".model.v" --include ".json" --include ".safetensors" --include ".txt" --exclude "optimizer" --include ".py" s3://fake-checkpoint/* test_folder',`
	`601`	`+ f'./s5cmd {endpoint_flag} --numworkers 512 cp --concurrency 10 --include ".model" --include ".model.v" --include ".json" --include ".safetensors" --include ".txt" --exclude "optimizer" --include ".py" s3://fake-checkpoint/* test_folder',`
`598`	`602`	`]`
`599`	`603`	`assert expected_result == subcommands`
`600`	`604`