Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions core/helm-charts/vllm/xeon-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,39 @@ modelConfigs:
tensor_parallel_size: "{{ .Values.tensor_parallel_size }}"
pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}"

# Qwen/Qwen3-Coder-30B-A3B-Instruct — MoE tool-calling model (~80 GB RAM)
"Qwen/Qwen3-Coder-30B-A3B-Instruct":
configMapValues:
VLLM_CPU_KVCACHE_SPACE: "10"
VLLM_RPC_TIMEOUT: "100000"
VLLM_ALLOW_LONG_MAX_MODEL_LEN: "1"
VLLM_ENGINE_ITERATION_TIMEOUT_S: "120"
VLLM_CPU_NUM_OF_RESERVED_CPU: "0"
VLLM_CPU_SGL_KERNEL: "1"
HF_HUB_DISABLE_XET: "1"
LOGNAME: "vllm"
extraCmdArgs:
[
"--block-size",
"128",
"--dtype",
"bfloat16",
"--max-model-len",
"32768",
"--distributed_executor_backend",
"mp",
"--enable_chunked_prefill",
"--enforce-eager",
"--max-num-batched-tokens",
"2048",
"--max-num-seqs",
"8",
"--enable-auto-tool-choice",
"--tool-call-parser", "qwen3_coder",
]
tensor_parallel_size: "{{ .Values.tensor_parallel_size }}"
pipeline_parallel_size: "{{ .Values.pipeline_parallel_size }}"

defaultModelConfigs:
configMapValues:
VLLM_CPU_KVCACHE_SPACE: "40"
Expand Down
12 changes: 10 additions & 2 deletions core/lib/models/model-selection.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,10 @@ model_selection(){
echo "24. deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
echo "25. Qwen/Qwen3-1.7B"
echo "26. Qwen/Qwen3-4B-Instruct-2507"
echo "27. Qwen/Qwen3-Coder-30B-A3B-Instruct"
read -p "Enter the number of the CPU model you want to deploy/remove: " cpu_model
# Validate input
if ! [[ "$cpu_model" =~ ^(21|22|23|24|25|26)$ ]]; then
if ! [[ "$cpu_model" =~ ^(21|22|23|24|25|26|27)$ ]]; then
echo "Error: Invalid model selected ($cpu_model). Exiting." >&2
exit 1
fi
Expand Down Expand Up @@ -242,14 +243,21 @@ get_model_names() {
fi
model_names+=("cpu-qwen3-4b")
;;
27)
if [ "$cpu_or_gpu" = "g" ]; then
echo "Error: CPU model identifier provided for GPU deployment/removal." >&2
exit 1
fi
model_names+=("cpu-qwen3-coder-30b")
;;
"llama-8b"|"llama-70b"|"codellama-34b"|"mixtral-8x-7b"|"mistral-7b"|"tei"|"tei-rerank"|"falcon3-7b"|"deepseek-r1-distill-qwen-32b"|"deepseek-r1-distill-llama8b"|"llama3-405b"|"llama-3-3-70b"|"llama-4-scout-17b"|"qwen-2-5-32b")
if [ "$cpu_or_gpu" = "c" ]; then
echo "Error: GPU model identifier provided for CPU deployment/removal." >&2
exit 1
fi
model_names+=("$model")
;;
"cpu-llama-8b"|"cpu-deepseek-r1-distill-qwen-32b"|"cpu-deepseek-r1-distill-llama8b"|"cpu-qwen3-1-7b"|"cpu-llama-3-2-3b"|"cpu-qwen3-4b")
"cpu-llama-8b"|"cpu-deepseek-r1-distill-qwen-32b"|"cpu-deepseek-r1-distill-llama8b"|"cpu-qwen3-1-7b"|"cpu-llama-3-2-3b"|"cpu-qwen3-4b"|"cpu-qwen3-coder-30b")
if [ "$cpu_or_gpu" = "g" ]; then
echo "Error: CPU model identifier provided for GPU deployment/removal." >&2
exit 1
Expand Down
104 changes: 104 additions & 0 deletions core/playbooks/deploy-inference-models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2582,6 +2582,110 @@
- "'cpu-qwen3-4b' in (model_name_list | regex_replace(',', ' ') | split())"


- name: Check if CPU Qwen3-Coder-30B is installed
ansible.builtin.command:
cmd: "helm list --filter vllm-qwen3-coder-30b-cpu --short"
register: helm_release_installed
ignore_errors: true
run_once: true
tags: uninstall-cpu-qwen3-coder-30b
- name: Uninstall Qwen3-Coder-30B
ansible.builtin.command:
cmd: "helm uninstall vllm-qwen3-coder-30b-cpu"
run_once: true
tags: uninstall-cpu-qwen3-coder-30b
when:
- model_name_list is defined
- "'cpu-qwen3-coder-30b' in (model_name_list | regex_replace(',', ' ') | split())"
- uninstall_true == 'true'
- helm_release_installed.stdout != ""

- name: Deploy CPU Qwen3-Coder-30B LLM model
block:
- name: Delete Ingress resource Qwen3-Coder-30B from default namespace
tags: install-cpu-qwen3-coder-30b
ignore_errors: yes
kubernetes.core.k8s:
kind: Ingress
namespace: default
name: vllm-qwen3-coder-30b-cpu-ingress
state: absent
- name: Delete Ingress resource Qwen3-Coder-30B from auth-apisix namespace
tags: install-cpu-qwen3-coder-30b
ignore_errors: yes
kubernetes.core.k8s:
kind: Ingress
namespace: auth-apisix
name: vllm-qwen3-coder-30b-cpu-ingress
state: absent
- name: Deploy CPU based LLM model Qwen3-Coder-30B Installation
ansible.builtin.command: >-
helm upgrade --install vllm-qwen3-coder-30b-cpu "{{ remote_helm_charts_base }}/vllm"
--values "{{ xeon_values_file }}"
--set LLM_MODEL_ID="Qwen/Qwen3-Coder-30B-A3B-Instruct"
--set global.monitoring="{{ vllm_metrics_enabled }}"
--set svcmonitor.enabled="{{ vllm_metrics_enabled }}"
--set global.HUGGINGFACEHUB_API_TOKEN={{ hugging_face_token }}
{% if cpu_playbook == 'true' %}
--set cpu_balloon_annotation="vllm-balloon"
--set podLabels.name="vllm"
--set cpu="{{ optimal_balloon_config.workload_cpus | default(8) }}"
--set memory="{{ optimal_memory_gb | default(8) }}Gi"
--set tensor_parallel_size={{ tensor_parallel_size | default(1) }}
--set pipeline_parallel_size={{ pipeline_parallel_size | default(1) }}
{% endif %}
{% if apisix_enabled %}
--set apisix.enabled={{ apisix_enabled }}
--set platform={{ kubernetes_platform }}
{% endif %}
{% if kubernetes_platform == 'openshift' %}
--set route.enabled=true
--set route.host={{ secret_name }}
--set route.tls.termination=edge
--set route.tls.insecureEdgeTerminationPolicy=Redirect
--set ingress.enabled=false
{% elif ingress_enabled %}
--set ingress.enabled={{ ingress_enabled }}
--set ingress.host={{ secret_name }}
--set ingress.secretname={{ secret_name }}
--set route.enabled=false
{% if kubernetes_platform == "eks" %}
--set aws_certificate_arn={{ aws_certificate_arn | default('') }}
{% endif %}
{% endif %}
{% if deploy_keycloak == 'yes' and apisix_enabled %}
--set oidc.client_id={{ keycloak_client_id | default('') }}
--set oidc.client_secret={{ client_secret | default('') }}
{% endif %}
{{ helm_proxy_args | default('') }}
--force
register: helm_upgrade_install_model_deployment_cpu_qwen3_coder_30b
failed_when: helm_upgrade_install_model_deployment_cpu_qwen3_coder_30b.rc != 0
- name: Register Xeon/CPU Qwen3-Coder-30B model
import_tasks: register-model-genai-gateway.yml
vars:
reg_model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct"
reg_litellm_model: "openai/Qwen/Qwen3-Coder-30B-A3B-Instruct"
reg_custom_llm_provider: "openai"
reg_api_base: "http://vllm-qwen3-coder-30b-cpu-service.default/v1"
reg_input_cost_per_token: 0.001
reg_output_cost_per_token: 0.002
tags:
- install-cpu-qwen3-coder-30b
- install-genai-gateway
run_once: true
when:
- "'install-cpu-qwen3-coder-30b' in ansible_run_tags"
- "'install-genai-gateway' in ansible_run_tags"
run_once: true
tags: install-cpu-qwen3-coder-30b
when:
- model_name_list is defined
- cpu_playbook == 'true'
- install_true == 'true'
- "'cpu-qwen3-coder-30b' in (model_name_list | regex_replace(',', ' ') | split())"



- name: List of Models to be Installed
tags: always
Expand Down
2 changes: 2 additions & 0 deletions sample_solutions/AgenticCodeExecution/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# === Domain selection (retail, airline, stocks, banking, triage) ===
# MCP_DOMAIN=retail
24 changes: 24 additions & 0 deletions sample_solutions/AgenticCodeExecution/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Python bytecode/cache
__pycache__/
*.py[cod]

# Environment files
.env.*
!.env.example
!.env

# Prompt-builder generated artifacts
prompt-builder/data/retail/runs/
prompt-builder/data/retail/optimization_report.json
prompt-builder/data/retail/retail_optimized_system_prompt.txt
prompt-builder/data/retail/retail_first10_validation.json
prompt-builder/data/retail/sections/section1_generic.txt
prompt-builder/data/retail/sections/section2_seed.txt
prompt-builder/data/retail/sections/section3_policy.txt

# tau2-bench database files (downloaded separately, see README)
examples/airline/data/db.json
examples/retail/data/db.json

# Runtime session data
examples/session_dbs/
Loading