openvinotoolkit · dkalinowski · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 19, 2026
diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md
@@ -20,16 +20,8 @@ set MOE_USE_MICRO_GEMM_PREFILL=0  # temporary workaround to improve accuracy wit
 ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
 > **Note:** For deployment, the model requires ~16GB disk space and recommended 19GB+ of VRAM on the GPU.
-:::
 
-:::{tab-item} OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov
-:sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov
-```bat
-mkdir c:\models
-set MOE_USE_MICRO_GEMM_PREFILL=0  # temporary workaround to improve accuracy with long context
-ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-Coder-30B-A3B-Instruct
-```
-> **Note:** For deployment, the model requires ~16GB disk space and recommended 34GB+ of VRAM on the GPU.
+> **Note:** An int8 variant is also available: `OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov`. It offers higher accuracy but requires 34GB+ of VRAM on the GPU.
 :::
 
 :::{tab-item} OpenVINO/gpt-oss-20b-int4-ov
@@ -57,6 +49,14 @@ ovms --model_repository_path c:\models --source_model OpenVINO/Qwen3-8B-int4-cw-
 ```
 > **Note:** First model initialization might be long. With the compilation cache, sequential model loading will be fast.
 :::
+:::{tab-item} Junrui2021/Qwen3-VL-8B-Instruct-int4
+:sync: Junrui2021/Qwen3-VL-8B-Instruct-int4
+```bat
+mkdir c:\models
+ovms --model_repository_path c:\models --source_model Junrui2021/Qwen3-VL-8B-Instruct-int4 --task text_generation --target_device GPU --pipeline_type VLM_CB --rest_port 8000 --cache_dir .ovcache --model_name Qwen3-VL-8B-Instruct
+```
+> **Note:** This is a Vision Language Model (VLM) that supports image inputs. For deployment, recommended 7GB+ of VRAM on the GPU.
+:::
 ::::
 
 ### Linux: via Docker
@@ -71,17 +71,8 @@ docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):
     --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int4-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct
 ```
 > **Note:** For deployment, the model requires ~16GB disk space and recommended 19GB+ of VRAM on the GPU.
-:::
 
-:::{tab-item} OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov
-:sync: OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov
-```bash
-mkdir -p models
-docker run -d -p 8000:8000 --rm -e MOE_USE_MICRO_GEMM_PREFILL=0 --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
-    openvino/model_server:weekly \
-    --model_repository_path /models --source_model OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov --task text_generation --target_device GPU --tool_parser qwen3coder --rest_port 8000 --model_name Qwen3-Coder-30B-A3B-Instruct
-```
-> **Note:** For deployment, the model requires ~16GB disk space and recommended 34GB+ of VRAM on the GPU.
+> **Note:** An int8 variant is also available: `OpenVINO/Qwen3-Coder-30B-A3B-Instruct-int8-ov`. It offers higher accuracy but requires 34GB+ of VRAM on the GPU.
 :::
 
 :::{tab-item} OpenVINO/gpt-oss-20B-int4-ov
@@ -115,6 +106,16 @@ docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/model
 ```
 > **Note:** First model initialization might be long. With the compilation cache, sequential model loading will be fast.
 :::
+:::{tab-item} Junrui2021/Qwen3-VL-8B-Instruct-int4
+:sync: Junrui2021/Qwen3-VL-8B-Instruct-int4
+```bash
+mkdir -p models
+docker run -d -p 8000:8000 --rm --user $(id -u):$(id -g) -v $(pwd)/models:/models/:rw --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) \
+    openvino/model_server:weekly \
+    --model_repository_path /models --source_model Junrui2021/Qwen3-VL-8B-Instruct-int4 --task text_generation --target_device GPU --pipeline_type VLM_CB --rest_port 8000 --model_name Qwen3-VL-8B-Instruct
+```
+> **Note:** This is a Vision Language Model (VLM) that supports image inputs. For deployment, recommended 7GB+ of VRAM on the GPU.
+:::
 ::::
 
 
@@ -300,8 +301,40 @@ context:
   - provider: folder
   - provider: codebase
 ```
+:::
+:::{tab-item} Qwen3-VL-8B-Instruct
+:sync: Qwen3-VL-8B-Instruct
+```
+name: Local Assistant
+version: 1.0.0
+schema: v1
+models:
+  - name: OVMS Qwen3-VL-8B-Instruct
+    provider: openai
+    model: Qwen3-VL-8B-Instruct
+    apiKey: unused
+    apiBase: http://localhost:8000/v3
+    roles:
+      - chat
+      - edit
+      - apply
+    capabilities:
+      - tool_use
+      - image_input
+context:
+  - provider: code
+  - provider: docs
+  - provider: diff
+  - provider: terminal
+  - provider: problems
+  - provider: folder
+  - provider: codebase
+```
+:::
 ::::
 
+> **Note:** For Vision Language Models (VLM) like Qwen3-VL-8B-Instruct, add `image_input` to the `capabilities` list in the Continue config. This enables the image modality, allowing you to send images in chat messages for the model to analyze.
+
 > **Note:** For more information about this config, see [configuration reference](https://docs.continue.dev/reference#models).
 
 ## Chatting, code editing and autocompletion in action
@@ -339,6 +372,10 @@ Example use cases for tools:
 
 ![glob](./glob.png)
 
+* Image input
+
+![vision](./image_input.png)
+
 * Extending VRAM allocation to iGPU to enable loading bigger models
 
 ![xram](./vram.png)
diff --git a/demos/code_local_assistant/image_input.png b/demos/code_local_assistant/image_input.png
diff --git a/demos/continuous_batching/agentic_ai/README.md b/demos/continuous_batching/agentic_ai/README.md
@@ -39,29 +39,32 @@ pip install openai-agents openai
 
 ## Start OVMS
 
-This deployment procedure assumes the model was pulled or exported using the procedure above. The exception are models from OpenVINO organization if they support tools correctly with the default template like "OpenVINO/Qwen3-8B-int4-ov" - they can be deployed in a single command pulling and staring the server.
+This deployment procedure assumes the model was pulled or exported using the procedure above. The exception are models from OpenVINO organization if they support tools correctly with the default template like "OpenVINO/Qwen3-4B-int4-ov" - they can be deployed in a single command pulling and staring the server.
 
 
 ### Deploying on Windows with GPU
 Assuming you have unpacked model server package with python enabled version, make sure to run `setupvars` script
 as mentioned in [deployment guide](../../../docs/deploying_server_baremetal.md), in every new shell that will start OpenVINO Model Server.
 
 ::::{tab-set}
-:::{tab-item} Qwen3-8B
-:sync: Qwen3-8B
+:::{tab-item} Qwen3-VL-8B
+:sync: Qwen3-VL-8B
 Pull and start OVMS:
 ```bat
-ovms.exe --rest_port 8000 --source_model OpenVINO/Qwen3-8B-int4-ov --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --cache_dir .cache
+ovms.exe --rest_port 8000 --source_model Junrui2021/Qwen3-VL-8B-Instruct-int4 --model_repository_path c:\models --tool_parser hermes3 --target_device GPU --task text_generation --pipeline_type VLM_CB --cache_dir .cache
 ```
 
-Use MCP server:
+Use MCP server, with additional image input:
+
+![poland](https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg)
+
 ```bat
-python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather
+python openai_agent.py --query "What is the current weather in location depicted in the image?" --image https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg --model Junrui2021/Qwen3-VL-8B-Instruct-int4 --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather
 ```
 
 Exemplary output:
 ```text
-The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is at -2.9°C. Wind is blowing from the NE at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover. Visibility is 24.1 km.
+The current weather in Gdansk, the location depicted in the image, is partly cloudy with a temperature of 5.7°C (feels like 2.7°C). The relative humidity is 83%, and the wind is blowing from the NNE at 9.8 km/h with gusts up to 23.4 km/h. The atmospheric pressure is 1022.3 hPa, with 72% cloud cover and a moderate UV index of 3.6. Visibility is 20.6 km.
 ```
 :::
 :::{tab-item} Qwen3-4B
@@ -192,23 +195,26 @@ The current weather in Tokyo is overcast with a temperature of 9.4°C (feels lik
 ### Deploying in a docker container on CPU
 
 ::::{tab-set}
-:::{tab-item} Qwen3-8B
-:sync: Qwen3-8B
+:::{tab-item} Qwen3-VL-8B
+:sync: Qwen3-VL-8B
 Pull and start OVMS:
 ```bash
 mkdir -p ${HOME}/models
 docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models openvino/model_server:weekly \
---rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --task text_generation
+--rest_port 8000 --model_repository_path /models --source_model Junrui2021/Qwen3-VL-8B-Instruct-int4 --tool_parser hermes3 --task text_generation --pipeline_type VLM_CB
 ```
 
-Use MCP server:
+Use MCP server, with additional image input:
+
+![poland](https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg)
+
 ```bash
-python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather
+python openai_agent.py --query "What is the current weather in location depicted in the image?" --image https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg --model Junrui2021/Qwen3-VL-8B-Instruct-int4 --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather
 ```
 
 Exemplary output:
 ```text
-The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa, with 84% cloud cover and visibility of 24.1 km.
+The current weather in Gdansk, the location depicted in the image, is partly cloudy with a temperature of 5.7°C (feels like 2.7°C). The relative humidity is 83%, and the wind is blowing from the NNE at 9.8 km/h with gusts up to 23.4 km/h. The atmospheric pressure is 1022.3 hPa, with 72% cloud cover and a moderate UV index of 3.6. Visibility is 20.6 km.
 ```
 :::
 :::{tab-item} Qwen3-4B
@@ -309,23 +315,26 @@ to `docker run` command, use the image with GPU support. Export the models with
 It can be applied using the commands below:
 
 ::::{tab-set}
-:::{tab-item} Qwen3-8B
-:sync: Qwen3-8B
+:::{tab-item} Qwen3-VL-8B
+:sync: Qwen3-VL-8B
 Pull and start OVMS:
 ```bash
 mkdir -p ${HOME}/models
 docker run -d --user $(id -u):$(id -g) --rm -p 8000:8000 -v ${HOME}/models:/models --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) openvino/model_server:weekly \
---rest_port 8000 --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --tool_parser hermes3 --target_device GPU --task text_generation
+--rest_port 8000 --model_repository_path /models --source_model Junrui2021/Qwen3-VL-8B-Instruct-int4 --tool_parser hermes3 --target_device GPU --task text_generation --pipeline_type VLM_CB --allowed_media_domains images.pexels.com
 ```
 
-Use MCP server:
+Use MCP server, with additional image input:
+
+![poland](https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg)
+
 ```bash
-python openai_agent.py --query "What is the current weather in Tokyo?" --model OpenVINO/Qwen3-8B-int4-ov --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather
+python openai_agent.py --query "What is the current weather in location depicted in the image?" --image https://images.pexels.com/photos/20015887/pexels-photo-20015887.jpeg --model Junrui2021/Qwen3-VL-8B-Instruct-int4 --base-url http://localhost:8000/v3 --mcp-server-url http://localhost:8080/sse --mcp-server weather
 ```
 
 Exemplary output:
 ```text
-The current weather in Tokyo is overcast with a temperature of 9.4°C (feels like 6.4°C). The relative humidity is at 42%, and the dew point is -2.9°C. Wind is blowing from the northeast at 3.6 km/h with gusts up to 24.8 km/h. The atmospheric pressure is 1018.9 hPa with 84% cloud cover, and visibility is 24.1 km.
+The current weather in Gdansk, the location depicted in the image, is partly cloudy with a temperature of 5.7°C (feels like 2.7°C). The relative humidity is 83%, and the wind is blowing from the NNE at 9.8 km/h with gusts up to 23.4 km/h. The atmospheric pressure is 1022.3 hPa, with 72% cloud cover and a moderate UV index of 3.6. Visibility is 20.6 km.
 ```
 :::
 :::{tab-item} Qwen3-4B

diff --git a/demos/continuous_batching/agentic_ai/openai_agent.py b/demos/continuous_batching/agentic_ai/openai_agent.py
@@ -17,6 +17,8 @@
 from __future__ import annotations
 
 import asyncio
+import base64
+import mimetypes
 import os
 import platform
 import sys
@@ -49,6 +51,33 @@
 
 RunConfig.tracing_disabled = False  # Enable tracing for this example
 
+
+def _image_url_from_path(path: str) -> str:
+    """Return a data-URI for a local file or pass through an HTTP(S) URL."""
+    if path.startswith(("http://", "https://")):
+        return path
+    mime_type = mimetypes.guess_type(path)[0] or "image/png"
+    with open(path, "rb") as f:
+        data = base64.b64encode(f.read()).decode("utf-8")
+    return f"data:{mime_type};base64,{data}"
+
+
+def build_multimodal_input(query: str, image_paths: list[str]) -> list[dict]:
+    """Build a Responses-API-style multimodal user message with text and images.
+
+    The OpenAI Agents SDK expects content parts typed as ``input_text`` /
+    ``input_image`` (Responses API format), *not* the Chat Completions
+    ``text`` / ``image_url`` format.
+    """
+    content: list[dict] = [{"type": "input_text", "text": query}]
+    for img in image_paths:
+        content.append({
+            "type": "input_image",
+            "image_url": _image_url_from_path(img),
+        })
+    return [{"role": "user", "content": content}]
+
+
 def check_if_tool_calls_present(result) -> bool:
     if hasattr(result, 'new_items') and result.new_items:
         for item in result.new_items:
@@ -99,6 +128,8 @@ async def run(query, agent, OVMS_MODEL_PROVIDER, stream: bool = False):
     parser.add_argument("--mcp-server", type=str, choices=["all", "weather", "fs"], default="all", help="Which MCP server(s) to use: all, weather, or fs")
     parser.add_argument("--tool-choice", type=str, default="auto", choices=["auto", "required"], help="Tool choice for the agent")
     parser.add_argument("--enable-thinking", action="store_true", help="Enable agent thinking (default: False)")
+    parser.add_argument("--image", type=str, nargs="+", default=[], metavar="PATH_OR_URL",
+                        help="One or more image file paths or URLs to include with the prompt")
     args = parser.parse_args()
     mcp_servers = []
     if args.mcp_server in ["all", "weather"]:
@@ -137,8 +168,13 @@ def get_model(self, _) -> Model:
         model_settings=ModelSettings(tool_choice=args.tool_choice, temperature=0.0, max_tokens=1000, extra_body={"chat_template_kwargs": {"enable_thinking": args.enable_thinking}}),
     )
     loop = asyncio.new_event_loop()
-
-    is_tool_call_present = loop.run_until_complete(run(args.query, agent, OVMS_MODEL_PROVIDER, args.stream))
+
+    if args.image:
+        agent_input = build_multimodal_input(args.query, args.image)
+    else:
+        agent_input = args.query
+
+    is_tool_call_present = loop.run_until_complete(run(agent_input, agent, OVMS_MODEL_PROVIDER, args.stream))
 
     # for testing purposes, exit codes are dependent on whether a tool call was present in the agent's reasoning process
     if is_tool_call_present:

diff --git a/demos/integration_with_OpenWebUI/README.md b/demos/integration_with_OpenWebUI/README.md
@@ -57,7 +57,7 @@ curl http://localhost:8000/v3/chat/completions -H "Content-Type: application/jso
 Install Open WebUI:
 
 ```bash
-pip install --no-cache-dir open-webui
+pip install --no-cache-dir open-webui --extra-index-url "https://download.pytorch.org/whl/cpu"
 ```
 
 Running Open WebUI: