apache · akshayjadiyanv · May 26, 2026 · May 27, 2026 · May 27, 2026
diff --git a/sdks/python/apache_beam/examples/inference/vllm_text_completion.py b/sdks/python/apache_beam/examples/inference/vllm_text_completion.py
@@ -138,6 +138,20 @@ def parse_known_args(argv):
           'Passed to the vLLM OpenAI server as --gpu-memory-utilization '
           '(fraction of total GPU memory for KV cache). Lower this if the '
           'engine fails to start with CUDA out of memory.'))
+  parser.add_argument(
+      '--use_dynamo',
+      dest='use_dynamo',
+      action='store_true',
+      help=(
+          'Use embedded NVIDIA Dynamo as the vLLM engine. Requires '
+          'ai-dynamo[vllm] and the etcd binary in the runtime environment. '
+          'See VLLMCompletionsModelHandler for limitations of embedded mode.'))
+  parser.add_argument(
+      '--max_tokens',
+      dest='max_tokens',
+      type=int,
+      default=16,
+      help='Maximum number of tokens to generate for each example.')
   return parser.parse_known_args(argv)
 
 
@@ -178,22 +192,26 @@ def run(
       build_vllm_server_kwargs(known_args))
 
   model_handler = VLLMCompletionsModelHandler(
-      model_name=known_args.model, vllm_server_kwargs=effective_vllm_kwargs)
+      model_name=known_args.model,
+      vllm_server_kwargs=effective_vllm_kwargs,
+      use_dynamo=known_args.use_dynamo)
   input_examples = COMPLETION_EXAMPLES
 
   if known_args.chat:
     model_handler = VLLMChatModelHandler(
         model_name=known_args.model,
         chat_template_path=known_args.chat_template,
-        vllm_server_kwargs=dict(effective_vllm_kwargs))
+        vllm_server_kwargs=dict(effective_vllm_kwargs),
+        use_dynamo=known_args.use_dynamo)
     input_examples = CHAT_EXAMPLES
 
   pipeline = test_pipeline
   if not test_pipeline:
     pipeline = beam.Pipeline(options=pipeline_options)
 
   examples = pipeline | "Create examples" >> beam.Create(input_examples)
-  predictions = examples | "RunInference" >> RunInference(model_handler)
+  predictions = examples | "RunInference" >> RunInference(
+      model_handler, inference_args={'max_tokens': known_args.max_tokens})
   process_output = predictions | "Process Predictions" >> beam.ParDo(
       PostProcessor())
   _ = process_output | "WriteOutput" >> beam.io.WriteToText(

diff --git a/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile.old b/sdks/python/apache_beam/ml/inference/test_resources/vllm.dockerfile.old
@@ -34,14 +34,22 @@ RUN python3 --version
 RUN apt-get install -y curl
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12 && pip install --upgrade pip
 
-RUN pip install --no-cache-dir -vvv apache-beam[gcp]==2.58.1
-RUN pip install openai vllm
+RUN pip install --no-cache-dir -vvv apache-beam[gcp]==2.71.0
+RUN pip install --no-cache-dir openai vllm ai-dynamo[vllm]
 
 RUN apt install libcairo2-dev pkg-config python3-dev -y
 RUN pip install pycairo
 
+# etcd binary required by embedded NVIDIA Dynamo for runtime discovery.
+ENV ETCD_VERSION=v3.5.13
+RUN curl -L https://github.com/etcd-io/etcd/releases/download/${ETCD_VERSION}/etcd-${ETCD_VERSION}-linux-amd64.tar.gz -o /tmp/etcd.tar.gz && \
+    tar xzf /tmp/etcd.tar.gz -C /tmp && \
+    mv /tmp/etcd-${ETCD_VERSION}-linux-amd64/etcd /usr/local/bin/etcd && \
+    chmod +x /usr/local/bin/etcd && \
+    rm -rf /tmp/etcd*
+
 # Copy the Apache Beam worker dependencies from the Beam Python 3.12 SDK image.
-COPY --from=apache/beam_python3.12_sdk:2.58.1 /opt/apache/beam /opt/apache/beam
+COPY --from=apache/beam_python3.12_sdk:2.71.0 /opt/apache/beam /opt/apache/beam
 
 # Set the entrypoint to Apache Beam SDK worker launcher.
 ENTRYPOINT [ "/opt/apache/beam/boot" ]