vllm-project · franciscojavierarceo · May 15, 2026 · maralbahari · May 19, 2026
@@ -6,6 +6,30 @@ description = "Stateful API logic for agentic applications using vLLM"
 license = "Apache-2.0"
 repository = "https://github.com/vllm-project/agentic-api"
 
+[dependencies]
+async-trait = "0.1"
+bytes = "1"
+clap = { version = "4", features = ["derive", "env"] }
+http = "1"
+praxis = { git = "https://github.com/praxis-proxy/praxis.git", rev = "2f7ea31" }
+praxis-proxy-core = { git = "https://github.com/praxis-proxy/praxis.git", rev = "2f7ea31" }
+praxis-proxy-filter = { git = "https://github.com/praxis-proxy/praxis.git", rev = "2f7ea31" }
+reqwest = { version = "0.12", features = ["stream"] }
+serde = { version = "1", features = ["derive"] }
+serde_json = "1"
+serde_yaml = "0.9"
+thiserror = "2"
+tokio = { version = "1", features = ["full"] }
+tracing = "0.1"
+
+[dev-dependencies]
+axum = "0.8"
+futures = "0.3"
+praxis-test-utils = { git = "https://github.com/praxis-proxy/praxis.git", rev = "2f7ea31" }
+reqwest = { version = "0.12", features = ["json"] }
+tokio = { version = "1", features = ["full", "test-util"] }
+tokio-stream = "0.1"
+
 [lints.rust]
 unsafe_code = "forbid"
 

@@ -1,22 +1,51 @@
 # agentic-api
 Stateful API logic for agentic applications using vLLM
 
-A Rust-first project that is migrating agentic gateway functionality from Python into
-native Rust components. The previous Python gateway implementation has been removed.
-Design and migration decisions are tracked in the ADRs under `docs/adr/`.
+A Rust-based gateway that adds stateful, agentic capabilities on top of
+[vLLM](https://github.com/vllm-project/vllm)'s high-throughput inference engine.
+Built on [Praxis](https://github.com/praxis-proxy/praxis), a composable filter-based
+proxy framework, so each concern (state hydration, tool dispatch, agentic looping) is
+an independent filter wired together via YAML configuration.
+
+Design decisions are tracked in the ADRs under `docs/adr/`.
+
+## Architecture
+
+```
+Client -> [Agentic API (Praxis filters)] -> [vLLM Core]
+                      |
+              [State Store]
+           (Files, Vector Stores,
+           Search, Conversations)
+```
+
+Filters in the pipeline:
+
+| Filter | Role |
+|--------|------|
+| `state_hydration` | Hydrates conversation state via `previous_response_id` |
+| `agentic_loop` | Detects tool calls and re-enters the inference loop |
+| `tool_dispatch` | Executes tool calls (MCP, code interpreter, file search) |
+| `responses_proxy` | Routes requests to vLLM's `/v1/responses` endpoint |
 
 ## Repository layout
 
-- Rust source: `src/`
-- Rust package manifest: `Cargo.toml`
-- Documentation: `docs/`
+- `src/filters/` — Praxis filter implementations
+- `config/agentic-api.yaml` — Default filter pipeline configuration
+- `docs/` — Documentation and ADRs
 
 ## Build
 
 ```bash
 cargo build
 ```
 
+## Run
+
+```bash
+cargo run -- -c config/agentic-api.yaml
+```
+
 ## Test
 
 ```bash

@@ -0,0 +1,22 @@
+admin:
+  address: "127.0.0.1:9901"
+
+listeners:
+  - name: agentic-api
+    address: "0.0.0.0:9000"
+    filter_chains: [agentic]
+
+filter_chains:
+  - name: agentic
+    filters:
+      - filter: state_hydration
+        store_base_url: "http://localhost:8080"
+      - filter: agentic_loop
+        max_iterations: 10
+      - filter: tool_dispatch
+      - filter: responses_proxy
+        vllm_base_url: "http://localhost:8000"
+
+clusters:
+  - name: vllm
+    endpoints: ["127.0.0.1:8000"]
@@ -0,0 +1,76 @@
+# Architecture
+
+## Overview
+
+The vLLM Agentic API is a Rust-based gateway built on [Praxis](https://github.com/praxis-proxy/praxis), a composable filter-based proxy framework. Each gateway concern is an independent Praxis filter, composed into a pipeline via YAML configuration.
+
+```mermaid
+graph TD
+    Client -->|POST /v1/responses| Gateway[Agentic API Gateway]
+    Gateway --> SH[state_hydration filter]
+    SH --> AL[agentic_loop filter]
+    AL --> TD[tool_dispatch filter]
+    TD --> RP[responses_proxy filter]
+    RP -->|native proxy| vLLM[vLLM Core]
+    SH -.->|hydrate state| Store[State Store]
+    TD -.->|execute tools| Tools[MCP / Tool Runtimes]
+    AL -.->|loop on tool calls| AL
+```
+
+**Stateless path:** Requests without `previous_response_id` flow straight through to vLLM Core.
+
+**Stateful path:** The `state_hydration` filter loads conversation history, the request goes to vLLM, and if tool calls are detected, the `agentic_loop` and `tool_dispatch` filters handle execution and re-inference.
+
+## Filter Pipeline
+
+The gateway is a pipeline of [Praxis filters](https://github.com/praxis-proxy/praxis/blob/main/docs/filters.md) — each filter implements the `HttpFilter` trait with hooks for request and response processing.
+
+| Filter | Phase | Role |
+|--------|-------|------|
+| `state_hydration` | Request | Inspects `previous_response_id` and hydrates conversation history from the state store |
+| `agentic_loop` | Response | Detects `function_call` output items in model responses and re-enters the inference loop |
+| `tool_dispatch` | Response | Executes tool calls (MCP servers, code interpreter, file search) |
+| `responses_proxy` | Request | Sets the upstream to vLLM's `/v1/responses` endpoint and injects auth credentials |
+
+Filters are configured and ordered in `config/agentic-api.yaml`:
+
+```yaml
+filter_chains:
+  - name: agentic
+    filters:
+      - filter: state_hydration
+        store_base_url: "http://localhost:8080"
+      - filter: agentic_loop
+        max_iterations: 10
+      - filter: tool_dispatch
+      - filter: responses_proxy
+        vllm_base_url: "http://localhost:8000"
+```
+
+Adding, removing, or reordering filters requires no code changes — just edit the YAML.
+
+## Why Praxis
+
+- **Composable** — Each filter is self-contained with no knowledge of other filters in the pipeline
+- **YAML-configured** — The pipeline can be reconfigured without code changes
+- **Native streaming** — Praxis/Pingora handles SSE streaming natively, delivering tokens to clients in real time
+- **Hot reload** — Filter pipelines can be reloaded from YAML without restarting the server
+- **AI-optimized** — Built-in body inspection (`StreamBuffer` mode), model-to-header routing, and MCP classification
+
+## Key Components
+
+### vLLM Core (Stateless Inference)
+
+The upstream vLLM server implements a stateless version of the Responses API. It handles tokenization, chat templates, and inference. The gateway never duplicates this logic.
+
+### State Store
+
+Provides stateful building blocks: file storage, vector stores, search, and conversation history. The `state_hydration` filter calls into the store to load conversation context before inference.
+
+### Tool Runtimes
+
+Tool calls detected in model output are dispatched by the `tool_dispatch` filter. Supported runtime types include MCP servers, code interpreter, file search, and web search.
+
+## Streaming
+
+SSE streaming is handled natively by Praxis's underlying [Pingora](https://github.com/cloudflare/pingora) proxy engine. The `responses_proxy` filter sets the upstream target and returns `FilterAction::Continue`, letting Pingora forward the response stream directly to the client — no buffering, no reqwest intermediary.
@@ -17,12 +17,25 @@ hide:
 <a class="github-button" href="https://github.com/vllm-project/agentic-api/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
 </p>
 
-vLLM Agentic API provides the stateful APIs needed for real-world agentic applications — managing conversations, tool calls, and multi-turn interactions on top of [vLLM](https://github.com/vllm-project/vllm)'s high-throughput inference engine.
+vLLM Agentic API provides the stateful APIs needed for real-world agentic applications — managing conversations, tool calls, and multi-turn interactions on top of [vLLM](https://github.com/vllm-project/vllm)'s high-throughput inference engine. Built on [Praxis](https://github.com/praxis-proxy/praxis), a composable filter-based proxy framework.
 
 !!! important
 
     This project is in early development. Follow along and contribute on [GitHub](https://github.com/vllm-project/agentic-api).
 
+## Architecture
+
+The gateway is a pipeline of composable [Praxis filters](https://github.com/praxis-proxy/praxis/blob/main/docs/filters.md) configured via YAML. Each concern — state hydration, tool dispatch, agentic looping, proxying — is an independent filter that can be configured, reordered, or extended without code changes.
+
+| Filter | Role |
+|--------|------|
+| `state_hydration` | Hydrates conversation state via `previous_response_id` |
+| `agentic_loop` | Detects tool calls in model output and re-enters the inference loop |
+| `tool_dispatch` | Executes tool calls (MCP, code interpreter, file search) |
+| `responses_proxy` | Routes requests to vLLM's `/v1/responses` endpoint |
+
+SSE streaming is handled natively by Praxis/Pingora, delivering tokens to clients in real time.
+
 ## Responses API
 
 Our first milestone is implementing the [Responses API](https://platform.openai.com/docs/api-reference/responses), bringing stateful, agentic capabilities to vLLM. We validate our implementation against the [Open Responses](https://www.openresponses.org/) compatibility test suite.

@@ -91,6 +91,7 @@ markdown_extensions:
 
 nav:
   - Home: index.md
+  - Architecture: architecture/index.md
   - API Reference: api/index.md
   - Community: community/index.md
 

@@ -0,0 +1,28 @@
+from openai import OpenAI
+
+client = OpenAI()
+
+# Create a 3-response chain
+resp1 = client.responses.create(model="gpt-4o", input="Remember: the secret word is banana", store=True)
+print(f"resp1: {resp1.id}")
+
+resp2 = client.responses.create(model="gpt-4o", input="Acknowledge the secret word", previous_response_id=resp1.id, store=True)
+print(f"resp2: {resp2.id}")
+
+resp3 = client.responses.create(model="gpt-4o", input="Say the secret word again", previous_response_id=resp2.id, store=True)
+print(f"resp3: {resp3.id} → {resp3.output_text}")
+
+# Delete the middle link
+client.responses.delete(resp2.id)
+print(f"Deleted resp2: {resp2.id}")
+
+# Try to continue from resp3 — does it still work?
+try:
+    resp4 = client.responses.create(model="gpt-4o", input="What was the secret word?", previous_response_id=resp3.id, store=True)
+    print(f"resp4: {resp4.id} → {resp4.output_text}")
+    print("Chain survived deletion → likely shadow conversation")
+except Exception as e:
+    print(f"Chain broke → likely walking the chain: {e}")
+
+# Also dump resp1 for any hidden fields
+print(f"\nFull resp1 dump keys: {list(resp1.model_dump().keys())}")
@@ -0,0 +1,37 @@
+from openai import OpenAI
+
+client = OpenAI()
+
+# Test 1: Delete middle link (resp2)
+print("=== Test 1: Delete middle link ===")
+resp1 = client.responses.create(model="gpt-4o", input="Remember: the secret word is banana", store=True)
+print(f"resp1: {resp1.id}")
+
+resp2 = client.responses.create(model="gpt-4o", input="Acknowledge the secret word", previous_response_id=resp1.id, store=True)
+print(f"resp2: {resp2.id}")
+
+resp3 = client.responses.create(model="gpt-4o", input="Say the secret word again", previous_response_id=resp2.id, store=True)
+print(f"resp3: {resp3.id} → {resp3.output_text}")
+
+client.responses.delete(resp2.id)
+print(f"Deleted resp2 (middle link)")
+
+resp4 = client.responses.create(model="gpt-4o", input="What was the secret word?", previous_response_id=resp3.id, store=True)
+print(f"resp4: {resp4.id} → {resp4.output_text}")
+
+# Test 2: Delete the source of truth (resp1 — the one with "banana")
+print("\n=== Test 2: Delete source of truth ===")
+r1 = client.responses.create(model="gpt-4o", input="Remember: the secret word is mango", store=True)
+print(f"r1: {r1.id}")
+
+r2 = client.responses.create(model="gpt-4o", input="Acknowledge the secret word", previous_response_id=r1.id, store=True)
+print(f"r2: {r2.id}")
+
+r3 = client.responses.create(model="gpt-4o", input="Say the secret word again", previous_response_id=r2.id, store=True)
+print(f"r3: {r3.id} → {r3.output_text}")
+
+client.responses.delete(r1.id)
+print(f"Deleted r1 (source of 'mango')")
+
+r4 = client.responses.create(model="gpt-4o", input="What was the secret word?", previous_response_id=r3.id, store=True)
+print(f"r4: {r4.id} → {r4.output_text}")
@@ -0,0 +1,49 @@
+use clap::Args;
+
+#[derive(Debug, Clone, Args)]
+pub struct RuntimeConfig {
+    #[arg(skip)]
+    pub llm_api_base: String,
+
+    #[arg(long, env = "OPENAI_API_KEY", hide_env_values = true)]
+    pub openai_api_key: Option<String>,
+
+    #[arg(long, default_value = "0.0.0.0")]
+    pub gateway_host: String,
+
+    #[arg(long, default_value_t = 9000)]
+    pub gateway_port: u16,
+
+    #[arg(long, default_value_t = 600.0)]
+    pub vllm_ready_timeout_s: f64,
+
+    #[arg(long, default_value_t = 2.0)]
+    pub vllm_ready_interval_s: f64,
+}
+
+#[must_use]
+pub fn normalize_base_url(url: &str) -> String {
+    let mut s = url.trim_end_matches('/').to_owned();
+    if s.ends_with("/v1") {
+        s.truncate(s.len() - 3);
+        s = s.trim_end_matches('/').to_owned();
+    }
+    s
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn strip_trailing_v1() {
+        assert_eq!(normalize_base_url("http://host:8000/v1"), "http://host:8000");
+        assert_eq!(normalize_base_url("http://host:8000/v1/"), "http://host:8000");
+    }
+
+    #[test]
+    fn no_v1_unchanged() {
+        assert_eq!(normalize_base_url("http://host:8000"), "http://host:8000");
+        assert_eq!(normalize_base_url("http://host:8000/"), "http://host:8000");
+    }
+}
@@ -0,0 +1,22 @@
+use std::io;
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("failed to build HTTP client")]
+    HttpClient(#[source] reqwest::Error),
+
+    #[error("vLLM not ready within {timeout_s:.0}s at {url}")]
+    VllmTimeout { url: String, timeout_s: f64 },
+
+    #[error("vLLM subprocess exited before becoming ready: {status}")]
+    VllmProcessExited { status: String },
+
+    #[error(transparent)]
+    Io(#[from] io::Error),
+
+    #[error("invalid header value")]
+    InvalidHeader(#[from] reqwest::header::InvalidHeaderValue),
+
+    #[error("{0}")]
+    Config(String),
+}