beyondoss · jaredLunde · Jun 1, 2026 · Jun 1, 2026
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+target
+.git
+dist
+*.md
+.env
+.env.*
+.claude
+.github
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,77 @@
+# syntax=docker/dockerfile:1.7-labs
+#
+# Multi-stage build for the Beyond AI gateway (`beyond-ai`).
+#
+# Built from this crate's root (it's a single standalone crate — no workspace,
+# all deps come from crates.io):
+#
+#     docker build -t beyond-ai .
+#
+# We use cargo-chef to cache the (heavy: pingora + tokio) dependency build in a
+# layer that's keyed only on the dependency graph, so source-only changes skip
+# the slow dep compile.
+
+# Latest stable 1.x. The crate's MSRV is 1.85, but cargo-chef's own build pulls
+# deps that need a newer rustc, so the image toolchain must lead the MSRV.
+ARG RUST_VERSION=1
+
+# ---------------------------------------------------------------------------
+# Stage 1: chef — base with cargo-chef installed.
+# ---------------------------------------------------------------------------
+FROM rust:${RUST_VERSION}-bookworm AS chef
+# cmake builds the bundled zlib-ng (libz-ng-sys, pulled in by pingora); the base
+# rust image ships a C/C++ toolchain but not cmake.
+RUN apt-get update && apt-get install -y --no-install-recommends cmake \
+    && rm -rf /var/lib/apt/lists/*
+RUN cargo install cargo-chef --locked --version ^0.1
+WORKDIR /app
+
+# ---------------------------------------------------------------------------
+# Stage 2: planner — compute a dependency-only recipe.
+# ---------------------------------------------------------------------------
+FROM chef AS planner
+COPY . .
+RUN cargo chef prepare --recipe-path recipe.json
+
+# ---------------------------------------------------------------------------
+# Stage 3: builder — cook dependencies from the recipe, then build the binary.
+# ---------------------------------------------------------------------------
+FROM chef AS builder
+COPY --from=planner /app/recipe.json recipe.json
+# Cook only dependencies — cached until the recipe (the dependency graph)
+# changes. Cache mounts keep the cargo registry + git db warm across builds.
+# rustls uses ring, which builds C/asm with the toolchain already in the image;
+# no extra apt packages are needed (pure-rustls, no OpenSSL/protobuf).
+RUN --mount=type=cache,target=/usr/local/cargo/registry \
+    --mount=type=cache,target=/usr/local/cargo/git \
+    cargo chef cook --release --recipe-path recipe.json
+
+# Now copy the full source and build just the gateway binary.
+COPY . .
+RUN --mount=type=cache,target=/usr/local/cargo/registry \
+    --mount=type=cache,target=/usr/local/cargo/git \
+    --mount=type=cache,target=/app/target \
+    cargo build --release --bin beyond-ai \
+    && cp /app/target/release/beyond-ai /usr/local/bin/beyond-ai
+
+# ---------------------------------------------------------------------------
+# Stage 4: runtime — minimal Debian slim with just the binary and the CA
+# certificates needed to verify outbound TLS to the LLM providers.
+# ---------------------------------------------------------------------------
+FROM debian:bookworm-slim AS runtime
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    ca-certificates \
+    && rm -rf /var/lib/apt/lists/*
+
+# Run as a non-root user.
+RUN useradd --system --uid 10001 --no-create-home --shell /usr/sbin/nologin beyond
+USER beyond
+
+COPY --from=builder /usr/local/bin/beyond-ai /usr/local/bin/beyond-ai
+
+# 8080: data-plane listener (client requests). 9090: admin — /metrics
+# (Prometheus), /livez, /readyz. Both default to 0.0.0.0; override via the
+# mounted config or the AI_LISTEN / AI_METRICS_LISTEN env vars.
+EXPOSE 8080/tcp 9090/tcp
+
+ENTRYPOINT ["/usr/local/bin/beyond-ai"]
diff --git a/src/deny.rs b/src/deny.rs
@@ -151,5 +151,9 @@ mod tests {
     fn spend_is_402_fraud_is_403() {
         assert_eq!(DenyReason::Spend.http_status(), 402);
         assert_eq!(DenyReason::Fraud.http_status(), 403);
+        // Unknown is fail-safe: an unrecognized reason still denies, and maps to 403 like fraud
+        // (not 402) — so a control-plane reason we don't yet parse can't be mistaken for a billing
+        // block or, worse, leak through as an allow.
+        assert_eq!(DenyReason::Unknown.http_status(), 403);
     }
 }
diff --git a/src/proxy.rs b/src/proxy.rs
@@ -940,4 +940,33 @@ mod tests {
         let ok = "a".repeat(MAX_MODEL_LEN);
         assert_eq!(sanitize_model(ok.clone()), ok);
     }
+
+    #[test]
+    fn dialect_for_path_selects_anthropic_only_for_messages() {
+        // The dialect drives usage parsing *and* stream injection: misclassifying an Anthropic
+        // `/v1/messages` request as OpenAI mis-meters its tokens. The rule is a `/v1/messages`
+        // prefix ⇒ Anthropic; everything else (chat completions, embeddings, the bare root) is
+        // OpenAI-dialect. This locks that mapping so a refactor can't silently flip it.
+        assert_eq!(dialect_for_path("/v1/messages"), Dialect::Anthropic);
+        assert_eq!(dialect_for_path("/v1/messages/batches"), Dialect::Anthropic);
+        assert_eq!(dialect_for_path("/v1/chat/completions"), Dialect::OpenAI);
+        assert_eq!(dialect_for_path("/v1/embeddings"), Dialect::OpenAI);
+        assert_eq!(dialect_for_path("/"), Dialect::OpenAI);
+    }
+
+    #[test]
+    fn is_streamable_path_matches_generation_suffixes_across_prefixes() {
+        // Only chat-completions / responses get buffered for `stream_options.include_usage`
+        // injection. The check is by *suffix* so it holds whatever mount prefix the provider uses;
+        // a mismatch here either skips injection on a streamable path (lost usage) or needlessly
+        // buffers a non-streaming one.
+        assert!(is_streamable_path("/v1/chat/completions"));
+        assert!(is_streamable_path("/openai/v1/chat/completions"));
+        assert!(is_streamable_path("/inference/v1/chat/completions"));
+        assert!(is_streamable_path("/v1/responses"));
+        // Non-streaming endpoints must not be buffered.
+        assert!(!is_streamable_path("/v1/embeddings"));
+        assert!(!is_streamable_path("/v1/messages"));
+        assert!(!is_streamable_path("/v1/models"));
+    }
 }
diff --git a/src/usage.rs b/src/usage.rs
@@ -263,7 +263,44 @@ mod tests {
 
     #[test]
     fn no_usage_returns_none() {
-        assert!(openai_stream(b"data: {\"choices\":[]}\n\n").is_none());
-        assert!(anthropic_body(b"{}").map(|u| u.input_tokens).unwrap_or(0) == 0);
+        // Absent `usage` and unparseable bodies must both meter as `None` — never a silent zero-token
+        // row that bills nothing while *looking* like a successful meter. A provider dropping `usage`
+        // (an error 200, a wire-version change) or returning non-JSON must surface as "no fact", which
+        // the proxy logs/alerts on, rather than a phantom 0-token success.
+
+        // --- non-streaming bodies ---
+        assert!(
+            openai_body(br#"{"choices":[{"message":{"content":"hi"}}]}"#).is_none(),
+            "openai body without a `usage` block has nothing to meter"
+        );
+        assert!(
+            openai_body(b"not json at all").is_none(),
+            "malformed openai body must not panic or meter zeros"
+        );
+        assert!(
+            anthropic_body(br#"{"content":[{"type":"text","text":"hi"}]}"#).is_none(),
+            "anthropic body without a `usage` block has nothing to meter"
+        );
+        assert!(
+            anthropic_body(b"{ broken").is_none(),
+            "malformed anthropic body must not panic or meter zeros"
+        );
+
+        // --- streaming: well-formed SSE that simply never carries a usage event ---
+        assert!(
+            openai_stream(
+                b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\ndata: [DONE]\n\n"
+            )
+            .is_none(),
+            "an openai stream with content but no usage chunk meters nothing"
+        );
+        assert!(
+            anthropic_stream(
+                b"event: content_block_delta\n\
+                  data: {\"type\":\"content_block_delta\",\"delta\":{\"text\":\"hi\"}}\n\n"
+            )
+            .is_none(),
+            "an anthropic stream with no usage-bearing event meters nothing"
+        );
     }
 }
diff --git a/tests/e2e.rs b/tests/e2e.rs
@@ -505,6 +505,64 @@ async fn per_credential_rate_limit_returns_429() {
     wait_for_metric(&gw, "ai_rejections_total", "rate_limit", 1.0).await;
 }
 
+#[tokio::test]
+async fn byo_global_rate_limit_caps_distinct_tokens() {
+    // The global BYO aggregate cap is the primary egress-IP protection: a flood of *distinct* junk
+    // BYO tokens (each its own per-credential bucket, so the per-credential tier never trips) would
+    // otherwise open junk-auth connections to providers from our egress IPs and get them banned.
+    // This proves the shared bucket bounds that aggregate. Per-credential is disabled so the only
+    // ceiling in play is the global BYO one — and a regression that inverts the `!managed` guard or
+    // skips the tier would slip past every other test.
+    let nats = Nats::start().await;
+    let (pubkey, _sk) = test_keypair(43);
+    let mock = MockUpstream::start(Mode::Json).await;
+    let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
+        .rate_limit_rps(0) // isolate: per-credential tier off
+        .byo_rate_limit_rps(3) // global BYO ceiling
+        .start()
+        .await;
+    let client = reqwest::Client::new();
+
+    // Warm with a *managed* path is pointless here (managed is exempt); warm with a distinct BYO
+    // token whose single hit can't itself exhaust a 3-rps bucket mid-readiness.
+    {
+        let (c, u) = (client.clone(), gw.url());
+        wait_for_status(200, move || {
+            let (c, u) = (c.clone(), u.clone());
+            async move { post_status(&c, &u, "sk-byo-warmup", body_for("gpt-4o")).await }
+        })
+        .await;
+    }
+
+    // Each request uses a *different* BYO token, so per-credential keying would never trip — only the
+    // shared BYO bucket can. The first few are served; once the aggregate ceiling is crossed the rest
+    // are throttled.
+    let mut saw_200 = false;
+    let mut saw_429 = false;
+    for i in 0..50 {
+        match post_status(
+            &client,
+            &gw.url(),
+            &format!("sk-byo-distinct-{i}"),
+            body_for("gpt-4o"),
+        )
+        .await
+        {
+            200 => saw_200 = true,
+            429 => saw_429 = true,
+            other => panic!("unexpected status under BYO global rate limit: {other}"),
+        }
+    }
+    assert!(saw_200, "requests under the BYO ceiling must be served");
+    assert!(
+        saw_429,
+        "a flood of distinct BYO tokens past the global ceiling must yield 429"
+    );
+    // The dedicated BYO-global reason label — distinct from the per-credential `rate_limit` — must be
+    // what fired, so an operator can tell which knob tripped.
+    wait_for_metric(&gw, "ai_rejections_total", "rate_limit_byo_global", 1.0).await;
+}
+
 #[tokio::test]
 async fn managed_key_via_x_api_key_header_is_accepted() {
     // Anthropic SDKs present the key in `x-api-key`, not `Authorization: Bearer`. A managed virtual
-Original file line number
+Diff line change
@@ -0,0 +1,8 @@
+    target
+    .git
+    dist
+    *.md
+    .env
+    .env.*
+    .claude
+    .github