Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
target
.git
dist
*.md
.env
.env.*
.claude
.github
77 changes: 77 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# syntax=docker/dockerfile:1.7-labs
#
# Multi-stage build for the Beyond AI gateway (`beyond-ai`).
#
# Built from this crate's root (it's a single standalone crate — no workspace,
# all deps come from crates.io):
#
# docker build -t beyond-ai .
#
# We use cargo-chef to cache the (heavy: pingora + tokio) dependency build in a
# layer that's keyed only on the dependency graph, so source-only changes skip
# the slow dep compile.

# Latest stable 1.x. The crate's MSRV is 1.85, but cargo-chef's own build pulls
# deps that need a newer rustc, so the image toolchain must lead the MSRV.
ARG RUST_VERSION=1

# ---------------------------------------------------------------------------
# Stage 1: chef — base with cargo-chef installed.
# ---------------------------------------------------------------------------
FROM rust:${RUST_VERSION}-bookworm AS chef
# cmake builds the bundled zlib-ng (libz-ng-sys, pulled in by pingora); the base
# rust image ships a C/C++ toolchain but not cmake.
RUN apt-get update && apt-get install -y --no-install-recommends cmake \
&& rm -rf /var/lib/apt/lists/*
RUN cargo install cargo-chef --locked --version ^0.1
WORKDIR /app

# ---------------------------------------------------------------------------
# Stage 2: planner — compute a dependency-only recipe.
# ---------------------------------------------------------------------------
FROM chef AS planner
COPY . .
RUN cargo chef prepare --recipe-path recipe.json

# ---------------------------------------------------------------------------
# Stage 3: builder — cook dependencies from the recipe, then build the binary.
# ---------------------------------------------------------------------------
FROM chef AS builder
COPY --from=planner /app/recipe.json recipe.json
# Cook only dependencies — cached until the recipe (the dependency graph)
# changes. Cache mounts keep the cargo registry + git db warm across builds.
# rustls uses ring, which builds C/asm with the toolchain already in the image;
# no extra apt packages are needed (pure-rustls, no OpenSSL/protobuf).
RUN --mount=type=cache,target=/usr/local/cargo/registry \
--mount=type=cache,target=/usr/local/cargo/git \
cargo chef cook --release --recipe-path recipe.json

# Now copy the full source and build just the gateway binary.
COPY . .
RUN --mount=type=cache,target=/usr/local/cargo/registry \
--mount=type=cache,target=/usr/local/cargo/git \
--mount=type=cache,target=/app/target \
cargo build --release --bin beyond-ai \
&& cp /app/target/release/beyond-ai /usr/local/bin/beyond-ai

# ---------------------------------------------------------------------------
# Stage 4: runtime — minimal Debian slim with just the binary and the CA
# certificates needed to verify outbound TLS to the LLM providers.
# ---------------------------------------------------------------------------
FROM debian:bookworm-slim AS runtime
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*

# Run as a non-root user.
RUN useradd --system --uid 10001 --no-create-home --shell /usr/sbin/nologin beyond
USER beyond

COPY --from=builder /usr/local/bin/beyond-ai /usr/local/bin/beyond-ai

# 8080: data-plane listener (client requests). 9090: admin — /metrics
# (Prometheus), /livez, /readyz. Both default to 0.0.0.0; override via the
# mounted config or the AI_LISTEN / AI_METRICS_LISTEN env vars.
EXPOSE 8080/tcp 9090/tcp

ENTRYPOINT ["/usr/local/bin/beyond-ai"]
4 changes: 4 additions & 0 deletions src/deny.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,5 +151,9 @@ mod tests {
fn spend_is_402_fraud_is_403() {
assert_eq!(DenyReason::Spend.http_status(), 402);
assert_eq!(DenyReason::Fraud.http_status(), 403);
// Unknown is fail-safe: an unrecognized reason still denies, and maps to 403 like fraud
// (not 402) — so a control-plane reason we don't yet parse can't be mistaken for a billing
// block or, worse, leak through as an allow.
assert_eq!(DenyReason::Unknown.http_status(), 403);
}
}
29 changes: 29 additions & 0 deletions src/proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -940,4 +940,33 @@ mod tests {
let ok = "a".repeat(MAX_MODEL_LEN);
assert_eq!(sanitize_model(ok.clone()), ok);
}

#[test]
fn dialect_for_path_selects_anthropic_only_for_messages() {
// The dialect drives usage parsing *and* stream injection: misclassifying an Anthropic
// `/v1/messages` request as OpenAI mis-meters its tokens. The rule is a `/v1/messages`
// prefix ⇒ Anthropic; everything else (chat completions, embeddings, the bare root) is
// OpenAI-dialect. This locks that mapping so a refactor can't silently flip it.
assert_eq!(dialect_for_path("/v1/messages"), Dialect::Anthropic);
assert_eq!(dialect_for_path("/v1/messages/batches"), Dialect::Anthropic);
assert_eq!(dialect_for_path("/v1/chat/completions"), Dialect::OpenAI);
assert_eq!(dialect_for_path("/v1/embeddings"), Dialect::OpenAI);
assert_eq!(dialect_for_path("/"), Dialect::OpenAI);
}

#[test]
fn is_streamable_path_matches_generation_suffixes_across_prefixes() {
// Only chat-completions / responses get buffered for `stream_options.include_usage`
// injection. The check is by *suffix* so it holds whatever mount prefix the provider uses;
// a mismatch here either skips injection on a streamable path (lost usage) or needlessly
// buffers a non-streaming one.
assert!(is_streamable_path("/v1/chat/completions"));
assert!(is_streamable_path("/openai/v1/chat/completions"));
assert!(is_streamable_path("/inference/v1/chat/completions"));
assert!(is_streamable_path("/v1/responses"));
// Non-streaming endpoints must not be buffered.
assert!(!is_streamable_path("/v1/embeddings"));
assert!(!is_streamable_path("/v1/messages"));
assert!(!is_streamable_path("/v1/models"));
}
}
41 changes: 39 additions & 2 deletions src/usage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,44 @@ mod tests {

#[test]
fn no_usage_returns_none() {
assert!(openai_stream(b"data: {\"choices\":[]}\n\n").is_none());
assert!(anthropic_body(b"{}").map(|u| u.input_tokens).unwrap_or(0) == 0);
// Absent `usage` and unparseable bodies must both meter as `None` — never a silent zero-token
// row that bills nothing while *looking* like a successful meter. A provider dropping `usage`
// (an error 200, a wire-version change) or returning non-JSON must surface as "no fact", which
// the proxy logs/alerts on, rather than a phantom 0-token success.

// --- non-streaming bodies ---
assert!(
openai_body(br#"{"choices":[{"message":{"content":"hi"}}]}"#).is_none(),
"openai body without a `usage` block has nothing to meter"
);
assert!(
openai_body(b"not json at all").is_none(),
"malformed openai body must not panic or meter zeros"
);
assert!(
anthropic_body(br#"{"content":[{"type":"text","text":"hi"}]}"#).is_none(),
"anthropic body without a `usage` block has nothing to meter"
);
assert!(
anthropic_body(b"{ broken").is_none(),
"malformed anthropic body must not panic or meter zeros"
);

// --- streaming: well-formed SSE that simply never carries a usage event ---
assert!(
openai_stream(
b"data: {\"choices\":[{\"delta\":{\"content\":\"hi\"}}]}\n\ndata: [DONE]\n\n"
)
.is_none(),
"an openai stream with content but no usage chunk meters nothing"
);
assert!(
anthropic_stream(
b"event: content_block_delta\n\
data: {\"type\":\"content_block_delta\",\"delta\":{\"text\":\"hi\"}}\n\n"
)
.is_none(),
"an anthropic stream with no usage-bearing event meters nothing"
);
}
}
58 changes: 58 additions & 0 deletions tests/e2e.rs
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,64 @@ async fn per_credential_rate_limit_returns_429() {
wait_for_metric(&gw, "ai_rejections_total", "rate_limit", 1.0).await;
}

#[tokio::test]
async fn byo_global_rate_limit_caps_distinct_tokens() {
// The global BYO aggregate cap is the primary egress-IP protection: a flood of *distinct* junk
// BYO tokens (each its own per-credential bucket, so the per-credential tier never trips) would
// otherwise open junk-auth connections to providers from our egress IPs and get them banned.
// This proves the shared bucket bounds that aggregate. Per-credential is disabled so the only
// ceiling in play is the global BYO one — and a regression that inverts the `!managed` guard or
// skips the tier would slip past every other test.
let nats = Nats::start().await;
let (pubkey, _sk) = test_keypair(43);
let mock = MockUpstream::start(Mode::Json).await;
let gw = Gateway::builder(nats.port, &mock.authority(), &b64(&pubkey))
.rate_limit_rps(0) // isolate: per-credential tier off
.byo_rate_limit_rps(3) // global BYO ceiling
.start()
.await;
let client = reqwest::Client::new();

// Warm with a *managed* path is pointless here (managed is exempt); warm with a distinct BYO
// token whose single hit can't itself exhaust a 3-rps bucket mid-readiness.
{
let (c, u) = (client.clone(), gw.url());
wait_for_status(200, move || {
let (c, u) = (c.clone(), u.clone());
async move { post_status(&c, &u, "sk-byo-warmup", body_for("gpt-4o")).await }
})
.await;
}

// Each request uses a *different* BYO token, so per-credential keying would never trip — only the
// shared BYO bucket can. The first few are served; once the aggregate ceiling is crossed the rest
// are throttled.
let mut saw_200 = false;
let mut saw_429 = false;
for i in 0..50 {
match post_status(
&client,
&gw.url(),
&format!("sk-byo-distinct-{i}"),
body_for("gpt-4o"),
)
.await
{
200 => saw_200 = true,
429 => saw_429 = true,
other => panic!("unexpected status under BYO global rate limit: {other}"),
}
}
assert!(saw_200, "requests under the BYO ceiling must be served");
assert!(
saw_429,
"a flood of distinct BYO tokens past the global ceiling must yield 429"
);
// The dedicated BYO-global reason label — distinct from the per-credential `rate_limit` — must be
// what fired, so an operator can tell which knob tripped.
wait_for_metric(&gw, "ai_rejections_total", "rate_limit_byo_global", 1.0).await;
}

#[tokio::test]
async fn managed_key_via_x_api_key_header_is_accepted() {
// Anthropic SDKs present the key in `x-api-key`, not `Authorization: Bearer`. A managed virtual
Expand Down
Loading