Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions bridge.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ func newInterceptionProcessor(p provider.Provider, cbs *circuitbreaker.ProviderC
asyncRecorder.WithProvider(p.Name())
asyncRecorder.WithModel(interceptor.Model())
asyncRecorder.WithInitiatorID(actor.ID)
asyncRecorder.WithClient(string(client))
interceptor.Setup(logger, asyncRecorder, mcpProxy)

if err := rec.RecordInterception(ctx, &recorder.InterceptionRecord{
Expand Down Expand Up @@ -231,13 +232,13 @@ func newInterceptionProcessor(p provider.Provider, cbs *circuitbreaker.ProviderC
return interceptor.ProcessRequest(rw, r)
}); err != nil {
if m != nil {
m.InterceptionCount.WithLabelValues(p.Name(), interceptor.Model(), metrics.InterceptionCountStatusFailed, route, r.Method, actor.ID).Add(1)
m.InterceptionCount.WithLabelValues(p.Name(), interceptor.Model(), metrics.InterceptionCountStatusFailed, route, r.Method, actor.ID, string(client)).Add(1)
}
span.SetStatus(codes.Error, fmt.Sprintf("interception failed: %v", err))
log.Warn(ctx, "interception failed", slog.Error(err))
} else {
if m != nil {
m.InterceptionCount.WithLabelValues(p.Name(), interceptor.Model(), metrics.InterceptionCountStatusCompleted, route, r.Method, actor.ID).Add(1)
m.InterceptionCount.WithLabelValues(p.Name(), interceptor.Model(), metrics.InterceptionCountStatusCompleted, route, r.Method, actor.ID, string(client)).Add(1)
}
log.Debug(ctx, "interception ended")
}
Expand Down
71 changes: 67 additions & 4 deletions internal/integrationtest/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ func TestMetrics_Interception(t *testing.T) {
name string
fixture []byte
path string
headers http.Header
expectStatus string
expectModel string
expectRoute string
expectProvider string
expectClient aibridge.Client
allowOverflow bool // error fixtures may cause retries
}{
{
Expand All @@ -39,72 +41,98 @@ func TestMetrics_Interception(t *testing.T) {
expectModel: "claude-sonnet-4-0",
expectRoute: "/v1/messages",
expectProvider: config.ProviderAnthropic,
expectClient: aibridge.ClientUnknown,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: could be worth adding a couple more test cases with different user agents (e.g. Copilot, Cursor) to cover label propagation beyond just Claude Code and Unknown.

},
{
name: "ant_error",
fixture: fixtures.AntNonStreamError,
path: pathAnthropicMessages,
headers: http.Header{"User-Agent": []string{"kilo-code/1.2.3"}},
expectStatus: metrics.InterceptionCountStatusFailed,
expectModel: "claude-sonnet-4-0",
expectRoute: "/v1/messages",
expectProvider: config.ProviderAnthropic,
expectClient: aibridge.ClientKilo,
allowOverflow: true,
},
{
name: "ant_simple_claude_code",
fixture: fixtures.AntSimple,
path: pathAnthropicMessages,
headers: http.Header{"User-Agent": []string{"claude-code/1.0.0"}},
expectStatus: metrics.InterceptionCountStatusCompleted,
expectModel: "claude-sonnet-4-0",
expectRoute: "/v1/messages",
expectProvider: config.ProviderAnthropic,
expectClient: aibridge.ClientClaudeCode,
},
{
name: "oai_chat_simple",
fixture: fixtures.OaiChatSimple,
path: pathOpenAIChatCompletions,
headers: http.Header{"User-Agent": []string{"copilot/1.0.0"}},
expectStatus: metrics.InterceptionCountStatusCompleted,
expectModel: "gpt-4.1",
expectRoute: "/v1/chat/completions",
expectProvider: config.ProviderOpenAI,
expectClient: aibridge.ClientCopilotCLI,
},
{
name: "oai_chat_error",
fixture: fixtures.OaiChatNonStreamError,
path: pathOpenAIChatCompletions,
headers: http.Header{"User-Agent": []string{"githubcopilotchat/0.30.0"}},
expectStatus: metrics.InterceptionCountStatusFailed,
expectModel: "gpt-4.1",
expectRoute: "/v1/chat/completions",
expectProvider: config.ProviderOpenAI,
expectClient: aibridge.ClientCopilotVSC,
allowOverflow: true,
},
{
name: "oai_responses_blocking_simple",
fixture: fixtures.OaiResponsesBlockingSimple,
path: pathOpenAIResponses,
headers: http.Header{"X-Cursor-Client-Version": []string{"0.50.0"}},
expectStatus: metrics.InterceptionCountStatusCompleted,
expectModel: "gpt-4o-mini",
expectRoute: "/v1/responses",
expectProvider: config.ProviderOpenAI,
expectClient: aibridge.ClientCursor,
},
{
name: "oai_responses_blocking_error",
fixture: fixtures.OaiResponsesBlockingHttpErr,
path: pathOpenAIResponses,
headers: http.Header{"User-Agent": []string{"codex/1.0.0"}},
expectStatus: metrics.InterceptionCountStatusFailed,
expectModel: "gpt-4o-mini",
expectRoute: "/v1/responses",
expectProvider: config.ProviderOpenAI,
expectClient: aibridge.ClientCodex,
allowOverflow: true,
},
{
name: "oai_responses_streaming_simple",
fixture: fixtures.OaiResponsesStreamingSimple,
path: pathOpenAIResponses,
headers: http.Header{"User-Agent": []string{"zed/0.200.0"}},
expectStatus: metrics.InterceptionCountStatusCompleted,
expectModel: "gpt-4o-mini",
expectRoute: "/v1/responses",
expectProvider: config.ProviderOpenAI,
expectClient: aibridge.ClientZed,
},
{
name: "oai_responses_streaming_error",
fixture: fixtures.OaiResponsesStreamingHttpErr,
path: pathOpenAIResponses,
headers: http.Header{"Originator": []string{"roo-code"}},
expectStatus: metrics.InterceptionCountStatusFailed,
expectModel: "gpt-4o-mini",
expectRoute: "/v1/responses",
expectProvider: config.ProviderOpenAI,
expectClient: aibridge.ClientRoo,
allowOverflow: true,
},
}
Expand All @@ -125,12 +153,12 @@ func TestMetrics_Interception(t *testing.T) {
withMetrics(m),
)

resp := bridgeServer.makeRequest(t, http.MethodPost, tc.path, fix.Request())
resp := bridgeServer.makeRequest(t, http.MethodPost, tc.path, fix.Request(), tc.headers)
_, err := io.ReadAll(resp.Body)
require.NoError(t, err)

count := promtest.ToFloat64(m.InterceptionCount.WithLabelValues(
tc.expectProvider, tc.expectModel, tc.expectStatus, tc.expectRoute, "POST", defaultActorID))
tc.expectProvider, tc.expectModel, tc.expectStatus, tc.expectRoute, "POST", defaultActorID, string(tc.expectClient)))
require.Equal(t, 1.0, count)
require.Equal(t, 1, promtest.CollectAndCount(m.InterceptionDuration))
require.Equal(t, 1, promtest.CollectAndCount(m.InterceptionCount))
Expand Down Expand Up @@ -229,16 +257,51 @@ func TestMetrics_PromptCount(t *testing.T) {
withMetrics(m),
)

resp := bridgeServer.makeRequest(t, http.MethodPost, pathOpenAIChatCompletions, fix.Request())
resp := bridgeServer.makeRequest(t, http.MethodPost, pathOpenAIChatCompletions, fix.Request(), http.Header{"User-Agent": []string{"claude-code/1.0.0"}})
require.Equal(t, http.StatusOK, resp.StatusCode)
_, err := io.ReadAll(resp.Body)
require.NoError(t, err)

prompts := promtest.ToFloat64(m.PromptCount.WithLabelValues(
config.ProviderOpenAI, "gpt-4.1", defaultActorID))
config.ProviderOpenAI, "gpt-4.1", defaultActorID, string(aibridge.ClientClaudeCode)))
require.Equal(t, 1.0, prompts)
}

func TestMetrics_TokenUseCount(t *testing.T) {
t.Parallel()

ctx, cancel := context.WithTimeout(t.Context(), time.Second*30)
t.Cleanup(cancel)

fix := fixtures.Parse(t, fixtures.OaiResponsesBlockingCachedInputTokens)
upstream := newMockUpstream(t, ctx, newFixtureResponse(fix))

m := aibridge.NewMetrics(prometheus.NewRegistry())
bridgeServer := newBridgeTestServer(t, ctx, upstream.URL,
withMetrics(m),
)

resp := bridgeServer.makeRequest(t, http.MethodPost, pathOpenAIResponses, fix.Request(),
http.Header{"User-Agent": []string{"claude-code/1.0.0"}})
require.Equal(t, http.StatusOK, resp.StatusCode)
_, _ = io.ReadAll(resp.Body)

clientLabel := string(aibridge.ClientClaudeCode)
// Token metrics are recorded asynchronously; wait for them to appear.
require.Eventually(t, func() bool {
return promtest.ToFloat64(m.TokenUseCount.WithLabelValues(
config.ProviderOpenAI, "gpt-4.1", "input", defaultActorID, clientLabel)) > 0
}, time.Second*10, time.Millisecond*50)

require.Equal(t, 129.0, promtest.ToFloat64(m.TokenUseCount.WithLabelValues(config.ProviderOpenAI, "gpt-4.1", "input", defaultActorID, clientLabel))) // 12033 - 11904 (cached)
require.Equal(t, 44.0, promtest.ToFloat64(m.TokenUseCount.WithLabelValues(config.ProviderOpenAI, "gpt-4.1", "output", defaultActorID, clientLabel)))

// ExtraTokenTypes
require.Equal(t, 11904.0, promtest.ToFloat64(m.TokenUseCount.WithLabelValues(config.ProviderOpenAI, "gpt-4.1", "input_cached", defaultActorID, clientLabel)))
require.Equal(t, 0.0, promtest.ToFloat64(m.TokenUseCount.WithLabelValues(config.ProviderOpenAI, "gpt-4.1", "output_reasoning", defaultActorID, clientLabel)))
require.Equal(t, 12077.0, promtest.ToFloat64(m.TokenUseCount.WithLabelValues(config.ProviderOpenAI, "gpt-4.1", "total_tokens", defaultActorID, clientLabel)))
}

func TestMetrics_NonInjectedToolUseCount(t *testing.T) {
t.Parallel()

Expand Down
28 changes: 14 additions & 14 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,20 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
return &Metrics{
// Interception-related metrics.

// Pessimistic cardinality: 2 providers, 5 models, 2 statuses, 2 routes, 3 methods = up to 120 PER INITIATOR.
// Pessimistic cardinality: 3 providers, 5 models, 2 statuses, 3 routes, 3 methods, 10 clients = up to 2700 PER INITIATOR.
InterceptionCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Subsystem: "interceptions",
Name: "total",
Help: "The count of intercepted requests.",
}, append(baseLabels, "status", "route", "method", "initiator_id")),
// Pessimistic cardinality: 2 providers, 5 models, 2 routes = up to 20.
}, append(baseLabels, "status", "route", "method", "initiator_id", "client")),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not directly related to the changes in this PR, but is using initiator_id as a label a good idea? cc @dannykopping
It's an unbounded value and the main factor of label cardinality (as the comments themselves mention). For 10k users, InterceptionCount alone could produce up to 18M time series. Maybe this is something that was already discussed, and we are still far from those numbers, but just pointing this out.

The Prometheus docs explicitly recommend against using user IDs as labels.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We decided that the benefit of operators being able to quickly identify which initiator(s) were overloading the system outweighed the possible cardinality problem. If cardinality becomes a problem they can just drop that label.

// Pessimistic cardinality: 3 providers, 5 models, 3 routes = up to 45.
// NOTE: route is not unbounded because this is only for intercepted routes.
InterceptionsInflight: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Subsystem: "interceptions",
Name: "inflight",
Help: "The number of intercepted requests which are being processed.",
}, append(baseLabels, "route")),
// Pessimistic cardinality: 2 providers, 5 models, 7 buckets + 3 extra series (count, sum, +Inf) = up to 100.
// Pessimistic cardinality: 3 providers, 5 models, 7 buckets + 3 extra series (count, sum, +Inf) = up to 150.
InterceptionDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
Subsystem: "interceptions",
Name: "duration_seconds",
Expand All @@ -67,7 +67,7 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
Buckets: []float64{0.5, 2, 5, 15, 30, 60, 120},
}, baseLabels),

// Pessimistic cardinality: 2 providers, 10 routes, 3 methods = up to 60.
// Pessimistic cardinality: 3 providers, 10 routes, 3 methods = up to 90.
// NOTE: route is not unbounded because PassthroughRoutes (see provider.go) is a static list.
PassthroughCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Subsystem: "passthrough",
Expand All @@ -77,31 +77,31 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {

// Prompt-related metrics.

// Pessimistic cardinality: 2 providers, 5 models = up to 10 PER INITIATOR.
// Pessimistic cardinality: 3 providers, 5 models, 10 clients = up to 150 PER INITIATOR.
PromptCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Subsystem: "prompts",
Name: "total",
Help: "The number of prompts issued by users (initiators).",
}, append(baseLabels, "initiator_id")),
}, append(baseLabels, "initiator_id", "client")),

// Token-related metrics.

// Pessimistic cardinality: 2 providers, 5 models, 10 types = up to 100 PER INITIATOR.
// Pessimistic cardinality: 3 providers, 5 models, 10 types, 10 clients = up to 1500 PER INITIATOR.
TokenUseCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Subsystem: "tokens",
Name: "total",
Help: "The number of tokens used by intercepted requests.",
}, append(baseLabels, "type", "initiator_id")),
}, append(baseLabels, "type", "initiator_id", "client")),

// Tool-related metrics.

// Pessimistic cardinality: 2 providers, 5 models, 3 servers, 30 tools = up to 900.
// Pessimistic cardinality: 3 providers, 5 models, 3 servers, 30 tools = up to 1350.
InjectedToolUseCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Subsystem: "injected_tool_invocations",
Name: "total",
Help: "The number of times an injected MCP tool was invoked by aibridge.",
}, append(baseLabels, "server", "name")),
// Pessimistic cardinality: 2 providers, 5 models, 30 tools = up to 300.
// Pessimistic cardinality: 3 providers, 5 models, 30 tools = up to 450.
NonInjectedToolUseCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Subsystem: "non_injected_tool_selections",
Name: "total",
Expand All @@ -110,19 +110,19 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {

// Circuit breaker metrics.

// Pessimistic cardinality: 2 providers, 2 endpoints, 5 models = up to 20.
// Pessimistic cardinality: 3 providers, 2 endpoints, 5 models = up to 30.
CircuitBreakerState: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
Subsystem: "circuit_breaker",
Name: "state",
Help: "Current state of the circuit breaker (0=closed, 0.5=half-open, 1=open).",
}, []string{"provider", "endpoint", "model"}),
// Pessimistic cardinality: 2 providers, 2 endpoints, 5 models = up to 20.
// Pessimistic cardinality: 3 providers, 2 endpoints, 5 models = up to 30.
CircuitBreakerTrips: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Subsystem: "circuit_breaker",
Name: "trips_total",
Help: "Total number of times the circuit breaker transitioned to open state.",
}, []string{"provider", "endpoint", "model"}),
// Pessimistic cardinality: 2 providers, 2 endpoints, 5 models = up to 20.
// Pessimistic cardinality: 3 providers, 2 endpoints, 5 models = up to 30.
CircuitBreakerRejects: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
Subsystem: "circuit_breaker",
Name: "rejects_total",
Expand Down
17 changes: 12 additions & 5 deletions recorder/recorder.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,10 @@ type AsyncRecorder struct {
timeout time.Duration
metrics *metrics.Metrics

provider, model, initiatorID string
provider string
model string
initiatorID string
client string

wg sync.WaitGroup
}
Expand All @@ -158,6 +161,10 @@ func (a *AsyncRecorder) WithInitiatorID(initiatorID string) {
a.initiatorID = initiatorID
}

func (a *AsyncRecorder) WithClient(client string) {
a.client = client
}

// RecordInterception must NOT be called asynchronously.
// If an interception cannot be recorded, the whole request should fail.
func (a *AsyncRecorder) RecordInterception(ctx context.Context, req *InterceptionRecord) error {
Expand Down Expand Up @@ -193,7 +200,7 @@ func (a *AsyncRecorder) RecordPromptUsage(ctx context.Context, req *PromptUsageR
}

if a.metrics != nil && req.Prompt != "" { // TODO: will be irrelevant once https://github.com/coder/aibridge/issues/55 is fixed.
a.metrics.PromptCount.WithLabelValues(a.provider, a.model, a.initiatorID).Add(1)
a.metrics.PromptCount.WithLabelValues(a.provider, a.model, a.initiatorID, a.client).Add(1)
}
}()

Expand All @@ -213,10 +220,10 @@ func (a *AsyncRecorder) RecordTokenUsage(ctx context.Context, req *TokenUsageRec
}

if a.metrics != nil {
a.metrics.TokenUseCount.WithLabelValues(a.provider, a.model, "input", a.initiatorID).Add(float64(req.Input))
a.metrics.TokenUseCount.WithLabelValues(a.provider, a.model, "output", a.initiatorID).Add(float64(req.Output))
a.metrics.TokenUseCount.WithLabelValues(a.provider, a.model, "input", a.initiatorID, a.client).Add(float64(req.Input))
a.metrics.TokenUseCount.WithLabelValues(a.provider, a.model, "output", a.initiatorID, a.client).Add(float64(req.Output))
for k, v := range req.ExtraTokenTypes {
a.metrics.TokenUseCount.WithLabelValues(a.provider, a.model, k, a.initiatorID).Add(float64(v))
a.metrics.TokenUseCount.WithLabelValues(a.provider, a.model, k, a.initiatorID, a.client).Add(float64(v))
}
}
}()
Expand Down