braintrustdata · Andrew Kent (realark) · Apr 30, 2026
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/api/BraintrustOpenApiClient.java b/braintrust-sdk/src/main/java/dev/braintrust/api/BraintrustOpenApiClient.java
@@ -145,12 +145,18 @@ public BtqlQueryResponse btqlQuery(String query) {
                     getHttpClient()
                             .send(requestBuilder.build(), HttpResponse.BodyHandlers.ofString());
 
-            if (response.statusCode() / 100 != 2) {
-                throw new RuntimeException(
-                        "BTQL query failed with status "
-                                + response.statusCode()
-                                + ": "
-                                + response.body());
+            if (response.statusCode() == 429) {
+                throw new BtqlRateLimitException(
+                        response.statusCode(),
+                        "BTQL rate limit exceeded",
+                        response.headers(),
+                        response.body());
+            } else if (response.statusCode() / 100 != 2) {
+                throw new ApiException(
+                        response.statusCode(),
+                        "BTQL query failed",
+                        response.headers(),
+                        response.body());
             }
 
             return MAPPER.readValue(response.body(), BtqlQueryResponse.class);
@@ -163,7 +169,38 @@ public BtqlQueryResponse btqlQuery(String query) {
 
     public record OrgInfo(String id, String name) {}
 
-    public record BtqlQueryResponse(List<Map<String, Object>> data) {}
+    /**
+     * Response from a {@code POST /btql} query.
+     *
+     * <p>Freshness is determined by comparing {@link FreshnessState#lastProcessedXactId()} to
+     * {@link FreshnessState#lastConsideredXactId()}: when both are non-null and equal, the query
+     * has caught up to all ingested data and the result is fresh.
+     *
+     * <p>The {@link RealtimeState#type()} field indicates whether realtime indexing is still active
+     * ({@code "on"}) or has timed out ({@code "exhausted_timeout"}).
+     */
+    public record BtqlQueryResponse(
+            List<Map<String, Object>> data,
+            @JsonProperty("freshness_state") FreshnessState freshnessState,
+            @JsonProperty("realtime_state") RealtimeState realtimeState) {
+
+        /** Returns {@code true} when the query result has caught up to all ingested data. */
+        public boolean isFresh() {
+            if (freshnessState == null) {
+                return false;
+            }
+            var processed = freshnessState.lastProcessedXactId();
+            var considered = freshnessState.lastConsideredXactId();
+            return processed != null && processed.equals(considered);
+        }
+    }
+
+    public record FreshnessState(
+            @JsonProperty("last_processed_xact_id") String lastProcessedXactId,
+            @JsonProperty("last_considered_xact_id") String lastConsideredXactId) {}
+
+    /** Real-time indexing state for a BTQL query. */
+    public record RealtimeState(@JsonProperty("type") String type) {}
 
     private record LoginRequest(String token) {}
 

diff --git a/braintrust-sdk/src/main/java/dev/braintrust/api/BtqlRateLimitException.java b/braintrust-sdk/src/main/java/dev/braintrust/api/BtqlRateLimitException.java
@@ -0,0 +1,12 @@
+package dev.braintrust.api;
+
+import dev.braintrust.openapi.ApiException;
+import java.net.http.HttpHeaders;
+
+/** Thrown when the BTQL endpoint returns HTTP 429 (Too Many Requests). */
+public final class BtqlRateLimitException extends ApiException {
+    BtqlRateLimitException(
+            int code, String message, HttpHeaders responseHeaders, String responseBody) {
+        super(code, message, responseHeaders, responseBody);
+    }
+}
diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
@@ -9,6 +9,7 @@
 import dev.braintrust.openapi.api.ExperimentsApi;
 import dev.braintrust.openapi.model.CreateExperiment;
 import dev.braintrust.openapi.model.Project;
+import dev.braintrust.trace.BrainstoreTrace;
 import dev.braintrust.trace.BraintrustContext;
 import dev.braintrust.trace.BraintrustTracing;
 import io.opentelemetry.api.common.AttributeKey;
@@ -124,6 +125,7 @@ private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase
         }
         try (var rootScope = BraintrustContext.ofExperiment(experimentId, rootSpan).makeCurrent()) {
             final TaskResult<INPUT, OUTPUT> taskResult;
+            final String taskSpanId;
             { // run task
                 var taskSpan =
                         tracer.spanBuilder("task")
@@ -132,6 +134,7 @@ private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase
                                         "braintrust.span_attributes",
                                         toJson(Map.of("type", "task")))
                                 .startSpan();
+                taskSpanId = taskSpan.getSpanContext().getSpanId();
                 try (var unused =
                         BraintrustContext.ofExperiment(experimentId, taskSpan).makeCurrent()) {
                     taskResult = task.apply(datasetCase, parameters);
@@ -155,32 +158,48 @@ private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase
                 }
                 taskSpan.end();
             }
+
+            // Create a single BrainstoreTrace for this eval case, shared across all scorers.
+            // It fetches spans lazily on first access (only if a TracedScorer actually calls it).
+            // We wait specifically for the task span to appear, which guarantees its children
+            // (LLM spans, tool spans) have also been indexed — since children end before parents.
+            var rootTraceId = rootSpan.getSpanContext().getTraceId();
+            var trace =
+                    BrainstoreTrace.forExperiment(
+                            client, experimentId, rootTraceId, List.of(taskSpanId));
+
             // run scorers - one span per scorer
             for (var scorer : scorers) {
-                runScorer(experimentId, rootSpan, scorer, taskResult);
+                runScorer(experimentId, rootSpan, scorer, taskResult, trace);
             }
         } finally {
             rootSpan.end();
         }
     }
 
     /**
-     * Runs a scorer against a successful task result. If the scorer throws, falls back to {@link
-     * Scorer#scoreForScorerException}.
+     * Runs a scorer against a successful task result. If the scorer is a {@link TracedScorer}, it
+     * receives the {@link BrainstoreTrace} for the eval case. If the scorer throws, falls back to
+     * {@link Scorer#scoreForScorerException}.
      */
     private void runScorer(
             String experimentId,
             Span rootSpan,
             Scorer<INPUT, OUTPUT> scorer,
-            TaskResult<INPUT, OUTPUT> taskResult) {
+            TaskResult<INPUT, OUTPUT> taskResult,
+            BrainstoreTrace trace) {
         var scoreSpan =
                 tracer.spanBuilder("score")
                         .setAttribute(PARENT, "experiment_id:" + experimentId)
                         .startSpan();
         try (var unused = BraintrustContext.ofExperiment(experimentId, scoreSpan).makeCurrent()) {
             List<Score> scores;
             try {
-                scores = scorer.score(taskResult);
+                if (scorer instanceof TracedScorer<INPUT, OUTPUT> tracedScorer) {
+                    scores = tracedScorer.score(taskResult, trace);
+                } else {
+                    scores = scorer.score(taskResult);
+                }
             } catch (Exception e) {
                 scoreSpan.setStatus(StatusCode.ERROR, e.getMessage());
                 scoreSpan.recordException(e);

diff --git a/braintrust-sdk/src/main/java/dev/braintrust/eval/TracedScorer.java b/braintrust-sdk/src/main/java/dev/braintrust/eval/TracedScorer.java
@@ -0,0 +1,41 @@
+package dev.braintrust.eval;
+
+import dev.braintrust.trace.BrainstoreTrace;
+import java.util.List;
+
+/**
+ * A scorer that receives access to the full distributed trace of the task that was evaluated.
+ *
+ * <p>Implement this interface when your scorer needs to examine intermediate LLM calls, tool
+ * invocations, or other spans produced during task execution — not just the final {@code
+ * TaskResult}.
+ *
+ * @param <INPUT> type of the input data
+ * @param <OUTPUT> type of the output data
+ */
+public interface TracedScorer<INPUT, OUTPUT> extends Scorer<INPUT, OUTPUT> {
+
+    /**
+     * Scores the task result using the distributed trace for additional context. Called instead of
+     * {@link Scorer#score(TaskResult)} when a {@link BrainstoreTrace} is available.
+     *
+     * @param taskResult the task output and originating dataset case
+     * @param trace lazy access to the distributed trace spans for this eval case
+     * @return one or more scores, each with a value between 0 and 1 inclusive
+     */
+    List<Score> score(TaskResult<INPUT, OUTPUT> taskResult, BrainstoreTrace trace);
+
+    /**
+     * {@inheritDoc}
+     *
+     * <p>When used inside an {@link Eval}, this overload is never called — {@link
+     * #score(TaskResult, BrainstoreTrace)} is dispatched instead. This default implementation
+     * throws {@link UnsupportedOperationException} to surface any accidental direct calls.
+     */
+    @Override
+    default List<Score> score(TaskResult<INPUT, OUTPUT> taskResult) {
+        throw new UnsupportedOperationException(
+                "traced scorer score method directly called. This is likely an accident. If you"
+                        + " wish to support this, your implementation must override this method.");
+    }
+}