Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,18 @@ public BtqlQueryResponse btqlQuery(String query) {
getHttpClient()
.send(requestBuilder.build(), HttpResponse.BodyHandlers.ofString());

if (response.statusCode() / 100 != 2) {
throw new RuntimeException(
"BTQL query failed with status "
+ response.statusCode()
+ ": "
+ response.body());
if (response.statusCode() == 429) {
throw new BtqlRateLimitException(
response.statusCode(),
"BTQL rate limit exceeded",
response.headers(),
response.body());
} else if (response.statusCode() / 100 != 2) {
throw new ApiException(
response.statusCode(),
"BTQL query failed",
response.headers(),
response.body());
}

return MAPPER.readValue(response.body(), BtqlQueryResponse.class);
Expand All @@ -163,7 +169,38 @@ public BtqlQueryResponse btqlQuery(String query) {

public record OrgInfo(String id, String name) {}

public record BtqlQueryResponse(List<Map<String, Object>> data) {}
/**
* Response from a {@code POST /btql} query.
*
* <p>Freshness is determined by comparing {@link FreshnessState#lastProcessedXactId()} to
* {@link FreshnessState#lastConsideredXactId()}: when both are non-null and equal, the query
* has caught up to all ingested data and the result is fresh.
*
* <p>The {@link RealtimeState#type()} field indicates whether realtime indexing is still active
* ({@code "on"}) or has timed out ({@code "exhausted_timeout"}).
*/
public record BtqlQueryResponse(
List<Map<String, Object>> data,
@JsonProperty("freshness_state") FreshnessState freshnessState,
@JsonProperty("realtime_state") RealtimeState realtimeState) {

/** Returns {@code true} when the query result has caught up to all ingested data. */
public boolean isFresh() {
if (freshnessState == null) {
return false;
}
var processed = freshnessState.lastProcessedXactId();
var considered = freshnessState.lastConsideredXactId();
return processed != null && processed.equals(considered);
}
}

public record FreshnessState(
@JsonProperty("last_processed_xact_id") String lastProcessedXactId,
@JsonProperty("last_considered_xact_id") String lastConsideredXactId) {}

/** Real-time indexing state for a BTQL query. */
public record RealtimeState(@JsonProperty("type") String type) {}

private record LoginRequest(String token) {}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package dev.braintrust.api;

import dev.braintrust.openapi.ApiException;
import java.net.http.HttpHeaders;

/** Thrown when the BTQL endpoint returns HTTP 429 (Too Many Requests). */
public final class BtqlRateLimitException extends ApiException {
BtqlRateLimitException(
int code, String message, HttpHeaders responseHeaders, String responseBody) {
super(code, message, responseHeaders, responseBody);
}
}
29 changes: 24 additions & 5 deletions braintrust-sdk/src/main/java/dev/braintrust/eval/Eval.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import dev.braintrust.openapi.api.ExperimentsApi;
import dev.braintrust.openapi.model.CreateExperiment;
import dev.braintrust.openapi.model.Project;
import dev.braintrust.trace.BrainstoreTrace;
import dev.braintrust.trace.BraintrustContext;
import dev.braintrust.trace.BraintrustTracing;
import io.opentelemetry.api.common.AttributeKey;
Expand Down Expand Up @@ -124,6 +125,7 @@ private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase
}
try (var rootScope = BraintrustContext.ofExperiment(experimentId, rootSpan).makeCurrent()) {
final TaskResult<INPUT, OUTPUT> taskResult;
final String taskSpanId;
{ // run task
var taskSpan =
tracer.spanBuilder("task")
Expand All @@ -132,6 +134,7 @@ private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase
"braintrust.span_attributes",
toJson(Map.of("type", "task")))
.startSpan();
taskSpanId = taskSpan.getSpanContext().getSpanId();
try (var unused =
BraintrustContext.ofExperiment(experimentId, taskSpan).makeCurrent()) {
taskResult = task.apply(datasetCase, parameters);
Expand All @@ -155,32 +158,48 @@ private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase
}
taskSpan.end();
}

// Create a single BrainstoreTrace for this eval case, shared across all scorers.
// It fetches spans lazily on first access (only if a TracedScorer actually calls it).
// We wait specifically for the task span to appear, which guarantees its children
// (LLM spans, tool spans) have also been indexed — since children end before parents.
var rootTraceId = rootSpan.getSpanContext().getTraceId();
var trace =
BrainstoreTrace.forExperiment(
client, experimentId, rootTraceId, List.of(taskSpanId));

// run scorers - one span per scorer
for (var scorer : scorers) {
runScorer(experimentId, rootSpan, scorer, taskResult);
runScorer(experimentId, rootSpan, scorer, taskResult, trace);
}
} finally {
rootSpan.end();
}
}

/**
* Runs a scorer against a successful task result. If the scorer throws, falls back to {@link
* Scorer#scoreForScorerException}.
* Runs a scorer against a successful task result. If the scorer is a {@link TracedScorer}, it
* receives the {@link BrainstoreTrace} for the eval case. If the scorer throws, falls back to
* {@link Scorer#scoreForScorerException}.
*/
private void runScorer(
String experimentId,
Span rootSpan,
Scorer<INPUT, OUTPUT> scorer,
TaskResult<INPUT, OUTPUT> taskResult) {
TaskResult<INPUT, OUTPUT> taskResult,
BrainstoreTrace trace) {
var scoreSpan =
tracer.spanBuilder("score")
.setAttribute(PARENT, "experiment_id:" + experimentId)
.startSpan();
try (var unused = BraintrustContext.ofExperiment(experimentId, scoreSpan).makeCurrent()) {
List<Score> scores;
try {
scores = scorer.score(taskResult);
if (scorer instanceof TracedScorer<INPUT, OUTPUT> tracedScorer) {
scores = tracedScorer.score(taskResult, trace);
} else {
scores = scorer.score(taskResult);
}
} catch (Exception e) {
scoreSpan.setStatus(StatusCode.ERROR, e.getMessage());
scoreSpan.recordException(e);
Expand Down
41 changes: 41 additions & 0 deletions braintrust-sdk/src/main/java/dev/braintrust/eval/TracedScorer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package dev.braintrust.eval;

import dev.braintrust.trace.BrainstoreTrace;
import java.util.List;

/**
* A scorer that receives access to the full distributed trace of the task that was evaluated.
*
* <p>Implement this interface when your scorer needs to examine intermediate LLM calls, tool
* invocations, or other spans produced during task execution — not just the final {@code
* TaskResult}.
*
* @param <INPUT> type of the input data
* @param <OUTPUT> type of the output data
*/
public interface TracedScorer<INPUT, OUTPUT> extends Scorer<INPUT, OUTPUT> {

/**
* Scores the task result using the distributed trace for additional context. Called instead of
* {@link Scorer#score(TaskResult)} when a {@link BrainstoreTrace} is available.
*
* @param taskResult the task output and originating dataset case
* @param trace lazy access to the distributed trace spans for this eval case
* @return one or more scores, each with a value between 0 and 1 inclusive
*/
List<Score> score(TaskResult<INPUT, OUTPUT> taskResult, BrainstoreTrace trace);

/**
* {@inheritDoc}
*
* <p>When used inside an {@link Eval}, this overload is never called — {@link
* #score(TaskResult, BrainstoreTrace)} is dispatched instead. This default implementation
* throws {@link UnsupportedOperationException} to surface any accidental direct calls.
*/
@Override
default List<Score> score(TaskResult<INPUT, OUTPUT> taskResult) {
throw new UnsupportedOperationException(
"traced scorer score method directly called. This is likely an accident. If you"
+ " wish to support this, your implementation must override this method.");
}
}
Loading
Loading