Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions examples/eval/classifiers.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require "bundler/setup"
require "braintrust"
require "opentelemetry/sdk"

# Example: Classifiers
#
# Classifiers categorize and label eval outputs. Unlike scorers (which return
# numeric 0-1 values), classifiers return structured Classification items —
# each with an :id, an optional :label, and optional :metadata.
#
# Results are stored as a dictionary keyed by classifier name:
#
# { "sentiment" => [{ id: "positive", label: "Positive" }] }
#
# Three patterns are shown:
#
# 1. Block-based (Braintrust::Classifier.new):
# Returns a single Classification hash. Good for concise, one-off classifiers.
#
# 2. Multi-label block-based:
# Returns an Array of Classification hashes — useful when a single
# classifier assigns multiple labels to the same output.
#
# 3. Class-based (include Braintrust::Classifier):
# Define a class with a #call method. Good for reusable classifiers
# that carry their own logic and state.
#
# Classifiers and scorers run independently. You can use both together, or
# use only classifiers when you don't need numeric scores.
#
# Usage:
# bundle exec ruby examples/eval/classifiers.rb

Braintrust.init

# ---------------------------------------------------------------------------
# Test cases: customer support messages
# ---------------------------------------------------------------------------
MESSAGES = [
{input: "Hi! I just wanted to say thank you, the product is amazing!"},
{input: "I've been waiting 2 weeks for my order. This is unacceptable!"},
{input: "How do I reset my password? I can't find the option anywhere."},
{input: "The item arrived damaged. I need a refund immediately."},
{input: "Just checking in — any update on my ticket #4821?"}
]

# ---------------------------------------------------------------------------
# Simulated task: generate a support response (replace with a real LLM call)
# ---------------------------------------------------------------------------
def generate_response(message)
case message
when /thank/i then "You're welcome! So glad you're enjoying it."
when /waiting|order/i then "I sincerely apologise for the delay. Let me look into this right away."
when /password|reset/i then "To reset your password, go to Settings > Account > Reset Password."
when /damaged|refund/i then "I'm sorry to hear that. I'll process your refund immediately."
else "Thanks for reaching out! Let me check on that for you."
end
end

# ---------------------------------------------------------------------------
# Pattern 1: block-based single-label classifier
#
# Classifies each message into a single intent category.
# Declare only the kwargs you need — extras are filtered automatically.
# ---------------------------------------------------------------------------
intent_classifier = Braintrust::Classifier.new("intent") do |input:|
id = case input
when /thank/i then "praise"
when /waiting|order|update/i then "follow_up"
when /password|reset|find/i then "how_to"
when /damaged|refund/i then "complaint"
else "other"
end

{name: "intent", id: id, label: id.tr("_", " ").capitalize}
end

# ---------------------------------------------------------------------------
# Pattern 2: block-based multi-label classifier
#
# A single classifier can return an Array to assign multiple labels.
# All items sharing the same :name are grouped into the same results array.
# ---------------------------------------------------------------------------
tone_classifier = Braintrust::Classifier.new("tone") do |input:|
labels = []
labels << {name: "tone", id: "urgent", label: "Urgent"} if input.match?(/immediately|unacceptable|waiting/i)
labels << {name: "tone", id: "polite", label: "Polite"} if input.match?(/please|thank|just checking/i)
labels << {name: "tone", id: "frustrated", label: "Frustrated"} if input.match?(/unacceptable|damaged|waiting/i)
labels << {name: "tone", id: "neutral", label: "Neutral"} if labels.empty?
labels
end

# ---------------------------------------------------------------------------
# Pattern 3: class-based classifier
#
# Include Braintrust::Classifier and define #call with keyword args.
# The class name is snake_cased to derive the default classifier name
# (ResponseQualityClassifier -> "response_quality_classifier").
# Override #name to customise it.
# ---------------------------------------------------------------------------
class ResponseQualityClassifier
include Braintrust::Classifier

def name
"response_quality"
end

def call(input:, output:)
word_count = output.to_s.split.length

id = if output.to_s.strip.empty?
"no_response"
elsif word_count < 5
"too_short"
elsif output.match?(/immediately|right away|look into/i)
"action_oriented"
else
"informational"
end

{
name: "response_quality",
id: id,
label: id.tr("_", " ").capitalize,
metadata: {word_count: word_count}
}
end
end

# ---------------------------------------------------------------------------
# Run the eval — classifiers only (no numeric scores needed here)
# ---------------------------------------------------------------------------
Braintrust::Eval.run(
project: "ruby-sdk-examples",
experiment: "classifiers-example",
cases: MESSAGES,
task: ->(input:) { generate_response(input) },
classifiers: [intent_classifier, tone_classifier, ResponseQualityClassifier.new]
)

OpenTelemetry.tracer_provider.shutdown
157 changes: 157 additions & 0 deletions lib/braintrust/classifier.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# frozen_string_literal: true

require_relative "internal/callable"

module Braintrust
# Classifier wraps a classification function that categorizes and labels eval outputs.
#
# Unlike scorers (which return numeric 0-1 values), classifiers return structured
# {Classification} items with an id and optional label and metadata.
#
# Use inline with a block (keyword args):
# classifier = Classifier.new("category") { |output:| {name: "category", id: "greeting", label: "Greeting"} }
#
# Or include in a class and define #call with keyword args:
# class CategoryClassifier
# include Braintrust::Classifier
#
# def call(output:)
# {name: "category", id: "greeting", label: "Greeting"}
# end
# end
#
# Classifiers may return a single Classification hash, an Array of them, or nil
# (meaning no classifications for this case).
module Classifier
DEFAULT_NAME = "classifier"

# @param base [Class] the class including Classifier
def self.included(base)
base.include(Callable)
end

# Create a block-based classifier.
#
# @param name [String, nil] optional name (defaults to "classifier")
# @param block [Proc] the classification implementation; declare only the keyword
# args you need. Extra kwargs are filtered out automatically.
#
# Supported kwargs: +input:+, +expected:+, +output:+, +metadata:+, +trace:+, +parameters:+
# @return [Classifier::Block]
# @raise [ArgumentError] if the block has unsupported arity
def self.new(name = nil, &block)
Block.new(name: name || DEFAULT_NAME, &block)
end

# Included into classes that +include Classifier+. Prepends KeywordFilter and
# ClassificationNormalizer so #call receives only declared kwargs and always returns
# Array<Hash>. Also provides a default #name and #call_parameters.
module Callable
# Normalizes the raw return value of #call into Array<Hash>.
# Nested inside Callable because it depends on #name which Callable provides.
module ClassificationNormalizer
# @return [Array<Hash>] normalized classification hashes with :name, :id, and optional :label, :metadata keys
def call(**kwargs)
normalize_classification_result(super)
end

private

# @param result [Hash, Array<Hash>, nil] raw return value from #call
# @return [Array<Hash>] zero or more classification hashes with :name, :id keys
# @raise [ArgumentError] if any item is not a non-empty object
def normalize_classification_result(result)
case result
when nil then []
when Array then result.map { |item| normalize_classification_item(item) }
when Hash then [normalize_classification_item(result)]
else
raise ArgumentError, "When returning structured classifier results, each classification must be a non-empty object. Got: #{result.inspect}"
end
end

# Fills in missing :name from the classifier, validates :id.
# @param item [Hash] a classification hash
# @return [Hash] the item with :name defaulted and validated
# @raise [ArgumentError] if item is not a non-empty Hash
def normalize_classification_item(item)
unless item.is_a?(Hash) && !item.empty?
raise ArgumentError, "When returning structured classifier results, each classification must be a non-empty object. Got: #{item.inspect}"
end

# :name defaults to the classifier's resolved name when missing, empty, or non-string
unless item[:name].is_a?(String) && !item[:name].empty?
item = item.merge(name: name)
end

item
end
end

# Infrastructure modules prepended onto every classifier class.
# Used both to set up the ancestor chain and to skip past them in
# #call_parameters so KeywordFilter sees the real call signature.
PREPENDED = [Internal::Callable::KeywordFilter, ClassificationNormalizer].freeze

# @param base [Class] the class including Callable
def self.included(base)
PREPENDED.each { |mod| base.prepend(mod) }
end

# Default name derived from the class name (e.g. CategoryClassifier -> "category_classifier").
# @return [String]
def name
klass = self.class.name&.split("::")&.last
return Classifier::DEFAULT_NAME unless klass
klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
end

# Provides KeywordFilter with the actual call signature of the subclass.
# Walks past PREPENDED modules in the ancestor chain so that user-defined
# #call keyword params are correctly introspected.
# Block overrides this to point directly at @block.parameters.
# @return [Array<Array>] parameter list
def call_parameters
meth = method(:call)
meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner)
meth.parameters
end
end

# Block-based classifier. Stores a Proc and delegates #call to it.
# Includes Classifier so it satisfies +Classifier ===+ checks.
# Exposes #call_parameters so KeywordFilter can introspect the block's
# declared kwargs rather than Block#call's **kwargs signature.
class Block
include Classifier

# @return [String]
attr_reader :name

# @param name [String] classifier name
# @param block [Proc] classification implementation; must use keyword args or zero-arity
# @raise [ArgumentError] if the block uses positional params
def initialize(name: DEFAULT_NAME, &block)
@name = name
params = block.parameters
unless Internal::Callable::KeywordFilter.has_any_keywords?(params) || block.arity == 0
raise ArgumentError, "Classifier block must use keyword args (got arity #{block.arity})"
end
@block = block
end

# @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
# @return [Array<Hash>] normalized classification results
def call(**kwargs)
@block.call(**kwargs)
end

# Exposes the block's parameter list so KeywordFilter can filter
# kwargs to match the block's declared keywords.
# @return [Array<Array>] parameter list from Proc#parameters
def call_parameters
@block.parameters
end
end
end
end
35 changes: 24 additions & 11 deletions lib/braintrust/eval.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# frozen_string_literal: true

require_relative "classifier"
require_relative "scorer"
require_relative "task"
require_relative "functions"
Expand Down Expand Up @@ -160,7 +161,10 @@ def scorer(name, callable = nil, &block)
# - String: dataset name (fetches from same project)
# - Hash: {name:, id:, project:, version:, limit:}
# @param task [#call] The task to evaluate (must be callable)
# @param scorers [Array<String, Scorer, #call>] The scorers to use (String names, Scorer objects, or callables)
# @param scorers [Array<String, Scorer, #call>, nil] The scorers to use (String names, Scorer objects, or callables).
# At least one of scorers or classifiers must be provided.
# @param classifiers [Array<Classifier, #call>, nil] The classifiers to use.
# At least one of scorers or classifiers must be provided.
# @param on_progress [#call, nil] Optional callback fired after each test case.
# Receives a Hash: {"data" => output, "scores" => {name => value}} on success,
# or {"error" => message} on failure.
Expand All @@ -177,13 +181,16 @@ def scorer(name, callable = nil, &block)
# @param parent [Hash, nil] Parent span context ({object_type:, object_id:, generation:})
# @param parameters [Hash, nil] Runtime parameters passed to task and scorers as a `parameters:` keyword argument
# @return [Result]
def run(task:, scorers:, project: nil, experiment: nil,
cases: nil, dataset: nil, on_progress: nil,
def run(task:, scorers: nil, classifiers: nil, project: nil,
experiment: nil, cases: nil, dataset: nil, on_progress: nil,
parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
state: nil, tracer_provider: nil, project_id: nil, parent: nil,
parameters: nil)
# Validate required parameters
validate_params!(task: task, scorers: scorers, cases: cases, dataset: dataset)
validate_params!(task: task, scorers: scorers,
classifiers: classifiers, cases: cases, dataset: dataset)
scorers ||= []
classifiers ||= []

experiment_id = nil
project_name = project
Expand Down Expand Up @@ -216,6 +223,7 @@ def run(task:, scorers:, project: nil, experiment: nil,
context = Context.build(
task: task,
scorers: scorers,
classifiers: classifiers,
cases: cases,
experiment_id: experiment_id,
experiment_name: experiment,
Expand Down Expand Up @@ -245,9 +253,19 @@ def print_result(result)

# Validate required parameters
# @raise [ArgumentError] if validation fails
def validate_params!(task:, scorers:, cases:, dataset:)
def validate_params!(task:, scorers:, classifiers:, cases:, dataset:)
raise ArgumentError, "task is required" unless task
raise ArgumentError, "scorers is required" unless scorers

# Validate task is callable before anything else
unless task.respond_to?(:call)
raise ArgumentError, "task must be callable (respond to :call)"
end

has_scorers = scorers && !scorers.empty?
has_classifiers = classifiers && !classifiers.empty?
unless has_scorers || has_classifiers
raise ArgumentError, "at least one of scorers or classifiers is required"
end

# Validate cases and dataset are mutually exclusive
if cases && dataset
Expand All @@ -258,11 +276,6 @@ def validate_params!(task:, scorers:, cases:, dataset:)
unless cases || dataset
raise ArgumentError, "must specify either 'cases' or 'dataset'"
end

# Validate task is callable
unless task.respond_to?(:call)
raise ArgumentError, "task must be callable (respond to :call)"
end
end

# Resolve project by name or ID. Creates if needed.
Expand Down
Loading
Loading