braintrustdata · Matt Perpick (clutchski) · Apr 6, 2026 · Apr 6, 2026
diff --git a/.claude/skills/ruby-integration/SKILL.md b/.claude/skills/ruby-integration/SKILL.md
@@ -168,12 +168,11 @@ Follow existing example patterns:
 **Do in this order:**
 
 - [ ] **Appraisals FIRST**: Add to `Appraisals` file (latest + 2 recent + uninstalled), run `bundle exec appraisal generate`
-- [ ] **Tests**: `test/braintrust/trace/your_provider_test.rb`
-- [ ] **Integration**: `lib/braintrust/trace/contrib/your_provider.rb`
-- [ ] **VCR cassettes**: `test/fixtures/vcr_cassettes/your_provider/` (record as you write tests)
-- [ ] **Auto-load**: Add to `lib/braintrust/trace.rb` with `begin/rescue LoadError`
-- [ ] **Example**: `examples/your_provider.rb`
-- [ ] **Example**: `examples/internal/your_provider.rb` (comprehensive internal example)
+- [ ] **Tests**: `test/braintrust/contrib/your_provider/`
+- [ ] **Integration**: `lib/braintrust/contrib/your_provider/`
+- [ ] **VCR cassettes**: Record with `VCR_MODE=all bundle exec appraisal <name> ruby -Ilib:test test/...` — **never hand-craft cassettes**
+- [ ] **Example**: `examples/contrib/your_provider.rb` — run it and verify the permalink works
+- [ ] **Example**: `examples/internal/contrib/your_provider/basic.rb` (comprehensive internal example)
 - [ ] **Env var**: Add to `.env.example` if needed
 
 ## Test Coverage (LLM Providers)
@@ -324,12 +323,26 @@ Use shared `TokenParser.parse_usage_tokens(usage)` in `lib/braintrust/trace/toke
 
 ## VCR Cassettes
 
+**CRITICAL: Never hand-craft cassette YAML files.** Cassettes must be recorded from real API responses so they contain authentic request/response data. Hand-crafted cassettes produce tests that pass against fake data but fail against real APIs.
+
+To record cassettes (requires real API keys in env):
+
 ```bash
-VCR_MODE=all bundle exec rake test           # Re-record all
-VCR_MODE=new_episodes bundle exec rake test  # Record new only
-VCR_OFF=true bundle exec rake test           # Skip VCR
+# Record cassettes for a specific appraisal (preferred — targets only your new tests)
+VCR_MODE=all bundle exec appraisal <name> ruby -Ilib:test test/braintrust/contrib/<name>/...
+
+# Re-record all cassettes for an appraisal
+VCR_MODE=all bundle exec appraisal <name> rake test
+
+# Record only new cassettes (keep existing)
+VCR_MODE=new_episodes bundle exec appraisal <name> rake test
+
+# Skip VCR entirely (useful for local debugging with a real key)
+VCR_OFF=true bundle exec rake test
 ```
 
+An `AGENTS.md` file in `test/fixtures/vcr_cassettes/` explains this to future contributors.
+
 ## Reference Files
 
 - Integrations: `lib/braintrust/trace/contrib/{openai,anthropic}.rb`

diff --git a/Appraisals b/Appraisals
@@ -26,6 +26,10 @@ OPTIONAL_GEMS = {
     "1.8" => {constraint: "~> 1.8.0", deps: {}},
     "1.9" => {constraint: "~> 1.9.0", deps: {}},
     "latest" => {constraint: ">= 1.9", deps: {}}
+  },
+  "llm.rb" => {
+    "4.11" => {constraint: "~> 4.11.0", deps: {}},
+    "latest" => {constraint: ">= 4.11", deps: {}}
   }
 }
 

diff --git a/examples/contrib/llm_rb.rb b/examples/contrib/llm_rb.rb
@@ -0,0 +1,51 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require "bundler/setup"
+require "braintrust"
+require "llm"
+require "opentelemetry/sdk"
+
+# Example: Basic llm.rb chat with Braintrust tracing
+#
+# Usage:
+#   OPENAI_API_KEY=your-key bundle exec appraisal llm.rb ruby examples/contrib/llm_rb.rb
+
+unless ENV["OPENAI_API_KEY"]
+  puts "Error: OPENAI_API_KEY environment variable is required"
+  exit 1
+end
+
+# Initialize Braintrust (with blocking login)
+#
+# NOTE: blocking_login is only necessary for this short-lived example.
+#       In most production apps, you can omit this.
+Braintrust.init(blocking_login: true)
+
+llm = LLM.openai(key: ENV["OPENAI_API_KEY"])
+ctx = LLM::Context.new(llm)
+
+# Instrument this context instance to produce Braintrust spans
+Braintrust.instrument!(:llm_rb, target: ctx)
+
+# Get a tracer and wrap the conversation in a root span
+tracer = OpenTelemetry.tracer_provider.tracer("llm-rb-example")
+
+root_span = nil
+tracer.in_span("examples/contrib/llm_rb.rb") do |span|
+  root_span = span
+
+  # Each ctx.talk call is automatically traced as a child span
+  ctx.talk("What is the capital of France?")
+  ctx.talk("And what is the population of that city?")
+end
+
+# Print permalink to view this trace in Braintrust
+puts "\nView this trace in Braintrust:"
+puts "  #{Braintrust::Trace.permalink(root_span)}"
+
+# Shutdown to flush spans to Braintrust
+#
+# NOTE: shutdown is only necessary for this short-lived example.
+#       In most production apps, you can omit this.
+OpenTelemetry.tracer_provider.shutdown
diff --git a/gemfiles/llm.rb.gemfile b/gemfiles/llm.rb.gemfile
@@ -0,0 +1,19 @@
+# This file was generated by Appraisal
+
+source "https://rubygems.org"
+
+gem "appraisal", "~> 2.5"
+gem "climate_control", "~> 1.2"
+gem "kramdown", "~> 2.0"
+gem "minitest-reporters", "~> 1.6"
+gem "minitest-stub-const", "~> 0.6"
+gem "minitest", "~> 5.0"
+gem "rake", "~> 13.0"
+gem "simplecov", "~> 0.22"
+gem "standard", "~> 1.0"
+gem "vcr", "~> 6.0"
+gem "webmock", "~> 3.0"
+gem "yard", "~> 0.9"
+gem "llm.rb", ">= 4.11"
+
+gemspec path: "../"
diff --git a/gemfiles/llm.rb_4_11.gemfile b/gemfiles/llm.rb_4_11.gemfile
@@ -0,0 +1,19 @@
+# This file was generated by Appraisal
+
+source "https://rubygems.org"
+
+gem "appraisal", "~> 2.5"
+gem "climate_control", "~> 1.2"
+gem "kramdown", "~> 2.0"
+gem "minitest-reporters", "~> 1.6"
+gem "minitest-stub-const", "~> 0.6"
+gem "minitest", "~> 5.0"
+gem "rake", "~> 13.0"
+gem "simplecov", "~> 0.22"
+gem "standard", "~> 1.0"
+gem "vcr", "~> 6.0"
+gem "webmock", "~> 3.0"
+gem "yard", "~> 0.9"
+gem "llm.rb", "~> 4.11.0"
+
+gemspec path: "../"
diff --git a/gemfiles/llm.rb_uninstalled.gemfile b/gemfiles/llm.rb_uninstalled.gemfile
@@ -0,0 +1,18 @@
+# This file was generated by Appraisal
+
+source "https://rubygems.org"
+
+gem "appraisal", "~> 2.5"
+gem "climate_control", "~> 1.2"
+gem "kramdown", "~> 2.0"
+gem "minitest-reporters", "~> 1.6"
+gem "minitest-stub-const", "~> 0.6"
+gem "minitest", "~> 5.0"
+gem "rake", "~> 13.0"
+gem "simplecov", "~> 0.22"
+gem "standard", "~> 1.0"
+gem "vcr", "~> 6.0"
+gem "webmock", "~> 3.0"
+gem "yard", "~> 0.9"
+
+gemspec path: "../"
diff --git a/lib/braintrust/contrib.rb b/lib/braintrust/contrib.rb
@@ -197,9 +197,11 @@ def tracer_for(target, name: "braintrust")
 require_relative "contrib/ruby_openai/integration"
 require_relative "contrib/ruby_llm/integration"
 require_relative "contrib/anthropic/integration"
+require_relative "contrib/llm_rb/integration"
 
 # Register integrations
 Braintrust::Contrib::OpenAI::Integration.register!
 Braintrust::Contrib::RubyOpenAI::Integration.register!
 Braintrust::Contrib::RubyLLM::Integration.register!
 Braintrust::Contrib::Anthropic::Integration.register!
+Braintrust::Contrib::LlmRb::Integration.register!
diff --git a/lib/braintrust/contrib/llm_rb/instrumentation/common.rb b/lib/braintrust/contrib/llm_rb/instrumentation/common.rb
@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+
+module Braintrust
+  module Contrib
+    module LlmRb
+      module Instrumentation
+        # Common utilities for llm.rb instrumentation.
+        module Common
+          # Parse LLM::Usage into normalized Braintrust metrics.
+          # LLM::Usage has: input_tokens, output_tokens, reasoning_tokens, total_tokens
+          #
+          # @param usage [LLM::Usage, nil] usage struct from llm.rb response
+          # @return [Hash<String, Integer>] normalized metrics for Braintrust
+          def self.parse_usage_tokens(usage)
+            return {} unless usage
+
+            input = usage.respond_to?(:input_tokens) ? usage.input_tokens : nil
+            output = usage.respond_to?(:output_tokens) ? usage.output_tokens : nil
+            reasoning = usage.respond_to?(:reasoning_tokens) ? usage.reasoning_tokens : nil
+            total = usage.respond_to?(:total_tokens) ? usage.total_tokens : nil
+
+            metrics = {}
+            metrics["prompt_tokens"] = input.to_i if input
+            metrics["completion_tokens"] = output.to_i if output
+            metrics["completion_reasoning_tokens"] = reasoning.to_i if reasoning && reasoning.to_i > 0
+
+            if metrics.key?("prompt_tokens") && metrics.key?("completion_tokens")
+              metrics["tokens"] = metrics["prompt_tokens"] + metrics["completion_tokens"]
+            elsif total
+              metrics["tokens"] = total.to_i
+            end
+
+            metrics
+          end
+        end
+      end
+    end
+  end
+end
diff --git a/lib/braintrust/contrib/llm_rb/instrumentation/context.rb b/lib/braintrust/contrib/llm_rb/instrumentation/context.rb
@@ -0,0 +1,161 @@
+# frozen_string_literal: true
+
+require "opentelemetry/sdk"
+require "json"
+
+require_relative "common"
+require_relative "../../support/otel"
+
+module Braintrust
+  module Contrib
+    module LlmRb
+      module Instrumentation
+        # Context instrumentation for llm.rb.
+        # Wraps LLM::Context#talk to create Braintrust spans for chat completions.
+        module Context
+          def self.included(base)
+            base.prepend(InstanceMethods) unless applied?(base)
+          end
+
+          def self.applied?(base)
+            base.ancestors.include?(InstanceMethods)
+          end
+
+          module InstanceMethods
+            # Wrap talk() to trace chat completions.
+            # Captures input messages, output, token usage, and timing.
+            # NOTE: super must be called from within this method (not a helper)
+            # because Ruby's super keyword resolves the method chain at the call site.
+            def talk(prompt, params = {})
+              return super unless tracing_enabled?
+
+              tracer = Braintrust::Contrib.tracer_for(self)
+
+              tracer.in_span("llm_rb.chat") do |span|
+                # Capture inputs BEFORE calling super (before @messages is updated)
+                input_messages = build_input_messages(prompt, params)
+                Support::OTel.set_json_attr(span, "braintrust.input_json", input_messages) if input_messages.any?
+
+                metadata = extract_metadata(params)
+
+                begin
+                  res = super(prompt, params)
+
+                  # Capture output from response
+                  output = capture_output(res)
+                  Support::OTel.set_json_attr(span, "braintrust.output_json", output) unless output.empty?
+
+                  # Update metadata with actual model from response
+                  if res.respond_to?(:model) && res.model
+                    metadata["model"] = res.model
+                  end
+                  Support::OTel.set_json_attr(span, "braintrust.metadata", metadata)
+
+                  # Capture token metrics
+                  usage = res.respond_to?(:usage) ? res.usage : nil
+                  metrics = Common.parse_usage_tokens(usage)
+                  Support::OTel.set_json_attr(span, "braintrust.metrics", metrics) unless metrics.empty?
+
+                  res
+                rescue => e
+                  span.record_exception(e)
+                  span.status = ::OpenTelemetry::Trace::Status.error("llm.rb error: #{e.message}")
+                  raise
+                end
+              end
+            end
+
+            private
+
+            # Checks if tracing is enabled via Braintrust::Contrib context.
+            def tracing_enabled?
+              ctx = Braintrust::Contrib.context_for(self)
+              ctx&.[](:enabled) != false
+            end
+
+            # Build input messages array from existing history + new prompt.
+            # Called BEFORE super so we capture the state before @messages is updated.
+            def build_input_messages(prompt, params)
+              existing = @messages.to_a.map { |m| format_message_for_input(m) }
+
+              new_msgs = if defined?(::LLM::Prompt) && ::LLM::Prompt === prompt
+                prompt.to_a.map { |m| format_message_for_input(m) }
+              elsif prompt.is_a?(Array)
+                prompt.flat_map do |m|
+                  if m.respond_to?(:role)
+                    [format_message_for_input(m)]
+                  else
+                    [{"role" => "user", "content" => m.to_s}]
+                  end
+                end
+              else
+                role = (params[:role] || @params[:role] || @llm.user_role).to_s
+                [{"role" => role, "content" => prompt.to_s}]
+              end
+
+              existing + new_msgs
+            end
+
+            # Format an LLM::Message into OpenAI-compatible hash.
+            def format_message_for_input(msg)
+              return {"role" => "user", "content" => msg.to_s} unless msg.respond_to?(:role)
+
+              formatted = {"role" => msg.role.to_s}
+
+              content = msg.content
+              content = content.to_s if content && !content.is_a?(String)
+              formatted["content"] = content
+
+              # Tool calls on assistant messages
+              if msg.respond_to?(:extra) && (tcs = msg.extra&.tool_calls)&.respond_to?(:any?) && tcs.any?
+                formatted["tool_calls"] = tcs.map { |tc| format_tool_call_for_input(tc) }
+                formatted["content"] = nil
+              end
+
+              formatted.compact
+            end
+
+            # Format a tool call into OpenAI-compatible format.
+            def format_tool_call_for_input(tc)
+              id = tc.respond_to?(:[]) ? (tc["id"] || tc[:id]) : nil
+              name = tc.respond_to?(:[]) ? (tc["name"] || tc[:name]) : nil
+              args = tc.respond_to?(:[]) ? (tc["arguments"] || tc[:arguments]) : nil
+              args_str = args.is_a?(String) ? args : args.to_json
+
+              {
+                "id" => id,
+                "type" => "function",
+                "function" => {
+                  "name" => name,
+                  "arguments" => args_str
+                }
+              }.compact
+            end
+
+            # Extract metadata from the context (provider name, model).
+            def extract_metadata(params)
+              provider_name = @llm.respond_to?(:name) ? @llm.name.to_s : @llm.class.name.split("::").last.downcase
+              merged = @params.merge(params)
+              model = merged[:model]
+
+              {
+                "provider" => "llm_rb",
+                "llm_provider" => provider_name,
+                "model" => model
+              }.compact
+            end
+
+            # Capture output messages from the response.
+            def capture_output(res)
+              return [] unless res.respond_to?(:choices)
+
+              res.choices.map { |msg| format_message_for_input(msg) }
+            rescue
+              []
+            end
+          end
+        end
+      end
+    end
+  end
+end