Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions lib/braintrust/api.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# frozen_string_literal: true

require_relative "api/datasets"
require_relative "api/experiments"
require_relative "api/functions"

module Braintrust
Expand All @@ -20,6 +21,12 @@ def datasets
@datasets ||= API::Datasets.new(self)
end

# Access to experiments API
# @return [API::Experiments]
def experiments
@experiments ||= API::Experiments.new(self)
end

# Access to functions API
# @return [API::Functions]
def functions
Expand Down
92 changes: 92 additions & 0 deletions lib/braintrust/api/experiments.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# frozen_string_literal: true

require "net/http"
require "json"
require "uri"
require_relative "../logger"

module Braintrust
class API
# Experiments API namespace
# Provides methods for fetching experiment comparison data
class Experiments
def initialize(api)
@api = api
@state = api.state
end

# Fetch experiment comparison data
# GET /experiment-comparison2
# Returns score and metric summaries with comparison to baseline experiment
# @param experiment_id [String] Current experiment ID
# @param base_experiment_id [String, nil] Baseline experiment ID to compare against.
# If nil, API will auto-select based on experiment metadata or most recent experiment.
# @return [Hash] Response with scores, metrics, and comparison info
# - "scores" [Hash] Score summaries keyed by name
# - "metrics" [Hash] Metric summaries keyed by name
# - "comparisonExperimentName" [String, nil] Name of baseline experiment
# - "comparisonExperimentId" [String, nil] ID of baseline experiment
def comparison(experiment_id:, base_experiment_id: nil)
params = {"experiment_id" => experiment_id}
params["base_experiment_id"] = base_experiment_id if base_experiment_id

http_get("/experiment-comparison2", params)
end

private

# Core HTTP request method with logging
# @param method [Symbol] :get or :post
# @param path [String] API path
# @param params [Hash] Query params (for GET)
# @param payload [Hash, nil] JSON payload (for POST)
# @return [Hash] Parsed JSON response
def http_request(method, path, params: {}, payload: nil)
# Build URI - use api_url for this endpoint
base = @state.api_url || "https://api.braintrust.dev"
uri = URI("#{base}#{path}")
uri.query = URI.encode_www_form(params) unless params.empty?

# Create request
request = case method
when :get
Net::HTTP::Get.new(uri)
when :post
req = Net::HTTP::Post.new(uri)
req["Content-Type"] = "application/json"
req.body = JSON.dump(payload) if payload
req
else
raise ArgumentError, "Unsupported HTTP method: #{method}"
end

# Use Bearer token format for API endpoints
request["Authorization"] = "Bearer #{@state.api_key}"

# Execute request with timing
start_time = Time.now
Log.debug("[API] #{method.upcase} #{uri}")

http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = (uri.scheme == "https")
response = http.request(request)

duration_ms = ((Time.now - start_time) * 1000).round(2)
Log.debug("[API] #{method.upcase} #{uri} -> #{response.code} (#{duration_ms}ms, #{response.body.bytesize} bytes)")

# Handle response
unless response.is_a?(Net::HTTPSuccess)
Log.debug("[API] Error response body: #{response.body}")
raise Error, "HTTP #{response.code} for #{method.upcase} #{uri}: #{response.body}"
end

JSON.parse(response.body)
end

# HTTP GET with query params - returns parsed JSON
def http_get(path, params = {})
http_request(:get, path, params: params)
end
end
end
end
207 changes: 194 additions & 13 deletions lib/braintrust/eval.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

require_relative "eval/scorer"
require_relative "eval/runner"
require_relative "eval/summary"
require_relative "eval/result"
require_relative "internal/experiments"

require "opentelemetry/sdk"
Expand Down Expand Up @@ -199,13 +201,15 @@ def scorer(name, callable = nil, &block)
# @param metadata [Hash] Optional experiment metadata
# @param update [Boolean] If true, allow reusing existing experiment (default: false)
# @param quiet [Boolean] If true, suppress result output (default: false)
# @param send_logs [Boolean] If true (default), create experiment on server and send span data.
# If false, run evaluation locally without sending data to Braintrust.
# @param state [State, nil] Braintrust state (defaults to global state)
# @param tracer_provider [TracerProvider, nil] OpenTelemetry tracer provider (defaults to global)
# @return [Result]
def run(project:, experiment:, task:, scorers:,
cases: nil, dataset: nil,
parallelism: 1, tags: nil, metadata: nil, update: false, quiet: false,
state: nil, tracer_provider: nil)
send_logs: true, state: nil, tracer_provider: nil)
# Validate required parameters
validate_params!(project: project, experiment: experiment,
cases: cases, dataset: dataset, task: task, scorers: scorers)
Expand All @@ -223,17 +227,48 @@ def run(project:, experiment:, task:, scorers:,
cases = resolve_dataset(dataset, project, state)
end

if send_logs
run_with_server(
project: project, experiment: experiment, task: task, scorers: scorers,
cases: cases, parallelism: parallelism, tags: tags, metadata: metadata,
update: update, quiet: quiet, state: state, tracer_provider: tracer_provider
)
else
run_local(
project: project, experiment: experiment, task: task, scorers: scorers,
cases: cases, parallelism: parallelism, quiet: quiet, state: state
)
end
end

private

# Print result summary to stdout
# @param result [Result] The evaluation result
def print_result(result)
puts result.to_pretty
end

# Run evaluation with server integration (send_logs: true)
# Creates experiment, sends span data, fetches comparison summary
def run_with_server(project:, experiment:, task:, scorers:, cases:,
parallelism:, tags:, metadata:, update:, quiet:, state:, tracer_provider:)
require_relative "api"

# Register project and experiment via API
result = Internal::Experiments.get_or_create(
reg_result = Internal::Experiments.get_or_create(
experiment, project, state: state,
tags: tags, metadata: metadata, update: update
)

experiment_id = result[:experiment_id]
project_id = result[:project_id]
project_name = result[:project_name]
experiment_id = reg_result[:experiment_id]
project_id = reg_result[:project_id]
project_name = reg_result[:project_name]

# Generate permalink
permalink = "#{state.app_url}/app/#{state.org_name}/object?object_type=experiment&object_id=#{experiment_id}"

# Instantiate Runner and run evaluation
# Instantiate Runner and run evaluation with tracing
runner = Runner.new(
experiment_id: experiment_id,
experiment_name: experiment,
Expand All @@ -244,20 +279,166 @@ def run(project:, experiment:, task:, scorers:,
state: state,
tracer_provider: tracer_provider
)
result = runner.run(cases, parallelism: parallelism)

# Print result summary unless quiet
start_time = Time.now
run_result = runner.run(cases, parallelism: parallelism)
duration = Time.now - start_time

# Flush spans before fetching comparison data from API
# This ensures the server has received all span data before we query for results
actual_tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
actual_tracer_provider.force_flush if actual_tracer_provider.respond_to?(:force_flush)

# Fetch comparison summary from API
summary = fetch_comparison_summary(
experiment_id: experiment_id,
experiment_name: experiment,
project_name: project_name,
permalink: permalink,
duration: duration,
errors: run_result.errors,
raw_scores: run_result.scores,
state: state
)

# Create result with summary
result = Result.new(
experiment_id: experiment_id,
experiment_name: experiment,
project_id: project_id,
project_name: project_name,
permalink: permalink,
errors: run_result.errors,
duration: duration,
scores: run_result.scores,
summary: summary
)

print_result(result) unless quiet
result
end

# Run evaluation locally without server (send_logs: false)
# No experiment created, no span data sent, local summary only
def run_local(project:, experiment:, task:, scorers:, cases:,
parallelism:, quiet:, state:)
# Create a no-op tracer provider that doesn't send data
no_op_tracer_provider = OpenTelemetry::SDK::Trace::TracerProvider.new

# Instantiate Runner with no-op tracer (no data sent)
runner = Runner.new(
experiment_id: nil,
experiment_name: experiment,
project_id: nil,
project_name: project,
task: task,
scorers: scorers,
state: state,
tracer_provider: no_op_tracer_provider
)

start_time = Time.now
run_result = runner.run(cases, parallelism: parallelism)
duration = Time.now - start_time

# Build local summary from raw scores
summary = ExperimentSummary.from_raw_scores(
run_result.scores,
{
project_name: project,
experiment_name: experiment,
experiment_id: nil,
experiment_url: nil,
duration: duration,
error_count: run_result.errors.length,
errors: run_result.errors
}
)

# Create result with local summary
result = Result.new(
experiment_id: nil,
experiment_name: experiment,
project_id: nil,
project_name: project,
permalink: nil,
errors: run_result.errors,
duration: duration,
scores: run_result.scores,
summary: summary
)

print_result(result) unless quiet
result
end

private
# Fetch comparison summary from API, falling back to local on failure or empty response
def fetch_comparison_summary(experiment_id:, experiment_name:, project_name:,
permalink:, duration:, errors:, raw_scores:, state:)
api = API.new(state: state)
local_metadata = {
project_name: project_name,
experiment_name: experiment_name,
experiment_id: experiment_id,
experiment_url: permalink,
duration: duration,
error_count: errors.length,
errors: errors
}

# Print result summary to stdout
# @param result [Result] The evaluation result
def print_result(result)
puts result.to_pretty
begin
api_response = api.experiments.comparison(experiment_id: experiment_id)

# If API returned empty scores, fall back to local data
if api_response["scores"].nil? || api_response["scores"].empty?
Log.debug("API returned no scores, using local summary")
return ExperimentSummary.from_raw_scores(raw_scores, local_metadata)
end

build_server_summary(api_response, local_metadata)
rescue => e
Log.warn("Failed to fetch comparison summary: #{e.message}. Falling back to local summary.")
ExperimentSummary.from_raw_scores(raw_scores, local_metadata)
end
end

# Build ExperimentSummary from API response
def build_server_summary(api_response, metadata)
# Transform API scores into ScoreSummary objects
scores = (api_response["scores"] || {}).map do |name, data|
[name, ScoreSummary.new(
name: name,
score: data["score"],
diff: data["diff"],
improvements: data["improvements"],
regressions: data["regressions"]
)]
end.to_h

# Transform API metrics into MetricSummary objects
metrics = (api_response["metrics"] || {}).map do |name, data|
[name, MetricSummary.new(
name: name,
metric: data["metric"],
unit: data["unit"] || "",
diff: data["diff"]
)]
end.to_h

# Build comparison info if present
comparison = if api_response["comparisonExperimentName"]
ComparisonInfo.new(
baseline_experiment_id: api_response["comparisonExperimentId"],
baseline_experiment_name: api_response["comparisonExperimentName"]
)
end

ExperimentSummary.new(
scores: scores,
metrics: metrics.empty? ? nil : metrics,
comparison: comparison,
**metadata
)
end

# Validate required parameters
Expand Down
Loading