Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
52d4f53
Add Prometheus metrics tracking for Bamboo CI API interactions
RodrigoMNardi May 26, 2026
da60465
Add Prometheus metrics for HTTP requests and durations
RodrigoMNardi May 26, 2026
49c8817
Enhance Prometheus metrics management by adding cleanup for stale met…
RodrigoMNardi May 27, 2026
dcb11a3
Add Prometheus metrics definitions for Delayed Job, CI, Puma, ActiveR…
RodrigoMNardi May 29, 2026
1681b41
Add Prometheus metrics definitions for Delayed Job, CI, Puma, ActiveR…
RodrigoMNardi May 29, 2026
19aa18e
Add Prometheus metrics tests and update database template configuration
RodrigoMNardi May 29, 2026
04523b3
Refactor Prometheus metrics tests to improve process alive checks and…
RodrigoMNardi May 29, 2026
0be8a3d
Refactor Delayed Job test setup to improve readability and maintainab…
RodrigoMNardi May 29, 2026
da42ab2
Add tests for Prometheus metrics subscription and gauge reset functio…
RodrigoMNardi May 29, 2026
d3718fa
Refactor Prometheus metrics tests to use FileUtils for file deletion …
RodrigoMNardi May 29, 2026
471ef4b
Add tests for CiJob stage handling and metric file cleanup scenarios
RodrigoMNardi May 29, 2026
c3d9621
Add tests for GitHub status update handling and error response scenarios
RodrigoMNardi May 29, 2026
a6dff44
Refactor GitHub app and CI job specs to improve test clarity and ensu…
RodrigoMNardi May 29, 2026
55a9249
Refactor GitHub app and CI job specs to improve test clarity and ensu…
RodrigoMNardi May 29, 2026
205fde6
Merge pull request #125 from RodrigoMNardi/feature/more-prometheus-info
RodrigoMNardi May 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions app/github_app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,37 @@

helpers Sinatra::Payload

before do
@_github_event = request.env['HTTP_X_GITHUB_EVENT']
@_request_start = Time.now
end

after do
route = request.env['sinatra.route']&.split(' ', 2)&.last || request.path_info
elapsed = Time.now - @_request_start

PrometheusMetrics::HTTP_REQUESTS.increment(
labels: { method: request.request_method, path: route, status: response.status.to_s }
)
PrometheusMetrics::HTTP_REQUEST_DURATION.observe(elapsed, labels: { path: route })

next unless @_github_event

result = response.status < 400 ? 'processed' : 'error'
PrometheusMetrics::GITHUB_WEBHOOK_EVENTS.increment(
labels: { event: @_github_event, result: result }
)
end

error StandardError do |e|
handler = request.path_info.split('/').reject(&:empty?).first || 'root'
PrometheusMetrics::APP_EXCEPTIONS.increment(
labels: { class: e.class.name, handler: handler }
)
content_type :json
halt 500, { error: 'Internal Server Error' }.to_json
end

class << self
def sinatra_logger_level
GitHubApp::Configuration.instance.reload
Expand Down Expand Up @@ -177,7 +208,7 @@
logger.debug '======= POST DONE ========'
end

def slack_authentication

Check warning on line 211 in app/github_app.rb

View workflow job for this annotation

GitHub Actions / Rubocop

[rubocop] reported by reviewdog 🐶 Predicate method names should end with `?`. Raw Output: app/github_app.rb:211:9: C: Naming/PredicateMethod: Predicate method names should end with `?`.
netrc = Netrc.read
user, passwd = netrc['slack_bot.netdef.org']

Expand Down
1 change: 1 addition & 0 deletions config.ru
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require_relative 'app/github_app'
require_relative 'config/delayed_job'
require_relative 'lib/helpers/prometheus_metrics'

PrometheusMetrics.cleanup_stale_metric_files!
PrometheusMetrics.subscribe_query_notifications!

File.write('.session.key', SecureRandom.hex(32))
Expand Down
1 change: 1 addition & 0 deletions database_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ test:
encoding: utf8
pool: 10
timeout: 5000
reaping_frequency: ~

production:
adapter: postgresql
Expand Down
18 changes: 12 additions & 6 deletions lib/bamboo_ci/api.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@ module Api
include GitHubApp::Request

def fetch_executions(plan)
get_request(URI("https://127.0.0.1/rest/api/latest/search/jobs/#{plan}"))
PrometheusMetrics.track_bamboo('fetch_executions') do
get_request(URI("https://127.0.0.1/rest/api/latest/search/jobs/#{plan}"))
end
end

def get_status(id)
get_request(URI("https://127.0.0.1/rest/api/latest/result/#{id}?expand=stages.stage.results,artifacts"))
PrometheusMetrics.track_bamboo('get_status') do
get_request(URI("https://127.0.0.1/rest/api/latest/result/#{id}?expand=stages.stage.results,artifacts"))
end
end

def submit_pr_to_ci(check_suite, plan, ci_variables)
Expand All @@ -39,8 +43,9 @@ def submit_pr_to_ci(check_suite, plan, ci_variables)

logger(Logger::DEBUG, "Submission URL:\n #{url}")

# Fetch Request
post_request(URI(url.delete(' ')))
PrometheusMetrics.track_bamboo('submit_plan') do
post_request(URI(url.delete(' ')))
end
end

def custom_variables(check_suite)
Expand All @@ -57,8 +62,9 @@ def add_comment_to_ci(key, comment)

logger(Logger::DEBUG, "Comment Submission URL:\n #{url}")

# Fetch Request
post_request(URI(url.delete(' ')), body: "<comment><content>#{comment}</content></comment>")
PrometheusMetrics.track_bamboo('add_comment') do
post_request(URI(url.delete(' ')), body: "<comment><content>#{comment}</content></comment>")
end
end

def logger(severity, message)
Expand Down
12 changes: 8 additions & 4 deletions lib/bamboo_ci/retry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,21 @@ class Retry

def self.restart(plan_key)
@logger = Logger.new($stdout)
put_request(URI("https://127.0.0.1/rest/api/latest/queue/#{plan_key}?executeAllStages=true"))
PrometheusMetrics.track_bamboo('retry_plan') do
put_request(URI("https://127.0.0.1/rest/api/latest/queue/#{plan_key}?executeAllStages=true"))
end
end

def self.rerun(plan_key)
@logger = Logger.new($stdout)
url = "https://127.0.0.1/rest/api/latest/queue/#{plan_key}?executeAllStages=true&orphanRemoval=true"
resp = put_request(URI(url))
resp = PrometheusMetrics.track_bamboo('rerun_plan') do
put_request(URI(url))
end

@logger.info "URL: #{url} -> (#{resp.code}) - #{resp.body}"
@logger.info "URL: #{url} -> (#{resp&.code}) - #{resp&.body}"

resp.body
resp&.body
end
end
end
8 changes: 6 additions & 2 deletions lib/bamboo_ci/stop_plan.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@ class StopPlan

def self.stop(job_key)
@logger = Logger.new($stdout)
delete_request(URI("https://127.0.0.1/rest/api/latest/queue/#{job_key}"))
PrometheusMetrics.track_bamboo('stop_job') do
delete_request(URI("https://127.0.0.1/rest/api/latest/queue/#{job_key}"))
end
end

def self.build(bamboo_ci_ref)
get_request(URI("https://127.0.0.1/build/admin/stopPlan.action?planResultKey=#{bamboo_ci_ref}"))
PrometheusMetrics.track_bamboo('stop_plan') do
get_request(URI("https://127.0.0.1/build/admin/stopPlan.action?planResultKey=#{bamboo_ci_ref}"))
end
end

def self.comment(check_suite, new_check_suite)
Expand Down
2 changes: 2 additions & 0 deletions lib/github/re_run/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def logger(severity, message)
def start_new_execution(check_suite, plan)
cleanup(check_suite)

PrometheusMetrics::CI_JOB_RETRIES.increment(labels: { reason: 'full' })

bamboo_plan_run = BambooCi::PlanRun.new(check_suite, plan, logger_level: @logger_level)
bamboo_plan_run.ci_variables = ci_vars
bamboo_plan_run.start_plan
Expand Down
2 changes: 2 additions & 0 deletions lib/github/retry/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def github_reaction_feedback_down(comment_id)
def normal_flow
@check_suite.update(retry: @check_suite.retry + 1)

PrometheusMetrics::CI_JOB_RETRIES.increment(labels: { reason: 'partial' })

create_ci_jobs(@check_suite)

BambooCi::Retry.restart(@check_suite.bamboo_ci_ref)
Expand Down
1 change: 1 addition & 0 deletions lib/github_ci_app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
# Helpers libs
require_relative 'helpers/configuration'
require_relative 'helpers/github_logger'
require_relative 'helpers/prometheus_metrics'
require_relative 'helpers/request'
require_relative 'helpers/sinatra_payload'
require_relative 'helpers/telemetry'
Expand Down
181 changes: 58 additions & 123 deletions lib/helpers/prometheus_metrics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,131 +9,44 @@
# frozen_string_literal: true

require 'prometheus/client'
require 'prometheus/client/data_stores/direct_file_store'
require 'fileutils'

module PrometheusMetrics
METRICS_DIR = ENV.fetch('PROMETHEUS_METRICS_DIR', '/tmp/prometheus_metrics')
FileUtils.mkdir_p(METRICS_DIR)
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: METRICS_DIR)

REGISTRY = Prometheus::Client.registry
DJ_MAX_ATTEMPTS = 5
DJ_MAX_RUN_TIME = 300 # 5 minutes, matches Delayed::Worker.max_run_time
end

require_relative 'prometheus_metrics/metrics_definitions'
require_relative 'prometheus_metrics/refresh_helpers'

module PrometheusMetrics

Check warning on line 28 in lib/helpers/prometheus_metrics.rb

View workflow job for this annotation

GitHub Actions / Rubocop

[rubocop] reported by reviewdog 🐶 Do not define multiple classes/modules at the top level in a single file. Raw Output: lib/helpers/prometheus_metrics.rb:28:1: C: Style/OneClassPerFile: Do not define multiple classes/modules at the top level in a single file.
def self.track_bamboo(operation)
start = Time.now
result = yield
BAMBOO_REQUESTS.increment(labels: { operation: operation, status: bamboo_response_status(result) })
BAMBOO_DURATION.observe(Time.now - start, labels: { operation: operation })
result
rescue StandardError
BAMBOO_REQUESTS.increment(labels: { operation: operation, status: 'error' })
BAMBOO_DURATION.observe(Time.now - start, labels: { operation: operation })
nil
end

def self.track_slack(type)
result = yield
SLACK_NOTIFICATIONS.increment(labels: { type: type, status: result.nil? ? 'error' : 'sent' })
result
rescue StandardError
SLACK_NOTIFICATIONS.increment(labels: { type: type, status: 'error' })
nil
end

# --- Delayed Job metrics ---

DJ_PENDING = REGISTRY.gauge(
:delayed_jobs_pending,
docstring: 'Delayed jobs waiting to run (run_at <= now, not locked, not failed)',
labels: [:queue]
)
DJ_RUNNING = REGISTRY.gauge(
:delayed_jobs_running,
docstring: 'Delayed jobs currently locked by a worker',
labels: [:queue]
)
DJ_SCHEDULED = REGISTRY.gauge(
:delayed_jobs_scheduled,
docstring: 'Delayed jobs scheduled to run in the future (run_at > now)',
labels: [:queue]
)
DJ_FAILED = REGISTRY.gauge(
:delayed_jobs_failed,
docstring: 'Delayed jobs that have permanently failed (failed_at IS NOT NULL)',
labels: [:queue]
)
DJ_MAX_ATTEMPTS_REACHED = REGISTRY.gauge(
:delayed_jobs_max_attempts_reached,
docstring: 'Delayed jobs that have exhausted all retry attempts',
labels: [:queue]
)
DJ_LOCKED_TOO_LONG = REGISTRY.gauge(
:delayed_jobs_locked_too_long,
docstring: 'Delayed jobs locked longer than max_run_time (5 min), indicating a stuck worker',
labels: [:queue]
)
DJ_TABLE = REGISTRY.gauge(
:delayed_jobs_table,
docstring: 'Unix timestamp of the next scheduled job per queue (0 when no job is scheduled)',
labels: %i[job_id queue job_class job_args run_at]
)

# --- CI domain metrics ---

CI_JOBS = REGISTRY.gauge(
:ci_jobs_total,
docstring: 'CI jobs grouped by status',
labels: [:status]
)
CI_STAGES = REGISTRY.gauge(
:ci_stages_total,
docstring: 'CI stages grouped by status',
labels: [:status]
)

# --- Puma metrics (cluster stats written by master process to tmp/puma_stats.json) ---

PUMA_WORKERS_TOTAL = REGISTRY.gauge(
:puma_workers_total,
docstring: 'Total number of Puma worker processes configured'
)
PUMA_BOOTED_WORKERS = REGISTRY.gauge(
:puma_booted_workers,
docstring: 'Number of Puma worker processes currently booted'
)
PUMA_BACKLOG = REGISTRY.gauge(
:puma_backlog,
docstring: 'Requests waiting for a Puma thread to become available, per worker',
labels: [:worker]
)
PUMA_RUNNING_THREADS = REGISTRY.gauge(
:puma_running_threads,
docstring: 'Threads currently processing requests, per worker',
labels: [:worker]
)
PUMA_POOL_CAPACITY = REGISTRY.gauge(
:puma_pool_capacity,
docstring: 'Threads available for new requests, per worker',
labels: [:worker]
)
PUMA_MAX_THREADS = REGISTRY.gauge(
:puma_max_threads,
docstring: 'Maximum threads configured per worker',
labels: [:worker]
)

# --- ActiveRecord connection pool metrics ---

AR_POOL_SIZE = REGISTRY.gauge(
:activerecord_connection_pool_size,
docstring: 'Maximum number of connections allowed in the ActiveRecord connection pool'
)
AR_POOL_CONNECTIONS = REGISTRY.gauge(
:activerecord_connection_pool_connections,
docstring: 'Current number of connections in the ActiveRecord connection pool'
)
AR_POOL_BUSY = REGISTRY.gauge(
:activerecord_connection_pool_busy,
docstring: 'Connections currently checked out by a thread'
)
AR_POOL_IDLE = REGISTRY.gauge(
:activerecord_connection_pool_idle,
docstring: 'Connections available for checkout'
)
AR_POOL_WAITING = REGISTRY.gauge(
:activerecord_connection_pool_waiting,
docstring: 'Threads waiting to obtain a connection from the pool'
)

# --- ActiveRecord query metrics (populated via ActiveSupport::Notifications) ---

AR_QUERIES = REGISTRY.counter(
:activerecord_queries_total,
docstring: 'Total number of SQL queries executed, by operation type and table',
labels: %i[operation table]
)
AR_QUERY_DURATION = REGISTRY.histogram(
:activerecord_query_duration_seconds,
docstring: 'Duration of SQL queries in seconds, by operation type and table',
labels: %i[operation table]
)

# Call once at startup to begin recording per-query metrics.
def self.subscribe_query_notifications!
ActiveSupport::Notifications.subscribe('sql.active_record') do |*args|
event = ActiveSupport::Notifications::Event.new(*args)
Expand All @@ -146,6 +59,15 @@
end
end

def self.cleanup_stale_metric_files!
Dir.glob(File.join(METRICS_DIR, 'metric_*___*.bin')).each do |path|
pid = File.basename(path)[/___(\d+)\.bin$/, 1]&.to_i
File.delete(path) if pid && pid != Process.pid && !process_alive?(pid)
end
rescue StandardError => e
warn "PrometheusMetrics#cleanup_stale_metric_files! error: #{e.message}"
end

def self.refresh!
refresh_delayed_jobs
refresh_scheduled_jobs_detail
Expand All @@ -156,12 +78,27 @@
warn "PrometheusMetrics#refresh! error: #{e.message}"
end

def self.bamboo_response_status(result)
return 'error' if result.nil?
return 'success' unless result.respond_to?(:code)

result.code.to_i < 400 ? 'success' : 'error'
end

def self.process_alive?(pid)
Process.kill(0, pid)
true
rescue Errno::ESRCH
false
rescue Errno::EPERM
true
end

def self.extract_sql_operation(sql)
op = sql.to_s.strip.split(/\s/, 2).first&.upcase
op if %w[SELECT INSERT UPDATE DELETE].include?(op)
end

# Extracts the table/model name from ActiveRecord's event name (e.g. "User Load" => "users").
def self.extract_table_name(name)
return 'unknown' if name.nil? || name.empty?

Expand All @@ -171,7 +108,5 @@
model&.downcase&.gsub('::', '_') || 'unknown'
end

private_class_method :extract_sql_operation, :extract_table_name
private_class_method :bamboo_response_status, :process_alive?, :extract_sql_operation, :extract_table_name
end

require_relative 'prometheus_metrics/refresh_helpers'
Loading
Loading