Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
14 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions app/github_app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,37 @@ class GithubApp < Sinatra::Base

helpers Sinatra::Payload

before do
@_github_event = request.env['HTTP_X_GITHUB_EVENT']
@_request_start = Time.now
end

after do
route = request.env['sinatra.route']&.split(' ', 2)&.last || request.path_info
elapsed = Time.now - @_request_start

PrometheusMetrics::HTTP_REQUESTS.increment(
labels: { method: request.request_method, path: route, status: response.status.to_s }
)
PrometheusMetrics::HTTP_REQUEST_DURATION.observe(elapsed, labels: { path: route })

next unless @_github_event

result = response.status < 400 ? 'processed' : 'error'
PrometheusMetrics::GITHUB_WEBHOOK_EVENTS.increment(
labels: { event: @_github_event, result: result }
)
end

error StandardError do |e|
handler = request.path_info.split('/').reject(&:empty?).first || 'root'
PrometheusMetrics::APP_EXCEPTIONS.increment(
labels: { class: e.class.name, handler: handler }
)
content_type :json
halt 500, { error: 'Internal Server Error' }.to_json
end

class << self
def sinatra_logger_level
GitHubApp::Configuration.instance.reload
Expand Down
1 change: 1 addition & 0 deletions config.ru
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ require_relative 'app/github_app'
require_relative 'config/delayed_job'
require_relative 'lib/helpers/prometheus_metrics'

PrometheusMetrics.cleanup_stale_metric_files!
PrometheusMetrics.subscribe_query_notifications!

File.write('.session.key', SecureRandom.hex(32))
Expand Down
1 change: 1 addition & 0 deletions database_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ test:
encoding: utf8
pool: 10
timeout: 5000
reaping_frequency: ~

production:
adapter: postgresql
Expand Down
18 changes: 12 additions & 6 deletions lib/bamboo_ci/api.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,15 @@ module Api
include GitHubApp::Request

def fetch_executions(plan)
get_request(URI("https://127.0.0.1/rest/api/latest/search/jobs/#{plan}"))
PrometheusMetrics.track_bamboo('fetch_executions') do
get_request(URI("https://127.0.0.1/rest/api/latest/search/jobs/#{plan}"))
end
end

def get_status(id)
get_request(URI("https://127.0.0.1/rest/api/latest/result/#{id}?expand=stages.stage.results,artifacts"))
PrometheusMetrics.track_bamboo('get_status') do
get_request(URI("https://127.0.0.1/rest/api/latest/result/#{id}?expand=stages.stage.results,artifacts"))
end
end

def submit_pr_to_ci(check_suite, plan, ci_variables)
Expand All @@ -39,8 +43,9 @@ def submit_pr_to_ci(check_suite, plan, ci_variables)

logger(Logger::DEBUG, "Submission URL:\n #{url}")

# Fetch Request
post_request(URI(url.delete(' ')))
PrometheusMetrics.track_bamboo('submit_plan') do
post_request(URI(url.delete(' ')))
end
end

def custom_variables(check_suite)
Expand All @@ -57,8 +62,9 @@ def add_comment_to_ci(key, comment)

logger(Logger::DEBUG, "Comment Submission URL:\n #{url}")

# Fetch Request
post_request(URI(url.delete(' ')), body: "<comment><content>#{comment}</content></comment>")
PrometheusMetrics.track_bamboo('add_comment') do
post_request(URI(url.delete(' ')), body: "<comment><content>#{comment}</content></comment>")
end
end

def logger(severity, message)
Expand Down
12 changes: 8 additions & 4 deletions lib/bamboo_ci/retry.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,21 @@ class Retry

def self.restart(plan_key)
@logger = Logger.new($stdout)
put_request(URI("https://127.0.0.1/rest/api/latest/queue/#{plan_key}?executeAllStages=true"))
PrometheusMetrics.track_bamboo('retry_plan') do
put_request(URI("https://127.0.0.1/rest/api/latest/queue/#{plan_key}?executeAllStages=true"))
end
end

def self.rerun(plan_key)
@logger = Logger.new($stdout)
url = "https://127.0.0.1/rest/api/latest/queue/#{plan_key}?executeAllStages=true&orphanRemoval=true"
resp = put_request(URI(url))
resp = PrometheusMetrics.track_bamboo('rerun_plan') do
put_request(URI(url))
end

@logger.info "URL: #{url} -> (#{resp.code}) - #{resp.body}"
@logger.info "URL: #{url} -> (#{resp&.code}) - #{resp&.body}"

resp.body
resp&.body
end
end
end
8 changes: 6 additions & 2 deletions lib/bamboo_ci/stop_plan.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@ class StopPlan

def self.stop(job_key)
@logger = Logger.new($stdout)
delete_request(URI("https://127.0.0.1/rest/api/latest/queue/#{job_key}"))
PrometheusMetrics.track_bamboo('stop_job') do
delete_request(URI("https://127.0.0.1/rest/api/latest/queue/#{job_key}"))
end
end

def self.build(bamboo_ci_ref)
get_request(URI("https://127.0.0.1/build/admin/stopPlan.action?planResultKey=#{bamboo_ci_ref}"))
PrometheusMetrics.track_bamboo('stop_plan') do
get_request(URI("https://127.0.0.1/build/admin/stopPlan.action?planResultKey=#{bamboo_ci_ref}"))
end
end

def self.comment(check_suite, new_check_suite)
Expand Down
2 changes: 2 additions & 0 deletions lib/github/re_run/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def logger(severity, message)
def start_new_execution(check_suite, plan)
cleanup(check_suite)

PrometheusMetrics::CI_JOB_RETRIES.increment(labels: { reason: 'full' })

bamboo_plan_run = BambooCi::PlanRun.new(check_suite, plan, logger_level: @logger_level)
bamboo_plan_run.ci_variables = ci_vars
bamboo_plan_run.start_plan
Expand Down
2 changes: 2 additions & 0 deletions lib/github/retry/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ def github_reaction_feedback_down(comment_id)
def normal_flow
@check_suite.update(retry: @check_suite.retry + 1)

PrometheusMetrics::CI_JOB_RETRIES.increment(labels: { reason: 'partial' })

create_ci_jobs(@check_suite)

BambooCi::Retry.restart(@check_suite.bamboo_ci_ref)
Expand Down
1 change: 1 addition & 0 deletions lib/github_ci_app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
# Helpers libs
require_relative 'helpers/configuration'
require_relative 'helpers/github_logger'
require_relative 'helpers/prometheus_metrics'
require_relative 'helpers/request'
require_relative 'helpers/sinatra_payload'
require_relative 'helpers/telemetry'
Expand Down
181 changes: 58 additions & 123 deletions lib/helpers/prometheus_metrics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,131 +9,44 @@
# frozen_string_literal: true

require 'prometheus/client'
require 'prometheus/client/data_stores/direct_file_store'
require 'fileutils'

module PrometheusMetrics
METRICS_DIR = ENV.fetch('PROMETHEUS_METRICS_DIR', '/tmp/prometheus_metrics')
FileUtils.mkdir_p(METRICS_DIR)
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: METRICS_DIR)

REGISTRY = Prometheus::Client.registry
DJ_MAX_ATTEMPTS = 5
DJ_MAX_RUN_TIME = 300 # 5 minutes, matches Delayed::Worker.max_run_time
end

require_relative 'prometheus_metrics/metrics_definitions'
require_relative 'prometheus_metrics/refresh_helpers'

module PrometheusMetrics

Check warning on line 28 in lib/helpers/prometheus_metrics.rb

View workflow job for this annotation

GitHub Actions / Rubocop

[rubocop] reported by reviewdog 🐶 Do not define multiple classes/modules at the top level in a single file. Raw Output: lib/helpers/prometheus_metrics.rb:28:1: C: Style/OneClassPerFile: Do not define multiple classes/modules at the top level in a single file.
def self.track_bamboo(operation)
start = Time.now
result = yield
BAMBOO_REQUESTS.increment(labels: { operation: operation, status: bamboo_response_status(result) })
BAMBOO_DURATION.observe(Time.now - start, labels: { operation: operation })
result
rescue StandardError
BAMBOO_REQUESTS.increment(labels: { operation: operation, status: 'error' })
BAMBOO_DURATION.observe(Time.now - start, labels: { operation: operation })
nil
end

def self.track_slack(type)
result = yield
SLACK_NOTIFICATIONS.increment(labels: { type: type, status: result.nil? ? 'error' : 'sent' })
result
rescue StandardError
SLACK_NOTIFICATIONS.increment(labels: { type: type, status: 'error' })
nil
end

# --- Delayed Job metrics ---

DJ_PENDING = REGISTRY.gauge(
:delayed_jobs_pending,
docstring: 'Delayed jobs waiting to run (run_at <= now, not locked, not failed)',
labels: [:queue]
)
DJ_RUNNING = REGISTRY.gauge(
:delayed_jobs_running,
docstring: 'Delayed jobs currently locked by a worker',
labels: [:queue]
)
DJ_SCHEDULED = REGISTRY.gauge(
:delayed_jobs_scheduled,
docstring: 'Delayed jobs scheduled to run in the future (run_at > now)',
labels: [:queue]
)
DJ_FAILED = REGISTRY.gauge(
:delayed_jobs_failed,
docstring: 'Delayed jobs that have permanently failed (failed_at IS NOT NULL)',
labels: [:queue]
)
DJ_MAX_ATTEMPTS_REACHED = REGISTRY.gauge(
:delayed_jobs_max_attempts_reached,
docstring: 'Delayed jobs that have exhausted all retry attempts',
labels: [:queue]
)
DJ_LOCKED_TOO_LONG = REGISTRY.gauge(
:delayed_jobs_locked_too_long,
docstring: 'Delayed jobs locked longer than max_run_time (5 min), indicating a stuck worker',
labels: [:queue]
)
DJ_TABLE = REGISTRY.gauge(
:delayed_jobs_table,
docstring: 'Unix timestamp of the next scheduled job per queue (0 when no job is scheduled)',
labels: %i[job_id queue job_class job_args run_at]
)

# --- CI domain metrics ---

CI_JOBS = REGISTRY.gauge(
:ci_jobs_total,
docstring: 'CI jobs grouped by status',
labels: [:status]
)
CI_STAGES = REGISTRY.gauge(
:ci_stages_total,
docstring: 'CI stages grouped by status',
labels: [:status]
)

# --- Puma metrics (cluster stats written by master process to tmp/puma_stats.json) ---

PUMA_WORKERS_TOTAL = REGISTRY.gauge(
:puma_workers_total,
docstring: 'Total number of Puma worker processes configured'
)
PUMA_BOOTED_WORKERS = REGISTRY.gauge(
:puma_booted_workers,
docstring: 'Number of Puma worker processes currently booted'
)
PUMA_BACKLOG = REGISTRY.gauge(
:puma_backlog,
docstring: 'Requests waiting for a Puma thread to become available, per worker',
labels: [:worker]
)
PUMA_RUNNING_THREADS = REGISTRY.gauge(
:puma_running_threads,
docstring: 'Threads currently processing requests, per worker',
labels: [:worker]
)
PUMA_POOL_CAPACITY = REGISTRY.gauge(
:puma_pool_capacity,
docstring: 'Threads available for new requests, per worker',
labels: [:worker]
)
PUMA_MAX_THREADS = REGISTRY.gauge(
:puma_max_threads,
docstring: 'Maximum threads configured per worker',
labels: [:worker]
)

# --- ActiveRecord connection pool metrics ---

AR_POOL_SIZE = REGISTRY.gauge(
:activerecord_connection_pool_size,
docstring: 'Maximum number of connections allowed in the ActiveRecord connection pool'
)
AR_POOL_CONNECTIONS = REGISTRY.gauge(
:activerecord_connection_pool_connections,
docstring: 'Current number of connections in the ActiveRecord connection pool'
)
AR_POOL_BUSY = REGISTRY.gauge(
:activerecord_connection_pool_busy,
docstring: 'Connections currently checked out by a thread'
)
AR_POOL_IDLE = REGISTRY.gauge(
:activerecord_connection_pool_idle,
docstring: 'Connections available for checkout'
)
AR_POOL_WAITING = REGISTRY.gauge(
:activerecord_connection_pool_waiting,
docstring: 'Threads waiting to obtain a connection from the pool'
)

# --- ActiveRecord query metrics (populated via ActiveSupport::Notifications) ---

AR_QUERIES = REGISTRY.counter(
:activerecord_queries_total,
docstring: 'Total number of SQL queries executed, by operation type and table',
labels: %i[operation table]
)
AR_QUERY_DURATION = REGISTRY.histogram(
:activerecord_query_duration_seconds,
docstring: 'Duration of SQL queries in seconds, by operation type and table',
labels: %i[operation table]
)

# Call once at startup to begin recording per-query metrics.
def self.subscribe_query_notifications!
ActiveSupport::Notifications.subscribe('sql.active_record') do |*args|
event = ActiveSupport::Notifications::Event.new(*args)
Expand All @@ -146,6 +59,15 @@
end
end

def self.cleanup_stale_metric_files!
Dir.glob(File.join(METRICS_DIR, 'metric_*___*.bin')).each do |path|
pid = File.basename(path)[/___(\d+)\.bin$/, 1]&.to_i
File.delete(path) if pid && pid != Process.pid && !process_alive?(pid)
end
rescue StandardError => e
warn "PrometheusMetrics#cleanup_stale_metric_files! error: #{e.message}"
end

def self.refresh!
refresh_delayed_jobs
refresh_scheduled_jobs_detail
Expand All @@ -156,12 +78,27 @@
warn "PrometheusMetrics#refresh! error: #{e.message}"
end

def self.bamboo_response_status(result)
return 'error' if result.nil?
return 'success' unless result.respond_to?(:code)

result.code.to_i < 400 ? 'success' : 'error'
end

def self.process_alive?(pid)
Process.kill(0, pid)
true
rescue Errno::ESRCH
false
rescue Errno::EPERM
true
end

def self.extract_sql_operation(sql)
op = sql.to_s.strip.split(/\s/, 2).first&.upcase
op if %w[SELECT INSERT UPDATE DELETE].include?(op)
end

# Extracts the table/model name from ActiveRecord's event name (e.g. "User Load" => "users").
def self.extract_table_name(name)
return 'unknown' if name.nil? || name.empty?

Expand All @@ -171,7 +108,5 @@
model&.downcase&.gsub('::', '_') || 'unknown'
end

private_class_method :extract_sql_operation, :extract_table_name
private_class_method :bamboo_response_status, :process_alive?, :extract_sql_operation, :extract_table_name
end

require_relative 'prometheus_metrics/refresh_helpers'
Loading