Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ gem 'rake'
# Delayed Job
gem 'delayed_job_active_record'

# Prometheus metrics
gem 'prometheus-client'

# Code lint
gem 'rubocop', '1.56.1', group: %i[development test]
gem 'rubocop-performance', group: %i[development test]
Expand Down
3 changes: 3 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ GEM
ast (~> 2.4.1)
racc
pg (1.5.7)
prometheus-client (4.2.5)
base64
public_suffix (6.0.1)
puma (6.4.2)
nio4r (~> 2.0)
Expand Down Expand Up @@ -181,6 +183,7 @@ DEPENDENCIES
octokit (~> 9.1)
otr-activerecord (~> 2.3)
pg (~> 1.5, >= 1.5.3)
prometheus-client
puma
rack (= 3.1.7)
rack-test
Expand Down
8 changes: 8 additions & 0 deletions app/github_app.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
require 'json'
require 'sinatra'
require 'octokit'
require 'prometheus/client/formats/text'
require 'netrc'
require 'date'
require 'yaml'
Expand All @@ -38,6 +39,12 @@ def sinatra_logger_level
attr_writer :sinatra_logger_level
end

get '/metrics' do
PrometheusMetrics.refresh!
content_type 'text/plain; version=0.0.4; charset=utf-8'
Prometheus::Client::Formats::Text.marshal(PrometheusMetrics::REGISTRY)
end

get '/telemetry' do
content_type :json

Expand Down Expand Up @@ -121,6 +128,7 @@ def sinatra_logger_level

halt 200, 'PONG!'
when 'pull_request'
logger.info 'Creating new action'
build_plan = Github::BuildPlan.new(payload, logger_level: GithubApp.sinatra_logger_level)
resp = build_plan.create

Expand Down
8 changes: 8 additions & 0 deletions config.ru
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,20 @@ require 'rackup'
require 'rack/handler/puma'
require 'rack/session/cookie'
require 'securerandom'
require 'prometheus/middleware/collector'
require 'prometheus/middleware/exporter'

require_relative 'app/github_app'
require_relative 'config/delayed_job'
require_relative 'lib/helpers/prometheus_metrics'

PrometheusMetrics.subscribe_query_notifications!

File.write('.session.key', SecureRandom.hex(32))

# Refresh delayed_job + CI domain gauges on every Prometheus scrape
use Prometheus::Middleware::Collector, metrics_prefix: 'http'

pids = []
pids << spawn("RACK_ENV=#{ENV.fetch('RACK_ENV', 'development')} rake jobs:work QUEUES=0,1,2,3")
pids << spawn("RACK_ENV=#{ENV.fetch('RACK_ENV', 'development')} rake jobs:work QUEUES=4,5,6")
Expand Down
1 change: 1 addition & 0 deletions config/puma.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
Thread.new do
loop do
Telemetry.instance.update_stats Puma.stats
File.write('tmp/puma_stats.json', Puma.stats_hash.to_json)
sleep 30
end
end
Expand Down
300 changes: 300 additions & 0 deletions lib/helpers/prometheus_metrics.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
# SPDX-License-Identifier: BSD-2-Clause
#
# prometheus_metrics.rb
# Part of NetDEF CI System
#
# Copyright (c) 2026 by
# Network Device Education Foundation, Inc. ("NetDEF")
#
# frozen_string_literal: true

require 'prometheus/client'

module PrometheusMetrics

Check warning on line 13 in lib/helpers/prometheus_metrics.rb

View workflow job for this annotation

GitHub Actions / Rubocop

[rubocop] reported by reviewdog 🐶 Module has too many lines. [235/200] Raw Output: lib/helpers/prometheus_metrics.rb:13:1: C: Metrics/ModuleLength: Module has too many lines. [235/200]
REGISTRY = Prometheus::Client.registry
DJ_MAX_ATTEMPTS = 5
DJ_MAX_RUN_TIME = 300 # 5 minutes, matches Delayed::Worker.max_run_time

# --- Delayed Job metrics ---

DJ_PENDING = REGISTRY.gauge(
:delayed_jobs_pending,
docstring: 'Delayed jobs waiting to run (run_at <= now, not locked, not failed)',
labels: [:queue]
)
DJ_RUNNING = REGISTRY.gauge(
:delayed_jobs_running,
docstring: 'Delayed jobs currently locked by a worker',
labels: [:queue]
)
DJ_SCHEDULED = REGISTRY.gauge(
:delayed_jobs_scheduled,
docstring: 'Delayed jobs scheduled to run in the future (run_at > now)',
labels: [:queue]
)
DJ_FAILED = REGISTRY.gauge(
:delayed_jobs_failed,
docstring: 'Delayed jobs that have permanently failed (failed_at IS NOT NULL)',
labels: [:queue]
)
DJ_MAX_ATTEMPTS_REACHED = REGISTRY.gauge(
:delayed_jobs_max_attempts_reached,
docstring: 'Delayed jobs that have exhausted all retry attempts',
labels: [:queue]
)
DJ_LOCKED_TOO_LONG = REGISTRY.gauge(
:delayed_jobs_locked_too_long,
docstring: 'Delayed jobs locked longer than max_run_time (5 min), indicating a stuck worker',
labels: [:queue]
)
DJ_TABLE = REGISTRY.gauge(
:delayed_jobs_table,
docstring: 'Unix timestamp of the next scheduled job per queue (0 when no job is scheduled)',
labels: %i[job_id queue job_class job_args run_at]
)

# --- CI domain metrics ---

CI_JOBS = REGISTRY.gauge(
:ci_jobs_total,
docstring: 'CI jobs grouped by status',
labels: [:status]
)
CI_STAGES = REGISTRY.gauge(
:ci_stages_total,
docstring: 'CI stages grouped by status',
labels: [:status]
)

# --- Puma metrics (cluster stats written by master process to tmp/puma_stats.json) ---

PUMA_WORKERS_TOTAL = REGISTRY.gauge(
:puma_workers_total,
docstring: 'Total number of Puma worker processes configured'
)
PUMA_BOOTED_WORKERS = REGISTRY.gauge(
:puma_booted_workers,
docstring: 'Number of Puma worker processes currently booted'
)
PUMA_BACKLOG = REGISTRY.gauge(
:puma_backlog,
docstring: 'Requests waiting for a Puma thread to become available, per worker',
labels: [:worker]
)
PUMA_RUNNING_THREADS = REGISTRY.gauge(
:puma_running_threads,
docstring: 'Threads currently processing requests, per worker',
labels: [:worker]
)
PUMA_POOL_CAPACITY = REGISTRY.gauge(
:puma_pool_capacity,
docstring: 'Threads available for new requests, per worker',
labels: [:worker]
)
PUMA_MAX_THREADS = REGISTRY.gauge(
:puma_max_threads,
docstring: 'Maximum threads configured per worker',
labels: [:worker]
)

# --- ActiveRecord connection pool metrics ---

AR_POOL_SIZE = REGISTRY.gauge(
:activerecord_connection_pool_size,
docstring: 'Maximum number of connections allowed in the ActiveRecord connection pool'
)
AR_POOL_CONNECTIONS = REGISTRY.gauge(
:activerecord_connection_pool_connections,
docstring: 'Current number of connections in the ActiveRecord connection pool'
)
AR_POOL_BUSY = REGISTRY.gauge(
:activerecord_connection_pool_busy,
docstring: 'Connections currently checked out by a thread'
)
AR_POOL_IDLE = REGISTRY.gauge(
:activerecord_connection_pool_idle,
docstring: 'Connections available for checkout'
)
AR_POOL_WAITING = REGISTRY.gauge(
:activerecord_connection_pool_waiting,
docstring: 'Threads waiting to obtain a connection from the pool'
)

# --- ActiveRecord query metrics (populated via ActiveSupport::Notifications) ---

AR_QUERIES = REGISTRY.counter(
:activerecord_queries_total,
docstring: 'Total number of SQL queries executed, by operation type and table',
labels: %i[operation table]
)
AR_QUERY_DURATION = REGISTRY.histogram(
:activerecord_query_duration_seconds,
docstring: 'Duration of SQL queries in seconds, by operation type and table',
labels: %i[operation table]
)

# Call once at startup to begin recording per-query metrics.
def self.subscribe_query_notifications!
ActiveSupport::Notifications.subscribe('sql.active_record') do |*args|
event = ActiveSupport::Notifications::Event.new(*args)
operation = extract_sql_operation(event.payload[:sql])
next if operation.nil?

labels = { operation: operation, table: extract_table_name(event.payload[:name]) }
AR_QUERIES.increment(labels: labels)
AR_QUERY_DURATION.observe(event.duration / 1000.0, labels: labels)
end
end

def self.refresh!
refresh_delayed_jobs
refresh_scheduled_jobs_detail
refresh_ci_domain
refresh_connection_pool
refresh_puma
rescue StandardError => e
warn "PrometheusMetrics#refresh! error: #{e.message}"
end

def self.refresh_delayed_jobs

Check warning on line 159 in lib/helpers/prometheus_metrics.rb

View workflow job for this annotation

GitHub Actions / Rubocop

[rubocop] reported by reviewdog 🐶 Method has too many lines. [28/20] (https://rubystyle.guide#short-methods) Raw Output: lib/helpers/prometheus_metrics.rb:159:3: C: Metrics/MethodLength: Method has too many lines. [28/20] (https://rubystyle.guide#short-methods)

Check warning on line 159 in lib/helpers/prometheus_metrics.rb

View workflow job for this annotation

GitHub Actions / Rubocop

[rubocop] reported by reviewdog 🐶 Assignment Branch Condition size for `refresh_delayed_jobs` is too high. [<13, 56, 3> 57.57/17] (https://wiki.c2.com/?AbcMetric, https://en.wikipedia.org/wiki/ABC_Software_Metric) Raw Output: lib/helpers/prometheus_metrics.rb:159:3: C: Metrics/AbcSize: Assignment Branch Condition size for `refresh_delayed_jobs` is too high. [<13, 56, 3> 57.57/17] (https://wiki.c2.com/?AbcMetric, https://en.wikipedia.org/wiki/ABC_Software_Metric)
[DJ_PENDING, DJ_RUNNING, DJ_SCHEDULED, DJ_FAILED, DJ_MAX_ATTEMPTS_REACHED, DJ_LOCKED_TOO_LONG].each do |gauge|
gauge.values.each_key { |labels| gauge.set(0, labels: labels) }
end

now = Time.now
stuck_threshold = now - DJ_MAX_RUN_TIME

pending = Delayed::Job
.where('run_at <= ? AND locked_at IS NULL AND failed_at IS NULL', now).group(:queue).count

running = Delayed::Job
.where('locked_at IS NOT NULL AND failed_at IS NULL').group(:queue).count

scheduled = Delayed::Job
.where('run_at > ? AND locked_at IS NULL AND failed_at IS NULL', now).group(:queue).count

failed = Delayed::Job
.where('failed_at IS NOT NULL').group(:queue).count

max_att = Delayed::Job
.where('attempts >= ? AND failed_at IS NULL', DJ_MAX_ATTEMPTS).group(:queue).count

stuck = Delayed::Job
.where('locked_at IS NOT NULL AND locked_at < ? AND failed_at IS NULL', stuck_threshold)
.group(:queue).count

all_queues = (pending.keys + running.keys + scheduled.keys + failed.keys + max_att.keys + stuck.keys).uniq

all_queues.each do |queue|
q = { queue: queue.to_s }
DJ_PENDING.set(pending[queue].to_i, labels: q)
DJ_RUNNING.set(running[queue].to_i, labels: q)
DJ_SCHEDULED.set(scheduled[queue].to_i, labels: q)
DJ_FAILED.set(failed[queue].to_i, labels: q)
DJ_MAX_ATTEMPTS_REACHED.set(max_att[queue].to_i, labels: q)
DJ_LOCKED_TOO_LONG.set(stuck[queue].to_i, labels: q)
end
end

def self.refresh_ci_domain
CiJob.unscoped.group(:status).count.each do |status, count|
CI_JOBS.set(count, labels: { status: status.to_s })
end

Stage.unscoped.group(:status).count.each do |status, count|
CI_STAGES.set(count, labels: { status: status.to_s })
end
end

def self.refresh_connection_pool
stat = ActiveRecord::Base.connection_pool.stat
AR_POOL_SIZE.set(stat[:size])
AR_POOL_CONNECTIONS.set(stat[:connections])
AR_POOL_BUSY.set(stat[:busy])
AR_POOL_IDLE.set(stat[:idle])
AR_POOL_WAITING.set(stat[:waiting])
end

def self.refresh_puma

Check warning on line 218 in lib/helpers/prometheus_metrics.rb

View workflow job for this annotation

GitHub Actions / Rubocop

[rubocop] reported by reviewdog 🐶 Assignment Branch Condition size for `refresh_puma` is too high. [<4, 26, 3> 26.48/17] (https://wiki.c2.com/?AbcMetric, https://en.wikipedia.org/wiki/ABC_Software_Metric) Raw Output: lib/helpers/prometheus_metrics.rb:218:3: C: Metrics/AbcSize: Assignment Branch Condition size for `refresh_puma` is too high. [<4, 26, 3> 26.48/17] (https://wiki.c2.com/?AbcMetric, https://en.wikipedia.org/wiki/ABC_Software_Metric)
stats = JSON.parse(File.read('tmp/puma_stats.json'), symbolize_names: true)
return unless stats.key?(:worker_status)

PUMA_WORKERS_TOTAL.set(stats[:workers].to_i)
PUMA_BOOTED_WORKERS.set(stats[:booted_workers].to_i)

stats[:worker_status].each do |w|
s = w[:last_status]
labels = { worker: w[:index].to_s }
PUMA_BACKLOG.set(s[:backlog].to_i, labels: labels)
PUMA_RUNNING_THREADS.set(s[:running].to_i, labels: labels)
PUMA_POOL_CAPACITY.set(s[:pool_capacity].to_i, labels: labels)
PUMA_MAX_THREADS.set(s[:max_threads].to_i, labels: labels)
end
rescue Errno::ENOENT
# tmp/puma_stats.json not written yet (first 30s after boot)
end

def self.refresh_scheduled_jobs_detail

Check warning on line 237 in lib/helpers/prometheus_metrics.rb

View workflow job for this annotation

GitHub Actions / Rubocop

[rubocop] reported by reviewdog 🐶 Assignment Branch Condition size for `refresh_scheduled_jobs_detail` is too high. [<6, 17, 2> 18.14/17] (https://wiki.c2.com/?AbcMetric, https://en.wikipedia.org/wiki/ABC_Software_Metric) Raw Output: lib/helpers/prometheus_metrics.rb:237:3: C: Metrics/AbcSize: Assignment Branch Condition size for `refresh_scheduled_jobs_detail` is too high. [<6, 17, 2> 18.14/17] (https://wiki.c2.com/?AbcMetric, https://en.wikipedia.org/wiki/ABC_Software_Metric)
now = Time.now

DJ_TABLE.values.each_key { |labels| DJ_TABLE.set(0, labels: labels) }

Delayed::Job
.where('run_at > ? AND locked_at IS NULL AND failed_at IS NULL', now)
.select(:id, :queue, :handler, :run_at)
.each do |job|
job_class, job_args = parse_dj_handler(job.handler)
labels =
{
job_id: job.id.to_s,
queue: job.queue.to_s,
job_class: job_class,
job_args: job_args,
run_at: job.run_at
}
DJ_TABLE.set(job.run_at.to_i, labels: labels)
end
end

# Parses a Delayed::PerformableMethod YAML handler without loading arbitrary Ruby objects.
# Returns [class::method string, truncated args string].
def self.parse_dj_handler(handler)
return ['Unknown', ''] unless handler

obj_class = extract_dj_class(handler)
method_name = handler[/method_name: :(\S+)/, 1] || ''
raw_args = handler[/^args:\n(.*?)(?=\n\S|\z)/m, 1].to_s.strip
args_str = raw_args.gsub("\n", ', ').squeeze(' ')
args_str = "#{args_str[0, 77]}..." if args_str.length > 80

["#{obj_class}##{method_name}", args_str]
rescue StandardError
['Unknown', '']
end

def self.extract_dj_class(handler)
handler[%r{object: !ruby/class '([^']+)'}, 1] ||
handler[%r{object: !ruby/object:(\S+)}, 1] ||
handler[%r{!ruby/object:(\S+)}, 1] ||
'Unknown'
end

def self.extract_sql_operation(sql)
op = sql.to_s.strip.split(/\s/, 2).first&.upcase
op if %w[SELECT INSERT UPDATE DELETE].include?(op)
end

# Extracts the table/model name from ActiveRecord's event name (e.g. "User Load" => "users").
def self.extract_table_name(name)
return 'unknown' if name.nil? || name.empty?

model = name.to_s.split.first
return 'other' if %w[SCHEMA EXPLAIN TRANSACTION].include?(model&.upcase)

model&.downcase&.gsub('::', '_') || 'unknown'
end

private_class_method :refresh_delayed_jobs, :refresh_scheduled_jobs_detail, :refresh_ci_domain,
:refresh_connection_pool, :refresh_puma, :extract_sql_operation,
:extract_table_name, :parse_dj_handler, :extract_dj_class
end
2 changes: 1 addition & 1 deletion lib/models/audit_status.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
require 'otr-activerecord'

class AuditStatus < ActiveRecord::Base
enum status: { queued: 0, in_progress: 1, success: 2, refresh: 3, cancelled: -1, failure: -2, skipped: -3 }
enum :status, { queued: 0, in_progress: 1, success: 2, refresh: 3, cancelled: -1, failure: -2, skipped: -3 }

belongs_to :auditable, polymorphic: true
end
Loading
Loading