Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

module VCAP
module CloudController
class MetricsWebserver
class ApiMetricsWebserver
attr_reader :app

def initialize
Expand Down
7 changes: 7 additions & 0 deletions lib/cloud_controller/clock/scheduler.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
require 'clockwork'
require 'cloud_controller/clock/clock'
require 'cloud_controller/clock/job_timeout_calculator'
require 'cloud_controller/standalone_metrics_webserver'

module VCAP::CloudController
class Scheduler
Expand Down Expand Up @@ -35,6 +36,12 @@ def initialize(config)
end

def start
if @config.get(:publish_metrics) || false
StandaloneMetricsWebserver.start_for_bosh_job(@config.get(:prometheus_port) || 9394)
periodic_updater = CloudController::DependencyLocator.instance.vitals_periodic_updater
periodic_updater.setup_updates
end

start_daily_jobs
start_frequent_jobs
start_inline_jobs
Expand Down
3 changes: 3 additions & 0 deletions lib/cloud_controller/config_schemas/clock_schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ class ClockSchema < VCAP::Config
optional(:port) => Integer
},

optional(:publish_metrics) => bool,
optional(:prometheus_port) => Integer,

skip_cert_verify: bool,

optional(:routing_api) => {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ class DeploymentUpdaterSchema < VCAP::Config

stacks_file: String,

optional(:publish_metrics) => bool,
optional(:prometheus_port) => Integer,

skip_cert_verify: bool,

optional(:credhub_api) => {
Expand Down
6 changes: 4 additions & 2 deletions lib/cloud_controller/db.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
require 'cloud_controller/db_migrator'
require 'cloud_controller/db_connection/options_factory'
require 'cloud_controller/db_connection/finalizer'
require 'cloud_controller/execution_context'
require 'sequel/extensions/query_length_logging'
require 'sequel/extensions/request_query_metrics'

Expand Down Expand Up @@ -73,8 +74,9 @@ def self.add_connection_expiration_extension(db, opts)
end

def self.add_connection_metrics_extension(db)
# only add the metrics for api and cc-worker processes. Otherwise e.g. rake db:migrate would also initialize metric updaters, which need additional config
return if Object.const_defined?(:RakeConfig) && RakeConfig.context != :worker
# only add the metrics for api, cc-worker, clock & deployment_updater processes.
# Otherwise, e.g. rake db:migrate would also initialize metric updaters, which need additional config
return if ExecutionContext.from_process_type_env.nil?

db.extension(:connection_metrics)
# so that we gather connection metrics from the beginning
Expand Down
20 changes: 16 additions & 4 deletions lib/cloud_controller/dependency_locator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
require 'cloud_controller/packager/local_bits_packer'
require 'credhub/client'
require 'cloud_controller/metrics/prometheus_updater'
require 'cloud_controller/execution_context'

module CloudController
class DependencyLocator
Expand Down Expand Up @@ -70,12 +71,21 @@ def periodic_updater
))
end

def prometheus_updater
@dependencies[:prometheus_updater] || register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
def vitals_periodic_updater
@dependencies[:vitals_periodic_updater] ||
register(:vitals_periodic_updater,
VCAP::CloudController::Metrics::PeriodicUpdater.new(
Time.now.utc,
log_counter,
Steno.logger('cc.vitals'),
statsd_updater,
prometheus_updater,
task_list: [VCAP::CloudController::Metrics::PeriodicUpdater::VITALS_TASK],
))
end

def cc_worker_prometheus_updater
@dependencies[:cc_worker_prometheus_updater] || register(:cc_worker_prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new(cc_worker: true))
def prometheus_updater
@dependencies[:prometheus_updater] || register(:prometheus_updater, VCAP::CloudController::Metrics::PrometheusUpdater.new)
end

def statsd_updater
Expand Down Expand Up @@ -362,6 +372,8 @@ def statsd_client
else
register(:statsd_client, NullStatsdClient.new)
end
rescue VCAP::CloudController::Config::InvalidConfigPath
register(:statsd_client, NullStatsdClient.new)
end

private
Expand Down
6 changes: 6 additions & 0 deletions lib/cloud_controller/deployment_updater/scheduler.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
require 'cloud_controller/deployment_updater/dispatcher'
require 'locket/lock_worker'
require 'locket/lock_runner'
require 'cloud_controller/standalone_metrics_webserver'

module VCAP::CloudController
module DeploymentUpdater
Expand All @@ -9,6 +10,11 @@ class << self
def start
with_error_logging('cc.deployment_updater') do
config = CloudController::DependencyLocator.instance.config
if config.get(:publish_metrics) || false
VCAP::CloudController::StandaloneMetricsWebserver.start_for_bosh_job(config.get(:prometheus_port) || 9395)
periodic_updater = CloudController::DependencyLocator.instance.vitals_periodic_updater
periodic_updater.setup_updates
end
statsd_client = CloudController::DependencyLocator.instance.statsd_client

update_step = proc {
Expand Down
39 changes: 39 additions & 0 deletions lib/cloud_controller/execution_context.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
module VCAP::CloudController
class ExecutionContext
ExecutionInfo = Struct.new(:process_type, :capi_job_name, :rake_context, keyword_init: true) do
def initialize(process_type:, capi_job_name:, rake_context: nil)
super
end

def set_process_type_env
ENV['PROCESS_TYPE'] = process_type
end

def set_rake_context
raise 'RakeConfig is not defined or rake_context argument is nil' if rake_context.nil? || !Object.const_defined?(:RakeConfig)

RakeConfig.context = rake_context
end
end

API_PUMA_MAIN = ExecutionInfo.new(process_type: 'main', capi_job_name: 'cloud_controller_ng')
API_PUMA_WORKER = ExecutionInfo.new(process_type: 'puma_worker', capi_job_name: 'cloud_controller_ng')
CC_WORKER = ExecutionInfo.new(process_type: 'cc-worker', capi_job_name: 'cloud_controller_worker', rake_context: :worker)
CLOCK = ExecutionInfo.new(process_type: 'clock', capi_job_name: 'cloud_controller_clock', rake_context: :clock)
DEPLOYMENT_UPDATER = ExecutionInfo.new(process_type: 'deployment_updater', capi_job_name: 'cc_deployment_updater', rake_context: :deployment_updater)

ALL_EXECUTION_CONTEXTS = [API_PUMA_MAIN, API_PUMA_WORKER, CC_WORKER, CLOCK, DEPLOYMENT_UPDATER].freeze

class << self
def from_process_type_env
process_type = ENV.fetch('PROCESS_TYPE', nil)
exec_ctx = ALL_EXECUTION_CONTEXTS.find { |p| p.process_type == process_type }

# For test environments where PROCESS_TYPE may not be set, default to API_PUMA_MAIN
exec_ctx = API_PUMA_MAIN if exec_ctx.nil? && ENV.fetch('CC_TEST', nil) == 'true'

exec_ctx
end
end
end
end
46 changes: 22 additions & 24 deletions lib/cloud_controller/metrics/periodic_updater.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,35 @@

module VCAP::CloudController::Metrics
class PeriodicUpdater
def initialize(start_time, log_counter, logger, statsd_updater, prometheus_updater)
UPDATE_TASK = Struct.new(:method_name, :interval)

USER_COUNT_TASK = UPDATE_TASK.new(:update_user_count, 600).freeze
JOB_QUEUE_LENGTH_TASK = UPDATE_TASK.new(:update_job_queue_length, 30).freeze
JOB_QUEUE_LOAD_TASK = UPDATE_TASK.new(:update_job_queue_load, 30).freeze
FAILED_JOB_COUNT_TASK = UPDATE_TASK.new(:update_failed_job_count, 30).freeze
VITALS_TASK = UPDATE_TASK.new(:update_vitals, 30).freeze
LOG_COUNTS_TASK = UPDATE_TASK.new(:update_log_counts, 30).freeze
TASK_STATS_TASK = UPDATE_TASK.new(:update_task_stats, 30).freeze
DEPLOYING_COUNT_TASK = UPDATE_TASK.new(:update_deploying_count, 30).freeze
WEBSERVER_STATS_TASK = UPDATE_TASK.new(:update_webserver_stats, 30).freeze

ALL_TASKS = [USER_COUNT_TASK, JOB_QUEUE_LENGTH_TASK, JOB_QUEUE_LOAD_TASK, FAILED_JOB_COUNT_TASK, VITALS_TASK, LOG_COUNTS_TASK, TASK_STATS_TASK, DEPLOYING_COUNT_TASK,
WEBSERVER_STATS_TASK].freeze

def initialize(start_time, log_counter, logger, statsd_updater, prometheus_updater, task_list: ALL_TASKS)
@start_time = start_time
@statsd_updater = statsd_updater
@prometheus_updater = prometheus_updater
@log_counter = log_counter
@logger = logger
@known_job_queues = {
VCAP::CloudController::Jobs::Queues.local(VCAP::CloudController::Config.config).to_sym => 0
}
@known_job_queues = { VCAP::CloudController::Jobs::Queues.local(VCAP::CloudController::Config.config).to_sym => 0 }
@task_list = task_list
end

def setup_updates
update!
@task_list.each { |task| update!(task) }
@update_tasks = []
@update_tasks << Concurrent::TimerTask.new(execution_interval: 600) { catch_error { update_user_count } }
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_job_queue_length } }
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_job_queue_load } }
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_failed_job_count } }
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_vitals } }
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_log_counts } }
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_task_stats } }
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_deploying_count } }
@update_tasks << Concurrent::TimerTask.new(execution_interval: 30) { catch_error { update_webserver_stats } }
@task_list.each { |task| @update_tasks << Concurrent::TimerTask.new(execution_interval: task.interval) { catch_error { update!(task) } } }
@update_tasks.each(&:execute)
end

Expand All @@ -35,16 +41,8 @@ def stop_updates
@update_tasks.each(&:shutdown)
end

def update!
update_user_count
update_job_queue_length
update_job_queue_load
update_failed_job_count
update_vitals
update_log_counts
update_task_stats
update_deploying_count
update_webserver_stats
def update!(task)
send(task.method_name)
end

def catch_error
Expand Down
70 changes: 57 additions & 13 deletions lib/cloud_controller/metrics/prometheus_updater.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
require 'prometheus/client'
require 'prometheus/client/data_stores/direct_file_store'
require 'cloud_controller/execution_context'

module VCAP::CloudController::Metrics
class PrometheusUpdater
Expand Down Expand Up @@ -29,12 +30,6 @@ def self.allow_pid_label
{ type: :histogram, name: :cc_staging_failed_duration_seconds, docstring: 'Durations of failed staging events', buckets: DURATION_BUCKETS },
{ type: :gauge, name: :cc_requests_outstanding_total, docstring: 'Requests outstanding', aggregation: :sum },
{ type: :counter, name: :cc_requests_completed_total, docstring: 'Requests completed' },
{ type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_mem_used_bytes, docstring: 'CloudController Vitals: mem_used_bytes', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_mem_free_bytes, docstring: 'CloudController Vitals: mem_free_bytes', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_num_cores, docstring: 'CloudController Vitals: num_cores', aggregation: :most_recent },
{ type: :gauge, name: :cc_running_tasks_total, docstring: 'Total running tasks', aggregation: :most_recent },
{ type: :gauge, name: :cc_running_tasks_memory_bytes, docstring: 'Total memory consumed by running tasks', aggregation: :most_recent },
{ type: :gauge, name: :cc_users_total, docstring: 'Number of users', aggregation: :most_recent },
Expand Down Expand Up @@ -67,19 +62,68 @@ def self.allow_pid_label
{ type: :histogram, name: :cc_job_duration_seconds, docstring: 'Job processing time (start to finish)', labels: %i[queue worker], buckets: DELAYED_JOB_METRIC_BUCKETS }
].freeze

def initialize(registry: Prometheus::Client.registry, cc_worker: false)
VITAL_METRICS = [
{ type: :gauge, name: :cc_vitals_started_at, docstring: 'CloudController Vitals: started_at', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_mem_bytes, docstring: 'CloudController Vitals: mem_bytes', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_cpu_load_avg, docstring: 'CloudController Vitals: cpu_load_avg', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_mem_used_bytes, docstring: 'CloudController Vitals: mem_used_bytes', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_mem_free_bytes, docstring: 'CloudController Vitals: mem_free_bytes', aggregation: :most_recent },
{ type: :gauge, name: :cc_vitals_num_cores, docstring: 'CloudController Vitals: num_cores', aggregation: :most_recent }
].freeze

def initialize(registry: Prometheus::Client.registry)
self.class.allow_pid_label

@registry = registry
execution_context = VCAP::CloudController::ExecutionContext.from_process_type_env

# Register all metrics, to initialize them for discoverability
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
DELAYED_JOB_METRICS.each { |metric| register(metric) }
register_metrics_for_process(execution_context)
initialize_cc_db_connection_pool_timeouts_total(execution_context)
end

private

# rubocop:disable Metrics/CyclomaticComplexity
def register_metrics_for_process(execution_context)
case execution_context
when VCAP::CloudController::ExecutionContext::CC_WORKER
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
DELAYED_JOB_METRICS.each { |metric| register(metric) }
VITAL_METRICS.each { |metric| register(metric) }
when VCAP::CloudController::ExecutionContext::CLOCK, VCAP::CloudController::ExecutionContext::DEPLOYMENT_UPDATER
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
VITAL_METRICS.each { |metric| register(metric) }
when VCAP::CloudController::ExecutionContext::API_PUMA_MAIN, VCAP::CloudController::ExecutionContext::API_PUMA_WORKER
DB_CONNECTION_POOL_METRICS.each { |metric| register(metric) }
DELAYED_JOB_METRICS.each { |metric| register(metric) }
VITAL_METRICS.each { |metric| register(metric) }
METRICS.each { |metric| register(metric) }
PUMA_METRICS.each { |metric| register(metric) } if is_puma_webserver?
else
raise 'Could not register Prometheus metrics: Unknown execution context'
end
end
# rubocop:enable Metrics/CyclomaticComplexity

def initialize_cc_db_connection_pool_timeouts_total(execution_context)
return if execution_context.nil? # In unit tests, the execution context might not be set - thus skip initialization
return unless @registry.exist?(:cc_db_connection_pool_timeouts_total) # If the metric is not registered, we don't need to initialize it

# initialize metric with 0 for discoverability, because it likely won't get updated on healthy systems
update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: execution_context.process_type })

return unless execution_context == VCAP::CloudController::ExecutionContext::API_PUMA_MAIN

# also initialize for puma_worker
update_gauge_metric(:cc_db_connection_pool_timeouts_total, 0, labels: { process_type: VCAP::CloudController::ExecutionContext::API_PUMA_WORKER.process_type })
end

return if cc_worker
public

METRICS.each { |metric| register(metric) }
PUMA_METRICS.each { |metric| register(metric) } if VCAP::CloudController::Config.config&.get(:webserver) == 'puma'
def is_puma_webserver?
VCAP::CloudController::Config.config&.get(:webserver) == 'puma'
rescue VCAP::CloudController::Config::InvalidConfigPath
false
end

def update_gauge_metric(metric, value, labels: {})
Expand Down
7 changes: 4 additions & 3 deletions lib/cloud_controller/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
require 'cloud_controller/telemetry_logger'
require 'cloud_controller/secrets_fetcher'
require 'cloud_controller/runners/puma_runner'
require 'cloud_controller/metrics_webserver'
require 'cloud_controller/api_metrics_webserver'
require 'cloud_controller/execution_context'
require 'prometheus/client/data_stores/direct_file_store'
require 'prometheus/middleware/exporter'

Expand All @@ -33,7 +34,7 @@ def initialize(argv)
# DB connection metrics have a label to determine whether the process accessing the connection is the
# main or a worker process. We need to set this env variable before `setup_db` otherwise the main process
# will show up twice in the metrics as main and worker.
ENV['PROCESS_TYPE'] = 'main'
VCAP::CloudController::ExecutionContext::API_PUMA_MAIN.set_process_type_env

setup_cloud_controller

Expand Down Expand Up @@ -130,7 +131,7 @@ def setup_metrics
Prometheus::Client.config.data_store = Prometheus::Client::DataStores::DirectFileStore.new(dir: prometheus_dir)

Thread.new do
VCAP::CloudController::MetricsWebserver.new.start(@config)
VCAP::CloudController::ApiMetricsWebserver.new.start(@config)
end
end

Expand Down
Loading
Loading