Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
0314faa
Set up Cgroup, CpuStats, and CpuMetricsCollector structs, and cgroup …
kathiehuang Feb 13, 2026
af53bf0
Add cpu collector into loop with dogstatsd
kathiehuang Feb 13, 2026
9d69162
Fix license
kathiehuang Feb 13, 2026
36171b8
Move metrics_collector into its own crate
kathiehuang Feb 23, 2026
9252609
Submit cpu usage and limit metrics and fix units
kathiehuang Feb 23, 2026
f47c4ff
Test more precise time interval, add instance ID as a tag
kathiehuang Feb 25, 2026
b632c17
Categorize metrics with azure.functions prefix as enhanced metrics
kathiehuang Feb 26, 2026
1bff3a8
Refactor to make CpuMetricsCollector, CpuStats, and metrics submissio…
kathiehuang Feb 26, 2026
fac5fda
Testing different cpu collection methods
kathiehuang Mar 4, 2026
bf1e8a7
Clean up and emit cpu usage and host-level cpu usage metrics
kathiehuang Mar 4, 2026
c43bb32
Clean up and emit cpu usage and host-level cpu usage metrics
kathiehuang Mar 4, 2026
cba2d69
Add tags to metrics
kathiehuang Mar 5, 2026
70aa6fe
Ensure tags match cloud integration metrics
kathiehuang Mar 6, 2026
eec4202
Separate Windows CPU metrics collection into separate PR
kathiehuang Mar 6, 2026
ddfd37f
Separate CPU host usage metrics collection into separate PR
kathiehuang Mar 6, 2026
5953d68
Remove functionname tag
kathiehuang Mar 6, 2026
45317ff
Send enhanced metrics even if custom metrics are turned off
kathiehuang Mar 6, 2026
12bfde2
Pull out building metrics tags into function
kathiehuang Mar 7, 2026
8c4cf5f
Add unit tests
kathiehuang Mar 7, 2026
a7f9f8d
Clean up
kathiehuang Mar 7, 2026
feca14c
Refactor
kathiehuang Mar 7, 2026
c6a55dc
Remove last_collection_time
kathiehuang Mar 7, 2026
dfe28a3
Only send enhanced metrics for Azure Functions
kathiehuang Mar 7, 2026
058ef53
Add back last_collection_time
kathiehuang Mar 7, 2026
36bba17
Only enable enhanced metrics for Azure Functions
kathiehuang Mar 9, 2026
c626c03
Only create CPUMetricsCollector when metrics flusher is successfully …
kathiehuang Mar 9, 2026
071d2f0
Launch metrics flusher as independent task from collector
kathiehuang Mar 9, 2026
f6c2694
Create windows-enhanced-metrics feature for Windows-specific logic
kathiehuang Mar 10, 2026
aae174a
Add unit to collection interval variable
kathiehuang Mar 10, 2026
f443794
Make last_usage_ns an Option and keep CPU total as u64 until f64 is n…
kathiehuang Mar 10, 2026
5f16053
Change collection interval to 1 for precision and remove unneeded logs
kathiehuang Mar 11, 2026
60cdecf
Add comment to clarify shared aggregator between dogstatsd and cpu co…
kathiehuang Mar 11, 2026
f867f6f
Move tag building logic from datadog-serverless-compat to datadog-met…
kathiehuang Mar 11, 2026
2ad5f24
Remove unused dependencies from datadog-trace-agent
kathiehuang Mar 11, 2026
b4a7624
Formatting
kathiehuang Mar 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-datadog-serverless-compat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
retention-days: 3
- if: ${{ inputs.runner == 'windows-2022' }}
shell: bash
run: cargo build --release -p datadog-serverless-compat --features windows-pipes
run: cargo build --release -p datadog-serverless-compat --features windows-pipes,windows-enhanced-metrics
- if: ${{ inputs.runner == 'windows-2022' }}
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # 4.6.2
with:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/cargo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ jobs:
- shell: bash
run: |
if [[ "${{ inputs.runner }}" == "windows-2022" ]]; then
cargo nextest run --workspace --features datadog-serverless-compat/windows-pipes
cargo nextest run --workspace --features datadog-serverless-compat/windows-pipes,datadog-serverless-compat/windows-enhanced-metrics
else
cargo nextest run --workspace
fi
27 changes: 27 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions LICENSE-3rdparty.csv
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ headers,https://github.com/hyperium/headers,MIT,Sean McArthur <sean@seanmonstar.
headers-core,https://github.com/hyperium/headers,MIT,Sean McArthur <sean@seanmonstar.com>
heck,https://github.com/withoutboats/heck,MIT OR Apache-2.0,The heck Authors
heck,https://github.com/withoutboats/heck,MIT OR Apache-2.0,Without Boats <woboats@gmail.com>
hermit-abi,https://github.com/hermit-os/hermit-rs,MIT OR Apache-2.0,Stefan Lankes
hex,https://github.com/KokaKiwi/rust-hex,MIT OR Apache-2.0,KokaKiwi <kokakiwi@kokakiwi.net>
home,https://github.com/rust-lang/cargo,MIT OR Apache-2.0,Brian Anderson <andersrb@gmail.com>
http,https://github.com/hyperium/http,MIT OR Apache-2.0,"Alex Crichton <alex@alexcrichton.com>, Carl Lerche <me@carllerche.com>, Sean McArthur <sean@seanmonstar.com>"
Expand Down Expand Up @@ -119,6 +120,7 @@ multimap,https://github.com/havarnov/multimap,MIT OR Apache-2.0,Håvar Nøvik <h
nix,https://github.com/nix-rust/nix,MIT,The nix-rust Project Developers
nu-ansi-term,https://github.com/nushell/nu-ansi-term,MIT,"ogham@bsago.me, Ryan Scheel (Havvy) <ryan.havvy@gmail.com>, Josh Triplett <josh@joshtriplett.org>, The Nushell Project Developers"
num-traits,https://github.com/rust-num/num-traits,MIT OR Apache-2.0,The Rust Project Developers
num_cpus,https://github.com/seanmonstar/num_cpus,MIT OR Apache-2.0,Sean McArthur <sean@seanmonstar.com>
once_cell,https://github.com/matklad/once_cell,MIT OR Apache-2.0,Aleksey Kladov <aleksey.kladov@gmail.com>
openssl-probe,https://github.com/rustls/openssl-probe,MIT OR Apache-2.0,Alex Crichton <alex@alexcrichton.com>
ordered-float,https://github.com/reem/rust-ordered-float,MIT,"Jonathan Reem <jonathan.reem@gmail.com>, Matt Brubeck <mbrubeck@limpet.net>"
Expand Down
15 changes: 15 additions & 0 deletions crates/datadog-metrics-collector/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[package]
name = "datadog-metrics-collector"
version = "0.1.0"
edition.workspace = true
license.workspace = true
description = "Collector to read, compute, and submit enhanced metrics in Serverless environments"

[dependencies]
dogstatsd = { path = "../dogstatsd", default-features = true }
num_cpus = "1.16"
tracing = { version = "0.1", default-features = false }
libdd-common = { git = "https://github.com/DataDog/libdatadog", rev = "d52ee90209cb12a28bdda0114535c1a985a29d95", default-features = false }

[features]
windows-enhanced-metrics = []
167 changes: 167 additions & 0 deletions crates/datadog-metrics-collector/src/cpu.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/
// SPDX-License-Identifier: Apache-2.0

//! CPU metrics collector for Azure Functions
//!
//! This module provides OS-agnostic CPU stats collection, CPU usage
//! and limit computation, and metrics submission to Datadog.
//!
//! All CPU metrics are reported in nanocores (1 core = 1,000,000,000 nanocores).

use dogstatsd::aggregator::AggregatorHandle;
use dogstatsd::metric::{Metric, MetricValue, SortedTags};
use libdd_common::azure_app_services;
use std::env;
use tracing::{debug, error};

const CPU_USAGE_METRIC: &str = "azure.functions.enhanced.cpu.usage";
const CPU_LIMIT_METRIC: &str = "azure.functions.enhanced.cpu.limit";

/// Computed CPU total and limit metrics
pub struct CpuStats {
pub total: u64, // Cumulative CPU usage in nanoseconds
pub limit: Option<f64>, // CPU limit in nanocores
pub defaulted_limit: bool, // Whether CPU limit was defaulted to host CPU count
}

pub trait CpuStatsReader {
fn read(&self) -> Option<CpuStats>;
}

pub struct CpuMetricsCollector {
reader: Box<dyn CpuStatsReader>,
aggregator: AggregatorHandle,
tags: Option<SortedTags>,
last_usage_ns: Option<u64>,
last_collection_time: std::time::Instant,
}

impl CpuMetricsCollector {
/// Creates a new CpuMetricsCollector
///
/// # Arguments
///
/// * `aggregator` - The aggregator handle to submit metrics to
/// * `tags` - Optional tags to attach to all metrics
pub fn new(aggregator: AggregatorHandle, tags: Option<SortedTags>) -> Self {
#[cfg(feature = "windows-enhanced-metrics")]
let reader: Box<dyn CpuStatsReader> = Box::new(crate::windows::WindowsCpuStatsReader);
#[cfg(not(feature = "windows-enhanced-metrics"))]
let reader: Box<dyn CpuStatsReader> = Box::new(crate::linux::LinuxCpuStatsReader);
Self {
reader,
aggregator,
tags,
last_usage_ns: None,
last_collection_time: std::time::Instant::now(),
}
}

pub fn collect_and_submit(&mut self) {
if let Some(cpu_stats) = self.reader.read() {
// Submit metrics
let current_usage_ns = cpu_stats.total;
let now_instant = std::time::Instant::now();

// Skip first collection
let Some(last_usage_ns) = self.last_usage_ns else {
debug!("First CPU collection, skipping interval");
self.last_usage_ns = Some(current_usage_ns);
self.last_collection_time = now_instant;
return;
};

if current_usage_ns < last_usage_ns {
debug!("Current CPU usage is less than last usage, skipping interval");
self.last_usage_ns = Some(current_usage_ns);
self.last_collection_time = now_instant;
return;
}
let delta_ns = (current_usage_ns - last_usage_ns) as f64;
self.last_usage_ns = Some(current_usage_ns);
let elapsed_secs = now_instant
.duration_since(self.last_collection_time)
.as_secs_f64();
self.last_collection_time = now_instant;

// Divide nanoseconds delta by elapsed time to get usage rate in nanocores
let usage_rate_nc = delta_ns / elapsed_secs;

let now = std::time::UNIX_EPOCH
.elapsed()
.map(|d| d.as_secs())
.unwrap_or(0)
.try_into()
.unwrap_or(0);

let usage_metric = Metric::new(
CPU_USAGE_METRIC.into(),
MetricValue::distribution(usage_rate_nc),
self.tags.clone(),
Some(now),
);

if let Err(e) = self.aggregator.insert_batch(vec![usage_metric]) {
error!("Failed to insert CPU usage metric: {}", e);
Copy link
Contributor

@Lewis-E Lewis-E Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In what situations would we see this error? Would we hit this repeatedly or can the aggregator recover from errors quickly? (Also applies to line 111)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

insert_batch calls tx.send, which is on an unbounded channel that has infinite capacity. An error will only happen if the receive half of the channel is closed or dropped, which means the aggregator service isn't working anymore and every subsequent call should also fail. This means that metrics would stop sending, with error logs on every attempted insert. It seems the only way to recover would be for the customer to stop and start their function app to restart the agent

Error logging but continuing is what the lambda extension does

If we're worried about log spam, I could change this to return early on the CPU usage metric insert failure - this would halve the error logs

Or maybe a better solution would be to have collect_and_submit return a Result, and main.rs could set cpu_collector=None on error?

}

if let Some(limit) = cpu_stats.limit {
if cpu_stats.defaulted_limit {
debug!("CPU limit defaulted to host CPU count");
}
let limit_metric = Metric::new(
CPU_LIMIT_METRIC.into(),
MetricValue::distribution(limit),
self.tags.clone(),
Some(now),
);
if let Err(e) = self.aggregator.insert_batch(vec![limit_metric]) {
error!("Failed to insert CPU limit metric: {}", e);
}
}
} else {
debug!(
"Skipping CPU metrics collection - could not find data to generate CPU usage and limit enhanced metrics"
);
}
}
}

pub fn build_cpu_metrics_tags() -> Option<SortedTags> {
let mut tag_parts = Vec::new();
// Azure tags from ddcommon
if let Some(aas_metadata) = &*azure_app_services::AAS_METADATA_FUNCTION {
let aas_tags = [
("resource_id", aas_metadata.get_resource_id()),
("resource_group", aas_metadata.get_resource_group()),
("subscription_id", aas_metadata.get_subscription_id()),
("name", aas_metadata.get_site_name()),
];
for (name, value) in aas_tags {
if value != "unknown" {
tag_parts.push(format!("{}:{}", name, value));
}
}
}

// Tags from env vars (not in ddcommon) - origin tag is added by DogStatsD
for (tag_name, env_var) in [
("region", "REGION_NAME"),
("plan_tier", "WEBSITE_SKU"),
("service", "DD_SERVICE"),
("env", "DD_ENV"),
("version", "DD_VERSION"),
("serverless_compat_version", "DD_SERVERLESS_COMPAT_VERSION"),
] {
if let Ok(val) = env::var(env_var) {
if !val.is_empty() {
tag_parts.push(format!("{}:{}", tag_name, val));
}
}
}

if tag_parts.is_empty() {
return None;
}
SortedTags::parse(&tag_parts.join(",")).ok()
}
14 changes: 14 additions & 0 deletions crates/datadog-metrics-collector/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright 2023-Present Datadog, Inc. https://www.datadoghq.com/
// SPDX-License-Identifier: Apache-2.0

#![cfg_attr(not(test), deny(clippy::panic))]
#![cfg_attr(not(test), deny(clippy::unwrap_used))]
#![cfg_attr(not(test), deny(clippy::expect_used))]
#![cfg_attr(not(test), deny(clippy::todo))]
#![cfg_attr(not(test), deny(clippy::unimplemented))]

pub mod cpu;
#[cfg(not(feature = "windows-enhanced-metrics"))]
pub(crate) mod linux;
#[cfg(feature = "windows-enhanced-metrics")]
pub(crate) mod windows;
Loading
Loading