Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 108 additions & 3 deletions components/support/error/src/error_tracing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,31 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */

use std::{
collections::HashMap,
time::{Duration, Instant},
};

use parking_lot::Mutex;

static RECENT_BREADCRUMBS: Mutex<BreadcrumbRingBuffer> = Mutex::new(BreadcrumbRingBuffer::new());
static GLOBALS: Mutex<Globals> = Mutex::new(Globals::new());

pub fn report_error_to_app(type_name: String, message: String) {
let mut globals = GLOBALS.lock();
if !globals
.rate_limiter
.should_send_report(&type_name, Instant::now())
{
return;
}

// Report errors by sending a tracing event to the `app-services-error-reporter::error` target.
//
// Applications should register for these events and send a glean error ping when they occur.
//
// breadcrumbs will be sent in the `breadcrumbs` field as a single string, with each individual
// breadcrumb joined by newlines.
let breadcrumbs = RECENT_BREADCRUMBS.lock().get_breadcrumbs().join("\n");
let breadcrumbs = globals.breadcrumbs.get_breadcrumbs().join("\n");
tracing_support::error!(target: "app-services-error-reporter::error", message, type_name, breadcrumbs);
}

Expand All @@ -22,10 +35,25 @@ pub fn report_breadcrumb(message: String, module: String, line: u32, column: u32
// - Push it to the `RECENT_BREADCRUMBS` list
// - Send out the `app-services-error-reporter::breadcrumb`. Applications can register for
// these events and log them.
RECENT_BREADCRUMBS.lock().push(message.clone());
GLOBALS.lock().breadcrumbs.push(message.clone());
tracing_support::info!(target: "app-services-error-reporter::breadcrumb", message, module, line, column);
}

// Global structs used for error reporting
struct Globals {
breadcrumbs: BreadcrumbRingBuffer,
rate_limiter: RateLimiter,
}

impl Globals {
const fn new() -> Self {
Self {
breadcrumbs: BreadcrumbRingBuffer::new(),
rate_limiter: RateLimiter::new(),
}
}
}

/// Ring buffer implementation that we use to store the most recent 20 breadcrumbs
#[derive(Default)]
struct BreadcrumbRingBuffer {
Expand Down Expand Up @@ -73,6 +101,46 @@ fn truncate_breadcrumb(breadcrumb: String) -> String {
breadcrumb[0..split_point].to_string()
}

/// Rate-limits error reports by type to 20 / hour
///
/// This uses the simplest algorithm possible. We could use something like a token bucket to allow
/// for a small burst of errors, but that doesn't seem so useful. In that scenario, the first
/// error report is the one we want to fix.
struct RateLimiter {
// Optional so we can make `new()` const.
last_report: Option<HashMap<String, Instant>>,
}

impl RateLimiter {
// Rate limit reports if they're within 3 minutes of each other.
const INTERVAL: Duration = Duration::from_secs(180);

const fn new() -> Self {
Self { last_report: None }
}

fn should_send_report(&mut self, error_type: &str, now: Instant) -> bool {
let component = error_type.split("-").next().unwrap();
let last_report = self.last_report.get_or_insert_with(HashMap::default);

if let Some(last_report) = last_report.get(component) {
match now.checked_duration_since(*last_report) {
// Not enough time has passed, rate-limit the report
Some(elapsed) if elapsed < Self::INTERVAL => {
return false;
}
// For all other cases, fall through and allow the report to be sent.
//
// Note: this also covers the `None` case which happens when the clock is
// non-monotonic. Letting an extra event through seems okay in this case.
_ => (),
}
}
last_report.insert(component.to_string(), now);
true
}
}

#[cfg(test)]
mod test {
use super::*;
Expand Down Expand Up @@ -212,4 +280,41 @@ mod test {
// fire emoji, which is multiple bytes long.
assert_eq!(truncate_breadcrumb("0".repeat(99) + "🔥").len(), 99);
}

#[test]
fn test_rate_limiter() {
let mut rate_limiter = RateLimiter::new();
let start = Instant::now();
let min = Duration::from_secs(60);
// The first error report is okay
assert!(rate_limiter.should_send_report("test-type", start));
// The report should be rate limited until 3 minutes pass, then we can send another one.
// Subtract time from the instant to simulate time going forward.
assert!(!rate_limiter.should_send_report("test-type", start));
assert!(!rate_limiter.should_send_report("test-type", start + min * 1));
assert!(!rate_limiter.should_send_report("test-type", start + min * 2));
assert!(rate_limiter.should_send_report("test-type", start + min * 3));
assert!(!rate_limiter.should_send_report("test-type", start + min * 4));
assert!(!rate_limiter.should_send_report("test-type", start + min * 5));
assert!(rate_limiter.should_send_report("test-type", start + min * 6));

assert!(rate_limiter.should_send_report("test-type", start + min * 60));
assert!(!rate_limiter.should_send_report("test-type", start + min * 61));
assert!(!rate_limiter.should_send_report("test-type", start + min * 62));
assert!(rate_limiter.should_send_report("test-type", start + min * 63));
}

#[test]
fn test_rate_limiter_type_matching() {
let mut rate_limiter = RateLimiter::new();
let start = Instant::now();
// Cause error error reports to be rate limited
assert!(rate_limiter.should_send_report("componenta-network-error", start));
assert!(!rate_limiter.should_send_report("componenta-network-error", start));
// Other reports from the same component should also be rate limited
assert!(!rate_limiter.should_send_report("componenta-database-error", start));
// But not ones from other components
assert!(rate_limiter.should_send_report("componentb-database-error", start));
assert!(rate_limiter.should_send_report("componentaa-network-error", start));
}
}