Skip to content

Commit 7e713a4

Browse files
lfranckerazvanTechassi
authored
fix: add wall-clock certificate expiry check to webhook TLS rotation (#1175)
* fix: add wall-clock certificate expiry check to webhook TLS rotation The rotation interval uses tokio's monotonic clock, but certificate validity uses wall-clock time. When these diverge (hibernation, VM migration, cgroup freezing), the certificate can expire before rotation. Add a periodic wall-clock check (every 5 minutes) that compares SystemTime::now() against the certificate's not_after field and triggers early rotation if the cert is within 4 hours of expiry. Fixes: #1174 * refactor: replace dual rotation timers with single wall-clock check Remove the monotonic 20h rotation interval and the supplementary wall-clock check. Instead, use a single periodic check (every 5 min) that compares wall-clock time against the certificate's not_after. Also derive the expiry buffer from the certificate lifetime (1/6) so it scales if the lifetime ever changes, and add comments documenting the relationship between lifetime and check interval. * add debug!() when checking for cert rotation * update changelog * refactor(webhook): Clean up constants and interval --------- Co-authored-by: Razvan-Daniel Mihai <84674+razvan@users.noreply.github.com> Co-authored-by: Techassi <git@techassi.dev>
1 parent 7486017 commit 7e713a4

6 files changed

Lines changed: 93 additions & 22 deletions

File tree

Cargo.lock

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ either = "1.13.0"
2929
futures = "0.3.30"
3030
futures-util = "0.3.30"
3131
http = "1.3.1"
32+
humantime = "2.1.0"
3233
indexmap = "2.5.0"
3334
indoc = "2.0.6"
3435
jiff = "0.2.18"

crates/stackable-webhook/CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
44

55
## [Unreleased]
66

7+
### Changed
8+
9+
- Check for certificate rotation every 5 minutes to prevent expiry due to monotonic and wall clock divergence ([#1175]).
10+
11+
[#1175]: https://github.com/stackabletech/operator-rs/pull/1175
12+
713
## [0.9.0] - 2026-02-03
814

915
### Added

crates/stackable-webhook/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ arc-swap.workspace = true
1515
async-trait.workspace = true
1616
axum.workspace = true
1717
futures-util.workspace = true
18+
humantime.workspace = true
1819
hyper-util.workspace = true
1920
hyper.workspace = true
2021
k8s-openapi.workspace = true

crates/stackable-webhook/src/tls/cert_resolver.rs

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use std::sync::Arc;
1+
use std::{sync::Arc, time::SystemTime};
22

33
use arc_swap::ArcSwap;
44
use snafu::{OptionExt, ResultExt, Snafu};
@@ -57,6 +57,11 @@ pub struct CertificateResolver {
5757
/// Using a [`ArcSwap`] (over e.g. [`tokio::sync::RwLock`]), so that we can easily
5858
/// (and performant) bridge between async write and sync read.
5959
current_certified_key: ArcSwap<CertifiedKey>,
60+
61+
/// The wall-clock expiry time (`not_after`) of the current certificate.
62+
/// Used to detect clock drift between monotonic and wall-clock time.
63+
current_not_after: ArcSwap<SystemTime>,
64+
6065
subject_alterative_dns_names: Arc<Vec<String>>,
6166

6267
certificate_tx: mpsc::Sender<Certificate>,
@@ -68,7 +73,7 @@ impl CertificateResolver {
6873
certificate_tx: mpsc::Sender<Certificate>,
6974
) -> Result<Self> {
7075
let subject_alterative_dns_names = Arc::new(subject_alterative_dns_names);
71-
let certified_key = Self::generate_new_certificate_inner(
76+
let (certified_key, not_after) = Self::generate_new_certificate_inner(
7277
subject_alterative_dns_names.clone(),
7378
&certificate_tx,
7479
)
@@ -77,26 +82,51 @@ impl CertificateResolver {
7782
Ok(Self {
7883
subject_alterative_dns_names,
7984
current_certified_key: ArcSwap::new(certified_key),
85+
current_not_after: ArcSwap::new(Arc::new(not_after)),
8086
certificate_tx,
8187
})
8288
}
8389

8490
pub async fn rotate_certificate(&self) -> Result<()> {
85-
let certified_key = self.generate_new_certificate().await?;
91+
let (certified_key, not_after) = self.generate_new_certificate().await?;
8692

8793
// TODO: Sign the new cert somehow with the old cert. See https://github.com/stackabletech/decisions/issues/56
8894
self.current_certified_key.store(certified_key);
95+
self.current_not_after.store(Arc::new(not_after));
8996

9097
Ok(())
9198
}
9299

93-
async fn generate_new_certificate(&self) -> Result<Arc<CertifiedKey>> {
100+
/// Returns `true` if the current certificate is expired or will expire
101+
/// within the given `buffer` duration according to wall-clock time.
102+
///
103+
/// This catches cases where the monotonic timer (used by `tokio::time`)
104+
/// has drifted from wall-clock time, e.g. due to system hibernation.
105+
pub fn needs_rotation(&self, buffer: std::time::Duration) -> bool {
106+
let not_after = **self.current_not_after.load();
107+
// If subtraction underflows (buffer > time since epoch), fall back to
108+
// UNIX_EPOCH so that the comparison always triggers rotation.
109+
let deadline = not_after
110+
.checked_sub(buffer)
111+
.unwrap_or(SystemTime::UNIX_EPOCH);
112+
113+
tracing::debug!(
114+
x509.subject_alterative_names = ?self.subject_alterative_dns_names,
115+
x509.not_after = %humantime::format_rfc3339(not_after),
116+
deadline = %humantime::format_rfc3339(deadline),
117+
"checking if certificate needs rotation"
118+
);
119+
120+
SystemTime::now() >= deadline
121+
}
122+
123+
async fn generate_new_certificate(&self) -> Result<(Arc<CertifiedKey>, SystemTime)> {
94124
let subject_alterative_dns_names = self.subject_alterative_dns_names.clone();
95125
Self::generate_new_certificate_inner(subject_alterative_dns_names, &self.certificate_tx)
96126
.await
97127
}
98128

99-
/// Creates a new certificate and returns the certified key.
129+
/// Creates a new certificate and returns the certified key as well as `notAfter` timestamp.
100130
///
101131
/// The certificate is send to the passed `cert_tx`.
102132
///
@@ -106,7 +136,7 @@ impl CertificateResolver {
106136
async fn generate_new_certificate_inner(
107137
subject_alterative_dns_names: Arc<Vec<String>>,
108138
certificate_tx: &mpsc::Sender<Certificate>,
109-
) -> Result<Arc<CertifiedKey>> {
139+
) -> Result<(Arc<CertifiedKey>, SystemTime)> {
110140
// The certificate generations can take a while, so we use `spawn_blocking`
111141
let (cert, certified_key) = tokio::task::spawn_blocking(move || {
112142
let tls_provider =
@@ -144,12 +174,14 @@ impl CertificateResolver {
144174
.await
145175
.context(TokioSpawnBlockingSnafu)??;
146176

177+
let not_after = cert.tbs_certificate.validity.not_after.to_system_time();
178+
147179
certificate_tx
148180
.send(cert)
149181
.await
150182
.map_err(|_err| CertificateResolverError::SendCertificateToChannel)?;
151183

152-
Ok(certified_key)
184+
Ok((certified_key, not_after))
153185
}
154186
}
155187

crates/stackable-webhook/src/tls/mod.rs

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,30 @@ use crate::{
3838

3939
mod cert_resolver;
4040

41-
pub const WEBHOOK_CA_LIFETIME: Duration = Duration::from_hours_unchecked(24);
42-
pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration = Duration::from_hours_unchecked(24);
43-
pub const WEBHOOK_CERTIFICATE_ROTATION_INTERVAL: Duration = Duration::from_hours_unchecked(20);
41+
/// All certificates (CAs and leaf certificates) are valid for this amount of time in hours. If this
42+
/// is ever reduced, ensure it stays well above [`CERTIFICATE_ROTATION_CHECK_INTERVAL`]
43+
/// (currently 5 minutes), otherwise the certificate could expire between checks.
44+
const CERTIFICATE_LIFETIME_HOURS: u64 = 24;
45+
46+
/// The CA lifetime as a human-readable [`Duration`].
47+
pub const WEBHOOK_CA_LIFETIME: Duration =
48+
Duration::from_hours_unchecked(CERTIFICATE_LIFETIME_HOURS);
49+
50+
/// The leaf certificate lifetime as a human-readable [`Duration`].
51+
pub const WEBHOOK_CERTIFICATE_LIFETIME: Duration =
52+
Duration::from_hours_unchecked(CERTIFICATE_LIFETIME_HOURS);
53+
54+
/// Rotate the certificate when less than 1/6 of its lifetime remains (4 hours for the current 24h
55+
/// lifetime). Derived from [`CERTIFICATE_LIFETIME_HOURS`] so it scales if the lifetime changes.
56+
const CERTIFICATE_EXPIRY_BUFFER_HOURS: u64 = CERTIFICATE_LIFETIME_HOURS * 60 / 6;
57+
58+
const CERTIFICATE_EXPIRY_BUFFER: Duration =
59+
Duration::from_minutes_unchecked(CERTIFICATE_EXPIRY_BUFFER_HOURS);
60+
61+
/// How often to check whether the certificate needs rotation. This is intentionally independent of
62+
/// the certificate lifetime - it controls how quickly we detect wall-clock drift (from hibernation,
63+
/// VM migration, etc.), not how long the certificate lives.
64+
const CERTIFICATE_ROTATION_CHECK_INTERVAL: Duration = Duration::from_minutes_unchecked(5);
4465

4566
pub type Result<T, E = TlsServerError> = std::result::Result<T, E>;
4667

@@ -153,8 +174,9 @@ impl TlsServer {
153174
router,
154175
} = self;
155176

156-
let start = tokio::time::Instant::now() + *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL;
157-
let mut interval = tokio::time::interval_at(start, *WEBHOOK_CERTIFICATE_ROTATION_INTERVAL);
177+
let start = tokio::time::Instant::now() + *CERTIFICATE_ROTATION_CHECK_INTERVAL;
178+
let mut rotation_check_interval =
179+
tokio::time::interval_at(start, *CERTIFICATE_ROTATION_CHECK_INTERVAL);
158180

159181
let tls_acceptor = TlsAcceptor::from(Arc::new(config));
160182
let tcp_listener = TcpListener::bind(socket_addr)
@@ -183,11 +205,10 @@ impl TlsServer {
183205
loop {
184206
let tls_acceptor = tls_acceptor.clone();
185207

186-
// Wait for either a new TCP connection or the certificate rotation interval tick
187208
tokio::select! {
188209
// We opt for a biased execution of arms to make sure we always check if a
189-
// shutdown signal was received or the certificate needs rotation based on the
190-
// interval. This ensures, we always use a valid certificate for the TLS connection.
210+
// shutdown signal was received or the certificate needs rotation before
211+
// accepting new connections.
191212
biased;
192213

193214
// Once a shutdown signal is received (this future becomes `Poll::Ready`), break out
@@ -198,13 +219,16 @@ impl TlsServer {
198219
break;
199220
}
200221

201-
// This is cancellation-safe. If this branch is cancelled, the tick is NOT consumed.
202-
// As such, we will not miss rotating the certificate.
203-
_ = interval.tick() => {
204-
cert_resolver
205-
.rotate_certificate()
206-
.await
207-
.context(RotateCertificateSnafu)?
222+
// Check wall-clock time to decide if the certificate needs rotation.
223+
// This is cancellation-safe: if cancelled, the tick is NOT consumed.
224+
_ = rotation_check_interval.tick() => {
225+
if cert_resolver.needs_rotation(*CERTIFICATE_EXPIRY_BUFFER) {
226+
tracing::info!("certificate approaching expiry, rotating");
227+
cert_resolver
228+
.rotate_certificate()
229+
.await
230+
.context(RotateCertificateSnafu)?;
231+
}
208232
}
209233

210234
// This is cancellation-safe. If cancelled, no new connections are accepted.

0 commit comments

Comments
 (0)