Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions engine/artifacts/errors/guard.tunnel_message_timeout.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions engine/artifacts/errors/guard.tunnel_request_aborted.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions engine/artifacts/errors/guard.tunnel_response_closed.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 40 additions & 0 deletions engine/packages/guard-core/src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,46 @@
#[error("guard", "service_unavailable", "Service unavailable.")]
pub struct ServiceUnavailable;

#[derive(RivetError, Serialize, Deserialize)]
#[error(
"guard",
"actor_stopped_while_waiting",
"Actor stopped while waiting for a response."
)]
pub struct ActorStoppedWhileWaiting;

Check warning on line 109 in engine/packages/guard-core/src/errors.rs

View workflow job for this annotation

GitHub Actions / Rustfmt

Diff in /home/runner/work/rivet/rivet/engine/packages/guard-core/src/errors.rs

#[derive(RivetError, Serialize, Deserialize)]
#[error(
"guard",
"tunnel_request_aborted",
"Actor tunnel aborted the request."
)]
pub struct TunnelRequestAborted;

#[derive(RivetError, Serialize, Deserialize)]
#[error(

Check warning on line 120 in engine/packages/guard-core/src/errors.rs

View workflow job for this annotation

GitHub Actions / Rustfmt

Diff in /home/runner/work/rivet/rivet/engine/packages/guard-core/src/errors.rs
"guard",
"tunnel_message_timeout",
"Actor tunnel message timed out."
)]
pub struct TunnelMessageTimeout;

#[derive(RivetError, Serialize, Deserialize)]
#[error(
"guard",
"tunnel_response_closed",
"Actor tunnel closed before sending a response."
)]
pub struct TunnelResponseClosed;

#[derive(RivetError, Serialize, Deserialize)]
#[error(
"guard",
"gateway_response_start_timeout",
"Timed out waiting for actor response start."
)]
pub struct GatewayResponseStartTimeout;

#[derive(RivetError, Serialize, Deserialize)]
#[error(
"guard",
Expand Down
31 changes: 26 additions & 5 deletions engine/packages/guard-core/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ pub(crate) fn err_into_response(err: anyhow::Error) -> Result<Response<ResponseB
("guard", "retry_attempts_exceeded") => StatusCode::BAD_GATEWAY,
("actor", "not_found") => StatusCode::NOT_FOUND,
("guard", "service_unavailable") => StatusCode::SERVICE_UNAVAILABLE,
("guard", "actor_stopped_while_waiting") => StatusCode::SERVICE_UNAVAILABLE,
("guard", "tunnel_request_aborted") => StatusCode::SERVICE_UNAVAILABLE,
("guard", "tunnel_message_timeout") => StatusCode::GATEWAY_TIMEOUT,
("guard", "tunnel_response_closed") => StatusCode::SERVICE_UNAVAILABLE,
("guard", "gateway_response_start_timeout") => StatusCode::GATEWAY_TIMEOUT,
("guard", "actor_ready_timeout") => StatusCode::SERVICE_UNAVAILABLE,
("guard", "no_route") => StatusCode::NOT_FOUND,
("guard", "invalid_request_body") => StatusCode::PAYLOAD_TOO_LARGE,
Expand Down Expand Up @@ -218,19 +223,35 @@ pub(crate) fn should_retry_request(res: &Result<Response<ResponseBody>>) -> bool
Ok(resp) => should_retry_request_inner(resp.status(), resp.headers()),
Err(err) => {
if let Some(rivet_err) = err.chain().find_map(|x| x.downcast_ref::<RivetError>()) {
rivet_err.group() == "guard" && rivet_err.code() == "service_unavailable"
rivet_err.group() == "guard" && is_retryable_guard_http_error(rivet_err.code())
} else {
false
}
}
}
}

// Determine if a response should trigger a retry. Guard-specific actor startup
// failures, including guard.actor_ready_timeout, are signaled as 503 with
// x-rivet-error and should be retried against a freshly resolved target.
fn is_retryable_guard_http_error(code: &str) -> bool {
matches!(
code,
"service_unavailable"
| "actor_ready_timeout"
| "actor_stopped_while_waiting"
| "tunnel_request_aborted"
| "tunnel_message_timeout"
| "tunnel_response_closed"
| "gateway_response_start_timeout"
)
}

// Determine if a response should trigger a retry: transient status and x-rivet-error.
pub(crate) fn should_retry_request_inner(status: StatusCode, headers: &hyper::HeaderMap) -> bool {
status == StatusCode::SERVICE_UNAVAILABLE && headers.contains_key(X_RIVET_ERROR)
(status == StatusCode::SERVICE_UNAVAILABLE || status == StatusCode::GATEWAY_TIMEOUT)
&& headers
.get(X_RIVET_ERROR)
.and_then(|value| value.to_str().ok())
.and_then(|value| value.split_once('.'))
.is_some_and(|(group, code)| group == "guard" && is_retryable_guard_http_error(code))
}

// Determine if a websocket error is retryable (e.g., transient UPS/tunnel issues)
Expand Down
17 changes: 9 additions & 8 deletions engine/packages/pegboard-gateway/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ use rivet_error::*;
use rivet_guard_core::{
ResponseBody, WebSocketHandle,
custom_serve::{CustomServeTrait, HibernationResult},
errors::{ServiceUnavailable, WebSocketServiceUnavailable},
errors::{
ActorStoppedWhileWaiting, GatewayResponseStartTimeout, TunnelMessageTimeout,
TunnelRequestAborted, TunnelResponseClosed, WebSocketServiceUnavailable,
},
request_context::RequestContext,
utils::is_ws_hibernate,
websocket_handle::WebSocketReceiver,
Expand Down Expand Up @@ -168,7 +171,7 @@ impl PegboardGateway {
}
protocol::mk2::ToServerTunnelMessageKind::ToServerResponseAbort => {
tracing::warn!("request aborted");
return Err(ServiceUnavailable.build());
return Err(TunnelRequestAborted.build());
}
_ => {
tracing::warn!("received non-response message from pubsub");
Expand All @@ -179,21 +182,19 @@ impl PegboardGateway {
request_id=%protocol::util::id_to_string(&request_id),
"received no message response during request init",
);
break;
return Err(TunnelResponseClosed.build());
}
}
_ = stopped_sub.next() => {
tracing::debug!("actor stopped while waiting for request response");
return Err(ServiceUnavailable.build());
return Err(ActorStoppedWhileWaiting.build());
}
_ = drop_rx.changed() => {
tracing::warn!(reason=?drop_rx.borrow(), "tunnel message timeout");
return Err(ServiceUnavailable.build());
return Err(TunnelMessageTimeout.build());
}
}
}

Err(ServiceUnavailable.build())
};
let response_start_timeout = Duration::from_millis(
self.ctx
Expand All @@ -206,7 +207,7 @@ impl PegboardGateway {
.map_err(|_| {
tracing::warn!("timed out waiting for response start from runner");

ServiceUnavailable.build()
GatewayResponseStartTimeout.build()
})??;
tracing::debug!("response handler task ended");

Expand Down
17 changes: 9 additions & 8 deletions engine/packages/pegboard-gateway2/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ use rivet_error::*;
use rivet_guard_core::{
ResponseBody, WebSocketHandle,
custom_serve::{CustomServeTrait, HibernationResult},
errors::{ServiceUnavailable, WebSocketServiceUnavailable},
errors::{
ActorStoppedWhileWaiting, GatewayResponseStartTimeout, TunnelMessageTimeout,
TunnelRequestAborted, TunnelResponseClosed, WebSocketServiceUnavailable,
},
request_context::RequestContext,
utils::is_ws_hibernate,
};
Expand Down Expand Up @@ -171,7 +174,7 @@ impl PegboardGateway2 {
}
protocol::ToRivetTunnelMessageKind::ToRivetResponseAbort => {
tracing::warn!("request aborted");
return Err(ServiceUnavailable.build());
return Err(TunnelRequestAborted.build());
}
_ => {
tracing::warn!("received non-response message from pubsub");
Expand All @@ -182,21 +185,19 @@ impl PegboardGateway2 {
request_id=%protocol::util::id_to_string(&request_id),
"received no message response during request init",
);
break;
return Err(TunnelResponseClosed.build());
}
}
_ = stopped_sub.next() => {
tracing::debug!("actor stopped while waiting for request response");
return Err(ServiceUnavailable.build());
return Err(ActorStoppedWhileWaiting.build());
}
_ = drop_rx.changed() => {
tracing::warn!(reason=?drop_rx.borrow(), "tunnel message timeout");
return Err(ServiceUnavailable.build());
return Err(TunnelMessageTimeout.build());
}
}
}

Err(ServiceUnavailable.build())
}
.instrument(tracing::info_span!("wait_for_tunnel_response"));
let response_start_timeout = Duration::from_millis(
Expand All @@ -210,7 +211,7 @@ impl PegboardGateway2 {
.map_err(|_| {
tracing::warn!("timed out waiting for response start from envoy");

ServiceUnavailable.build()
GatewayResponseStartTimeout.build()
})??;
tracing::debug!("response handler task ended");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,21 @@ function classifyActorError(
);
}

if (
error.group === "guard" &&
isRetryableGuardGatewayHttpError(error.code)
) {
return buildLifecycleBoundaryInfo(
"request_retry",
"actor_error",
error.message,
{
group: error.group,
code: error.code,
},
);
}

// TODO(RVT-6193): Remove this legacy match after structured restart errors
// are authoritative everywhere.
if (
Expand Down Expand Up @@ -144,6 +159,17 @@ function classifyActorError(
return undefined;
}

function isRetryableGuardGatewayHttpError(code: string): boolean {
return (
code === "service_unavailable" ||
code === "actor_stopped_while_waiting" ||
code === "tunnel_request_aborted" ||
code === "tunnel_message_timeout" ||
code === "tunnel_response_closed" ||
code === "gateway_response_start_timeout"
);
}

function classifyTransportError(
error: Error,
): LifecycleBoundaryInfo | undefined {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { describe, expect, test } from "vitest";
import { ActorError } from "../src/client/errors";
import { isRetryableLifecycleRequestError } from "../src/client/lifecycle-errors";

describe("lifecycle error retry classification", () => {
test.each([
"service_unavailable",
"actor_stopped_while_waiting",
"tunnel_request_aborted",
"tunnel_message_timeout",
"tunnel_response_closed",
"gateway_response_start_timeout",
])("classifies guard.%s as retryable", (code) => {
expect(
isRetryableLifecycleRequestError(
new ActorError("guard", code, "transient gateway error"),
),
).toBe(true);
});
});
9 changes: 8 additions & 1 deletion website/src/content/docs/clients/javascript.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,14 @@ const ws = await handle.webSocket("probe", undefined, {
});
```

Requests still return a transient `actor.stopping` lifecycle error (`{"group":"actor","code":"stopping","message":"Actor is stopping."}`) if the actor has fully stopped, i.e. the sleep grace period has ended but it has not yet restarted. Retry once the actor is available again.
Requests can still return transient lifecycle or gateway errors. Retry once the actor is available again.

- `actor.stopping`: the actor has fully stopped, i.e. the sleep grace period has ended but it has not yet restarted.
- `guard.actor_stopped_while_waiting`: the request reached the actor tunnel, but the actor stopped before the gateway received a response.
- `guard.tunnel_request_aborted`: the actor tunnel aborted the request before a response started.
- `guard.tunnel_message_timeout`: the gateway dropped the in-flight tunnel request after its tunnel message timeout.
- `guard.tunnel_response_closed`: the actor tunnel closed before sending a response.
- `guard.gateway_response_start_timeout`: the gateway timed out waiting for the actor response to start.

## API Reference

Expand Down
Loading