Skip to content
11 changes: 11 additions & 0 deletions quickwit/quickwit-common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,17 @@ pub fn is_true(value: &bool) -> bool {
*value
}

/// `true` when the current process is running inside an AWS Lambda runtime.
///
/// Detected via the presence of the `AWS_LAMBDA_FUNCTION_NAME` environment
/// variable, which the Lambda runtime sets automatically. The result is cached
/// on first access since environment variables are read once at process start.
pub fn is_running_in_lambda() -> bool {
static IS_LAMBDA: std::sync::LazyLock<bool> =
std::sync::LazyLock::new(|| std::env::var_os("AWS_LAMBDA_FUNCTION_NAME").is_some());
*IS_LAMBDA
}

pub fn chunk_range(range: Range<usize>, chunk_size: usize) -> impl Iterator<Item = Range<usize>> {
range.clone().step_by(chunk_size).map(move |block_start| {
let block_end = (block_start + chunk_size).min(range.end);
Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-jaeger/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2723,6 +2723,7 @@ mod tests {
scroll_id: None,
failed_splits: Vec::new(),
num_successful_splits: 1,
resource_stats: None,
})
});

Expand Down
10 changes: 7 additions & 3 deletions quickwit/quickwit-lambda-client/src/invoker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,18 @@ impl LambdaLeafSearchInvoker for AwsLambdaInvoker {
let start = std::time::Instant::now();
let result = self.invoke_leaf_search_with_retry(request).await;
let elapsed = start.elapsed().as_secs_f64();
let status = if result.is_ok() { "success" } else { "error" };
let outcome = match &result {
Ok(_) => "success",
Err(SearchError::Timeout(_)) => "timeout",
Err(_) => "other",
};
LAMBDA_METRICS
.leaf_search_requests_total
.with_label_values([status])
.with_label_values([outcome])
.inc();
LAMBDA_METRICS
.leaf_search_duration_seconds
.with_label_values([status])
.with_label_values([outcome])
.observe(elapsed);
result
}
Expand Down
4 changes: 2 additions & 2 deletions quickwit/quickwit-lambda-client/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ impl Default for LambdaMetrics {
"Total number of Lambda leaf search invocations.",
"lambda",
&[],
["status"],
["outcome"],
),
leaf_search_duration_seconds: new_histogram_vec(
"leaf_search_duration_seconds",
"Duration of Lambda leaf search invocations in seconds.",
"lambda",
&[],
["status"],
["outcome"],
duration_buckets(),
),
leaf_search_request_payload_size_bytes: new_histogram(
Expand Down
16 changes: 16 additions & 0 deletions quickwit/quickwit-proto/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,16 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.unwrap();

// Search service.
//
// Unlike the other services above, search goes through `tonic_prost_build` directly
// (not through `quickwit_codegen::Codegen`, which emits `cargo:rerun-if-changed` for
// every proto it compiles). `prost_build` 0.14 has a TODO acknowledging it does not
// emit those directives itself, and `tonic_prost_build` 0.14.5 exposes an
// `emit_rerun_if_changed` setter but never forwards it to the underlying `Config`.
// Without the explicit hint below, edits to `search.proto` do not retrigger build.rs
// (because the other `Codegen` calls have already narrowed cargo's watch list).
println!("cargo:rerun-if-changed=protos/quickwit/search.proto");

let mut prost_config = prost_build::Config::default();
prost_config
.file_descriptor_set_path("src/codegen/quickwit/search_descriptor.bin")
Expand All @@ -199,6 +209,12 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.type_attribute("SortByValue", "#[derive(Ord, PartialOrd)]")
.type_attribute("SearchRequest", "#[derive(Hash, Eq)]")
.type_attribute("PartialHit", "#[derive(Hash, Eq)]")
// The `Response` variant carries a `LeafSearchResponse` which now
// embeds `LeafResourceStats`.
.type_attribute(
"LambdaSingleSplitResult.outcome",
"#[allow(clippy::large_enum_variant)]",
)
.out_dir("src/codegen/quickwit")
.compile_with_config(
prost_config,
Expand Down
106 changes: 99 additions & 7 deletions quickwit/quickwit-proto/protos/quickwit/search.proto
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,9 @@ message SearchResponse {

// Total number of successful splits searched.
uint64 num_successful_splits = 8;

// Resource statistics for the root search.
RootResourceStats resource_stats = 10;
}

message SearchPlanResponse {
Expand Down Expand Up @@ -355,12 +358,98 @@ message LeafSearchRequest {
repeated string index_uris = 9;
}

message ResourceStats {
uint64 short_lived_cache_num_bytes = 1;
uint64 split_num_docs = 2;
uint64 warmup_microsecs = 3;
uint64 cpu_thread_pool_wait_microsecs = 4;
uint64 cpu_microsecs = 5;
// Per-split resource statistics.
//
// All fields are extensive (sum across splits is meaningful) except where noted.
message SplitResourceStats {
// Number of documents in the split.
uint64 split_num_docs = 1;
// Bytes resident in the warmup short-lived cache after warmup
// (measure of input data the search needed to process).
uint64 input_memory_bytes = 2;
// Bytes downloaded from storage during the request, as measured
// by a request-scoped storage wrapper.
uint64 download_num_bytes = 3;
// Number of storage GET requests issued during the request.
uint64 download_num_requests = 4;
// Number of documents matched by the query in this split (we always count).
uint64 matched_num_docs = 5;

// Time the split spent waiting to acquire its search permit.
uint64 wait_for_search_permit_microsecs = 6;
// Time spent in the warmup phase (downloading and indexing data into caches).
uint64 warmup_microsecs = 7;
// Time the split spent waiting for a slot on the CPU pool after warmup.
uint64 wait_for_cpu_pool_microsecs = 8;
// CPU time spent in the search itself (predicate matching + collection +
// per-segment harvest), as measured around the tantivy search call.
// Excludes any cross-split finalize work performed outside the single-split
// search.
uint64 cpu_search_microsecs = 9;
}

// Resource statistics for a single leaf-search call (over one or more splits).
// If the configuration allows it, leaf nodes can offload part of their computation to
// lambdas.
//
// `split_resources_worst` / `split_resources_sum` aggregations on
// `LeafResourceStats` are only computed for locally-executed splits.
//
// All numeric fields are extensive (summed when merging across leaves).
// `split_resources_worst` is the exception — it is selected by ranking, not
// summed. `lambda_bottleneck` is 0 or 1 at the source and becomes a count of
// leaves where lambda was the bottleneck once aggregated.
message LeafResourceStats {
// Number of splits whose results came from the partial result cache.
uint64 partial_result_cache_num_splits = 1;
// Sum of `split.num_docs` across cache-hit splits.
uint64 partial_result_cache_num_docs = 2;

// Number of splits executed locally (excluding cache hits and lambda).
uint64 localexec_num_splits = 8;
// Sum of `split.num_docs` across locally-executed splits.
uint64 localexec_num_docs = 9;
// The worst single-split contribution, ranked by
// `warmup + wait_for_cpu_pool + cpu_predicate + cpu_collection + cpu_harvest`
// (intentionally excludes `wait_for_search_permit`).
SplitResourceStats split_resources_worst = 10;
// Field-wise sum of `SplitResourceStats` across all locally-executed splits.
// If you want to compute averages, divide by localexec_num_splits.
SplitResourceStats split_resources_sum = 11;
// Wall-clock duration of the leaf search (set in `multi_index_leaf_search`).
uint64 wall_time_microsecs = 12;

// Number of splits dispatched to lambda (offloaded execution).
uint64 lambda_num_splits = 3;
// Sum of `split.num_docs` across lambda-dispatched splits.
uint64 lambda_num_docs = 4;
// Number of lambda-dispatched splits that succeeded.
uint64 lambda_success_num_splits = 5;
// Sum of `split.num_docs` across successful lambda-dispatched splits.
uint64 lambda_success_num_docs = 6;
// At the source (a single leaf call) this is 1 if the offloaded path
// finished after the local path (lambda was the bottleneck), 0 otherwise.
// Forced to 0 when `lambda_num_splits == 0`. At aggregate levels (merged
// across leaves) it is the count of leaves where lambda was the
// bottleneck. Voluntarily a uint64 (not a bool) for summability.
uint64 lambda_bottleneck = 7;
}

// Resource statistics for a root search.
message RootResourceStats {
// The leaf with the largest `wall_time_microsecs`.
LeafResourceStats leaf_resources_worst = 1;
// Field-wise sum of all leaf stats. `wall_time_microsecs` is summed even though
// it is not strictly extensive — the sum still gives a useful view of total leaf time.
// If you want to compute averages, divide by leaf_num_calls: failed leaf attempts
// usually do not come with proper counters.
LeafResourceStats leaf_resources_sum = 2;
// Number of leaf calls excluding retries.
uint64 leaf_num_calls = 3;
// Number of leaf calls, including retries.
uint64 leaf_num_calls_including_retries = 4;
// Number of failed splits across all leaves.
uint64 num_failed_splits = 5;
}

// LeafRequestRef references data in LeafSearchRequest to deduplicate data.
Expand Down Expand Up @@ -496,7 +585,10 @@ message LeafSearchResponse {
// postcard serialized intermediate aggregation_result.
optional bytes intermediate_aggregation_result = 6;

ResourceStats resource_stats = 8;
// [deprecated] ResourceStats resource_stats = 8;
reserved 8;

LeafResourceStats resource_stats = 9;
}

// The result of searching a single split in a Lambda invocation.
Expand Down
124 changes: 117 additions & 7 deletions quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading