Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#
# Copyright Kroxylicious Authors.
#
# Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
#

{{- if .Values.kafka.metrics.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: kafka-jmx-metrics-config
labels:
app: kafka
data:
kafka-metrics-config.yml: |
lowercaseOutputName: true
rules:
# Request handler pool idle % - near 0 means Kafka I/O thread pool is saturated
- pattern: "kafka.server<type=KafkaRequestHandlerPool><>RequestHandlerAvgIdlePercent"
name: kafka_server_requesthandlerpool_requesthandleravagidle
type: GAUGE

# Network processor idle % - near 0 means network threads saturated
- pattern: "kafka.network<type=SocketServer><>NetworkProcessorAvgIdlePercent"
name: kafka_network_socketserver_networkprocessoravagidle
type: GAUGE

# Broker-wide byte rates - aggregate inbound/outbound throughput
- pattern: "kafka.server<type=BrokerTopicMetrics, name=BytesInPerSec><>OneMinuteRate"
name: kafka_server_brokertopicmetrics_bytesinpersec_rate
type: GAUGE
- pattern: "kafka.server<type=BrokerTopicMetrics, name=BytesOutPerSec><>OneMinuteRate"
name: kafka_server_brokertopicmetrics_bytesoutpersec_rate
type: GAUGE

# Messages in per second (broker total)
- pattern: "kafka.server<type=BrokerTopicMetrics, name=MessagesInPerSec><>OneMinuteRate"
name: kafka_server_brokertopicmetrics_messagesinpersec_rate
type: GAUGE

# Per-topic byte rates - identify which topic is hitting the ceiling
- pattern: "kafka.server<type=BrokerTopicMetrics, name=BytesInPerSec, topic=(.+)><>OneMinuteRate"
name: kafka_server_brokertopicmetrics_bytesinpersec_rate
labels:
topic: "$1"
type: GAUGE
- pattern: "kafka.server<type=BrokerTopicMetrics, name=BytesOutPerSec, topic=(.+)><>OneMinuteRate"
name: kafka_server_brokertopicmetrics_bytesoutpersec_rate
labels:
topic: "$1"
type: GAUGE

# ISR shrinks - indicates replica lag / replication bottleneck
- pattern: "kafka.server<type=ReplicaManager, name=IsrShrinksPerSec><>OneMinuteRate"
name: kafka_server_replicamanager_isrshrinks_rate
type: GAUGE

# Produce request latency - how long Kafka spends processing produce requests
- pattern: "kafka.network<type=RequestMetrics, name=TotalTimeMs, request=Produce><>(\\w+)"
name: kafka_network_requestmetrics_totaltimems_produce
labels:
quantile: "$1"
type: GAUGE
{{- end }}
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ metadata:
spec:
kafka:
version: {{ .Values.kafka.version }}
{{- if .Values.kafka.metrics.enabled }}
metricsConfig:
type: jmxPrometheusExporter
valueFrom:
configMapKeyRef:
name: kafka-jmx-metrics-config
key: kafka-metrics-config.yml
{{- end }}
listeners:
- name: plain
port: 9092
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ kafka:
storage:
size: "100Gi"
storageClass: "" # Use default storage class
metrics:
enabled: false # Enable JMX Prometheus exporter on Kafka brokers (port 9404)

# Kroxylicious proxy (enabled for proxy scenarios)
# Requires the Kroxylicious operator to be installed in the cluster.
Expand Down
105 changes: 105 additions & 0 deletions kroxylicious-openmessaging-benchmarks/scripts/poll-kafka-metrics.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env bash
#
# Copyright Kroxylicious Authors.
#
# Licensed under the Apache Software License version 2.0, available at http://www.apache.org/licenses/LICENSE-2.0
#

set -euo pipefail

# Polls the Kafka JMX Prometheus exporter during a benchmark run.
# Intended to be started as a background process by run-benchmark.sh.
#
# The exporter is only available when kafka.metrics.enabled=true in the Helm chart.
# If the endpoint does not respond within the initial timeout, this script exits
# cleanly (exit 0) so run-benchmark.sh is not disrupted when metrics are disabled.
#
# Each poll appends a snapshot header followed by the raw Prometheus text format output.
# The header line format is:
# # SNAPSHOT timestamp=<unix_epoch_seconds> datetime=<ISO8601>
#
# Usage: poll-kafka-metrics.sh <broker-pod> <namespace> <output-dir> [interval-seconds]

usage() {
cat >&2 <<EOF
Usage: $(basename "$0") <broker-pod> <namespace> <output-dir> [interval-seconds]

Polls the Kafka JMX Prometheus exporter endpoint (/metrics on port 9404) via
kubectl port-forward and appends timestamped Prometheus snapshots to
<output-dir>/kafka-metrics.txt.

Exits cleanly (exit 0) if the endpoint does not respond within 15 seconds —
this happens when kafka.metrics.enabled=false in the Helm chart.

Arguments:
broker-pod Kubernetes pod name for a Kafka broker
namespace Kubernetes namespace containing the pod
output-dir Directory to write kafka-metrics.txt into
interval-seconds Polling interval in seconds (default: 30)
EOF
exit 1
}

if [[ $# -lt 3 ]]; then
usage
fi

BROKER_POD="$1"
NAMESPACE="$2"
OUTPUT_DIR="$3"
INTERVAL="${4:-30}"

METRICS_FILE="${OUTPUT_DIR}/kafka-metrics.txt"
LOCAL_PORT=19404

cleanup() {
if [[ -n "${PF_PID:-}" ]]; then
kill "${PF_PID}" 2>/dev/null || true
fi
}
trap cleanup EXIT

mkdir -p "${OUTPUT_DIR}"

echo "Starting port-forward to ${BROKER_POD}:9404 on localhost:${LOCAL_PORT}..."
kubectl port-forward "pod/${BROKER_POD}" "${LOCAL_PORT}:9404" \
-n "${NAMESPACE}" &>/dev/null &
PF_PID=$!

# Wait for endpoint to respond. Exit cleanly if it doesn't — JMX exporter is not deployed.
echo "Waiting for Kafka JMX metrics endpoint to be ready..."
PF_DEADLINE=$((SECONDS + 15))
until curl -sf "http://localhost:${LOCAL_PORT}/metrics" >/dev/null 2>&1; do
if [[ $SECONDS -ge $PF_DEADLINE ]]; then
echo "Kafka JMX metrics endpoint not available on ${BROKER_POD}:9404 — skipping Kafka metrics collection" \
"(enable with kafka.metrics.enabled=true in cluster-overrides.yaml)" >&2
exit 0
fi
if ! kill -0 "${PF_PID}" 2>/dev/null; then
echo "Kafka metrics port-forward exited — JMX exporter likely not deployed" >&2
exit 0
fi
sleep 1
done
echo "Kafka JMX metrics endpoint ready."

{
echo "# kafka-metrics polling started"
echo "# broker=${BROKER_POD} namespace=${NAMESPACE} interval=${INTERVAL}s"
echo "# started=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
} > "${METRICS_FILE}"

while true; do
NOW=$(date +%s)
{
echo ""
echo "# SNAPSHOT datetime=$(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo "# HELP benchmark_sample_timestamp_seconds Unix timestamp of this metrics snapshot"
echo "# TYPE benchmark_sample_timestamp_seconds gauge"
echo "benchmark_sample_timestamp_seconds ${NOW}"
if ! curl -sf "http://localhost:${LOCAL_PORT}/metrics"; then
echo "# WARNING: kafka metrics fetch failed at ${NOW}"
fi
} >> "${METRICS_FILE}"
sleep "${INTERVAL}"
done
29 changes: 29 additions & 0 deletions kroxylicious-openmessaging-benchmarks/scripts/run-benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -176,13 +176,15 @@ if ! kubectl auth can-i get pods -n "${NAMESPACE}" &>/dev/null; then
fi

METRICS_PID=""
KAFKA_METRICS_PID=""
LOGS_PID=""

teardown() {
echo ""
echo "--- Tearing down benchmark infrastructure ---"
stop_logs_tailer
stop_metrics_poller
stop_kafka_metrics_poller
if helm status "${HELM_RELEASE}" -n "${NAMESPACE}" &>/dev/null; then
helm uninstall "${HELM_RELEASE}" -n "${NAMESPACE}" --wait --timeout 120s
fi
Expand Down Expand Up @@ -291,6 +293,31 @@ stop_metrics_poller() {
fi
}

start_kafka_metrics_poller() {
local kafka_pod
kafka_pod=$(kubectl get pod -n "${NAMESPACE}" \
-l "strimzi.io/cluster=kafka,strimzi.io/pool-name=kafka-pool" \
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || true
if [[ -z "${kafka_pod}" ]]; then
return
fi
echo "Starting Kafka JMX metrics polling (every ${METRICS_INTERVAL}s) for pod ${kafka_pod}..."
mkdir -p "${OUTPUT_DIR}"
"${SCRIPT_DIR}/poll-kafka-metrics.sh" \
"${kafka_pod}" "${NAMESPACE}" "${OUTPUT_DIR}" "${METRICS_INTERVAL}" &
KAFKA_METRICS_PID=$!
echo "Kafka metrics poller running (PID ${KAFKA_METRICS_PID})"
}

stop_kafka_metrics_poller() {
if [[ -n "${KAFKA_METRICS_PID}" ]]; then
echo "Stopping Kafka metrics poller (PID ${KAFKA_METRICS_PID})..."
kill "${KAFKA_METRICS_PID}" 2>/dev/null || true
wait "${KAFKA_METRICS_PID}" 2>/dev/null || true
KAFKA_METRICS_PID=""
fi
}

# Creates the results PVC if it does not already exist.
# The PVC is not managed by Helm — it persists across probes and Helm installs.
ensure_results_pvc() {
Expand Down Expand Up @@ -590,6 +617,7 @@ fi
create_benchmark_job

start_metrics_poller
start_kafka_metrics_poller

echo ""
echo "--- Running benchmark (${SCENARIO} / ${WORKLOAD}) ---"
Expand Down Expand Up @@ -726,6 +754,7 @@ if [[ -n "${PROXY_POD}" ]]; then
fi

stop_metrics_poller
stop_kafka_metrics_poller

# --- Collect results ---

Expand Down
Loading