Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ Metrics will be available at `http://localhost:2112/metrics`
- `--balance.scrape-interval`: Balance check scrape interval in seconds (default: 30)
- `--verifier.workers`: Number of concurrent workers for block verification (default: 50)
- `--verbose`: Enable verbose logging (default: false)
- `--debug`: Enable debug logging for submission details (default: false, can also be set via `EVMETRICS_DEBUG=true` environment variable)

### Example with Custom Endpoints

Expand Down Expand Up @@ -150,6 +151,26 @@ When metrics are enabled, the following metrics are exposed:
- **Labels**: `chain_id`, `type`
- **Description**: Latest DA height for header and data submissions

### `ev_metrics_submission_attempts_total`
- **Type**: Counter
- **Labels**: `chain_id`, `type`
- **Description**: Total number of DA submission attempts (both successful and failed)

### `ev_metrics_submission_failures_total`
- **Type**: Counter
- **Labels**: `chain_id`, `type`
- **Description**: Total number of failed DA submission attempts

### `ev_metrics_last_submission_attempt_time`
- **Type**: Gauge
- **Labels**: `chain_id`, `type`
- **Description**: Timestamp of the last DA submission attempt (Unix timestamp)

### `ev_metrics_last_successful_submission_time`
- **Type**: Gauge
- **Labels**: `chain_id`, `type`
- **Description**: Timestamp of the last successful DA submission (Unix timestamp)

### Block Time Metrics

### `ev_metrics_block_time_seconds`
Expand Down Expand Up @@ -253,3 +274,18 @@ When `--balance.addresses` and `--balance.consensus-rpc-urls` are provided:
- **Type**: Counter
- **Labels**: `chain_id`, `endpoint`, `error_type`
- **Description**: Total number of consensus RPC endpoint errors by type

## Debug Logging

Debug logging provides detailed visibility into DA submission process. Enable with `--debug` flag or `EVMETRICS_DEBUG=true` environment variable.

**Use cases**: Troubleshoot submission failures, verify submission flow, diagnose Celestia RPC issues.

**Logs include**: Successful/failed submissions, DA height updates, submission timing.

**Example**:
```bash
# Enable debug logging
export EVMETRICS_DEBUG=true
./ev-metrics monitor --header-namespace testnet_header --data-namespace testnet_data
```
2 changes: 2 additions & 0 deletions pkg/exporters/verifier/verifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,12 @@ func (e *exporter) processBlocks(ctx context.Context, m *metrics.Metrics, worker

func (e *exporter) onVerified(m *metrics.Metrics, namespace string, blockHeight, daHeight uint64, verified bool, submissionDuration time.Duration) {
if verified {
m.RecordSubmissionAttempt(e.chainID, namespace, true)
m.RecordSubmissionDaHeight(e.chainID, namespace, daHeight)
m.RemoveVerifiedBlock(e.chainID, namespace, blockHeight)
m.RecordSubmissionDuration(e.chainID, namespace, submissionDuration)
} else {
m.RecordSubmissionAttempt(e.chainID, namespace, false)
m.RecordMissingBlock(e.chainID, namespace, blockHeight)
}
}
Expand Down
94 changes: 94 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package metrics

import (
"fmt"
"log"
"os"
"sort"
"strings"
"sync"
Expand Down Expand Up @@ -29,6 +31,14 @@ type Metrics struct {
SubmissionDuration *prometheus.SummaryVec
// SubmissionDaHeight tracks the DA height at which blocks were submitted.
SubmissionDaHeight *prometheus.GaugeVec
// SubmissionAttemptsTotal tracks the total number of submission attempts.
SubmissionAttemptsTotal *prometheus.CounterVec
// SubmissionFailuresTotal tracks the total number of failed submission attempts.
SubmissionFailuresTotal *prometheus.CounterVec
// LastSubmissionAttemptTime tracks the timestamp of the last submission attempt.
LastSubmissionAttemptTime *prometheus.GaugeVec
// LastSuccessfulSubmissionTime tracks the timestamp of the last successful submission.
LastSuccessfulSubmissionTime *prometheus.GaugeVec
// BlockTime tracks the time between consecutive blocks with histogram buckets for accurate SLO calculations.
BlockTime *prometheus.HistogramVec
// BlockTimeSummary tracks block time with percentiles over a rolling window.
Expand Down Expand Up @@ -72,6 +82,11 @@ type Metrics struct {
ranges map[string][]*blockRange // key: blobType -> sorted slice of ranges
}

// isDebugEnabled returns true if debug logging is enabled via environment variable
func isDebugEnabled() bool {
return strings.ToLower(os.Getenv("EVMETRICS_DEBUG")) == "true"
}

type blockRange struct {
start uint64
end uint64
Expand Down Expand Up @@ -289,6 +304,38 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
},
[]string{"chain_id", "endpoint", "error_type"},
),
SubmissionAttemptsTotal: factory.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Name: "submission_attempts_total",
Help: "total number of DA submission attempts",
},
[]string{"chain_id", "type"},
),
SubmissionFailuresTotal: factory.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Name: "submission_failures_total",
Help: "total number of failed DA submission attempts",
},
[]string{"chain_id", "type"},
),
LastSubmissionAttemptTime: factory.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "last_submission_attempt_time",
Help: "timestamp of the last DA submission attempt",
},
[]string{"chain_id", "type"},
),
LastSuccessfulSubmissionTime: factory.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "last_successful_submission_time",
Help: "timestamp of the last successful DA submission",
},
[]string{"chain_id", "type"},
),
ranges: make(map[string][]*blockRange),
lastBlockArrivalTime: make(map[string]time.Time),
lastSubmissionDurations: make(map[string]time.Duration),
Expand All @@ -297,6 +344,34 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric
return m
}

// RecordSubmissionAttempt records a submission attempt and updates related metrics
func (m *Metrics) RecordSubmissionAttempt(chainID, submissionType string, success bool) {
m.mu.Lock()
defer m.mu.Unlock()

// Always record the attempt
m.SubmissionAttemptsTotal.WithLabelValues(chainID, submissionType).Inc()

if !success {
m.SubmissionFailuresTotal.WithLabelValues(chainID, submissionType).Inc()
}

// Record timestamp of this attempt
now := time.Now()
m.LastSubmissionAttemptTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix()))

if success {
m.LastSuccessfulSubmissionTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix()))
if isDebugEnabled() {
log.Printf("DEBUG: Successful submission - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix())
}
} else {
if isDebugEnabled() {
log.Printf("DEBUG: Failed submission attempt - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix())
}
}
}

// RecordSubmissionDaHeight records the DA height only if it's higher than previously recorded
func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHeight uint64) {
m.mu.Lock()
Expand All @@ -306,6 +381,15 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei
if daHeight > m.latestHeaderDaHeight {
m.latestHeaderDaHeight = daHeight
m.SubmissionDaHeight.WithLabelValues(chainID, "header").Set(float64(daHeight))
// Debug log when submission DA height is recorded
if isDebugEnabled() {
log.Printf("DEBUG: Recorded header submission DA height - chain: %s, height: %d", chainID, daHeight)
}
} else {
// Debug log when DA height is not higher than previous
if isDebugEnabled() {
log.Printf("DEBUG: Header DA height %d not higher than previous %d for chain %s", daHeight, m.latestHeaderDaHeight, chainID)
}
}
return
}
Expand All @@ -314,6 +398,15 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei
if daHeight > m.latestDataDaHeight {
m.latestDataDaHeight = daHeight
m.SubmissionDaHeight.WithLabelValues(chainID, "data").Set(float64(daHeight))
// Debug log when submission DA height is recorded
if isDebugEnabled() {
log.Printf("DEBUG: Recorded data submission DA height - chain: %s, height: %d", chainID, daHeight)
}
} else {
// Debug log when DA height is not higher than previous
if isDebugEnabled() {
log.Printf("DEBUG: Data DA height %d not higher than previous %d for chain %s", daHeight, m.latestDataDaHeight, chainID)
}
}
}
}
Expand Down Expand Up @@ -530,6 +623,7 @@ func (m *Metrics) RecordSubmissionDuration(chainID, submissionType string, durat

key := fmt.Sprintf("%s:%s", chainID, submissionType)
m.lastSubmissionDurations[key] = duration

}

// RefreshSubmissionDuration re-observes the last known submission duration to keep the metric alive.
Expand Down
134 changes: 132 additions & 2 deletions pkg/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,130 @@ func TestMetrics_RefreshSubmissionDuration_Empty(t *testing.T) {
})
}

func TestMetrics_RecordSubmissionAttempt(t *testing.T) {
reg := prometheus.NewRegistry()
m := NewWithRegistry("test", reg)

// Test successful submission
beforeAttempt := time.Now()
m.RecordSubmissionAttempt("testchain", "header", true)
afterAttempt := time.Now()

// Verify counters
attempts := getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{
"chain_id": "testchain",
"type": "header",
})
require.Equal(t, float64(1), attempts, "should have 1 attempt")

// For successful submission, failures should be 0 (metric may not be exported if 0)
// We'll check if the metric exists and has value 0, or doesn't exist (both are valid)
failuresMetricFound := false
failures := float64(0)
metrics, err := reg.Gather()
require.NoError(t, err)
for _, mf := range metrics {
if mf.GetName() == "test_submission_failures_total" {
for _, m := range mf.GetMetric() {
// check if labels match
match := true
for _, label := range m.GetLabel() {
if expectedVal, ok := map[string]string{"chain_id": "testchain", "type": "header"}[label.GetName()]; ok {
if label.GetValue() != expectedVal {
match = false
break
}
}
}
if match && len(m.GetLabel()) == 2 {
failuresMetricFound = true
if m.GetCounter() != nil {
failures = m.GetCounter().GetValue()
}
break
}
}
}
}
if failuresMetricFound {
require.Equal(t, float64(0), failures, "should have 0 failures")
} // else: metric not exported because value is 0, which is expected behavior

// Verify timestamps are within expected range
lastAttemptTime := getMetricValue(t, reg, "test_last_submission_attempt_time", map[string]string{
"chain_id": "testchain",
"type": "header",
})
require.GreaterOrEqual(t, lastAttemptTime, float64(beforeAttempt.Unix()))
require.LessOrEqual(t, lastAttemptTime, float64(afterAttempt.Unix()))

lastSuccessTime := getMetricValue(t, reg, "test_last_successful_submission_time", map[string]string{
"chain_id": "testchain",
"type": "header",
})
require.GreaterOrEqual(t, lastSuccessTime, float64(beforeAttempt.Unix()))
require.LessOrEqual(t, lastSuccessTime, float64(afterAttempt.Unix()))

// Test failed submission
beforeFailure := time.Now()
m.RecordSubmissionAttempt("testchain", "data", false)
afterFailure := time.Now()

// Verify counters
attempts = getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{
"chain_id": "testchain",
"type": "data",
})
require.Equal(t, float64(1), attempts, "should have 1 attempt for data")

failures = getMetricValue(t, reg, "test_submission_failures_total", map[string]string{
"chain_id": "testchain",
"type": "data",
})
require.Equal(t, float64(1), failures, "should have 1 failure for data")

// Verify timestamps - should have attempt time but not success time for failed submission
lastAttemptTime = getMetricValue(t, reg, "test_last_submission_attempt_time", map[string]string{
"chain_id": "testchain",
"type": "data",
})
require.GreaterOrEqual(t, lastAttemptTime, float64(beforeFailure.Unix()))
require.LessOrEqual(t, lastAttemptTime, float64(afterFailure.Unix()))

// Last successful submission time should still be 0 for data type (never succeeded)
// Gauge metrics with 0 values may not be exported, so we need to check if it exists
var lastSuccessTimeData float64
var successMetricFoundData bool
metrics, err = reg.Gather()
require.NoError(t, err)
for _, mf := range metrics {
if mf.GetName() == "test_last_successful_submission_time" {
for _, m := range mf.GetMetric() {
// check if labels match
match := true
for _, label := range m.GetLabel() {
if expectedVal, ok := map[string]string{"chain_id": "testchain", "type": "data"}[label.GetName()]; ok {
if label.GetValue() != expectedVal {
match = false
break
}
}
}
if match && len(m.GetLabel()) == 2 {
successMetricFoundData = true
if m.GetGauge() != nil {
lastSuccessTimeData = m.GetGauge().GetValue()
}
break
}
}
}
}
if successMetricFoundData {
require.Equal(t, float64(0), lastSuccessTimeData, "should have no successful submission time for data")
} // else: metric not exported because value is 0, which is expected behavior
}
Comment on lines +566 to +688
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This test function is quite long and contains significant code duplication, especially for verifying metrics that might not be present (e.g., when their value is 0). This makes the test harder to read and maintain.

I suggest refactoring it to improve clarity and reusability:

  1. Use Sub-tests: Split the logic for successful and failed submissions into separate sub-tests using t.Run(). This will improve test organization and make test failures easier to diagnose.

  2. Create a Helper for Optional Metrics: The logic to check for metrics that might be zero (and thus not exported by Prometheus) is repeated. You could extract this into a helper function, for example getMetricValueOptional, that returns (float64, bool) to indicate if the metric was found. This would greatly simplify the test body.

Here's a sketch of how the refactored test could look:

func TestMetrics_RecordSubmissionAttempt(t *testing.T) {
	reg := prometheus.NewRegistry()
	m := NewWithRegistry("test", reg)

	t.Run("successful submission", func(t *testing.T) {
		beforeAttempt := time.Now()
		m.RecordSubmissionAttempt("testchain", "header", true)
		afterAttempt := time.Now()

		attempts := getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{"chain_id": "testchain", "type": "header"})
		require.Equal(t, float64(1), attempts)

		// Using a new helper for optional metrics
		failures, found := getMetricValueOptional(t, reg, "test_submission_failures_total", map[string]string{"chain_id": "testchain", "type": "header"})
		if found {
			require.Equal(t, float64(0), failures)
		}

		// ... timestamp checks
	})

	t.Run("failed submission", func(t *testing.T) {
		// ... similar logic for failed case
	})
}


// helper types for table tests
type blockToRecord struct {
chain string
Expand Down Expand Up @@ -593,7 +717,7 @@ func calculateExpectedTotal(ranges []expectedRange, blobType string) uint64 {
return total
}

// getMetricValue retrieves the current value of a gauge metric
// getMetricValue retrieves the current value of a metric (gauge or counter)
func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, labels map[string]string) float64 {
t.Helper()
metrics, err := reg.Gather()
Expand All @@ -613,7 +737,13 @@ func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, l
}
}
if match && len(m.GetLabel()) == len(labels) {
return m.GetGauge().GetValue()
// Try gauge first, then counter
if m.GetGauge() != nil {
return m.GetGauge().GetValue()
}
if m.GetCounter() != nil {
return m.GetCounter().GetValue()
}
}
}
}
Expand Down