From 5ecb0d0c3af13de79469a1a5e8bf5be6748ad2fe Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Mon, 22 Dec 2025 16:44:11 +0100 Subject: [PATCH 1/2] wip --- pkg/exporters/verifier/verifier.go | 2 + pkg/metrics/metrics.go | 76 ++++++++++++++++ pkg/metrics/metrics_test.go | 134 ++++++++++++++++++++++++++++- 3 files changed, 210 insertions(+), 2 deletions(-) diff --git a/pkg/exporters/verifier/verifier.go b/pkg/exporters/verifier/verifier.go index 4195eb8..833e628 100644 --- a/pkg/exporters/verifier/verifier.go +++ b/pkg/exporters/verifier/verifier.go @@ -143,10 +143,12 @@ func (e *exporter) processBlocks(ctx context.Context, m *metrics.Metrics, worker func (e *exporter) onVerified(m *metrics.Metrics, namespace string, blockHeight, daHeight uint64, verified bool, submissionDuration time.Duration) { if verified { + m.RecordSubmissionAttempt(e.chainID, namespace, true) m.RecordSubmissionDaHeight(e.chainID, namespace, daHeight) m.RemoveVerifiedBlock(e.chainID, namespace, blockHeight) m.RecordSubmissionDuration(e.chainID, namespace, submissionDuration) } else { + m.RecordSubmissionAttempt(e.chainID, namespace, false) m.RecordMissingBlock(e.chainID, namespace, blockHeight) } } diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 4de801e..2021c2b 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -2,6 +2,7 @@ package metrics import ( "fmt" + "log" "sort" "strings" "sync" @@ -29,6 +30,14 @@ type Metrics struct { SubmissionDuration *prometheus.SummaryVec // SubmissionDaHeight tracks the DA height at which blocks were submitted. SubmissionDaHeight *prometheus.GaugeVec + // SubmissionAttemptsTotal tracks the total number of submission attempts. + SubmissionAttemptsTotal *prometheus.CounterVec + // SubmissionFailuresTotal tracks the total number of failed submission attempts. + SubmissionFailuresTotal *prometheus.CounterVec + // LastSubmissionAttemptTime tracks the timestamp of the last submission attempt. + LastSubmissionAttemptTime *prometheus.GaugeVec + // LastSuccessfulSubmissionTime tracks the timestamp of the last successful submission. + LastSuccessfulSubmissionTime *prometheus.GaugeVec // BlockTime tracks the time between consecutive blocks with histogram buckets for accurate SLO calculations. BlockTime *prometheus.HistogramVec // BlockTimeSummary tracks block time with percentiles over a rolling window. @@ -289,6 +298,38 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric }, []string{"chain_id", "endpoint", "error_type"}, ), + SubmissionAttemptsTotal: factory.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "submission_attempts_total", + Help: "total number of DA submission attempts", + }, + []string{"chain_id", "type"}, + ), + SubmissionFailuresTotal: factory.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "submission_failures_total", + Help: "total number of failed DA submission attempts", + }, + []string{"chain_id", "type"}, + ), + LastSubmissionAttemptTime: factory.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "last_submission_attempt_time", + Help: "timestamp of the last DA submission attempt", + }, + []string{"chain_id", "type"}, + ), + LastSuccessfulSubmissionTime: factory.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: namespace, + Name: "last_successful_submission_time", + Help: "timestamp of the last successful DA submission", + }, + []string{"chain_id", "type"}, + ), ranges: make(map[string][]*blockRange), lastBlockArrivalTime: make(map[string]time.Time), lastSubmissionDurations: make(map[string]time.Duration), @@ -297,6 +338,30 @@ func NewWithRegistry(namespace string, registerer prometheus.Registerer) *Metric return m } +// RecordSubmissionAttempt records a submission attempt and updates related metrics +func (m *Metrics) RecordSubmissionAttempt(chainID, submissionType string, success bool) { + m.mu.Lock() + defer m.mu.Unlock() + + // Always record the attempt + m.SubmissionAttemptsTotal.WithLabelValues(chainID, submissionType).Inc() + + if !success { + m.SubmissionFailuresTotal.WithLabelValues(chainID, submissionType).Inc() + } + + // Record timestamp of this attempt + now := time.Now() + m.LastSubmissionAttemptTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix())) + + if success { + m.LastSuccessfulSubmissionTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix())) + log.Printf("DEBUG: Successful submission - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix()) + } else { + log.Printf("DEBUG: Failed submission attempt - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix()) + } +} + // RecordSubmissionDaHeight records the DA height only if it's higher than previously recorded func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHeight uint64) { m.mu.Lock() @@ -306,6 +371,11 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei if daHeight > m.latestHeaderDaHeight { m.latestHeaderDaHeight = daHeight m.SubmissionDaHeight.WithLabelValues(chainID, "header").Set(float64(daHeight)) + // Debug log when submission DA height is recorded + log.Printf("DEBUG: Recorded header submission DA height - chain: %s, height: %d", chainID, daHeight) + } else { + // Debug log when DA height is not higher than previous + log.Printf("DEBUG: Header DA height %d not higher than previous %d for chain %s", daHeight, m.latestHeaderDaHeight, chainID) } return } @@ -314,6 +384,11 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei if daHeight > m.latestDataDaHeight { m.latestDataDaHeight = daHeight m.SubmissionDaHeight.WithLabelValues(chainID, "data").Set(float64(daHeight)) + // Debug log when submission DA height is recorded + log.Printf("DEBUG: Recorded data submission DA height - chain: %s, height: %d", chainID, daHeight) + } else { + // Debug log when DA height is not higher than previous + log.Printf("DEBUG: Data DA height %d not higher than previous %d for chain %s", daHeight, m.latestDataDaHeight, chainID) } } } @@ -530,6 +605,7 @@ func (m *Metrics) RecordSubmissionDuration(chainID, submissionType string, durat key := fmt.Sprintf("%s:%s", chainID, submissionType) m.lastSubmissionDurations[key] = duration + } // RefreshSubmissionDuration re-observes the last known submission duration to keep the metric alive. diff --git a/pkg/metrics/metrics_test.go b/pkg/metrics/metrics_test.go index 2b2e061..1c05513 100644 --- a/pkg/metrics/metrics_test.go +++ b/pkg/metrics/metrics_test.go @@ -563,6 +563,130 @@ func TestMetrics_RefreshSubmissionDuration_Empty(t *testing.T) { }) } +func TestMetrics_RecordSubmissionAttempt(t *testing.T) { + reg := prometheus.NewRegistry() + m := NewWithRegistry("test", reg) + + // Test successful submission + beforeAttempt := time.Now() + m.RecordSubmissionAttempt("testchain", "header", true) + afterAttempt := time.Now() + + // Verify counters + attempts := getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{ + "chain_id": "testchain", + "type": "header", + }) + require.Equal(t, float64(1), attempts, "should have 1 attempt") + + // For successful submission, failures should be 0 (metric may not be exported if 0) + // We'll check if the metric exists and has value 0, or doesn't exist (both are valid) + failuresMetricFound := false + failures := float64(0) + metrics, err := reg.Gather() + require.NoError(t, err) + for _, mf := range metrics { + if mf.GetName() == "test_submission_failures_total" { + for _, m := range mf.GetMetric() { + // check if labels match + match := true + for _, label := range m.GetLabel() { + if expectedVal, ok := map[string]string{"chain_id": "testchain", "type": "header"}[label.GetName()]; ok { + if label.GetValue() != expectedVal { + match = false + break + } + } + } + if match && len(m.GetLabel()) == 2 { + failuresMetricFound = true + if m.GetCounter() != nil { + failures = m.GetCounter().GetValue() + } + break + } + } + } + } + if failuresMetricFound { + require.Equal(t, float64(0), failures, "should have 0 failures") + } // else: metric not exported because value is 0, which is expected behavior + + // Verify timestamps are within expected range + lastAttemptTime := getMetricValue(t, reg, "test_last_submission_attempt_time", map[string]string{ + "chain_id": "testchain", + "type": "header", + }) + require.GreaterOrEqual(t, lastAttemptTime, float64(beforeAttempt.Unix())) + require.LessOrEqual(t, lastAttemptTime, float64(afterAttempt.Unix())) + + lastSuccessTime := getMetricValue(t, reg, "test_last_successful_submission_time", map[string]string{ + "chain_id": "testchain", + "type": "header", + }) + require.GreaterOrEqual(t, lastSuccessTime, float64(beforeAttempt.Unix())) + require.LessOrEqual(t, lastSuccessTime, float64(afterAttempt.Unix())) + + // Test failed submission + beforeFailure := time.Now() + m.RecordSubmissionAttempt("testchain", "data", false) + afterFailure := time.Now() + + // Verify counters + attempts = getMetricValue(t, reg, "test_submission_attempts_total", map[string]string{ + "chain_id": "testchain", + "type": "data", + }) + require.Equal(t, float64(1), attempts, "should have 1 attempt for data") + + failures = getMetricValue(t, reg, "test_submission_failures_total", map[string]string{ + "chain_id": "testchain", + "type": "data", + }) + require.Equal(t, float64(1), failures, "should have 1 failure for data") + + // Verify timestamps - should have attempt time but not success time for failed submission + lastAttemptTime = getMetricValue(t, reg, "test_last_submission_attempt_time", map[string]string{ + "chain_id": "testchain", + "type": "data", + }) + require.GreaterOrEqual(t, lastAttemptTime, float64(beforeFailure.Unix())) + require.LessOrEqual(t, lastAttemptTime, float64(afterFailure.Unix())) + + // Last successful submission time should still be 0 for data type (never succeeded) + // Gauge metrics with 0 values may not be exported, so we need to check if it exists + var lastSuccessTimeData float64 + var successMetricFoundData bool + metrics, err = reg.Gather() + require.NoError(t, err) + for _, mf := range metrics { + if mf.GetName() == "test_last_successful_submission_time" { + for _, m := range mf.GetMetric() { + // check if labels match + match := true + for _, label := range m.GetLabel() { + if expectedVal, ok := map[string]string{"chain_id": "testchain", "type": "data"}[label.GetName()]; ok { + if label.GetValue() != expectedVal { + match = false + break + } + } + } + if match && len(m.GetLabel()) == 2 { + successMetricFoundData = true + if m.GetGauge() != nil { + lastSuccessTimeData = m.GetGauge().GetValue() + } + break + } + } + } + } + if successMetricFoundData { + require.Equal(t, float64(0), lastSuccessTimeData, "should have no successful submission time for data") + } // else: metric not exported because value is 0, which is expected behavior +} + // helper types for table tests type blockToRecord struct { chain string @@ -593,7 +717,7 @@ func calculateExpectedTotal(ranges []expectedRange, blobType string) uint64 { return total } -// getMetricValue retrieves the current value of a gauge metric +// getMetricValue retrieves the current value of a metric (gauge or counter) func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, labels map[string]string) float64 { t.Helper() metrics, err := reg.Gather() @@ -613,7 +737,13 @@ func getMetricValue(t *testing.T, reg *prometheus.Registry, metricName string, l } } if match && len(m.GetLabel()) == len(labels) { - return m.GetGauge().GetValue() + // Try gauge first, then counter + if m.GetGauge() != nil { + return m.GetGauge().GetValue() + } + if m.GetCounter() != nil { + return m.GetCounter().GetValue() + } } } } From 341f245990ee513031a94f020591a5e04deea451 Mon Sep 17 00:00:00 2001 From: auricom <27022259+auricom@users.noreply.github.com> Date: Mon, 22 Dec 2025 18:43:37 +0100 Subject: [PATCH 2/2] readme --- README.md | 36 ++++++++++++++++++++++++++++++++++++ pkg/metrics/metrics.go | 30 ++++++++++++++++++++++++------ 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index f43b2c6..3bbe6f0 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ Metrics will be available at `http://localhost:2112/metrics` - `--balance.scrape-interval`: Balance check scrape interval in seconds (default: 30) - `--verifier.workers`: Number of concurrent workers for block verification (default: 50) - `--verbose`: Enable verbose logging (default: false) +- `--debug`: Enable debug logging for submission details (default: false, can also be set via `EVMETRICS_DEBUG=true` environment variable) ### Example with Custom Endpoints @@ -150,6 +151,26 @@ When metrics are enabled, the following metrics are exposed: - **Labels**: `chain_id`, `type` - **Description**: Latest DA height for header and data submissions +### `ev_metrics_submission_attempts_total` +- **Type**: Counter +- **Labels**: `chain_id`, `type` +- **Description**: Total number of DA submission attempts (both successful and failed) + +### `ev_metrics_submission_failures_total` +- **Type**: Counter +- **Labels**: `chain_id`, `type` +- **Description**: Total number of failed DA submission attempts + +### `ev_metrics_last_submission_attempt_time` +- **Type**: Gauge +- **Labels**: `chain_id`, `type` +- **Description**: Timestamp of the last DA submission attempt (Unix timestamp) + +### `ev_metrics_last_successful_submission_time` +- **Type**: Gauge +- **Labels**: `chain_id`, `type` +- **Description**: Timestamp of the last successful DA submission (Unix timestamp) + ### Block Time Metrics ### `ev_metrics_block_time_seconds` @@ -253,3 +274,18 @@ When `--balance.addresses` and `--balance.consensus-rpc-urls` are provided: - **Type**: Counter - **Labels**: `chain_id`, `endpoint`, `error_type` - **Description**: Total number of consensus RPC endpoint errors by type + +## Debug Logging + +Debug logging provides detailed visibility into DA submission process. Enable with `--debug` flag or `EVMETRICS_DEBUG=true` environment variable. + +**Use cases**: Troubleshoot submission failures, verify submission flow, diagnose Celestia RPC issues. + +**Logs include**: Successful/failed submissions, DA height updates, submission timing. + +**Example**: +```bash +# Enable debug logging +export EVMETRICS_DEBUG=true +./ev-metrics monitor --header-namespace testnet_header --data-namespace testnet_data +``` diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 2021c2b..8f2773f 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -3,6 +3,7 @@ package metrics import ( "fmt" "log" + "os" "sort" "strings" "sync" @@ -81,6 +82,11 @@ type Metrics struct { ranges map[string][]*blockRange // key: blobType -> sorted slice of ranges } +// isDebugEnabled returns true if debug logging is enabled via environment variable +func isDebugEnabled() bool { + return strings.ToLower(os.Getenv("EVMETRICS_DEBUG")) == "true" +} + type blockRange struct { start uint64 end uint64 @@ -356,9 +362,13 @@ func (m *Metrics) RecordSubmissionAttempt(chainID, submissionType string, succes if success { m.LastSuccessfulSubmissionTime.WithLabelValues(chainID, submissionType).Set(float64(now.Unix())) - log.Printf("DEBUG: Successful submission - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix()) + if isDebugEnabled() { + log.Printf("DEBUG: Successful submission - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix()) + } } else { - log.Printf("DEBUG: Failed submission attempt - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix()) + if isDebugEnabled() { + log.Printf("DEBUG: Failed submission attempt - chain: %s, type: %s, timestamp: %d", chainID, submissionType, now.Unix()) + } } } @@ -372,10 +382,14 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei m.latestHeaderDaHeight = daHeight m.SubmissionDaHeight.WithLabelValues(chainID, "header").Set(float64(daHeight)) // Debug log when submission DA height is recorded - log.Printf("DEBUG: Recorded header submission DA height - chain: %s, height: %d", chainID, daHeight) + if isDebugEnabled() { + log.Printf("DEBUG: Recorded header submission DA height - chain: %s, height: %d", chainID, daHeight) + } } else { // Debug log when DA height is not higher than previous - log.Printf("DEBUG: Header DA height %d not higher than previous %d for chain %s", daHeight, m.latestHeaderDaHeight, chainID) + if isDebugEnabled() { + log.Printf("DEBUG: Header DA height %d not higher than previous %d for chain %s", daHeight, m.latestHeaderDaHeight, chainID) + } } return } @@ -385,10 +399,14 @@ func (m *Metrics) RecordSubmissionDaHeight(chainID, submissionType string, daHei m.latestDataDaHeight = daHeight m.SubmissionDaHeight.WithLabelValues(chainID, "data").Set(float64(daHeight)) // Debug log when submission DA height is recorded - log.Printf("DEBUG: Recorded data submission DA height - chain: %s, height: %d", chainID, daHeight) + if isDebugEnabled() { + log.Printf("DEBUG: Recorded data submission DA height - chain: %s, height: %d", chainID, daHeight) + } } else { // Debug log when DA height is not higher than previous - log.Printf("DEBUG: Data DA height %d not higher than previous %d for chain %s", daHeight, m.latestDataDaHeight, chainID) + if isDebugEnabled() { + log.Printf("DEBUG: Data DA height %d not higher than previous %d for chain %s", daHeight, m.latestDataDaHeight, chainID) + } } } }