remove FailOnError for workloads, fail node_hint based on slo-report

qrort · qrort · commit cc9e0db43ef6 · 2025-12-18T11:29:08.000+09:00
diff --git a/.github/resources/slo-report-thresholds.yaml b/.github/resources/slo-report-thresholds.yaml
@@ -0,0 +1,71 @@
+# Default SLO thresholds configuration
+# Users can override/extend by providing custom thresholds via:
+#   - thresholds_yaml input (inline YAML)
+#   - thresholds_yaml_path input (file path, default: .slo/thresholds.yaml)
+#
+# Example custom config:
+#   metrics:
+#     - name: read_latency_ms              # Override specific metric
+#       warning_change_percent: 10.0
+#     - pattern: critical_*                # Add new pattern
+#       critical_change_percent: 25.0
+
+# Global threshold for considering change as "neutral" (stable)
+neutral_change_percent: 5.0
+
+# Default thresholds applied to all metrics unless overridden
+default:
+  # Regression/improvement percentage thresholds
+  warning_change_percent: 20.0 # > 20% change triggers warning
+  critical_change_percent: 50.0 # > 50% change triggers failure
+
+# Metric-specific thresholds
+# Patterns support wildcards: *_availability, read_*, etc.
+metrics:
+  # Availability metrics (should stay high)
+  - pattern: "*_availability"
+    direction: higher_is_better
+    warning_min: 99.0 # < 99% triggers warning
+    critical_min: 95.0 # < 95% triggers failure
+    warning_change_percent: 1.0 # > 1% drop is significant
+
+  # Latency metrics (should stay low)
+  - pattern: "*_latency_*"
+    direction: lower_is_better
+    warning_change_percent: 30.0 # > 30% increase triggers warning
+    critical_change_percent: 100.0 # > 100% (2x) triggers failure
+
+  - pattern: "*_duration_*"
+    direction: lower_is_better
+    warning_change_percent: 30.0
+    critical_change_percent: 100.0
+
+  # Throughput metrics (should stay stable or increase)
+  - pattern: "*_throughput"
+    direction: higher_is_better
+    warning_change_percent: 25.0 # > 25% drop triggers warning
+    critical_change_percent: 50.0 # > 50% drop triggers failure
+
+  - pattern: "*_qps"
+    direction: higher_is_better
+    warning_change_percent: 25.0
+    critical_change_percent: 50.0
+
+  - pattern: "*_rps"
+    direction: higher_is_better
+    warning_change_percent: 25.0
+    critical_change_percent: 50.0
+
+  # Error/failure metrics (should stay at zero)
+  - pattern: "*_error*"
+    direction: lower_is_better
+    warning_max: 0.1 # Any errors trigger warning
+    critical_max: 1.0 # > 1% error rate triggers failure
+
+  - pattern: "*_failure*"
+    direction: lower_is_better
+    warning_max: 0.1
+    critical_max: 1.0
+  - pattern: "*_node_hints.misses*"
+    direction: lower_is_better
+    critical_max: 0 # more than zero and node_hints workload fails
diff --git a/.github/workflows/slo-report.yml b/.github/workflows/slo-report.yml
@@ -21,6 +21,7 @@ jobs:
         with:
           github_token: ${{ secrets.GITHUB_TOKEN }}
           github_run_id: ${{ github.event.workflow_run.id }}
+          thresholds_yaml_path: "${{ github.workspace }}/current/.github/resources/slo-report-thresholds.yaml"
   remove-slo-label:
     needs: test-ydb-slo-action
     if: always() && github.event.workflow_run.event == 'pull_request'
diff --git a/.github/workflows/slo.yml b/.github/workflows/slo.yml
@@ -275,10 +275,6 @@ jobs:
           docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
           echo ""
 
-          if [ "$CURRENT_EXIT" -ne 0 ]; then
-            echo "ERROR: current workload exited with code $CURRENT_EXIT"
-            exit 1
-          fi
           echo "SUCCESS: Workloads completed successfully"
 
       - if: always()
diff --git a/tests/slo/database/sql/query/main.go b/tests/slo/database/sql/query/main.go
@@ -127,7 +127,6 @@ func main() {
 		go w.Metrics(ctx, &wg, metricsRL)
 
 		wg.Wait()
-		w.FailOnError()
 	default:
 		panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
 	}
diff --git a/tests/slo/database/sql/table/main.go b/tests/slo/database/sql/table/main.go
@@ -127,7 +127,6 @@ func main() {
 		go w.Metrics(ctx, &wg, metricsRL)
 
 		wg.Wait()
-		w.FailOnError()
 	default:
 		panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
 	}
diff --git a/tests/slo/internal/metrics/metrics.go b/tests/slo/internal/metrics/metrics.go
@@ -4,7 +4,6 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"sync/atomic"
 	"time"
 
 	"github.com/ydb-platform/ydb-go-sdk/v3"
@@ -25,10 +24,11 @@ const (
 
 type (
 	Metrics struct {
-		mp     *sdkmetric.MeterProvider
-		meter  otelmetric.Meter
-		ctx    context.Context //nolint:containedctx
-		cancel context.CancelFunc
+		mp             *sdkmetric.MeterProvider
+		meter          otelmetric.Meter
+		ctx            context.Context //nolint:containedctx
+		cancel         context.CancelFunc
+		ExportInterval time.Duration
 
 		// Labels for metrics
 		ref   string
@@ -46,11 +46,7 @@ type (
 		retriesSuccessTotal     otelmetric.Int64Counter
 		retriesFailureTotal     otelmetric.Int64Counter
 		pendingOperations       otelmetric.Int64UpDownCounter
-
-		// Local counters for fail-on-error logic
-		errorsCount   atomic.Int64
-		timeoutsCount atomic.Int64
-		opsCount      atomic.Int64
+		nodeHintMissesPresent   otelmetric.Int64Counter
 	}
 )
 
@@ -99,7 +95,7 @@ func New(endpoint, ref, label, jobName string, reportPeriodMs int) (*Metrics, er
 	if exportInterval == 0 {
 		exportInterval = 250 * time.Millisecond // Default 250ms
 	}
-
+	m.ExportInterval = exportInterval
 	m.mp = sdkmetric.NewMeterProvider(
 		sdkmetric.WithResource(res),
 		sdkmetric.WithReader(
@@ -206,6 +202,13 @@ func New(endpoint, ref, label, jobName string, reportPeriodMs int) (*Metrics, er
 	if err != nil {
 		return nil, fmt.Errorf("failed to create pendingOperations counter: %w", err)
 	}
+	m.nodeHintMissesPresent, err = m.meter.Int64Counter(
+		"workload.node_hints.misses",
+		otelmetric.WithDescription("Exclusively for node_hints SLO workload: Signals GRPC requests to wrong node"),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create nodeHintMissesPresent counter: %w", err)
+	}
 
 	return m, nil
 }
@@ -220,11 +223,6 @@ func (m *Metrics) Push() error {
 }
 
 func (m *Metrics) Reset() error {
-	// Reset local counters for fail-on-error logic
-	m.errorsCount.Store(0)
-	m.timeoutsCount.Store(0)
-	m.opsCount.Store(0)
-
 	// Note: OTel counters/gauges are cumulative and cannot be reset
 	// This is just for local state
 	return m.Push()
@@ -264,6 +262,12 @@ func (m *Metrics) Start(name SpanName) Span {
 	return j
 }
 
+func (m *Metrics) ReportNodeHintMisses() {
+	if m.meter != nil {
+		m.nodeHintMissesPresent.Add(m.ctx, 1)
+	}
+}
+
 func (j Span) Finish(err error, attempts int) {
 	if j.m.meter == nil {
 		return
@@ -283,18 +287,13 @@ func (j Span) Finish(err error, attempts int) {
 	j.m.operationsTotal.Add(j.m.ctx, 1, otelmetric.WithAttributes(attrs...))
 	j.m.retryAttemptsTotal.Add(j.m.ctx, int64(attempts), otelmetric.WithAttributes(attrs...))
 
-	// Local counters for fail-on-error
-	j.m.opsCount.Add(1)
-
 	if err != nil {
 		if errors.Is(err, context.DeadlineExceeded) {
 			j.m.timeoutsTotal.Add(j.m.ctx, 1, otelmetric.WithAttributes(attrs...))
-			j.m.timeoutsCount.Add(1)
 		}
 		j.m.errorsTotal.Add(j.m.ctx, 1, otelmetric.WithAttributes(
 			j.m.commonAttrs(attribute.String("error_type", err.Error()))...,
 		))
-		j.m.errorsCount.Add(1)
 
 		j.m.retriesFailureTotal.Add(j.m.ctx, int64(attempts), otelmetric.WithAttributes(attrs...))
 		j.m.operationsFailureTotal.Add(j.m.ctx, 1, otelmetric.WithAttributes(attrs...))
@@ -315,32 +314,3 @@ func (j Span) Finish(err error, attempts int) {
 		))
 	}
 }
-
-func (m *Metrics) OperationsTotal() float64 {
-	return float64(m.opsCount.Load())
-}
-
-func (m *Metrics) ErrorsTotal() float64 {
-	return float64(m.errorsCount.Load())
-}
-
-func (m *Metrics) TimeoutsTotal() float64 {
-	return float64(m.timeoutsCount.Load())
-}
-
-func (m *Metrics) FailOnError() {
-	if m.ErrorsTotal()*20 > m.OperationsTotal() { // 95%
-		log.Panicf(
-			"unretriable (or not successfully retried) errors: %.0f errors out of %.0f operations",
-			m.ErrorsTotal(),
-			m.OperationsTotal(),
-		)
-	}
-	if m.TimeoutsTotal()*20 > m.OperationsTotal() {
-		log.Panicf(
-			"user timeouts: %.0f timeouts out of %.0f operations",
-			m.TimeoutsTotal(),
-			m.OperationsTotal(),
-		)
-	}
-}
diff --git a/tests/slo/internal/workers/workers.go b/tests/slo/internal/workers/workers.go
@@ -2,6 +2,7 @@ package workers
 
 import (
 	"context"
+	"time"
 
 	"slo/internal/config"
 	"slo/internal/generator"
@@ -57,8 +58,14 @@ func NewWithBatch(cfg *config.Config, s BatchReadWriter, ref, label, jobName str
 	}, nil
 }
 
-func (w *Workers) FailOnError() {
-	w.m.FailOnError()
+func (w *Workers) ReportNodeHintMisses() {
+	if w.m != nil {
+		w.m.ReportNodeHintMisses()
+	}
+}
+
+func (w *Workers) ExportInterval() time.Duration {
+	return w.m.ExportInterval
 }
 
 func (w *Workers) Close() error {
diff --git a/tests/slo/native/node_hints/dynnode_traffic.go b/tests/slo/native/node_hints/dynnode_traffic.go
@@ -133,14 +133,15 @@ func (e *Estimator) ClusterRWCounter(ctx context.Context) float64 {
 	return e.ClusterGrpcAPICounter(ctx, "ReadRows") + e.ClusterGrpcAPICounter(ctx, "BulkUpsert")
 }
 
-func (e *Estimator) OnlyThisNode(ctx context.Context, nodeID uint32) {
+func (e *Estimator) OnlyThisNode(ctx context.Context, nodeID uint32) error {
 	clusterNow := e.ClusterRWCounter(ctx)
 	nodeNow := e.NodeRWCounter(ctx, nodeID)
 	if clusterNow-e.ClusterCounter > nodeNow-e.NodeRequests[nodeID] {
-		log.Panicf("requests were served by other nodes: cluster %f -> %f, node %d %f -> %f",
+		return fmt.Errorf("requests were served by other nodes: cluster %f -> %f, node %d %f -> %f",
 			e.ClusterCounter, clusterNow,
 			nodeID,
 			e.NodeRequests[nodeID], nodeNow,
 		)
 	}
+	return nil
 }
diff --git a/tests/slo/native/node_hints/main.go b/tests/slo/native/node_hints/main.go
@@ -144,18 +144,15 @@ func main() {
 			go w.Write(ctx, &wg, writeRL, gen)
 		}
 		log.Println("started " + strconv.Itoa(cfg.WriteRPS) + " write workers")
-
-		metricsRL := rate.NewLimiter(rate.Every(time.Duration(cfg.ReportPeriod)*time.Millisecond), 1)
-		wg.Add(1)
-		go w.Metrics(ctx, &wg, metricsRL)
-
 		wg.Wait()
-		w.FailOnError()
 		// check all load is sent to a single node
 		ectx, ecancel := context.WithTimeout(context.Background(), 10*time.Second)
 		defer ecancel()
-		estimator.OnlyThisNode(ectx, nodeID)
-
+		err = estimator.OnlyThisNode(ectx, nodeID)
+		if err != nil {
+			w.ReportNodeHintMisses()
+			time.Sleep(w.ExportInterval())
+		}
 	default:
 		panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
 	}
diff --git a/tests/slo/native/query/main.go b/tests/slo/native/query/main.go
@@ -143,7 +143,6 @@ func main() {
 		go w.Metrics(ctx, &wg, metricsRL)
 
 		wg.Wait()
-		w.FailOnError()
 	default:
 		panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
 	}
diff --git a/tests/slo/native/table/main.go b/tests/slo/native/table/main.go
@@ -127,7 +127,6 @@ func main() {
 		go w.Metrics(ctx, &wg, metricsRL)
 
 		wg.Wait()
-		w.FailOnError()
 	default:
 		panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
 	}
diff --git a/tests/slo/native/table/over/query/service/main.go b/tests/slo/native/table/over/query/service/main.go
@@ -127,7 +127,6 @@ func main() {
 		go w.Metrics(ctx, &wg, metricsRL)
 
 		wg.Wait()
-		w.FailOnError()
 	default:
 		panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
 	}

Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,6 @@ func main() {`
`127`	`127`	`go w.Metrics(ctx, &wg, metricsRL)`
`128`	`128`
`129`	`129`	`wg.Wait()`
`130`		`- w.FailOnError()`
`131`	`130`	`default:`
`132`	`131`	`panic(fmt.Errorf("unknown mode: %v", cfg.Mode))`
`133`	`132`	`}`
Original file line number	Diff line number	Diff line change
`@@ -133,14 +133,15 @@ func (e *Estimator) ClusterRWCounter(ctx context.Context) float64 {`
`133`	`133`	`return e.ClusterGrpcAPICounter(ctx, "ReadRows") + e.ClusterGrpcAPICounter(ctx, "BulkUpsert")`
`134`	`134`	`}`
`135`	`135`
`136`		`-func (e *Estimator) OnlyThisNode(ctx context.Context, nodeID uint32) {`
	`136`	`+func (e *Estimator) OnlyThisNode(ctx context.Context, nodeID uint32) error {`
`137`	`137`	`clusterNow := e.ClusterRWCounter(ctx)`
`138`	`138`	`nodeNow := e.NodeRWCounter(ctx, nodeID)`
`139`	`139`	`if clusterNow-e.ClusterCounter > nodeNow-e.NodeRequests[nodeID] {`
`140`		`- log.Panicf("requests were served by other nodes: cluster %f -> %f, node %d %f -> %f",`
	`140`	`+ return fmt.Errorf("requests were served by other nodes: cluster %f -> %f, node %d %f -> %f",`
`141`	`141`	`e.ClusterCounter, clusterNow,`
`142`	`142`	`nodeID,`
`143`	`143`	`e.NodeRequests[nodeID], nodeNow,`
`144`	`144`	`)`
`145`	`145`	`}`
	`146`	`+ return nil`
`146`	`147`	`}`
Original file line number	Diff line number	Diff line change
`@@ -143,7 +143,6 @@ func main() {`
`143`	`143`	`go w.Metrics(ctx, &wg, metricsRL)`
`144`	`144`
`145`	`145`	`wg.Wait()`
`146`		`- w.FailOnError()`
`147`	`146`	`default:`
`148`	`147`	`panic(fmt.Errorf("unknown mode: %v", cfg.Mode))`
`149`	`148`	`}`