Skip to content

Commit cc9e0db

Browse files
committed
remove FailOnError for workloads, fail node_hint based on slo-report
1 parent 3bc3652 commit cc9e0db

File tree

12 files changed

+109
-71
lines changed

12 files changed

+109
-71
lines changed
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Default SLO thresholds configuration
2+
# Users can override/extend by providing custom thresholds via:
3+
# - thresholds_yaml input (inline YAML)
4+
# - thresholds_yaml_path input (file path, default: .slo/thresholds.yaml)
5+
#
6+
# Example custom config:
7+
# metrics:
8+
# - name: read_latency_ms # Override specific metric
9+
# warning_change_percent: 10.0
10+
# - pattern: critical_* # Add new pattern
11+
# critical_change_percent: 25.0
12+
13+
# Global threshold for considering change as "neutral" (stable)
14+
neutral_change_percent: 5.0
15+
16+
# Default thresholds applied to all metrics unless overridden
17+
default:
18+
# Regression/improvement percentage thresholds
19+
warning_change_percent: 20.0 # > 20% change triggers warning
20+
critical_change_percent: 50.0 # > 50% change triggers failure
21+
22+
# Metric-specific thresholds
23+
# Patterns support wildcards: *_availability, read_*, etc.
24+
metrics:
25+
# Availability metrics (should stay high)
26+
- pattern: "*_availability"
27+
direction: higher_is_better
28+
warning_min: 99.0 # < 99% triggers warning
29+
critical_min: 95.0 # < 95% triggers failure
30+
warning_change_percent: 1.0 # > 1% drop is significant
31+
32+
# Latency metrics (should stay low)
33+
- pattern: "*_latency_*"
34+
direction: lower_is_better
35+
warning_change_percent: 30.0 # > 30% increase triggers warning
36+
critical_change_percent: 100.0 # > 100% (2x) triggers failure
37+
38+
- pattern: "*_duration_*"
39+
direction: lower_is_better
40+
warning_change_percent: 30.0
41+
critical_change_percent: 100.0
42+
43+
# Throughput metrics (should stay stable or increase)
44+
- pattern: "*_throughput"
45+
direction: higher_is_better
46+
warning_change_percent: 25.0 # > 25% drop triggers warning
47+
critical_change_percent: 50.0 # > 50% drop triggers failure
48+
49+
- pattern: "*_qps"
50+
direction: higher_is_better
51+
warning_change_percent: 25.0
52+
critical_change_percent: 50.0
53+
54+
- pattern: "*_rps"
55+
direction: higher_is_better
56+
warning_change_percent: 25.0
57+
critical_change_percent: 50.0
58+
59+
# Error/failure metrics (should stay at zero)
60+
- pattern: "*_error*"
61+
direction: lower_is_better
62+
warning_max: 0.1 # Any errors trigger warning
63+
critical_max: 1.0 # > 1% error rate triggers failure
64+
65+
- pattern: "*_failure*"
66+
direction: lower_is_better
67+
warning_max: 0.1
68+
critical_max: 1.0
69+
- pattern: "*_node_hints.misses*"
70+
direction: lower_is_better
71+
critical_max: 0 # more than zero and node_hints workload fails

.github/workflows/slo-report.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ jobs:
2121
with:
2222
github_token: ${{ secrets.GITHUB_TOKEN }}
2323
github_run_id: ${{ github.event.workflow_run.id }}
24+
thresholds_yaml_path: "${{ github.workspace }}/current/.github/resources/slo-report-thresholds.yaml"
2425
remove-slo-label:
2526
needs: test-ydb-slo-action
2627
if: always() && github.event.workflow_run.event == 'pull_request'

.github/workflows/slo.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -275,10 +275,6 @@ jobs:
275275
docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
276276
echo ""
277277
278-
if [ "$CURRENT_EXIT" -ne 0 ]; then
279-
echo "ERROR: current workload exited with code $CURRENT_EXIT"
280-
exit 1
281-
fi
282278
echo "SUCCESS: Workloads completed successfully"
283279
284280
- if: always()

tests/slo/database/sql/query/main.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ func main() {
127127
go w.Metrics(ctx, &wg, metricsRL)
128128

129129
wg.Wait()
130-
w.FailOnError()
131130
default:
132131
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
133132
}

tests/slo/database/sql/table/main.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ func main() {
127127
go w.Metrics(ctx, &wg, metricsRL)
128128

129129
wg.Wait()
130-
w.FailOnError()
131130
default:
132131
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
133132
}

tests/slo/internal/metrics/metrics.go

Lines changed: 20 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55
"errors"
66
"fmt"
7-
"sync/atomic"
87
"time"
98

109
"github.com/ydb-platform/ydb-go-sdk/v3"
@@ -25,10 +24,11 @@ const (
2524

2625
type (
2726
Metrics struct {
28-
mp *sdkmetric.MeterProvider
29-
meter otelmetric.Meter
30-
ctx context.Context //nolint:containedctx
31-
cancel context.CancelFunc
27+
mp *sdkmetric.MeterProvider
28+
meter otelmetric.Meter
29+
ctx context.Context //nolint:containedctx
30+
cancel context.CancelFunc
31+
ExportInterval time.Duration
3232

3333
// Labels for metrics
3434
ref string
@@ -46,11 +46,7 @@ type (
4646
retriesSuccessTotal otelmetric.Int64Counter
4747
retriesFailureTotal otelmetric.Int64Counter
4848
pendingOperations otelmetric.Int64UpDownCounter
49-
50-
// Local counters for fail-on-error logic
51-
errorsCount atomic.Int64
52-
timeoutsCount atomic.Int64
53-
opsCount atomic.Int64
49+
nodeHintMissesPresent otelmetric.Int64Counter
5450
}
5551
)
5652

@@ -99,7 +95,7 @@ func New(endpoint, ref, label, jobName string, reportPeriodMs int) (*Metrics, er
9995
if exportInterval == 0 {
10096
exportInterval = 250 * time.Millisecond // Default 250ms
10197
}
102-
98+
m.ExportInterval = exportInterval
10399
m.mp = sdkmetric.NewMeterProvider(
104100
sdkmetric.WithResource(res),
105101
sdkmetric.WithReader(
@@ -206,6 +202,13 @@ func New(endpoint, ref, label, jobName string, reportPeriodMs int) (*Metrics, er
206202
if err != nil {
207203
return nil, fmt.Errorf("failed to create pendingOperations counter: %w", err)
208204
}
205+
m.nodeHintMissesPresent, err = m.meter.Int64Counter(
206+
"workload.node_hints.misses",
207+
otelmetric.WithDescription("Exclusively for node_hints SLO workload: Signals GRPC requests to wrong node"),
208+
)
209+
if err != nil {
210+
return nil, fmt.Errorf("failed to create nodeHintMissesPresent counter: %w", err)
211+
}
209212

210213
return m, nil
211214
}
@@ -220,11 +223,6 @@ func (m *Metrics) Push() error {
220223
}
221224

222225
func (m *Metrics) Reset() error {
223-
// Reset local counters for fail-on-error logic
224-
m.errorsCount.Store(0)
225-
m.timeoutsCount.Store(0)
226-
m.opsCount.Store(0)
227-
228226
// Note: OTel counters/gauges are cumulative and cannot be reset
229227
// This is just for local state
230228
return m.Push()
@@ -264,6 +262,12 @@ func (m *Metrics) Start(name SpanName) Span {
264262
return j
265263
}
266264

265+
func (m *Metrics) ReportNodeHintMisses() {
266+
if m.meter != nil {
267+
m.nodeHintMissesPresent.Add(m.ctx, 1)
268+
}
269+
}
270+
267271
func (j Span) Finish(err error, attempts int) {
268272
if j.m.meter == nil {
269273
return
@@ -283,18 +287,13 @@ func (j Span) Finish(err error, attempts int) {
283287
j.m.operationsTotal.Add(j.m.ctx, 1, otelmetric.WithAttributes(attrs...))
284288
j.m.retryAttemptsTotal.Add(j.m.ctx, int64(attempts), otelmetric.WithAttributes(attrs...))
285289

286-
// Local counters for fail-on-error
287-
j.m.opsCount.Add(1)
288-
289290
if err != nil {
290291
if errors.Is(err, context.DeadlineExceeded) {
291292
j.m.timeoutsTotal.Add(j.m.ctx, 1, otelmetric.WithAttributes(attrs...))
292-
j.m.timeoutsCount.Add(1)
293293
}
294294
j.m.errorsTotal.Add(j.m.ctx, 1, otelmetric.WithAttributes(
295295
j.m.commonAttrs(attribute.String("error_type", err.Error()))...,
296296
))
297-
j.m.errorsCount.Add(1)
298297

299298
j.m.retriesFailureTotal.Add(j.m.ctx, int64(attempts), otelmetric.WithAttributes(attrs...))
300299
j.m.operationsFailureTotal.Add(j.m.ctx, 1, otelmetric.WithAttributes(attrs...))
@@ -315,32 +314,3 @@ func (j Span) Finish(err error, attempts int) {
315314
))
316315
}
317316
}
318-
319-
func (m *Metrics) OperationsTotal() float64 {
320-
return float64(m.opsCount.Load())
321-
}
322-
323-
func (m *Metrics) ErrorsTotal() float64 {
324-
return float64(m.errorsCount.Load())
325-
}
326-
327-
func (m *Metrics) TimeoutsTotal() float64 {
328-
return float64(m.timeoutsCount.Load())
329-
}
330-
331-
func (m *Metrics) FailOnError() {
332-
if m.ErrorsTotal()*20 > m.OperationsTotal() { // 95%
333-
log.Panicf(
334-
"unretriable (or not successfully retried) errors: %.0f errors out of %.0f operations",
335-
m.ErrorsTotal(),
336-
m.OperationsTotal(),
337-
)
338-
}
339-
if m.TimeoutsTotal()*20 > m.OperationsTotal() {
340-
log.Panicf(
341-
"user timeouts: %.0f timeouts out of %.0f operations",
342-
m.TimeoutsTotal(),
343-
m.OperationsTotal(),
344-
)
345-
}
346-
}

tests/slo/internal/workers/workers.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package workers
22

33
import (
44
"context"
5+
"time"
56

67
"slo/internal/config"
78
"slo/internal/generator"
@@ -57,8 +58,14 @@ func NewWithBatch(cfg *config.Config, s BatchReadWriter, ref, label, jobName str
5758
}, nil
5859
}
5960

60-
func (w *Workers) FailOnError() {
61-
w.m.FailOnError()
61+
func (w *Workers) ReportNodeHintMisses() {
62+
if w.m != nil {
63+
w.m.ReportNodeHintMisses()
64+
}
65+
}
66+
67+
func (w *Workers) ExportInterval() time.Duration {
68+
return w.m.ExportInterval
6269
}
6370

6471
func (w *Workers) Close() error {

tests/slo/native/node_hints/dynnode_traffic.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,14 +133,15 @@ func (e *Estimator) ClusterRWCounter(ctx context.Context) float64 {
133133
return e.ClusterGrpcAPICounter(ctx, "ReadRows") + e.ClusterGrpcAPICounter(ctx, "BulkUpsert")
134134
}
135135

136-
func (e *Estimator) OnlyThisNode(ctx context.Context, nodeID uint32) {
136+
func (e *Estimator) OnlyThisNode(ctx context.Context, nodeID uint32) error {
137137
clusterNow := e.ClusterRWCounter(ctx)
138138
nodeNow := e.NodeRWCounter(ctx, nodeID)
139139
if clusterNow-e.ClusterCounter > nodeNow-e.NodeRequests[nodeID] {
140-
log.Panicf("requests were served by other nodes: cluster %f -> %f, node %d %f -> %f",
140+
return fmt.Errorf("requests were served by other nodes: cluster %f -> %f, node %d %f -> %f",
141141
e.ClusterCounter, clusterNow,
142142
nodeID,
143143
e.NodeRequests[nodeID], nodeNow,
144144
)
145145
}
146+
return nil
146147
}

tests/slo/native/node_hints/main.go

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -144,18 +144,15 @@ func main() {
144144
go w.Write(ctx, &wg, writeRL, gen)
145145
}
146146
log.Println("started " + strconv.Itoa(cfg.WriteRPS) + " write workers")
147-
148-
metricsRL := rate.NewLimiter(rate.Every(time.Duration(cfg.ReportPeriod)*time.Millisecond), 1)
149-
wg.Add(1)
150-
go w.Metrics(ctx, &wg, metricsRL)
151-
152147
wg.Wait()
153-
w.FailOnError()
154148
// check all load is sent to a single node
155149
ectx, ecancel := context.WithTimeout(context.Background(), 10*time.Second)
156150
defer ecancel()
157-
estimator.OnlyThisNode(ectx, nodeID)
158-
151+
err = estimator.OnlyThisNode(ectx, nodeID)
152+
if err != nil {
153+
w.ReportNodeHintMisses()
154+
time.Sleep(w.ExportInterval())
155+
}
159156
default:
160157
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
161158
}

tests/slo/native/query/main.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ func main() {
143143
go w.Metrics(ctx, &wg, metricsRL)
144144

145145
wg.Wait()
146-
w.FailOnError()
147146
default:
148147
panic(fmt.Errorf("unknown mode: %v", cfg.Mode))
149148
}

0 commit comments

Comments
 (0)