44 "context"
55 "errors"
66 "fmt"
7- "sync/atomic"
87 "time"
98
109 "github.com/ydb-platform/ydb-go-sdk/v3"
@@ -25,10 +24,11 @@ const (
2524
2625type (
2726 Metrics struct {
28- mp * sdkmetric.MeterProvider
29- meter otelmetric.Meter
30- ctx context.Context //nolint:containedctx
31- cancel context.CancelFunc
27+ mp * sdkmetric.MeterProvider
28+ meter otelmetric.Meter
29+ ctx context.Context //nolint:containedctx
30+ cancel context.CancelFunc
31+ ExportInterval time.Duration
3232
3333 // Labels for metrics
3434 ref string
@@ -46,11 +46,7 @@ type (
4646 retriesSuccessTotal otelmetric.Int64Counter
4747 retriesFailureTotal otelmetric.Int64Counter
4848 pendingOperations otelmetric.Int64UpDownCounter
49-
50- // Local counters for fail-on-error logic
51- errorsCount atomic.Int64
52- timeoutsCount atomic.Int64
53- opsCount atomic.Int64
49+ nodeHintMissesPresent otelmetric.Int64Counter
5450 }
5551)
5652
@@ -99,7 +95,7 @@ func New(endpoint, ref, label, jobName string, reportPeriodMs int) (*Metrics, er
9995 if exportInterval == 0 {
10096 exportInterval = 250 * time .Millisecond // Default 250ms
10197 }
102-
98+ m . ExportInterval = exportInterval
10399 m .mp = sdkmetric .NewMeterProvider (
104100 sdkmetric .WithResource (res ),
105101 sdkmetric .WithReader (
@@ -206,6 +202,13 @@ func New(endpoint, ref, label, jobName string, reportPeriodMs int) (*Metrics, er
206202 if err != nil {
207203 return nil , fmt .Errorf ("failed to create pendingOperations counter: %w" , err )
208204 }
205+ m .nodeHintMissesPresent , err = m .meter .Int64Counter (
206+ "workload.node_hints.misses" ,
207+ otelmetric .WithDescription ("Exclusively for node_hints SLO workload: Signals GRPC requests to wrong node" ),
208+ )
209+ if err != nil {
210+ return nil , fmt .Errorf ("failed to create nodeHintMissesPresent counter: %w" , err )
211+ }
209212
210213 return m , nil
211214}
@@ -220,11 +223,6 @@ func (m *Metrics) Push() error {
220223}
221224
222225func (m * Metrics ) Reset () error {
223- // Reset local counters for fail-on-error logic
224- m .errorsCount .Store (0 )
225- m .timeoutsCount .Store (0 )
226- m .opsCount .Store (0 )
227-
228226 // Note: OTel counters/gauges are cumulative and cannot be reset
229227 // This is just for local state
230228 return m .Push ()
@@ -264,6 +262,12 @@ func (m *Metrics) Start(name SpanName) Span {
264262 return j
265263}
266264
265+ func (m * Metrics ) ReportNodeHintMisses () {
266+ if m .meter != nil {
267+ m .nodeHintMissesPresent .Add (m .ctx , 1 )
268+ }
269+ }
270+
267271func (j Span ) Finish (err error , attempts int ) {
268272 if j .m .meter == nil {
269273 return
@@ -283,18 +287,13 @@ func (j Span) Finish(err error, attempts int) {
283287 j .m .operationsTotal .Add (j .m .ctx , 1 , otelmetric .WithAttributes (attrs ... ))
284288 j .m .retryAttemptsTotal .Add (j .m .ctx , int64 (attempts ), otelmetric .WithAttributes (attrs ... ))
285289
286- // Local counters for fail-on-error
287- j .m .opsCount .Add (1 )
288-
289290 if err != nil {
290291 if errors .Is (err , context .DeadlineExceeded ) {
291292 j .m .timeoutsTotal .Add (j .m .ctx , 1 , otelmetric .WithAttributes (attrs ... ))
292- j .m .timeoutsCount .Add (1 )
293293 }
294294 j .m .errorsTotal .Add (j .m .ctx , 1 , otelmetric .WithAttributes (
295295 j .m .commonAttrs (attribute .String ("error_type" , err .Error ()))... ,
296296 ))
297- j .m .errorsCount .Add (1 )
298297
299298 j .m .retriesFailureTotal .Add (j .m .ctx , int64 (attempts ), otelmetric .WithAttributes (attrs ... ))
300299 j .m .operationsFailureTotal .Add (j .m .ctx , 1 , otelmetric .WithAttributes (attrs ... ))
@@ -315,32 +314,3 @@ func (j Span) Finish(err error, attempts int) {
315314 ))
316315 }
317316}
318-
319- func (m * Metrics ) OperationsTotal () float64 {
320- return float64 (m .opsCount .Load ())
321- }
322-
323- func (m * Metrics ) ErrorsTotal () float64 {
324- return float64 (m .errorsCount .Load ())
325- }
326-
327- func (m * Metrics ) TimeoutsTotal () float64 {
328- return float64 (m .timeoutsCount .Load ())
329- }
330-
331- func (m * Metrics ) FailOnError () {
332- if m .ErrorsTotal ()* 20 > m .OperationsTotal () { // 95%
333- log .Panicf (
334- "unretriable (or not successfully retried) errors: %.0f errors out of %.0f operations" ,
335- m .ErrorsTotal (),
336- m .OperationsTotal (),
337- )
338- }
339- if m .TimeoutsTotal ()* 20 > m .OperationsTotal () {
340- log .Panicf (
341- "user timeouts: %.0f timeouts out of %.0f operations" ,
342- m .TimeoutsTotal (),
343- m .OperationsTotal (),
344- )
345- }
346- }
0 commit comments