Skip to content

Commit f54475a

Browse files
committed
feat(metrics): add scheduler attempt counter and outcome helper
Signed-off-by: CYJiang <googs1025@gmail.com>
1 parent 25cfb90 commit f54475a

File tree

4 files changed

+77
-1
lines changed

4 files changed

+77
-1
lines changed

pkg/epp/metrics/metrics.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,16 @@ var (
299299
[]string{},
300300
)
301301

302+
// SchedulerAttemptsTotal counts total number of scheduling attempts, labeled by status.
303+
SchedulerAttemptsTotal = prometheus.NewCounterVec(
304+
prometheus.CounterOpts{
305+
Subsystem: InferenceExtension,
306+
Name: "scheduler_attempts_total",
307+
Help: metricsutil.HelpMsgWithStability("Total number of scheduling attempts.", compbasemetrics.ALPHA),
308+
},
309+
[]string{"status"}, // "success", "failure"
310+
)
311+
302312
PluginProcessingLatencies = prometheus.NewHistogramVec(
303313
prometheus.HistogramOpts{
304314
Subsystem: InferenceExtension,
@@ -409,6 +419,7 @@ func Register(customCollectors ...prometheus.Collector) {
409419
metrics.Registry.MustRegister(inferencePoolAvgQueueSize)
410420
metrics.Registry.MustRegister(inferencePoolReadyPods)
411421
metrics.Registry.MustRegister(SchedulerE2ELatency)
422+
metrics.Registry.MustRegister(SchedulerAttemptsTotal)
412423
metrics.Registry.MustRegister(PluginProcessingLatencies)
413424
metrics.Registry.MustRegister(InferenceExtensionInfo)
414425
metrics.Registry.MustRegister(PrefixCacheSize)
@@ -453,6 +464,7 @@ func Reset() {
453464
inferencePoolAvgQueueSize.Reset()
454465
inferencePoolReadyPods.Reset()
455466
SchedulerE2ELatency.Reset()
467+
SchedulerAttemptsTotal.Reset()
456468
PluginProcessingLatencies.Reset()
457469
InferenceExtensionInfo.Reset()
458470
PrefixCacheSize.Reset()
@@ -462,7 +474,7 @@ func Reset() {
462474
flowControlQueueSize.Reset()
463475
}
464476

465-
// RecordRequstCounter records the number of requests.
477+
// RecordRequestCounter records the number of requests.
466478
func RecordRequestCounter(modelName, targetModelName string) {
467479
requestCounter.WithLabelValues(modelName, targetModelName).Inc()
468480
}
@@ -684,6 +696,20 @@ func RecordSchedulerE2ELatency(duration time.Duration) {
684696
SchedulerE2ELatency.WithLabelValues().Observe(duration.Seconds())
685697
}
686698

699+
// RecordSchedulerAttempt records a scheduling attempt with status.
700+
func RecordSchedulerAttempt(err error) {
701+
if err != nil {
702+
SchedulerAttemptsTotal.WithLabelValues(SchedulerStatusFailure).Inc()
703+
} else {
704+
SchedulerAttemptsTotal.WithLabelValues(SchedulerStatusSuccess).Inc()
705+
}
706+
}
707+
708+
const (
709+
SchedulerStatusSuccess = "success"
710+
SchedulerStatusFailure = "failure"
711+
)
712+
687713
// RecordPluginProcessingLatency records the processing latency for a plugin.
688714
func RecordPluginProcessingLatency(extensionPoint, pluginType, pluginName string, duration time.Duration) {
689715
PluginProcessingLatencies.WithLabelValues(extensionPoint, pluginType, pluginName).Observe(duration.Seconds())

pkg/epp/metrics/metrics_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package metrics
1818

1919
import (
2020
"context"
21+
"errors"
2122
"os"
2223
"testing"
2324
"time"
@@ -684,6 +685,50 @@ func TestSchedulerE2ELatency(t *testing.T) {
684685
}
685686
}
686687

688+
func TestSchedulerAttemptsTotal(t *testing.T) {
689+
690+
scenarios := []struct {
691+
name string
692+
successCount int
693+
failureCount int
694+
}{
695+
{
696+
name: "mixed success and failure attempts",
697+
successCount: 10,
698+
failureCount: 5,
699+
},
700+
}
701+
702+
for _, scenario := range scenarios {
703+
t.Run(scenario.name, func(t *testing.T) {
704+
Reset()
705+
for i := 0; i < scenario.successCount; i++ {
706+
RecordSchedulerAttempt(nil)
707+
}
708+
for i := 0; i < scenario.failureCount; i++ {
709+
RecordSchedulerAttempt(errors.New("simulated scheduling failure"))
710+
}
711+
712+
wantMetrics, err := os.Open("testdata/scheduler_attempts_total_metrics")
713+
defer func() {
714+
if err = wantMetrics.Close(); err != nil {
715+
t.Error(err)
716+
}
717+
}()
718+
if err != nil {
719+
t.Fatal(err)
720+
}
721+
if err := testutil.GatherAndCompare(
722+
metrics.Registry,
723+
wantMetrics,
724+
"inference_extension_scheduler_attempts_total",
725+
); err != nil {
726+
t.Errorf("metric comparison failed: %v", err)
727+
}
728+
})
729+
}
730+
}
731+
687732
func TestPrefixCacheMetrics(t *testing.T) {
688733
Reset()
689734
const (
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# HELP inference_extension_scheduler_attempts_total [ALPHA] Total number of scheduling attempts.
2+
# TYPE inference_extension_scheduler_attempts_total counter
3+
inference_extension_scheduler_attempts_total{status="failure"} 5
4+
inference_extension_scheduler_attempts_total{status="success"} 10

pkg/epp/scheduling/scheduler.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ func (s *Scheduler) Schedule(ctx context.Context, request *types.LLMRequest, can
8787
before := time.Now()
8888
result, err := s.profileHandler.ProcessResults(ctx, cycleState, request, profileRunResults)
8989
metrics.RecordPluginProcessingLatency(framework.ProcessProfilesResultsExtensionPoint, s.profileHandler.TypedName().Type, s.profileHandler.TypedName().Name, time.Since(before))
90+
metrics.RecordSchedulerAttempt(err)
9091
loggerVerbose.Info("Completed running profile handler ProcessResults successfully", "plugin", s.profileHandler.TypedName())
9192

9293
return result, err

0 commit comments

Comments
 (0)