diff --git a/docs/checks/commands/check_cpu_utilization.md b/docs/checks/commands/check_cpu_utilization.md index 1a5b7051..45e2d059 100644 --- a/docs/checks/commands/check_cpu_utilization.md +++ b/docs/checks/commands/check_cpu_utilization.md @@ -20,8 +20,8 @@ Checks the cpu utilization metrics. ### Default Check - check_cpu_utilization - OK - user: 29% - system: 11% - iowait: 3% - steal: 0% - guest: 0% |'user'=28.83%;;;0;... + check_cpu_utilization +OK - user: 2% - system: 1% - iowait: 0% - steal: 0% - guest: 0 - idle: 96% |'total'=3.4%;90;95;0; 'user'=2.11%;;;0;... ### Example using NRPE and Naemon @@ -41,15 +41,15 @@ Naemon Config ## Argument Defaults -| Argument | Default Value | -| ------------- | --------------------------------------------------------------------------------------------------- | -| warning | total > 90 | -| critical | total > 95 | -| empty-state | 0 (OK) | -| empty-syntax | | -| top-syntax | \${status} - \${list} | -| ok-syntax | | -| detail-syntax | user: \${user}% - system: \${system}% - iowait: \${iowait}% - steal: \${steal}% - guest: \${guest}% | +| Argument | Default Value | +| ------------- | ----------------------------------------------------------------------------------------------------- | +| warning | total > 90 | +| critical | total > 95 | +| empty-state | 0 (OK) | +| empty-syntax | | +| top-syntax | \${status} - \${list} | +| ok-syntax | | +| detail-syntax | user: \${user}% - system: \${system}% - iowait: \${iowait}% - steal: \${steal}% - guest: \${guest} - idle: %{idle}% | ## Check Specific Arguments @@ -71,3 +71,4 @@ these can be used in filters and thresholds (along with the default attributes): | iowait | IOWait cpu utilization in percent | | steal | Steal cpu utilization in percent | | guest | Guest cpu utilization in percent | +| idle | Idle cpu utilization in percent | diff --git a/pkg/counter/counter.go b/pkg/counter/counter.go index 3c515112..4eeaf5af 100644 --- a/pkg/counter/counter.go +++ b/pkg/counter/counter.go @@ -1,6 +1,7 @@ package counter import ( + "fmt" "math" "sync" "time" @@ -9,10 +10,14 @@ import ( // Counter is the container for a single timeseries of performance values // it used a fixed size storage backend type Counter struct { - lock sync.RWMutex // lock for concurrent access - data []Value // array of values - current int64 // position of last inserted value - size int64 // number of values for this series + lock sync.RWMutex // lock for concurrent access + data []Value // array of values. size determined by the retention and interval + current int64 // position of last inserted value + oldest int64 // position of the earliest inserted value + size int64 // number of values for this series + timesSet int64 // number of times a value was set in this counter + retention time.Duration // the time span this counter can hold, interval * size + interval time.Duration // the interval time that new values are designed to be added } // Value is a single entry of a Counter @@ -24,30 +29,43 @@ type Value struct { // NewCounter creates a new Counter with given retention time and interval func NewCounter(retentionTime, interval time.Duration) *Counter { // round retention and interval to milliseconds - retentionMilli := retentionTime.Milliseconds() - intervalMilli := interval.Milliseconds() + retentionMili := retentionTime.Milliseconds() + intervalMili := interval.Milliseconds() - // round retention time to a multiple of interval - retention := int64(math.Ceil(float64(retentionMilli)/float64(intervalMilli))) * intervalMilli - size := retention / intervalMilli + // round retentionMili to a multiple of interval + retentionMiliRounded := int64(math.Ceil(float64(retentionMili)/float64(intervalMili))) * intervalMili + size := retentionMiliRounded / intervalMili return &Counter{ - lock: sync.RWMutex{}, - data: make([]Value, size), - size: size, - current: -1, + lock: sync.RWMutex{}, + data: make([]Value, size), + size: size, + current: -1, + oldest: -1, + retention: time.Duration(retentionMiliRounded) * time.Millisecond, + interval: interval, + timesSet: 0, } } // Set adds a new value with current timestamp func (c *Counter) Set(val any) { c.lock.Lock() + // setting a value for the first time + if c.oldest == -1 { + c.oldest = 0 + } c.current++ if c.current == c.size { c.current = 0 } c.data[c.current].UnixMilli = time.Now().UTC().UnixMilli() c.data[c.current].Value = val + c.timesSet++ + // if we already filled the array, and started overwriting, the oldest index just got overwritten + if c.timesSet > c.size { + c.oldest = (c.current + 1) % c.size + } c.lock.Unlock() } @@ -66,7 +84,8 @@ func (c *Counter) AvgForDuration(duration time.Duration) float64 { if idx == -1 { return 0 } - for seen := int64(0); seen <= c.size; seen++ { + + for range c.size { if c.data[idx].UnixMilli > useAfter { if val, ok := c.data[idx].Value.(float64); ok { sum += val @@ -145,35 +164,73 @@ func (c *Counter) getLast() *Value { return &c.data[c.current] } -// GetAt returns first value closest to given date -func (c *Counter) GetAt(useAfter time.Time) *Value { +// GetFirst returns first (earliest) value +func (c *Counter) GetFirst() *Value { + c.lock.RLock() + defer c.lock.RUnlock() + + return c.getFirst() +} + +func (c *Counter) getFirst() *Value { + // the latest added item had index c.current + if c.oldest == -1 { + return nil + } + + return &c.data[c.oldest] +} + +// GetAt returns first value with >= timestamp than lowerBound +func (c *Counter) GetAt(lowerBound time.Time) *Value { c.lock.RLock() defer c.lock.RUnlock() - return c.getAt(useAfter) + return c.getAt(lowerBound) } -func (c *Counter) getAt(useAfter time.Time) *Value { - useAfterUnix := useAfter.UTC().UnixMilli() +// Gets the first counter that has a >= timestamp than lowerBound +func (c *Counter) getAt(lowerBound time.Time) *Value { + useAfterUnix := lowerBound.UTC().UnixMilli() + + // the counter is not yet populated idx := c.current if idx == -1 { return nil } - var last *Value - for seen := int64(0); seen <= c.size; seen++ { - val := &c.data[idx] - if val.UnixMilli < useAfterUnix { - return last + var previouslyComparedValue *Value + for range c.size { + currentValue := &c.data[idx] + if currentValue.UnixMilli < useAfterUnix { + return previouslyComparedValue } - last = val + + previouslyComparedValue = currentValue idx-- if idx < 0 { idx = c.size - 1 } } - return last + return previouslyComparedValue +} + +// checks if the counter can fit the targetRetention. optionally extend the interval by count in the check +func (c *Counter) CheckRetention(targetRetention time.Duration, intervalExtensionCount int64) error { + extendedRetentionRange := c.retention + time.Duration(intervalExtensionCount)*c.interval + + if extendedRetentionRange < targetRetention { + if intervalExtensionCount == 0 { + return fmt.Errorf("counter retention range is %f seconds, less than the target retention range of %f seconds", + extendedRetentionRange.Seconds(), targetRetention.Seconds()) + } + + return fmt.Errorf("counter retention range is %f seconds, even when extended by %d intervals to be %f seconds, it is less than target retention range of %f seconds", + c.interval.Seconds(), intervalExtensionCount, extendedRetentionRange.Seconds(), targetRetention.Seconds()) + } + + return nil } // Float64 returns value as float64 diff --git a/pkg/counter/counter_test.go b/pkg/counter/counter_test.go index 068a17a7..f53ccbee 100644 --- a/pkg/counter/counter_test.go +++ b/pkg/counter/counter_test.go @@ -5,6 +5,7 @@ import ( "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestCounter(t *testing.T) { @@ -55,3 +56,96 @@ func TestCounter(t *testing.T) { set.Delete("test", "key") assert.Emptyf(t, set.counter, "set is empty now") } + +func TestCounter2(t *testing.T) { + set := NewCounterSet() + + retention := time.Millisecond * 4500 + interval := time.Second + set.Create("test", "key", retention, interval) + + // empty counter + counter := set.Get("test", "key") + latest := counter.getLast() + oldest := counter.getFirst() + assert.Nil(t, latest, "calling latest on empty counter should return nil") + assert.Nil(t, oldest, "calling oldest on empty counter should return nil") + + // check the retention for 4 seconds + retentionCheck1 := counter.CheckRetention(time.Second*4, 0) + require.NoError(t, retentionCheck1, "the counter should be able to hold 4 seconds") + + // check the retention for 5 seconds + retentionCheck2 := counter.CheckRetention(time.Second*5, 0) + require.NoError(t, retentionCheck2, "the counter should be able to hold 5 seconds") + + // check the retention for 6 seconds + retentionCheck3 := counter.CheckRetention(time.Second*6, 0) + require.Error(t, retentionCheck3, "the counter should not be able to hold 6 seconds") + + // check the retention for 10 seconds + retentionCheck4 := counter.CheckRetention(time.Second*10, 0) + require.Error(t, retentionCheck4, "the counter should not be able to hold 10 seconds") + + // check the retention for 1 minute with 10 extensions + retentionCheck5 := counter.CheckRetention(time.Minute, 10) + require.Error(t, retentionCheck5, "the counter should not be able to hold 1 minute with 10 interval extensions") + + // check the retention for 1 minute with 100 extensions + retentionCheck6 := counter.CheckRetention(time.Minute, 100) + require.NoError(t, retentionCheck6, "the counter should be able to hold 1 minute with 10 interval extensions") + + // 1 _ _ _ _ + counter.Set(float64(1)) + latest = counter.getLast() + oldest = counter.getFirst() + assert.InEpsilon(t, float64(1), latest.Value, 0.001, "latest element should be 1") + assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1") + + // 1 2 _ _ _ + counter.Set(float64(2)) + latest = counter.getLast() + oldest = counter.getFirst() + assert.InEpsilon(t, float64(2), latest.Value, 0.001, "latest element should be 2") + assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1") + + // 1 2 3 _ _ + counter.Set(float64(3)) + latest = counter.getLast() + oldest = counter.getFirst() + assert.InEpsilon(t, float64(3), latest.Value, 0.001, "latest element should be 3") + assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1") + + // 1 2 3 4 _ + counter.Set(float64(4)) + latest = counter.getLast() + oldest = counter.getFirst() + assert.InEpsilon(t, float64(4), latest.Value, 0.001, "latest element should be 4") + assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1") + + // 1 2 3 4 5 + counter.Set(float64(5)) + latest = counter.getLast() + oldest = counter.getFirst() + assert.InEpsilon(t, float64(5), latest.Value, 0.001, "latest element should be 5") + assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1") + + // check the average now + avg := counter.AvgForDuration(time.Minute) + assert.InEpsilon(t, 3, avg, 0.001, "average of 1,2,3,4,5 is 3") + + // started overwriting from the first index, the c.oldest should update + // 6 2 3 4 5 + counter.Set(float64(6)) + latest = counter.getLast() + oldest = counter.getFirst() + assert.InEpsilon(t, float64(6), latest.Value, 0.001, "latest element should be 6") + assert.InEpsilon(t, float64(2), oldest.Value, 0.001, "oldest element should be 2") + + // 6 7 3 4 5 + counter.Set(float64(7)) + latest = counter.getLast() + oldest = counter.getFirst() + assert.InEpsilon(t, float64(7), latest.Value, 0.001, "latest element should be 7") + assert.InEpsilon(t, float64(3), oldest.Value, 0.001, "oldest element should be 3") +} diff --git a/pkg/snclient/check_cpu_utilization.go b/pkg/snclient/check_cpu_utilization.go index db277442..f1c20ec1 100644 --- a/pkg/snclient/check_cpu_utilization.go +++ b/pkg/snclient/check_cpu_utilization.go @@ -21,6 +21,7 @@ type CPUUtilizationResult struct { iowait float64 steal float64 guest float64 + idle float64 } type CheckCPUUtilization struct { @@ -49,7 +50,7 @@ func (l *CheckCPUUtilization) Build() *CheckData { defaultWarning: "total > 90", defaultCritical: "total > 95", topSyntax: "${status} - ${list}", - detailSyntax: "user: ${user}% - system: ${system}% - iowait: ${iowait}% - steal: ${steal}% - guest: ${guest}%", + detailSyntax: "user: ${user}% - system: ${system}% - iowait: ${iowait}% - steal: ${steal}% - guest: ${guest} - idle: %{idle}%", attributes: []CheckAttribute{ {name: "total", description: "Sum of user,system,iowait,steal and guest in percent", unit: UPercent}, {name: "user", description: "User cpu utilization in percent", unit: UPercent}, @@ -57,10 +58,11 @@ func (l *CheckCPUUtilization) Build() *CheckData { {name: "iowait", description: "IOWait cpu utilization in percent", unit: UPercent}, {name: "steal", description: "Steal cpu utilization in percent", unit: UPercent}, {name: "guest", description: "Guest cpu utilization in percent", unit: UPercent}, + {name: "idle", description: "Idle cpu utilization in percent", unit: UPercent}, }, exampleDefault: ` - check_cpu_utilization - OK - user: 29% - system: 11% - iowait: 3% - steal: 0% - guest: 0% |'user'=28.83%;;;0;... + check_cpu_utilization +OK - user: 2% - system: 1% - iowait: 0% - steal: 0% - guest: 0 - idle: 96% |'total'=3.4%;90;95;0; 'user'=2.11%;;;0;... `, exampleArgs: `'warn=total > 90%' 'crit=total > 95%'`, } @@ -86,6 +88,7 @@ func (l *CheckCPUUtilization) Check(_ context.Context, snc *Agent, check *CheckD return check.Finalize() } +//nolint:funlen // The function is simple enough, the length comes from many fields to add. func (l *CheckCPUUtilization) addCPUUtilizationMetrics(check *CheckData, scanLookBack uint64) { entry := map[string]string{ "total": "0", @@ -94,6 +97,7 @@ func (l *CheckCPUUtilization) addCPUUtilizationMetrics(check *CheckData, scanLoo "iowait": "0", "steal": "0", "guest": "0", + "idle": "0", } check.listData = append(check.listData, entry) @@ -108,6 +112,7 @@ func (l *CheckCPUUtilization) addCPUUtilizationMetrics(check *CheckData, scanLoo entry["iowait"] = fmt.Sprintf("%.f", cpuMetrics.iowait) entry["steal"] = fmt.Sprintf("%.f", cpuMetrics.steal) entry["guest"] = fmt.Sprintf("%.f", cpuMetrics.guest) + entry["idle"] = fmt.Sprintf("%.f", cpuMetrics.idle) check.result.Metrics = append(check.result.Metrics, &CheckMetric{ @@ -158,45 +163,99 @@ func (l *CheckCPUUtilization) addCPUUtilizationMetrics(check *CheckData, scanLoo Critical: check.critThreshold, Min: &Zero, }, + &CheckMetric{ + Name: "idle", + Value: utils.ToPrecision(cpuMetrics.idle, 2), + Unit: "%", + Warning: check.warnThreshold, + Critical: check.critThreshold, + Min: &Zero, + }, ) } +//nolint:funlen // moving these statements to new helper functions would be illogical func (l *CheckCPUUtilization) getMetrics(scanLookBack uint64) (res *CPUUtilizationResult, ok bool) { res = &CPUUtilizationResult{} - counter1 := l.snc.Counter.Get("cpuinfo", "info") - counter2 := l.snc.Counter.Get("cpuinfo", "info") - if counter1 == nil || counter2 == nil { + counter := l.snc.Counter.Get("cpuinfo", "info") + if counter == nil { return nil, false } scanLookBack64, err := convert.Int64E(scanLookBack) if err != nil { - log.Warnf("failed to convert scan look back: %s", err.Error()) + log.Warnf("failed to convert scan look back to int64: %s", err.Error()) + + return nil, false + } + + if err = counter.CheckRetention(time.Second*time.Duration(scanLookBack64), 0); err != nil { + log.Tracef("cpuinfo counter can not hold the query range: %s", err.Error()) + } + + if err = counter.CheckRetention(time.Second*time.Duration(scanLookBack64), 100); err != nil { + log.Warnf("cpuinfo counter can not hold the query range even when extended: %s", err.Error()) return nil, false } - cpuinfo1 := counter1.GetLast() - cpuinfo2 := counter2.GetAt(time.Now().Add(-time.Duration(scanLookBack64) * time.Second)) - if cpuinfo1 == nil || cpuinfo2 == nil { + cpuinfoLatest := counter.GetLast() + cpuinfoOldest := counter.GetFirst() + if cpuinfoLatest == nil { + log.Warnf("latest cpuinfo value seems to be null. counter might not be populated yet.") + + return nil, false + } + if cpuinfoOldest == nil { + log.Warnf("oldest cpuinfo value seems to be null. counter might not be populated yet.") + return nil, false } - if cpuinfo1.UnixMilli < cpuinfo2.UnixMilli { + cpuinfoCounterDuration := cpuinfoLatest.UnixMilli - cpuinfoOldest.UnixMilli + if cpuinfoCounterDuration < scanLookBack64*1000 { + log.Tracef("cpuinfo counter has %d ms between its latest and oldest value, cannot properly provide %d s range of query", cpuinfoCounterDuration, scanLookBack) + + // Optionally we can wait on this thread while other threads fill the counter up. + } + + cpuinfoLookBackAgo := counter.GetAt(time.Now().Add(-time.Duration(scanLookBack64) * time.Second)) + if cpuinfoLookBackAgo == nil { + log.Warnf("cpuinfo value search with lower bound of now-%d seconds returned null", scanLookBack) + return nil, false } - duration := float64(cpuinfo1.UnixMilli - cpuinfo2.UnixMilli) - if duration <= 0 { + + duration := float64(cpuinfoLatest.UnixMilli - cpuinfoLookBackAgo.UnixMilli) + acceptableDurationMultipler := 0.5 + minimumAcceptableDuration := float64(scanLookBack) * 1000 * acceptableDurationMultipler + + switch { + case duration <= 0: + // This case might happen if there is only one counter value so far + log.Tracef("counter query from now-%d seconds <-> latest returned a range of %f ms. This is not positive, there might not be enough values recorded yet.", scanLookBack, duration) + return nil, false + case duration < minimumAcceptableDuration: + log.Tracef("counter query from now-%d seconds <-> latest returned a range of %f ms. This is not bellow the acceptable range, the data may be unrepresentative. "+ + "acceptableDurationMultipler * scanLookBack seconds : %f * %f = %f ", + scanLookBack, duration, acceptableDurationMultipler, float64(scanLookBack), minimumAcceptableDuration) + + // Optionally we can return an empty result here + case duration <= float64(scanLookBack)*1000: + log.Tracef("counter query from now-%d seconds <-> latest returned a range of %f ms. This is in the acceptable range, the data should be representative. "+ + "acceptableDurationMultipler * scanLookBack seconds : %f * %f = %f ", + scanLookBack, duration, acceptableDurationMultipler, float64(scanLookBack), minimumAcceptableDuration) + default: + log.Tracef("counter query from now-%d seconds <-> latest returned a range of %f ms. This is higher than the query range and something must have gone wrong.", scanLookBack, duration) } - duration /= 1e3 // cpu times are measured in seconds - info1, ok := cpuinfo1.Value.(*cpuinfo.TimesStat) + info1, ok := cpuinfoLatest.Value.(*cpuinfo.TimesStat) if !ok { return nil, false } - info2, ok := cpuinfo2.Value.(*cpuinfo.TimesStat) + info2, ok := cpuinfoLookBackAgo.Value.(*cpuinfo.TimesStat) if !ok { return nil, false } @@ -208,11 +267,13 @@ func (l *CheckCPUUtilization) getMetrics(scanLookBack uint64) (res *CPUUtilizati return nil, false } - res.user = (((info1.User - info2.User) / duration) * 100) / float64(numCPU) - res.system = (((info1.System - info2.System) / duration) * 100) / float64(numCPU) - res.iowait = (((info1.Iowait - info2.Iowait) / duration) * 100) / float64(numCPU) - res.steal = (((info1.Steal - info2.Steal) / duration) * 100) / float64(numCPU) - res.guest = (((info1.Guest - info2.Guest) / duration) * 100) / float64(numCPU) + durationInS := duration / 1e3 // cpu times are measured in seconds + res.user = (((info1.User - info2.User) / durationInS * 100) / float64(numCPU)) + res.system = (((info1.System - info2.System) / durationInS * 100) / float64(numCPU)) + res.iowait = (((info1.Iowait - info2.Iowait) / durationInS * 100) / float64(numCPU)) + res.steal = (((info1.Steal - info2.Steal) / durationInS * 100) / float64(numCPU)) + res.guest = (((info1.Guest - info2.Guest) / durationInS * 100) / float64(numCPU)) + res.idle = (((info1.Idle - info2.Idle) / durationInS * 100) / float64(numCPU)) res.total = (res.user + res.system + res.iowait) return res, true