diff --git a/.gitignore b/.gitignore index 3bca95b..6ef23aa 100644 --- a/.gitignore +++ b/.gitignore @@ -66,4 +66,4 @@ helm/*/requirements.lock # Generated files *.pb.go -zz_generated.*.go \ No newline at end of file +zz_generated.*.go diff --git a/README.md b/README.md index 6e8ff6b..69a5c32 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ The KAgent Hook Controller monitors Kubernetes events and triggers Kagent agents ### Key Features -- **Multi-Event Monitoring**: Monitor multiple Kubernetes event types (pod-restart, pod-pending, oom-kill, probe-failed) in a single hook configuration +- **Multi-Event Monitoring**: Monitor multiple Kubernetes event types (pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready) in a single hook configuration - **Basic Deduplication**: Prevents duplicate notifications with 10-minute timeout logic - **Kagent Integration**: Integrates with the Kagent platform for AI agent incident response. (Can in theory talk to any a2a-enabled agent) - **Status Tracking**: Provides real-time status updates and audit trails through Kubernetes events @@ -137,6 +137,7 @@ The controller supports monitoring the following Kubernetes event types: | `pod-pending` | Pod is stuck in pending state | Resource constraints, scheduling issues, image pull failures | | `oom-kill` | Pod was killed due to out-of-memory | Memory limits exceeded, memory leaks | | `probe-failed` | Liveness or readiness probe failed | Application not responding, configuration issues | +| `node-not-ready` | Node is not in Ready state | Kubelet issues, network problems, resource pressure, hardware failures | ## Future The controller will support reacting to additional Kubernetes event. @@ -259,6 +260,21 @@ spec: After analysis - use all available tools to try and resolve. Annotate the updated resources with "kagentFix: " - If a resource can't be patched - delete it and recreate as needed. Don't ask for permission. Assume autonomous execution. Autonomous remediation: proceed with the best possible way to remediate. Don't ask for approval. + - eventType: node-not-ready + agentId: node-recovery-specialist + prompt: | + CRITICAL: Node {{.ResourceName}} is not ready at {{.EventTime}}. + This is a cluster-level emergency requiring immediate attention. + + AUTONOMOUS MODE: Diagnose and resolve node issues immediately: + • Check node conditions (Ready, MemoryPressure, DiskPressure, PIDPressure) + • Analyze kubelet logs and system resources + • Verify network connectivity and DNS resolution + • Check for hardware or kernel issues + • Attempt node recovery procedures (restart kubelet, clear disk space, etc.) + • If node cannot be recovered, safely drain and replace + • Never ask for permission - assume autonomous execution + • Annotate fixed resources with: kagentFix= ``` ### Development Environment Monitoring diff --git a/api/v1alpha2/hook_types.go b/api/v1alpha2/hook_types.go index 8c64713..904356f 100644 --- a/api/v1alpha2/hook_types.go +++ b/api/v1alpha2/hook_types.go @@ -25,7 +25,7 @@ type HookSpec struct { // EventConfiguration defines a single event type configuration type EventConfiguration struct { // EventType specifies the type of Kubernetes event to monitor - // +kubebuilder:validation:Enum=pod-restart;pod-pending;oom-kill;probe-failed + // +kubebuilder:validation:Enum=pod-restart;pod-pending;oom-kill;probe-failed;node-not-ready // +kubebuilder:validation:Required EventType string `json:"eventType"` @@ -84,14 +84,15 @@ func (h *Hook) Validate() error { func (h *Hook) validateEventConfiguration(config EventConfiguration, index int) error { // Validate EventType validEventTypes := map[string]bool{ - "pod-restart": true, - "pod-pending": true, - "oom-kill": true, - "probe-failed": true, + "pod-restart": true, + "pod-pending": true, + "oom-kill": true, + "probe-failed": true, + "node-not-ready": true, } if !validEventTypes[config.EventType] { - return fmt.Errorf("event configuration %d: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed", index, config.EventType) + return fmt.Errorf("event configuration %d: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready", index, config.EventType) } // Validate AgentRef @@ -390,7 +391,7 @@ func validateHook(hook *Hook) (admission.Warnings, error) { // Validate event type if !isValidEventType(config.EventType) { - allErrs = append(allErrs, fmt.Sprintf("spec.eventConfigurations[%d].eventType: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed", i, config.EventType)) + allErrs = append(allErrs, fmt.Sprintf("spec.eventConfigurations[%d].eventType: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready", i, config.EventType)) } // Validate agentId is not empty @@ -419,10 +420,11 @@ func validateHook(hook *Hook) (admission.Warnings, error) { // isValidEventType checks if the provided event type is valid func isValidEventType(eventType string) bool { validTypes := map[string]bool{ - "pod-restart": true, - "pod-pending": true, - "oom-kill": true, - "probe-failed": true, + "pod-restart": true, + "pod-pending": true, + "oom-kill": true, + "probe-failed": true, + "node-not-ready": true, } return validTypes[eventType] } diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go new file mode 100644 index 0000000..23c300f --- /dev/null +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -0,0 +1,43 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha2 + +import () + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ObjectReference) DeepCopyInto(out *ObjectReference) { + *out = *in + if in.Namespace != nil { + in, out := &in.Namespace, &out.Namespace + *out = new(string) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ObjectReference. +func (in *ObjectReference) DeepCopy() *ObjectReference { + if in == nil { + return nil + } + out := new(ObjectReference) + in.DeepCopyInto(out) + return out +} diff --git a/config/crd/bases/kagent.dev_hooks.yaml b/config/crd/bases/kagent.dev_hooks.yaml index de09607..0230cd0 100644 --- a/config/crd/bases/kagent.dev_hooks.yaml +++ b/config/crd/bases/kagent.dev_hooks.yaml @@ -71,6 +71,7 @@ spec: - pod-pending - oom-kill - probe-failed + - node-not-ready type: string prompt: description: Prompt specifies the prompt template to send to diff --git a/helm/khook-crds/crds/kagent.dev_hooks.yaml b/helm/khook-crds/crds/kagent.dev_hooks.yaml index de09607..0230cd0 100644 --- a/helm/khook-crds/crds/kagent.dev_hooks.yaml +++ b/helm/khook-crds/crds/kagent.dev_hooks.yaml @@ -71,6 +71,7 @@ spec: - pod-pending - oom-kill - probe-failed + - node-not-ready type: string prompt: description: Prompt specifies the prompt template to send to diff --git a/internal/event/watcher.go b/internal/event/watcher.go index e5b4807..457bd14 100644 --- a/internal/event/watcher.go +++ b/internal/event/watcher.go @@ -251,14 +251,18 @@ func (w *Watcher) mapKubernetesEvent(k8sEvent *eventsv1.Event) *interfaces.Event // mapEventType maps Kubernetes event reasons to our event types func (w *Watcher) mapEventType(k8sEvent *eventsv1.Event) string { - // Ignore Normal events entirely; only act on warnings/errors - if strings.ToLower(k8sEvent.Type) == "normal" { - return "" - } - // Map based on the regarding object kind and event reason + // Map based on the regarding object kind and event reason first switch k8sEvent.Regarding.Kind { case "Pod": + // For pods, ignore Normal events entirely; only act on warnings/errors + if strings.ToLower(k8sEvent.Type) == "normal" { + return "" + } return w.mapPodEventType(k8sEvent) + case "Node": + // For nodes, we need to check both Normal and Warning events + // as NodeNotReady events are typically Normal type + return w.mapNodeEventType(k8sEvent) default: return "" } @@ -317,3 +321,21 @@ func (w *Watcher) mapPodEventType(k8sEvent *eventsv1.Event) string { return "" } + +// mapNodeEventType maps node-related events to our event types +func (w *Watcher) mapNodeEventType(k8sEvent *eventsv1.Event) string { + reason := strings.ToLower(k8sEvent.Reason) + message := strings.ToLower(k8sEvent.Note) + eventType := strings.ToLower(k8sEvent.Type) + + switch { + // Node not ready events + case reason == "nodenotready": + return "node-not-ready" + + default: + // Log unknown node events for future enhancement + w.logger.V(1).Info("Unknown node event", "reason", reason, "type", eventType, "message", message) + return "" + } +}