Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,4 @@ helm/*/requirements.lock

# Generated files
*.pb.go
zz_generated.*.go
zz_generated.*.go
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ The KAgent Hook Controller monitors Kubernetes events and triggers Kagent agents

### Key Features

- **Multi-Event Monitoring**: Monitor multiple Kubernetes event types (pod-restart, pod-pending, oom-kill, probe-failed) in a single hook configuration
- **Multi-Event Monitoring**: Monitor multiple Kubernetes event types (pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready) in a single hook configuration
- **Basic Deduplication**: Prevents duplicate notifications with 10-minute timeout logic
- **Kagent Integration**: Integrates with the Kagent platform for AI agent incident response. (Can in theory talk to any a2a-enabled agent)
- **Status Tracking**: Provides real-time status updates and audit trails through Kubernetes events
Expand Down Expand Up @@ -137,6 +137,7 @@ The controller supports monitoring the following Kubernetes event types:
| `pod-pending` | Pod is stuck in pending state | Resource constraints, scheduling issues, image pull failures |
| `oom-kill` | Pod was killed due to out-of-memory | Memory limits exceeded, memory leaks |
| `probe-failed` | Liveness or readiness probe failed | Application not responding, configuration issues |
| `node-not-ready` | Node is not in Ready state | Kubelet issues, network problems, resource pressure, hardware failures |

## Future
The controller will support reacting to additional Kubernetes event.
Expand Down Expand Up @@ -259,6 +260,21 @@ spec:
After analysis - use all available tools to try and resolve. Annotate the updated resources with "kagentFix: <dateTime>"
- If a resource can't be patched - delete it and recreate as needed. Don't ask for permission. Assume autonomous execution.
Autonomous remediation: proceed with the best possible way to remediate. Don't ask for approval.
- eventType: node-not-ready
agentId: node-recovery-specialist
prompt: |
CRITICAL: Node {{.ResourceName}} is not ready at {{.EventTime}}.
This is a cluster-level emergency requiring immediate attention.

AUTONOMOUS MODE: Diagnose and resolve node issues immediately:
• Check node conditions (Ready, MemoryPressure, DiskPressure, PIDPressure)
• Analyze kubelet logs and system resources
• Verify network connectivity and DNS resolution
• Check for hardware or kernel issues
• Attempt node recovery procedures (restart kubelet, clear disk space, etc.)
• If node cannot be recovered, safely drain and replace
• Never ask for permission - assume autonomous execution
• Annotate fixed resources with: kagentFix=<timestamp>
```

### Development Environment Monitoring
Expand Down
24 changes: 13 additions & 11 deletions api/v1alpha2/hook_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ type HookSpec struct {
// EventConfiguration defines a single event type configuration
type EventConfiguration struct {
// EventType specifies the type of Kubernetes event to monitor
// +kubebuilder:validation:Enum=pod-restart;pod-pending;oom-kill;probe-failed
// +kubebuilder:validation:Enum=pod-restart;pod-pending;oom-kill;probe-failed;node-not-ready
// +kubebuilder:validation:Required
EventType string `json:"eventType"`

Expand Down Expand Up @@ -84,14 +84,15 @@ func (h *Hook) Validate() error {
func (h *Hook) validateEventConfiguration(config EventConfiguration, index int) error {
// Validate EventType
validEventTypes := map[string]bool{
"pod-restart": true,
"pod-pending": true,
"oom-kill": true,
"probe-failed": true,
"pod-restart": true,
"pod-pending": true,
"oom-kill": true,
"probe-failed": true,
"node-not-ready": true,
}

if !validEventTypes[config.EventType] {
return fmt.Errorf("event configuration %d: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed", index, config.EventType)
return fmt.Errorf("event configuration %d: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready", index, config.EventType)
}

// Validate AgentRef
Expand Down Expand Up @@ -390,7 +391,7 @@ func validateHook(hook *Hook) (admission.Warnings, error) {

// Validate event type
if !isValidEventType(config.EventType) {
allErrs = append(allErrs, fmt.Sprintf("spec.eventConfigurations[%d].eventType: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed", i, config.EventType))
allErrs = append(allErrs, fmt.Sprintf("spec.eventConfigurations[%d].eventType: invalid event type '%s', must be one of: pod-restart, pod-pending, oom-kill, probe-failed, node-not-ready", i, config.EventType))
}

// Validate agentId is not empty
Expand Down Expand Up @@ -419,10 +420,11 @@ func validateHook(hook *Hook) (admission.Warnings, error) {
// isValidEventType checks if the provided event type is valid
func isValidEventType(eventType string) bool {
validTypes := map[string]bool{
"pod-restart": true,
"pod-pending": true,
"oom-kill": true,
"probe-failed": true,
"pod-restart": true,
"pod-pending": true,
"oom-kill": true,
"probe-failed": true,
"node-not-ready": true,
}
return validTypes[eventType]
}
43 changes: 43 additions & 0 deletions api/v1alpha2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions config/crd/bases/kagent.dev_hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ spec:
- pod-pending
- oom-kill
- probe-failed
- node-not-ready
type: string
prompt:
description: Prompt specifies the prompt template to send to
Expand Down
1 change: 1 addition & 0 deletions helm/khook-crds/crds/kagent.dev_hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ spec:
- pod-pending
- oom-kill
- probe-failed
- node-not-ready
type: string
prompt:
description: Prompt specifies the prompt template to send to
Expand Down
32 changes: 27 additions & 5 deletions internal/event/watcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,14 +251,18 @@ func (w *Watcher) mapKubernetesEvent(k8sEvent *eventsv1.Event) *interfaces.Event

// mapEventType maps Kubernetes event reasons to our event types
func (w *Watcher) mapEventType(k8sEvent *eventsv1.Event) string {
// Ignore Normal events entirely; only act on warnings/errors
if strings.ToLower(k8sEvent.Type) == "normal" {
return ""
}
// Map based on the regarding object kind and event reason
// Map based on the regarding object kind and event reason first
switch k8sEvent.Regarding.Kind {
case "Pod":
// For pods, ignore Normal events entirely; only act on warnings/errors
if strings.ToLower(k8sEvent.Type) == "normal" {
return ""
}
return w.mapPodEventType(k8sEvent)
case "Node":
// For nodes, we need to check both Normal and Warning events
// as NodeNotReady events are typically Normal type
return w.mapNodeEventType(k8sEvent)
default:
return ""
}
Expand Down Expand Up @@ -317,3 +321,21 @@ func (w *Watcher) mapPodEventType(k8sEvent *eventsv1.Event) string {

return ""
}

// mapNodeEventType maps node-related events to our event types
func (w *Watcher) mapNodeEventType(k8sEvent *eventsv1.Event) string {
reason := strings.ToLower(k8sEvent.Reason)
message := strings.ToLower(k8sEvent.Note)
eventType := strings.ToLower(k8sEvent.Type)

switch {
// Node not ready events
case reason == "nodenotready":
return "node-not-ready"

default:
// Log unknown node events for future enhancement
w.logger.V(1).Info("Unknown node event", "reason", reason, "type", eventType, "message", message)
return ""
}
}
Loading