Skip to content

Commit 48bedf7

Browse files
authored
Merge branch 'main' into init-tweaks
2 parents 6979011 + 28523cc commit 48bedf7

File tree

92 files changed

+8831
-778
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

92 files changed

+8831
-778
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,3 +134,8 @@ artifacts/
134134
ui/node_modules/
135135
ui/ui-kit/node_modules/
136136
ui/test-results/
137+
138+
# SSL certificates (generated locally)
139+
monitoring/nginx/certs/
140+
config.yaml
141+
eval_reports

config.yaml.example

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# Redis SRE Agent Configuration
2+
# Copy this file to config.yaml and customize for your environment.
3+
#
4+
# Settings can be loaded from (priority order):
5+
# 1. Environment variables (highest priority)
6+
# 2. .env file
7+
# 3. config.yaml (this file)
8+
# 4. Default values (lowest priority)
9+
#
10+
# Set SRE_AGENT_CONFIG environment variable to use a custom path.
11+
12+
# Application settings
13+
# debug: false
14+
# log_level: INFO
15+
16+
# Server settings
17+
# host: "0.0.0.0"
18+
# port: 8000
19+
20+
# MCP (Model Context Protocol) servers configuration
21+
# This is the primary use case for YAML config - complex nested structures
22+
mcp_servers:
23+
# Memory server for long-term agent memory
24+
redis-memory-server:
25+
command: uv
26+
args:
27+
- tool
28+
- run
29+
- --from
30+
- agent-memory-server
31+
- agent-memory
32+
- mcp
33+
env:
34+
REDIS_URL: redis://localhost:6399
35+
tools:
36+
get_current_datetime:
37+
description: |
38+
Get the current date and time. Use this when you need to
39+
record timestamps for Redis instance events or incidents.
40+
41+
{original}
42+
create_long_term_memories:
43+
description: |
44+
Save long-term memories about Redis instances. Use this to
45+
record: past incidents and their resolutions, configuration
46+
changes, performance baselines, known issues, maintenance
47+
history, and lessons learned. Always include the instance_id
48+
in the memory text for future retrieval.
49+
50+
{original}
51+
search_long_term_memory:
52+
description: |
53+
Search saved memories about Redis instances. ALWAYS use this
54+
before troubleshooting a Redis instance to recall past issues,
55+
solutions, and context. Search by instance_id, error patterns,
56+
or symptoms.
57+
58+
{original}
59+
get_long_term_memory:
60+
description: |
61+
Retrieve a specific memory by ID. Use this to get full details
62+
of a memory found via search.
63+
64+
{original}
65+
edit_long_term_memory:
66+
description: |
67+
Update an existing memory. Use this to add new information to
68+
a past incident record, update resolution status, or correct
69+
outdated information.
70+
71+
{original}
72+
delete_long_term_memories:
73+
description: |
74+
Delete memories that are no longer relevant. Use sparingly -
75+
prefer editing to add context rather than deleting.
76+
77+
{original}
78+
79+
# GitHub MCP server for repository operations
80+
# Option 1: Local Docker (requires Docker to be running)
81+
github:
82+
command: docker
83+
args:
84+
- run
85+
- -i
86+
- --rm
87+
- -e
88+
- GITHUB_PERSONAL_ACCESS_TOKEN
89+
- ghcr.io/github/github-mcp-server
90+
env:
91+
# Set your GitHub Personal Access Token here or via environment variable
92+
GITHUB_PERSONAL_ACCESS_TOKEN: ${GITHUB_PERSONAL_ACCESS_TOKEN}
93+
94+
# Option 2: Remote GitHub MCP server (recommended, no Docker needed)
95+
# Uncomment the following and comment out the local Docker option above:
96+
# github:
97+
# url: "https://api.githubcopilot.com/mcp/"
98+
# headers:
99+
# Authorization: "Bearer ${GITHUB_PERSONAL_ACCESS_TOKEN}"
100+
# # transport: streamable_http # default, uses Streamable HTTP protocol
101+
102+
# Tool providers configuration (fully qualified class paths)
103+
# tool_providers:
104+
# - redis_sre_agent.tools.metrics.prometheus.provider.PrometheusToolProvider
105+
# - redis_sre_agent.tools.diagnostics.redis_command.provider.RedisCommandToolProvider
106+
# - redis_sre_agent.tools.logs.loki.provider.LokiToolProvider
107+
# - redis_sre_agent.tools.host_telemetry.provider.HostTelemetryToolProvider

docker-compose.test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ services:
8787
context: .
8888
dockerfile: Dockerfile
8989
ports:
90-
- "8000:8000"
90+
- "8080:8000"
9191
environment:
9292
- REDIS_URL=redis://redis-demo:6379/0 # Internal container port stays 6379
9393
- PROMETHEUS_URL=http://prometheus:9090

docker-compose.yml

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@ services:
99
- ./monitoring/redis.conf:/usr/local/etc/redis/redis.conf
1010
command: redis-server /usr/local/etc/redis/redis.conf
1111
healthcheck:
12-
test: ["CMD", "redis-cli", "ping"]
13-
interval: 10s
12+
# Wait for Redis to finish loading before marking healthy
13+
test: ["CMD-SHELL", "redis-cli ping | grep -q PONG && redis-cli INFO persistence | grep -q 'loading:0'"]
14+
interval: 5s
1415
timeout: 5s
15-
retries: 3
16+
retries: 10
17+
start_period: 10s
1618
networks:
1719
- sre-network
1820

@@ -273,7 +275,7 @@ services:
273275
context: .
274276
dockerfile: Dockerfile
275277
ports:
276-
- "8000:8000"
278+
- "8080:8000"
277279
environment:
278280
- REDIS_URL=redis://redis:6379/0 # Internal container port stays 6379
279281
- TOOLS_PROMETHEUS_URL=http://prometheus:9090
@@ -327,6 +329,71 @@ services:
327329
networks:
328330
- sre-network
329331

332+
# GitHub MCP Server - Exposes GitHub tools via MCP
333+
# This runs the GitHub MCP server behind an SSE/HTTP proxy so the sre-worker
334+
# can connect to it without needing Docker-in-Docker permissions.
335+
github-mcp:
336+
image: ghcr.io/sparfenyuk/mcp-proxy:latest
337+
ports:
338+
- "8082:8082"
339+
environment:
340+
- GITHUB_PERSONAL_ACCESS_TOKEN=${GITHUB_PERSONAL_ACCESS_TOKEN}
341+
command: >
342+
--pass-environment
343+
--port=8082
344+
--host=0.0.0.0
345+
docker run -i --rm -e GITHUB_PERSONAL_ACCESS_TOKEN ghcr.io/github/github-mcp-server
346+
volumes:
347+
- /var/run/docker.sock:/var/run/docker.sock
348+
networks:
349+
- sre-network
350+
profiles:
351+
- mcp # Start with: docker compose --profile mcp up
352+
353+
# SRE Agent MCP Server - Exposes agent capabilities via Model Context Protocol
354+
# Connect Claude to this via: Settings > Connectors > Add Custom Connector
355+
# HTTP: http://localhost:8081/mcp
356+
# HTTPS: https://localhost:8450/mcp (requires running scripts/generate-mcp-certs.sh first)
357+
sre-mcp:
358+
build:
359+
context: .
360+
dockerfile: Dockerfile
361+
ports:
362+
- "8081:8081"
363+
environment:
364+
- REDIS_URL=redis://redis:6379/0
365+
- REDIS_SRE_MASTER_KEY=${REDIS_SRE_MASTER_KEY}
366+
- TOOLS_PROMETHEUS_URL=http://prometheus:9090
367+
- TOOLS_LOKI_URL=http://loki:3100
368+
depends_on:
369+
redis:
370+
condition: service_healthy
371+
volumes:
372+
- .env:/app/.env
373+
- ./redis_sre_agent:/app/redis_sre_agent
374+
command: uv run redis-sre-agent mcp serve --transport http --host 0.0.0.0 --port 8081
375+
networks:
376+
- sre-network
377+
profiles:
378+
- mcp # Start with: docker compose --profile mcp up
379+
- ssl # Or with SSL: docker compose --profile ssl up
380+
381+
# MCP SSL Proxy - HTTPS termination for MCP server
382+
# Run scripts/generate-mcp-certs.sh first to generate self-signed certs
383+
sre-mcp-ssl:
384+
image: nginx:alpine
385+
ports:
386+
- "8450:443"
387+
volumes:
388+
- ./monitoring/nginx/mcp-ssl.conf:/etc/nginx/conf.d/default.conf:ro
389+
- ./monitoring/nginx/certs:/etc/nginx/certs:ro
390+
depends_on:
391+
- sre-mcp
392+
networks:
393+
- sre-network
394+
profiles:
395+
- ssl # Only start with: docker compose --profile ssl up
396+
330397
# SRE Agent UI
331398
sre-ui:
332399
build:

docs/concepts/core.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ This section explains the core ideas behind Redis SRE Agent and how pieces fit t
2727
When you create a task, the API creates or reuses a thread to store the execution history. You can:
2828
- Poll the task for status: `GET /api/v1/tasks/{task_id}`
2929
- Read the thread for results: `GET /api/v1/threads/{thread_id}`
30-
- Stream updates via WebSocket: `ws://localhost:8000/api/v1/ws/tasks/{thread_id}`
30+
- Stream updates via WebSocket: `ws://localhost:8080/api/v1/ws/tasks/{thread_id}` (Docker Compose) or port 8000 (local)
3131

3232
- **Jobs**
3333
- Ad-hoc jobs: On-demand via CLI or API. Each run creates a task and streams results to a thread.

docs/how-to/api.md

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ This guide shows how to use the HTTP API end-to-end: check health, add an instan
66
- Services running (Docker Compose or local uvicorn + worker)
77
- If you enabled auth in your environment, include your API key header as needed
88

9+
**Port Note**: Docker Compose exposes the API on port **8080**, while local uvicorn uses port **8000**. Examples below use port 8080 (Docker Compose). Replace with 8000 if running locally.
10+
911
### 1) Start services (choose one)
1012
- Docker Compose
1113
```bash
@@ -26,20 +28,21 @@ uv run redis-sre-agent worker --concurrency 4
2628
### 2) Health and readiness
2729
```bash
2830
# Root health (fast)
29-
curl -fsS http://localhost:8000/
31+
# Use port 8080 for Docker Compose, port 8000 for local uvicorn
32+
curl -fsS http://localhost:8080/
3033

3134
# Detailed health (Redis, vector index, workers)
32-
curl -fsS http://localhost:8000/api/v1/health | jq
35+
curl -fsS http://localhost:8080/api/v1/health | jq
3336

3437
# Prometheus metrics (scrape this)
35-
curl -fsS http://localhost:8000/api/v1/metrics | head -n 20
38+
curl -fsS http://localhost:8080/api/v1/metrics | head -n 20
3639
```
3740

3841
### 3) Manage Redis instances
3942
Create the instance the agent will triage, then verify a connection.
4043
```bash
4144
# Create instance
42-
curl -fsS -X POST http://localhost:8000/api/v1/instances \
45+
curl -fsS -X POST http://localhost:8080/api/v1/instances \
4346
-H 'Content-Type: application/json' \
4447
-d '{
4548
"name": "prod-cache",
@@ -50,14 +53,14 @@ curl -fsS -X POST http://localhost:8000/api/v1/instances \
5053
}' | jq
5154

5255
# List & inspect
53-
curl -fsS http://localhost:8000/api/v1/instances | jq
54-
curl -fsS http://localhost:8000/api/v1/instances/<id> | jq
56+
curl -fsS http://localhost:8080/api/v1/instances | jq
57+
curl -fsS http://localhost:8080/api/v1/instances/<id> | jq
5558

5659
# Test connection (by ID)
57-
curl -fsS -X POST http://localhost:8000/api/v1/instances/<id>/test-connection | jq
60+
curl -fsS -X POST http://localhost:8080/api/v1/instances/<id>/test-connection | jq
5861

5962
# Test a raw URL (without saving)
60-
curl -fsS -X POST http://localhost:8000/api/v1/instances/test-connection-url \
63+
curl -fsS -X POST http://localhost:8080/api/v1/instances/test-connection-url \
6164
-H 'Content-Type: application/json' \
6265
-d '{"connection_url": "redis://host:6379/0"}' | jq
6366
```
@@ -69,14 +72,16 @@ curl -fsS -X POST http://localhost:8000/api/v1/instances/test-connection-url \
6972

7073
### 4) Triage with tasks and threads
7174
Simplest: create a task with your question. The API will create a thread if you omit `thread_id`.
75+
76+
> **Note**: Triage performs comprehensive analysis (metrics, logs, knowledge base, multi-topic recommendations) and typically takes **2-10 minutes** to complete. Poll the task status or use WebSocket for real-time updates.
7277
```bash
7378
# Create a task (no instance)
74-
curl -fsS -X POST http://localhost:8000/api/v1/tasks \
79+
curl -fsS -X POST http://localhost:8080/api/v1/tasks \
7580
-H 'Content-Type: application/json' \
7681
-d '{"message": "Explain high memory usage signals in Redis"}' | jq
7782

7883
# Create a task (target a specific instance)
79-
curl -fsS -X POST http://localhost:8000/api/v1/tasks \
84+
curl -fsS -X POST http://localhost:8080/api/v1/tasks \
8085
-H 'Content-Type: application/json' \
8186
-d '{
8287
"message": "Check memory pressure and slow ops",
@@ -86,15 +91,15 @@ curl -fsS -X POST http://localhost:8000/api/v1/tasks \
8691
Poll task or inspect the thread:
8792
```bash
8893
# Poll task status
89-
curl -fsS http://localhost:8000/api/v1/tasks/<task_id> | jq
94+
curl -fsS http://localhost:8080/api/v1/tasks/<task_id> | jq
9095

9196
# Get the thread state (messages, updates, result)
92-
curl -fsS http://localhost:8000/api/v1/threads/<thread_id> | jq
97+
curl -fsS http://localhost:8080/api/v1/threads/<thread_id> | jq
9398
```
9499
Real-time updates via WebSocket:
95100
```bash
96101
# Requires a thread_id; use any ws client (wscat, websocat)
97-
wscat -c ws://localhost:8000/api/v1/ws/tasks/<thread_id>
102+
wscat -c ws://localhost:8080/api/v1/ws/tasks/<thread_id>
98103
# You will receive an initial_state event and subsequent progress updates
99104
```
100105

@@ -103,12 +108,12 @@ wscat -c ws://localhost:8000/api/v1/ws/tasks/<thread_id>
103108
Alternative flow: create a thread first, then submit a task on that thread.
104109
```bash
105110
# Create thread
106-
curl -fsS -X POST http://localhost:8000/api/v1/threads \
111+
curl -fsS -X POST http://localhost:8080/api/v1/threads \
107112
-H 'Content-Type: application/json' \
108113
-d '{"user_id": "u1", "subject": "Prod triage"}' | jq
109114

110115
# Submit a task to that thread
111-
curl -fsS -X POST http://localhost:8000/api/v1/tasks \
116+
curl -fsS -X POST http://localhost:8080/api/v1/tasks \
112117
-H 'Content-Type: application/json' \
113118
-d '{
114119
"thread_id": "<thread_id>",
@@ -121,20 +126,20 @@ curl -fsS -X POST http://localhost:8000/api/v1/tasks \
121126
Run an ingestion job, then search to confirm content is available.
122127
```bash
123128
# Start pipeline job (ingest existing artifacts or run full if configured)
124-
curl -fsS -X POST http://localhost:8000/api/v1/knowledge/ingest/pipeline \
129+
curl -fsS -X POST http://localhost:8080/api/v1/knowledge/ingest/pipeline \
125130
-H 'Content-Type: application/json' \
126131
-d '{"operation": "ingest", "artifacts_path": "./artifacts"}' | jq
127132

128133
# List jobs & check individual job status
129-
curl -fsS http://localhost:8000/api/v1/knowledge/jobs | jq
130-
curl -fsS http://localhost:8000/api/v1/knowledge/jobs/<job_id> | jq
134+
curl -fsS http://localhost:8080/api/v1/knowledge/jobs | jq
135+
curl -fsS http://localhost:8080/api/v1/knowledge/jobs/<job_id> | jq
131136

132137
# Search knowledge
133-
curl -fsS 'http://localhost:8000/api/v1/knowledge/search?query=redis%20eviction%20policy' | jq
138+
curl -fsS 'http://localhost:8080/api/v1/knowledge/search?query=redis%20eviction%20policy' | jq
134139
```
135140
Optional single-document ingestion:
136141
```bash
137-
curl -fsS -X POST http://localhost:8000/api/v1/knowledge/ingest/document \
142+
curl -fsS -X POST http://localhost:8080/api/v1/knowledge/ingest/document \
138143
-H 'Content-Type: application/json' \
139144
-d '{
140145
"title": "Redis memory troubleshooting",
@@ -148,7 +153,7 @@ curl -fsS -X POST http://localhost:8000/api/v1/knowledge/ingest/document \
148153
Create a schedule to run instructions periodically, optionally bound to an instance.
149154
```bash
150155
# Create schedule (daily)
151-
curl -fsS -X POST http://localhost:8000/api/v1/schedules/ \
156+
curl -fsS -X POST http://localhost:8080/api/v1/schedules/ \
152157
-H 'Content-Type: application/json' \
153158
-d '{
154159
"name": "daily-triage",
@@ -161,20 +166,20 @@ curl -fsS -X POST http://localhost:8000/api/v1/schedules/ \
161166
}' | jq
162167

163168
# List/get
164-
curl -fsS http://localhost:8000/api/v1/schedules/ | jq
165-
curl -fsS http://localhost:8000/api/v1/schedules/<schedule_id> | jq
169+
curl -fsS http://localhost:8080/api/v1/schedules/ | jq
170+
curl -fsS http://localhost:8080/api/v1/schedules/<schedule_id> | jq
166171

167172
# Trigger now (manual run)
168-
curl -fsS -X POST http://localhost:8000/api/v1/schedules/<schedule_id>/trigger | jq
173+
curl -fsS -X POST http://localhost:8080/api/v1/schedules/<schedule_id>/trigger | jq
169174

170175
# View runs for a schedule
171-
curl -fsS http://localhost:8000/api/v1/schedules/<schedule_id>/runs | jq
176+
curl -fsS http://localhost:8080/api/v1/schedules/<schedule_id>/runs | jq
172177
```
173178

174179
### 7) Tasks, threads, and streaming
175180
- Tasks: `GET /api/v1/tasks/{task_id}`
176181
- Threads: `GET /api/v1/threads`, `GET /api/v1/threads/{thread_id}`
177-
- WebSocket: `ws://localhost:8000/api/v1/ws/tasks/{thread_id}`
182+
- WebSocket: `ws://localhost:8080/api/v1/ws/tasks/{thread_id}`
178183

179184
### 8) Observability
180185
- Prometheus scrape: `GET /api/v1/metrics`

0 commit comments

Comments
 (0)