redis-applied-ai · abrookins · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 17, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -51,21 +51,16 @@ FROM python:3.12-slim
 # commands that use `uv run` work correctly.
 COPY --from=builder /bin/uv /bin/uv
 
-# Copy the uv-managed Python installation used by the virtualenv in /app/.venv.
-# Without this, /app/.venv/bin/python points at a non-existent interpreter under
-# /root/.local/share/uv, which causes `uv run` and even direct venv usage to
-# fail with "permission denied" when running as the non-root `app` user.
-COPY --from=builder /root/.local/share/uv /root/.local/share/uv
-
-# Make the uv Python tree readable and traversable by the unprivileged `app`
-# user so that symlinks in /app/.venv/bin/python* can be resolved.
-RUN chmod 755 /root && chmod -R 755 /root/.local/share/uv || true
+# Note: With UV_LINK_MODE=copy set in the builder stage, the virtualenv at
+# /app/.venv is self-contained and doesn't require a shared Python installation
+# under /root/.local/share/uv. All necessary files are copied directly into the venv.
 
 WORKDIR /app
 
 # Install ONLY runtime system dependencies
 # We repeat the Docker/Redis install here because they are needed at runtime.
 RUN apt-get update && apt-get install -y \
+    git \
     curl \
     ca-certificates \
     redis-tools \

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -293,6 +293,7 @@ services:
         condition: service_started
     volumes:
       - .env:/app/.env
+      - ./config.yaml:/app/config.yaml  # Mount config for hot-reload on restart
       - ./artifacts:/app/artifacts  # For data pipeline
       - ./redis_sre_agent:/app/redis_sre_agent  # Mount source code for development
       - ./tests:/app/tests  # Mount tests for development
@@ -318,6 +319,7 @@ services:
         condition: service_healthy
     volumes:
       - .env:/app/.env
+      - ./config.yaml:/app/config.yaml  # Mount config for hot-reload on restart
       - ./artifacts:/app/artifacts
       - /var/run/docker.sock:/var/run/docker.sock  # Mount Docker socket for log access
       - ./redis_sre_agent:/app/redis_sre_agent  # Mount source code for development

diff --git a/docs/operations/observability.md b/docs/operations/observability.md
@@ -77,14 +77,66 @@ Both the API and worker will automatically instrument and export spans when this
 
 ### What gets traced
 - **FastAPI requests** (excluding health/metrics endpoints)
-- **Redis operations** (via RedisInstrumentor)
+- **Redis operations** (via RedisInstrumentor with custom hooks)
 - **HTTP clients** (HTTPX, AioHTTP)
 - **OpenAI API calls** (via OpenAIInstrumentor)
 - **LangGraph nodes**: Each node in the agent workflow gets a custom span with attributes:
   - `langgraph.graph` - which graph (e.g., `sre_agent`, `knowledge`, `runbook`)
   - `langgraph.node` - which node (e.g., `agent`, `tools`, `reasoning`)
 - **LLM calls**: Token usage and latency are added as span attributes
 
+### Span Categories for Filtering
+
+All spans include a `sre_agent.category` attribute to help you filter out noise (especially Redis commands) and focus on application logic. Available categories:
+
+| Category | Description |
+|----------|-------------|
+| `llm` | LLM API calls |
+| `tool` | Tool invocations |
+| `graph_node` | LangGraph node execution |
+| `agent` | High-level agent operations |
+| `knowledge` | Knowledge base operations |
+| `redis` | Redis commands (filter these out to reduce noise) |
+
+Redis spans also include:
+- `redis.command` - the Redis command (GET, SET, HSET, etc.)
+- `redis.is_infrastructure` - `true` for internal ops (PING, INFO, etc.)
+- `redis.key_prefix` - the key prefix (before first `:`) for grouping
+
+### TraceQL Queries for Grafana/Tempo
+
+Use these queries in Grafana's Tempo Explore view to filter traces:
+
+```traceql
+# Hide all Redis spans - see only app logic
+{ span.sre_agent.category != "redis" }
+
+# Show only LLM calls
+{ span.sre_agent.category = "llm" }
+
+# Slow LLM calls (> 5 seconds)
+{ span.sre_agent.category = "llm" && duration > 5s }
+
+# Show only tool invocations
+{ span.sre_agent.category = "tool" }
+
+# Filter by LangGraph graph name
+{ span.langgraph.graph = "sre_agent" }
+
+# Show app-level Redis ops only (hide PING, INFO, etc.)
+{ span.sre_agent.category = "redis" && span.redis.is_infrastructure = false }
+```
+
+### SRE Agent Traces Dashboard
+
+A pre-built Grafana dashboard is available at `monitoring/grafana/provisioning/dashboards/json/agent-traces.json` that provides:
+- Trace list with category filtering
+- LLM call duration percentiles
+- Token usage rates by component
+- Tokens by model breakdown
+
+Access it in Grafana under Dashboards → SRE Agent Traces.
+
 ### Example: Tempo (local dev)
 The docker-compose stack includes Tempo as an OTLP collector:
 ```yaml

diff --git a/monitoring/grafana/provisioning/dashboards/json/agent-traces.json b/monitoring/grafana/provisioning/dashboards/json/agent-traces.json
@@ -0,0 +1,133 @@
+{
+  "id": null,
+  "uid": "agent-traces",
+  "title": "SRE Agent Traces",
+  "description": "OpenTelemetry traces for the Redis SRE Agent - filter out Redis noise to see LLM calls, tool invocations, and graph node execution",
+  "tags": ["traces", "tempo", "agent", "llm"],
+  "timezone": "",
+  "schemaVersion": 39,
+  "version": 1,
+  "refresh": "30s",
+  "time": { "from": "now-1h", "to": "now" },
+  "templating": {
+    "list": [
+      {
+        "name": "service",
+        "label": "Service",
+        "type": "custom",
+        "options": [
+          { "text": "All", "value": "", "selected": true },
+          { "text": "API", "value": "redis-sre-agent" },
+          { "text": "Worker", "value": "redis-sre-worker" }
+        ],
+        "current": { "text": "All", "value": "" }
+      },
+      {
+        "name": "category",
+        "label": "Span Category",
+        "type": "custom",
+        "options": [
+          { "text": "All (no Redis)", "value": "exclude_redis", "selected": true },
+          { "text": "LLM Calls", "value": "llm" },
+          { "text": "Tool Calls", "value": "tool" },
+          { "text": "Graph Nodes", "value": "graph_node" },
+          { "text": "Agent", "value": "agent" },
+          { "text": "Knowledge", "value": "knowledge" },
+          { "text": "All (including Redis)", "value": "all" }
+        ],
+        "current": { "text": "All (no Redis)", "value": "exclude_redis" }
+      }
+    ]
+  },
+  "panels": [
+    {
+      "type": "text",
+      "title": "Trace Filtering Guide",
+      "gridPos": { "x": 0, "y": 0, "w": 24, "h": 3 },
+      "options": {
+        "mode": "markdown",
+        "content": "## TraceQL Queries for Filtering\n\n| Query | Description |\n|-------|-------------|\n| `{ span.sre_agent.category != \"redis\" }` | Hide all Redis spans |\n| `{ span.sre_agent.category = \"llm\" }` | Show only LLM calls |\n| `{ span.sre_agent.category = \"tool\" }` | Show only tool invocations |\n| `{ span.langgraph.graph = \"sre_agent\" }` | Filter by specific graph |\n| `{ duration > 5s && span.sre_agent.category = \"llm\" }` | Slow LLM calls |\n| `{ span.redis.is_infrastructure = false }` | App-level Redis ops only |"
+      }
+    },
+    {
+      "type": "traces",
+      "title": "Recent Agent Traces",
+      "description": "Traces filtered by selected category. Use Tempo Explore for advanced queries.",
+      "gridPos": { "x": 0, "y": 3, "w": 24, "h": 12 },
+      "datasource": { "type": "tempo", "uid": "tempo" },
+      "targets": [
+        {
+          "refId": "A",
+          "queryType": "traceqlSearch",
+          "limit": 50,
+          "tableType": "traces",
+          "filters": [
+            { "id": "service-name", "tag": "service.name", "operator": "=~", "value": ["redis-sre-.*"], "scope": "resource" }
+          ]
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "LLM Call Duration (p95)",
+      "gridPos": { "x": 0, "y": 15, "w": 8, "h": 8 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(sre_agent_llm_duration_seconds_bucket[5m])) by (le, component))",
+          "legendFormat": "{{component}}"
+        }
+      ],
+      "fieldConfig": { "defaults": { "unit": "s" } }
+    },
+    {
+      "type": "timeseries",
+      "title": "LLM Token Usage (rate/min)",
+      "gridPos": { "x": 8, "y": 15, "w": 8, "h": 8 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(rate(sre_agent_llm_tokens_total[1m])) by (component) * 60",
+          "legendFormat": "{{component}}"
+        }
+      ],
+      "fieldConfig": { "defaults": { "unit": "tokens/min" } }
+    },
+    {
+      "type": "stat",
+      "title": "LLM Requests (last hour)",
+      "gridPos": { "x": 16, "y": 15, "w": 4, "h": 4 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(increase(sre_agent_llm_requests_total[1h]))"
+        }
+      ],
+      "fieldConfig": { "defaults": { "unit": "requests" } }
+    },
+    {
+      "type": "stat",
+      "title": "Total Tokens (last hour)",
+      "gridPos": { "x": 20, "y": 15, "w": 4, "h": 4 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(increase(sre_agent_llm_tokens_total[1h]))"
+        }
+      ],
+      "fieldConfig": { "defaults": { "unit": "tokens" } }
+    },
+    {
+      "type": "piechart",
+      "title": "Tokens by Model",
+      "gridPos": { "x": 16, "y": 19, "w": 8, "h": 4 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(increase(sre_agent_llm_tokens_total[1h])) by (model)",
+          "legendFormat": "{{model}}"
+        }
+      ]
+    }
+  ]
+}
diff --git a/redis_sre_agent/api/app.py b/redis_sre_agent/api/app.py
@@ -1,22 +1,11 @@
 """Main FastAPI application for Redis SRE Agent."""
 
 import logging
-import os
 from contextlib import asynccontextmanager
 
 from fastapi import FastAPI
 from fastapi.responses import PlainTextResponse
-from opentelemetry import trace
-from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
-from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
-from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
-from opentelemetry.instrumentation.openai import OpenAIInstrumentor
-from opentelemetry.instrumentation.redis import RedisInstrumentor
-from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
 
 from redis_sre_agent.api.health import router as health_router
 from redis_sre_agent.api.instances import router as instances_router
@@ -29,6 +18,7 @@
 from redis_sre_agent.api.websockets import router as websockets_router
 from redis_sre_agent.core.config import settings
 from redis_sre_agent.core.redis import initialize_redis
+from redis_sre_agent.observability.tracing import setup_tracing as setup_base_tracing
 
 # Configure logging with consistent format
 # Note: When running via uvicorn with --log-config, this is overridden by logging_config.yaml
@@ -49,34 +39,12 @@
 def setup_tracing(app: FastAPI) -> None:
     """Initialize OpenTelemetry tracing if an OTLP endpoint is configured.
 
-    If OTEL_EXPORTER_OTLP_ENDPOINT is not set, tracing is disabled and this
-    function logs an info message and returns.
+    Uses the centralized tracing module for consistent span attributes
+    and Redis filtering hooks.
     """
-    otlp_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
-    if not otlp_endpoint:
-        logger.info("OpenTelemetry tracing disabled (no OTEL_EXPORTER_OTLP_ENDPOINT)")
-        return
-
-    resource = Resource.create(
-        {
-            "service.name": settings.app_name,
-            "service.version": "0.1.0",
-        }
-    )
-    provider = TracerProvider(resource=resource)
-    exporter = OTLPSpanExporter(
-        endpoint=otlp_endpoint,
-        headers=os.environ.get("OTEL_EXPORTER_OTLP_HEADERS"),
-    )
-    provider.add_span_processor(BatchSpanProcessor(exporter))
-    trace.set_tracer_provider(provider)
-
-    # Instrument libraries
-    RedisInstrumentor().instrument()
-    HTTPXClientInstrumentor().instrument()
-    AioHttpClientInstrumentor().instrument()
-    AsyncioInstrumentor().instrument()
-    OpenAIInstrumentor().instrument()
+    # Setup base tracing (Redis with hooks, HTTP clients, OpenAI)
+    if not setup_base_tracing(settings.app_name, "0.1.0"):
+        return  # Tracing not enabled
 
     # Instrument FastAPI (exclude common health/docs paths)
     excluded = ",".join(

diff --git a/redis_sre_agent/api/threads.py b/redis_sre_agent/api/threads.py
@@ -40,25 +40,32 @@ async def list_threads(
         tm = ThreadManager(redis_client=rc)
         summaries = await tm.list_threads(user_id=user_id, limit=limit, offset=offset)
 
-        # Enrich with message_count for UI display. Be defensive about failures.
+        # Enrich with message_count and latest_message for UI display.
         enriched: List[Dict[str, Any]] = []
         for s in summaries or []:
             s_out = dict(s)
-            if "message_count" not in s_out:
-                try:
-                    state = await tm.get_thread(s_out.get("thread_id"))
-                    msgs = []
-                    if state is not None:
-                        ctx = state.context or {}
-                        msgs = ctx.get("messages", []) or []
-                    # Only count user/assistant messages (exclude tools/system)
-                    s_out["message_count"] = sum(
-                        1
-                        for m in msgs
-                        if isinstance(m, dict) and m.get("role") in ("user", "assistant")
-                    )
-                except Exception:
-                    # If we cannot fetch the state, default to 0 rather than failing the list
+            try:
+                state = await tm.get_thread(s_out.get("thread_id"))
+                if state is not None:
+                    # Get messages from the Thread.messages list (primary storage)
+                    msgs = state.messages or []
+                    # Count user/assistant messages
+                    user_assistant_msgs = [m for m in msgs if m.role in ("user", "assistant")]
+                    s_out["message_count"] = len(user_assistant_msgs)
+
+                    # Get latest message content from the last assistant or user message
+                    if user_assistant_msgs:
+                        last_msg = user_assistant_msgs[-1]
+                        content = last_msg.content or ""
+                        # Truncate for preview
+                        s_out["latest_message"] = (
+                            content[:100] + "..." if len(content) > 100 else content
+                        )
+                else:
+                    s_out["message_count"] = 0
+            except Exception:
+                # If we cannot fetch the state, default to 0 rather than failing the list
+                if "message_count" not in s_out:
                     s_out["message_count"] = 0
             enriched.append(s_out)
         return enriched