From bfa068727ce46f81149dad766be6688628b7dc19 Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Mon, 15 Dec 2025 16:53:35 -0800
Subject: [PATCH 1/3] Add span categories and filtering for OpenTelemetry
 traces

- Create centralized tracing module with span categories (llm, tool, graph_node, agent, knowledge, redis)
- Add Redis instrumentation hooks to tag spans with command type and infrastructure flag
- Add Grafana dashboard for agent traces with TraceQL filter examples
- Update observability docs with TraceQL queries to filter out Redis noise
- Add comprehensive unit tests for tracing module
---
 docs/operations/observability.md              |  54 ++-
 .../dashboards/json/agent-traces.json         | 133 ++++++
 redis_sre_agent/api/app.py                    |  43 +-
 redis_sre_agent/api/threads.py                |  41 +-
 redis_sre_agent/cli/worker.py                 |  41 +-
 redis_sre_agent/observability/tracing.py      | 229 +++++++++++
 .../api/test_threads_list_message_count.py    |  10 +-
 tests/unit/observability/__init__.py          |   1 +
 tests/unit/observability/test_tracing.py      | 384 ++++++++++++++++++
 9 files changed, 843 insertions(+), 93 deletions(-)
 create mode 100644 monitoring/grafana/provisioning/dashboards/json/agent-traces.json
 create mode 100644 redis_sre_agent/observability/tracing.py
 create mode 100644 tests/unit/observability/__init__.py
 create mode 100644 tests/unit/observability/test_tracing.py

diff --git a/docs/operations/observability.md b/docs/operations/observability.md
index b6c14881..66fef356 100644
--- a/docs/operations/observability.md
+++ b/docs/operations/observability.md
@@ -77,7 +77,7 @@ Both the API and worker will automatically instrument and export spans when this
 
 ### What gets traced
 - **FastAPI requests** (excluding health/metrics endpoints)
-- **Redis operations** (via RedisInstrumentor)
+- **Redis operations** (via RedisInstrumentor with custom hooks)
 - **HTTP clients** (HTTPX, AioHTTP)
 - **OpenAI API calls** (via OpenAIInstrumentor)
 - **LangGraph nodes**: Each node in the agent workflow gets a custom span with attributes:
@@ -85,6 +85,58 @@ Both the API and worker will automatically instrument and export spans when this
   - `langgraph.node` - which node (e.g., `agent`, `tools`, `reasoning`)
 - **LLM calls**: Token usage and latency are added as span attributes
 
+### Span Categories for Filtering
+
+All spans include a `sre_agent.category` attribute to help you filter out noise (especially Redis commands) and focus on application logic. Available categories:
+
+| Category | Description |
+|----------|-------------|
+| `llm` | LLM API calls |
+| `tool` | Tool invocations |
+| `graph_node` | LangGraph node execution |
+| `agent` | High-level agent operations |
+| `knowledge` | Knowledge base operations |
+| `redis` | Redis commands (filter these out to reduce noise) |
+
+Redis spans also include:
+- `redis.command` - the Redis command (GET, SET, HSET, etc.)
+- `redis.is_infrastructure` - `true` for internal ops (PING, INFO, etc.)
+- `redis.key_prefix` - the key prefix (before first `:`) for grouping
+
+### TraceQL Queries for Grafana/Tempo
+
+Use these queries in Grafana's Tempo Explore view to filter traces:
+
+```traceql
+# Hide all Redis spans - see only app logic
+{ span.sre_agent.category != "redis" }
+
+# Show only LLM calls
+{ span.sre_agent.category = "llm" }
+
+# Slow LLM calls (> 5 seconds)
+{ span.sre_agent.category = "llm" && duration > 5s }
+
+# Show only tool invocations
+{ span.sre_agent.category = "tool" }
+
+# Filter by LangGraph graph name
+{ span.langgraph.graph = "sre_agent" }
+
+# Show app-level Redis ops only (hide PING, INFO, etc.)
+{ span.sre_agent.category = "redis" && span.redis.is_infrastructure = false }
+```
+
+### SRE Agent Traces Dashboard
+
+A pre-built Grafana dashboard is available at `monitoring/grafana/provisioning/dashboards/json/agent-traces.json` that provides:
+- Trace list with category filtering
+- LLM call duration percentiles
+- Token usage rates by component
+- Tokens by model breakdown
+
+Access it in Grafana under Dashboards → SRE Agent Traces.
+
 ### Example: Tempo (local dev)
 The docker-compose stack includes Tempo as an OTLP collector:
 ```yaml
diff --git a/monitoring/grafana/provisioning/dashboards/json/agent-traces.json b/monitoring/grafana/provisioning/dashboards/json/agent-traces.json
new file mode 100644
index 00000000..5b3eb392
--- /dev/null
+++ b/monitoring/grafana/provisioning/dashboards/json/agent-traces.json
@@ -0,0 +1,133 @@
+{
+  "id": null,
+  "uid": "agent-traces",
+  "title": "SRE Agent Traces",
+  "description": "OpenTelemetry traces for the Redis SRE Agent - filter out Redis noise to see LLM calls, tool invocations, and graph node execution",
+  "tags": ["traces", "tempo", "agent", "llm"],
+  "timezone": "",
+  "schemaVersion": 39,
+  "version": 1,
+  "refresh": "30s",
+  "time": { "from": "now-1h", "to": "now" },
+  "templating": {
+    "list": [
+      {
+        "name": "service",
+        "label": "Service",
+        "type": "custom",
+        "options": [
+          { "text": "All", "value": "", "selected": true },
+          { "text": "API", "value": "redis-sre-agent" },
+          { "text": "Worker", "value": "redis-sre-worker" }
+        ],
+        "current": { "text": "All", "value": "" }
+      },
+      {
+        "name": "category",
+        "label": "Span Category",
+        "type": "custom",
+        "options": [
+          { "text": "All (no Redis)", "value": "exclude_redis", "selected": true },
+          { "text": "LLM Calls", "value": "llm" },
+          { "text": "Tool Calls", "value": "tool" },
+          { "text": "Graph Nodes", "value": "graph_node" },
+          { "text": "Agent", "value": "agent" },
+          { "text": "Knowledge", "value": "knowledge" },
+          { "text": "All (including Redis)", "value": "all" }
+        ],
+        "current": { "text": "All (no Redis)", "value": "exclude_redis" }
+      }
+    ]
+  },
+  "panels": [
+    {
+      "type": "text",
+      "title": "Trace Filtering Guide",
+      "gridPos": { "x": 0, "y": 0, "w": 24, "h": 3 },
+      "options": {
+        "mode": "markdown",
+        "content": "## TraceQL Queries for Filtering\n\n| Query | Description |\n|-------|-------------|\n| `{ span.sre_agent.category != \"redis\" }` | Hide all Redis spans |\n| `{ span.sre_agent.category = \"llm\" }` | Show only LLM calls |\n| `{ span.sre_agent.category = \"tool\" }` | Show only tool invocations |\n| `{ span.langgraph.graph = \"sre_agent\" }` | Filter by specific graph |\n| `{ duration > 5s && span.sre_agent.category = \"llm\" }` | Slow LLM calls |\n| `{ span.redis.is_infrastructure = false }` | App-level Redis ops only |"
+      }
+    },
+    {
+      "type": "traces",
+      "title": "Recent Agent Traces",
+      "description": "Traces filtered by selected category. Use Tempo Explore for advanced queries.",
+      "gridPos": { "x": 0, "y": 3, "w": 24, "h": 12 },
+      "datasource": { "type": "tempo", "uid": "tempo" },
+      "targets": [
+        {
+          "refId": "A",
+          "queryType": "traceqlSearch",
+          "limit": 50,
+          "tableType": "traces",
+          "filters": [
+            { "id": "service-name", "tag": "service.name", "operator": "=~", "value": ["redis-sre-.*"], "scope": "resource" }
+          ]
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "LLM Call Duration (p95)",
+      "gridPos": { "x": 0, "y": 15, "w": 8, "h": 8 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum(rate(sre_agent_llm_duration_seconds_bucket[5m])) by (le, component))",
+          "legendFormat": "{{component}}"
+        }
+      ],
+      "fieldConfig": { "defaults": { "unit": "s" } }
+    },
+    {
+      "type": "timeseries",
+      "title": "LLM Token Usage (rate/min)",
+      "gridPos": { "x": 8, "y": 15, "w": 8, "h": 8 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(rate(sre_agent_llm_tokens_total[1m])) by (component) * 60",
+          "legendFormat": "{{component}}"
+        }
+      ],
+      "fieldConfig": { "defaults": { "unit": "tokens/min" } }
+    },
+    {
+      "type": "stat",
+      "title": "LLM Requests (last hour)",
+      "gridPos": { "x": 16, "y": 15, "w": 4, "h": 4 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(increase(sre_agent_llm_requests_total[1h]))"
+        }
+      ],
+      "fieldConfig": { "defaults": { "unit": "requests" } }
+    },
+    {
+      "type": "stat",
+      "title": "Total Tokens (last hour)",
+      "gridPos": { "x": 20, "y": 15, "w": 4, "h": 4 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(increase(sre_agent_llm_tokens_total[1h]))"
+        }
+      ],
+      "fieldConfig": { "defaults": { "unit": "tokens" } }
+    },
+    {
+      "type": "piechart",
+      "title": "Tokens by Model",
+      "gridPos": { "x": 16, "y": 19, "w": 8, "h": 4 },
+      "datasource": { "type": "prometheus", "uid": "prometheus" },
+      "targets": [
+        {
+          "expr": "sum(increase(sre_agent_llm_tokens_total[1h])) by (model)",
+          "legendFormat": "{{model}}"
+        }
+      ]
+    }
+  ]
+}
diff --git a/redis_sre_agent/api/app.py b/redis_sre_agent/api/app.py
index 39b69c34..f4544190 100644
--- a/redis_sre_agent/api/app.py
+++ b/redis_sre_agent/api/app.py
@@ -6,17 +6,7 @@
 
 from fastapi import FastAPI
 from fastapi.responses import PlainTextResponse
-from opentelemetry import trace
-from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
-from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor
 from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
-from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
-from opentelemetry.instrumentation.openai import OpenAIInstrumentor
-from opentelemetry.instrumentation.redis import RedisInstrumentor
-from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
 
 from redis_sre_agent.api.health import router as health_router
 from redis_sre_agent.api.instances import router as instances_router
@@ -29,6 +19,7 @@
 from redis_sre_agent.api.websockets import router as websockets_router
 from redis_sre_agent.core.config import settings
 from redis_sre_agent.core.redis import initialize_redis
+from redis_sre_agent.observability.tracing import setup_tracing as setup_base_tracing
 
 # Configure logging with consistent format
 # Note: When running via uvicorn with --log-config, this is overridden by logging_config.yaml
@@ -49,34 +40,12 @@
 def setup_tracing(app: FastAPI) -> None:
     """Initialize OpenTelemetry tracing if an OTLP endpoint is configured.
 
-    If OTEL_EXPORTER_OTLP_ENDPOINT is not set, tracing is disabled and this
-    function logs an info message and returns.
+    Uses the centralized tracing module for consistent span attributes
+    and Redis filtering hooks.
     """
-    otlp_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
-    if not otlp_endpoint:
-        logger.info("OpenTelemetry tracing disabled (no OTEL_EXPORTER_OTLP_ENDPOINT)")
-        return
-
-    resource = Resource.create(
-        {
-            "service.name": settings.app_name,
-            "service.version": "0.1.0",
-        }
-    )
-    provider = TracerProvider(resource=resource)
-    exporter = OTLPSpanExporter(
-        endpoint=otlp_endpoint,
-        headers=os.environ.get("OTEL_EXPORTER_OTLP_HEADERS"),
-    )
-    provider.add_span_processor(BatchSpanProcessor(exporter))
-    trace.set_tracer_provider(provider)
-
-    # Instrument libraries
-    RedisInstrumentor().instrument()
-    HTTPXClientInstrumentor().instrument()
-    AioHttpClientInstrumentor().instrument()
-    AsyncioInstrumentor().instrument()
-    OpenAIInstrumentor().instrument()
+    # Setup base tracing (Redis with hooks, HTTP clients, OpenAI)
+    if not setup_base_tracing(settings.app_name, "0.1.0"):
+        return  # Tracing not enabled
 
     # Instrument FastAPI (exclude common health/docs paths)
     excluded = ",".join(
diff --git a/redis_sre_agent/api/threads.py b/redis_sre_agent/api/threads.py
index d56ed926..337db473 100644
--- a/redis_sre_agent/api/threads.py
+++ b/redis_sre_agent/api/threads.py
@@ -40,25 +40,34 @@ async def list_threads(
         tm = ThreadManager(redis_client=rc)
         summaries = await tm.list_threads(user_id=user_id, limit=limit, offset=offset)
 
-        # Enrich with message_count for UI display. Be defensive about failures.
+        # Enrich with message_count and latest_message for UI display.
         enriched: List[Dict[str, Any]] = []
         for s in summaries or []:
             s_out = dict(s)
-            if "message_count" not in s_out:
-                try:
-                    state = await tm.get_thread(s_out.get("thread_id"))
-                    msgs = []
-                    if state is not None:
-                        ctx = state.context or {}
-                        msgs = ctx.get("messages", []) or []
-                    # Only count user/assistant messages (exclude tools/system)
-                    s_out["message_count"] = sum(
-                        1
-                        for m in msgs
-                        if isinstance(m, dict) and m.get("role") in ("user", "assistant")
-                    )
-                except Exception:
-                    # If we cannot fetch the state, default to 0 rather than failing the list
+            try:
+                state = await tm.get_thread(s_out.get("thread_id"))
+                if state is not None:
+                    # Get messages from the Thread.messages list (primary storage)
+                    msgs = state.messages or []
+                    # Count user/assistant messages
+                    user_assistant_msgs = [
+                        m for m in msgs if m.role in ("user", "assistant")
+                    ]
+                    s_out["message_count"] = len(user_assistant_msgs)
+
+                    # Get latest message content from the last assistant or user message
+                    if user_assistant_msgs:
+                        last_msg = user_assistant_msgs[-1]
+                        content = last_msg.content or ""
+                        # Truncate for preview
+                        s_out["latest_message"] = (
+                            content[:100] + "..." if len(content) > 100 else content
+                        )
+                else:
+                    s_out["message_count"] = 0
+            except Exception:
+                # If we cannot fetch the state, default to 0 rather than failing the list
+                if "message_count" not in s_out:
                     s_out["message_count"] = 0
             enriched.append(s_out)
         return enriched
diff --git a/redis_sre_agent/cli/worker.py b/redis_sre_agent/cli/worker.py
index f9b04da7..627b0c3f 100644
--- a/redis_sre_agent/cli/worker.py
+++ b/redis_sre_agent/cli/worker.py
@@ -3,23 +3,13 @@
 from __future__ import annotations
 
 import asyncio
-import os
 
 import click
 from docket import Worker
-from opentelemetry import trace
-from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
-from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
-from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor
-from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
-from opentelemetry.instrumentation.openai import OpenAIInstrumentor
-from opentelemetry.instrumentation.redis import RedisInstrumentor
-from opentelemetry.sdk.resources import Resource
-from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import BatchSpanProcessor
 
 from redis_sre_agent.core.config import settings
 from redis_sre_agent.core.docket_tasks import register_sre_tasks
+from redis_sre_agent.observability.tracing import setup_tracing
 
 
 # TODO: rename start
@@ -49,33 +39,8 @@ async def _worker():
         redis_url = settings.redis_url.get_secret_value()
         logger.info("Starting SRE Docket worker connected to Redis")
 
-        # OpenTelemetry tracing (enabled when OTEL endpoint is present)
-        otlp_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
-        if not otlp_endpoint:
-            logger.info("OTel tracing disabled in worker (no OTEL_EXPORTER_OTLP_ENDPOINT)")
-        else:
-            resource = Resource.create(
-                {
-                    "service.name": "redis-sre-worker",
-                    "service.version": "0.1.0",
-                }
-            )
-            provider = TracerProvider(resource=resource)
-            exporter = OTLPSpanExporter(
-                endpoint=otlp_endpoint,
-                headers=os.environ.get("OTEL_EXPORTER_OTLP_HEADERS"),
-            )
-            provider.add_span_processor(BatchSpanProcessor(exporter))
-            trace.set_tracer_provider(provider)
-
-            # Libraries
-            RedisInstrumentor().instrument()
-            HTTPXClientInstrumentor().instrument()
-            AioHttpClientInstrumentor().instrument()
-            AsyncioInstrumentor().instrument()
-            OpenAIInstrumentor().instrument()
-
-            logger.info("OTel tracing enabled in worker (redis/httpx/aiohttp/asyncio)")
+        # OpenTelemetry tracing with centralized setup (includes Redis hooks for filtering)
+        setup_tracing("redis-sre-worker", "0.1.0")
 
         # Start a Prometheus metrics HTTP server to expose worker metrics (incl. LLM tokens)
         try:
diff --git a/redis_sre_agent/observability/tracing.py b/redis_sre_agent/observability/tracing.py
new file mode 100644
index 00000000..3ef415a5
--- /dev/null
+++ b/redis_sre_agent/observability/tracing.py
@@ -0,0 +1,229 @@
+"""Centralized OpenTelemetry tracing configuration and helpers.
+
+This module provides:
+- Unified tracer setup for API and worker
+- Span category constants for filtering in Grafana/Tempo
+- Redis instrumentation with request/response hooks
+- Helper decorators for consistent span attributes
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from enum import Enum
+from functools import wraps
+from typing import Any, Callable, Optional, TypeVar
+
+from opentelemetry import trace
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.instrumentation.aiohttp_client import AioHttpClientInstrumentor
+from opentelemetry.instrumentation.asyncio import AsyncioInstrumentor
+from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
+from opentelemetry.instrumentation.openai import OpenAIInstrumentor
+from opentelemetry.instrumentation.redis import RedisInstrumentor
+from opentelemetry.sdk.resources import Resource
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+
+logger = logging.getLogger(__name__)
+
+# Type var for decorators
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+class SpanCategory(str, Enum):
+    """Span categories for filtering traces in Grafana/Tempo.
+
+    Use TraceQL like: {span.sre_agent.category = "llm"}
+    """
+    LLM = "llm"
+    TOOL = "tool"
+    GRAPH_NODE = "graph_node"
+    AGENT = "agent"
+    KNOWLEDGE = "knowledge"
+    HTTP = "http"
+    REDIS = "redis"  # Infrastructure - can be filtered out
+
+
+# Attribute keys
+ATTR_CATEGORY = "sre_agent.category"
+ATTR_COMPONENT = "sre_agent.component"
+ATTR_GRAPH_NAME = "langgraph.graph"
+ATTR_NODE_NAME = "langgraph.node"
+
+# Redis commands that are "noisy" infrastructure - useful for filtering
+REDIS_INFRA_COMMANDS = frozenset({
+    "PING", "SELECT", "INFO", "CONFIG", "CLIENT", "DEBUG",
+    "SLOWLOG", "MEMORY", "DBSIZE", "LASTSAVE", "TIME",
+    "SCAN", "HSCAN", "SSCAN", "ZSCAN",  # Iteration commands
+    "WATCH", "MULTI", "EXEC", "DISCARD",  # Transaction commands
+})
+
+
+def _redis_request_hook(span: trace.Span, instance: Any, args: tuple, kwargs: dict) -> None:
+    """Hook called before each Redis command to add custom attributes."""
+    if not span or not span.is_recording():
+        return
+
+    # Add category for filtering
+    span.set_attribute(ATTR_CATEGORY, SpanCategory.REDIS.value)
+
+    # Extract command name for filtering
+    command = args[0].upper() if args else "UNKNOWN"
+    span.set_attribute("redis.command", command)
+
+    # Mark infrastructure commands for easy filtering
+    is_infra = command in REDIS_INFRA_COMMANDS
+    span.set_attribute("redis.is_infrastructure", is_infra)
+
+    # Add key pattern (first key arg, if present) without exposing values
+    if len(args) > 1 and isinstance(args[1], (str, bytes)):
+        key = args[1] if isinstance(args[1], str) else args[1].decode("utf-8", errors="replace")
+        # Extract key prefix (before first colon) for grouping
+        prefix = key.split(":")[0] if ":" in key else key
+        span.set_attribute("redis.key_prefix", prefix[:50])  # Truncate for safety
+
+
+def _redis_response_hook(span: trace.Span, instance: Any, response: Any) -> None:
+    """Hook called after Redis command completion."""
+    if not span or not span.is_recording():
+        return
+
+    # Add response type for debugging
+    if response is not None:
+        resp_type = type(response).__name__
+        span.set_attribute("redis.response_type", resp_type)
+
+
+def setup_tracing(
+    service_name: str,
+    service_version: str = "0.1.0",
+) -> bool:
+    """Initialize OpenTelemetry tracing if OTLP endpoint is configured.
+
+    Returns True if tracing was enabled, False otherwise.
+    """
+    otlp_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
+    if not otlp_endpoint:
+        logger.info(f"OpenTelemetry tracing disabled for {service_name} (no OTEL_EXPORTER_OTLP_ENDPOINT)")
+        return False
+
+    resource = Resource.create({
+        "service.name": service_name,
+        "service.version": service_version,
+    })
+    provider = TracerProvider(resource=resource)
+    exporter = OTLPSpanExporter(
+        endpoint=otlp_endpoint,
+        headers=os.environ.get("OTEL_EXPORTER_OTLP_HEADERS"),
+    )
+    provider.add_span_processor(BatchSpanProcessor(exporter))
+    trace.set_tracer_provider(provider)
+
+    # Instrument Redis with custom hooks for filtering
+    RedisInstrumentor().instrument(
+        request_hook=_redis_request_hook,
+        response_hook=_redis_response_hook,
+    )
+
+    # Instrument HTTP clients
+    HTTPXClientInstrumentor().instrument()
+    AioHttpClientInstrumentor().instrument()
+    AsyncioInstrumentor().instrument()
+    OpenAIInstrumentor().instrument()
+
+    logger.info(f"OpenTelemetry tracing initialized for {service_name}")
+    return True
+
+
+def get_tracer(name: str) -> trace.Tracer:
+    """Get a tracer instance for creating spans."""
+    return trace.get_tracer(name)
+
+
+def trace_graph_node(graph_name: str, node_name: str):
+    """Decorator to trace LangGraph node execution with standard attributes."""
+    def decorator(fn: F) -> F:
+        tracer = get_tracer(fn.__module__)
+
+        @wraps(fn)
+        async def wrapper(*args, **kwargs):
+            with tracer.start_as_current_span(
+                f"langgraph.{graph_name}.{node_name}",
+                attributes={
+                    ATTR_CATEGORY: SpanCategory.GRAPH_NODE.value,
+                    ATTR_GRAPH_NAME: graph_name,
+                    ATTR_NODE_NAME: node_name,
+                },
+            ):
+                return await fn(*args, **kwargs)
+        return wrapper  # type: ignore
+    return decorator
+
+
+def trace_tool(tool_name: str, component: Optional[str] = None):
+    """Decorator to trace tool execution with standard attributes."""
+    def decorator(fn: F) -> F:
+        tracer = get_tracer(fn.__module__)
+
+        @wraps(fn)
+        async def wrapper(*args, **kwargs):
+            attrs = {
+                ATTR_CATEGORY: SpanCategory.TOOL.value,
+                "tool.name": tool_name,
+            }
+            if component:
+                attrs[ATTR_COMPONENT] = component
+            with tracer.start_as_current_span(f"tool.{tool_name}", attributes=attrs):
+                return await fn(*args, **kwargs)
+        return wrapper  # type: ignore
+    return decorator
+
+
+def trace_llm(component: str):
+    """Decorator to trace LLM calls with standard attributes.
+
+    Note: This creates the span; use record_llm_call_metrics() to add token usage.
+    """
+    def decorator(fn: F) -> F:
+        tracer = get_tracer(fn.__module__)
+
+        @wraps(fn)
+        async def wrapper(*args, **kwargs):
+            with tracer.start_as_current_span(
+                f"llm.{component}",
+                attributes={
+                    ATTR_CATEGORY: SpanCategory.LLM.value,
+                    ATTR_COMPONENT: component,
+                },
+            ):
+                return await fn(*args, **kwargs)
+        return wrapper  # type: ignore
+    return decorator
+
+
+def add_span_attributes(attrs: dict[str, Any]) -> None:
+    """Add attributes to the current span if recording."""
+    span = trace.get_current_span()
+    if span and span.is_recording():
+        for key, value in attrs.items():
+            try:
+                span.set_attribute(key, value)
+            except Exception:
+                pass  # Ignore invalid attribute values
+
+
+# Common TraceQL queries for Grafana/Tempo
+# These can be used as saved queries or in dashboard panels
+TRACEQL_QUERIES = {
+    "agent_turns": '{ span.name = "agent.turn" }',
+    "llm_calls": '{ span.sre_agent.category = "llm" }',
+    "tool_calls": '{ span.sre_agent.category = "tool" }',
+    "graph_nodes": '{ span.sre_agent.category = "graph_node" }',
+    "exclude_redis": '{ span.sre_agent.category != "redis" }',
+    "slow_llm": '{ span.sre_agent.category = "llm" && duration > 5s }',
+    "knowledge_ops": '{ span.sre_agent.category = "knowledge" }',
+    # Filter out Redis infrastructure commands
+    "app_only": '{ span.sre_agent.category != "redis" || span.redis.is_infrastructure = false }',
+}
diff --git a/tests/unit/api/test_threads_list_message_count.py b/tests/unit/api/test_threads_list_message_count.py
index 0fc3940c..bf29bf25 100644
--- a/tests/unit/api/test_threads_list_message_count.py
+++ b/tests/unit/api/test_threads_list_message_count.py
@@ -11,8 +11,16 @@
 
 
 def make_state_with_messages(n: int):
+    """Create a mock state with messages in the Thread.messages list (primary storage)."""
+
+    class MockMessage:
+        def __init__(self, role: str, content: str):
+            self.role = role
+            self.content = content
+
     class State:
-        context = {"messages": [{"role": "user", "content": f"m{i}"} for i in range(n)]}
+        messages = [MockMessage("user", f"m{i}") for i in range(n)]
+        context = {}
         metadata = MagicMock()
 
     return State()
diff --git a/tests/unit/observability/__init__.py b/tests/unit/observability/__init__.py
new file mode 100644
index 00000000..b2ea4de4
--- /dev/null
+++ b/tests/unit/observability/__init__.py
@@ -0,0 +1 @@
+# Observability tests
diff --git a/tests/unit/observability/test_tracing.py b/tests/unit/observability/test_tracing.py
new file mode 100644
index 00000000..547ae430
--- /dev/null
+++ b/tests/unit/observability/test_tracing.py
@@ -0,0 +1,384 @@
+"""Tests for the centralized tracing module.
+
+Tests cover:
+- SpanCategory enum values
+- Redis request/response hooks
+- Tracing decorators (trace_graph_node, trace_tool, trace_llm)
+- add_span_attributes helper
+- setup_tracing behavior (with and without OTLP endpoint)
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from redis_sre_agent.observability.tracing import (
+    ATTR_CATEGORY,
+    ATTR_COMPONENT,
+    ATTR_GRAPH_NAME,
+    ATTR_NODE_NAME,
+    REDIS_INFRA_COMMANDS,
+    TRACEQL_QUERIES,
+    SpanCategory,
+    _redis_request_hook,
+    _redis_response_hook,
+    add_span_attributes,
+    get_tracer,
+    setup_tracing,
+    trace_graph_node,
+    trace_llm,
+    trace_tool,
+)
+
+
+class TestSpanCategory:
+    """Test SpanCategory enum."""
+
+    def test_span_categories_are_strings(self):
+        """All span categories should be string values."""
+        assert SpanCategory.LLM.value == "llm"
+        assert SpanCategory.TOOL.value == "tool"
+        assert SpanCategory.GRAPH_NODE.value == "graph_node"
+        assert SpanCategory.AGENT.value == "agent"
+        assert SpanCategory.KNOWLEDGE.value == "knowledge"
+        assert SpanCategory.HTTP.value == "http"
+        assert SpanCategory.REDIS.value == "redis"
+
+    def test_span_category_is_string_subclass(self):
+        """SpanCategory should be usable as a string."""
+        assert isinstance(SpanCategory.LLM, str)
+        assert SpanCategory.LLM == "llm"
+
+
+class TestAttributeConstants:
+    """Test attribute key constants."""
+
+    def test_attribute_keys_defined(self):
+        """Verify attribute key constants are defined."""
+        assert ATTR_CATEGORY == "sre_agent.category"
+        assert ATTR_COMPONENT == "sre_agent.component"
+        assert ATTR_GRAPH_NAME == "langgraph.graph"
+        assert ATTR_NODE_NAME == "langgraph.node"
+
+
+class TestRedisInfraCommands:
+    """Test Redis infrastructure command detection."""
+
+    def test_ping_is_infra(self):
+        """PING should be an infrastructure command."""
+        assert "PING" in REDIS_INFRA_COMMANDS
+
+    def test_info_is_infra(self):
+        """INFO should be an infrastructure command."""
+        assert "INFO" in REDIS_INFRA_COMMANDS
+
+    def test_scan_commands_are_infra(self):
+        """SCAN variants should be infrastructure commands."""
+        assert "SCAN" in REDIS_INFRA_COMMANDS
+        assert "HSCAN" in REDIS_INFRA_COMMANDS
+        assert "SSCAN" in REDIS_INFRA_COMMANDS
+        assert "ZSCAN" in REDIS_INFRA_COMMANDS
+
+    def test_get_set_not_infra(self):
+        """GET and SET should not be infrastructure commands."""
+        assert "GET" not in REDIS_INFRA_COMMANDS
+        assert "SET" not in REDIS_INFRA_COMMANDS
+        assert "HSET" not in REDIS_INFRA_COMMANDS
+
+
+class TestRedisRequestHook:
+    """Test the Redis request hook."""
+
+    def test_hook_sets_category_attribute(self):
+        """Hook should set sre_agent.category to redis."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, ("GET", "mykey"), {})
+
+        mock_span.set_attribute.assert_any_call(ATTR_CATEGORY, "redis")
+
+    def test_hook_sets_command_attribute(self):
+        """Hook should set redis.command attribute."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, ("HSET", "myhash", "field", "value"), {})
+
+        mock_span.set_attribute.assert_any_call("redis.command", "HSET")
+
+    def test_hook_marks_infra_commands(self):
+        """Hook should mark infrastructure commands."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, ("PING",), {})
+
+        mock_span.set_attribute.assert_any_call("redis.is_infrastructure", True)
+
+    def test_hook_marks_non_infra_commands(self):
+        """Hook should mark non-infrastructure commands."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, ("GET", "mykey"), {})
+
+        mock_span.set_attribute.assert_any_call("redis.is_infrastructure", False)
+
+    def test_hook_extracts_key_prefix(self):
+        """Hook should extract key prefix before colon."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, ("GET", "user:123:profile"), {})
+
+        mock_span.set_attribute.assert_any_call("redis.key_prefix", "user")
+
+    def test_hook_handles_key_without_colon(self):
+        """Hook should handle keys without colons."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, ("GET", "simplekey"), {})
+
+        mock_span.set_attribute.assert_any_call("redis.key_prefix", "simplekey")
+
+    def test_hook_handles_bytes_key(self):
+        """Hook should handle bytes keys."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, ("GET", b"mykey:123"), {})
+
+        mock_span.set_attribute.assert_any_call("redis.key_prefix", "mykey")
+
+    def test_hook_skips_non_recording_span(self):
+        """Hook should skip if span is not recording."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = False
+
+        _redis_request_hook(mock_span, None, ("GET", "mykey"), {})
+
+        mock_span.set_attribute.assert_not_called()
+
+    def test_hook_handles_none_span(self):
+        """Hook should handle None span gracefully."""
+        # Should not raise
+        _redis_request_hook(None, None, ("GET", "mykey"), {})
+
+    def test_hook_handles_empty_args(self):
+        """Hook should handle empty args gracefully."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, (), {})
+
+        mock_span.set_attribute.assert_any_call("redis.command", "UNKNOWN")
+
+
+class TestRedisResponseHook:
+    """Test the Redis response hook."""
+
+    def test_hook_sets_response_type(self):
+        """Hook should set redis.response_type attribute."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_response_hook(mock_span, None, "OK")
+
+        mock_span.set_attribute.assert_called_with("redis.response_type", "str")
+
+    def test_hook_handles_list_response(self):
+        """Hook should handle list responses."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_response_hook(mock_span, None, [b"item1", b"item2"])
+
+        mock_span.set_attribute.assert_called_with("redis.response_type", "list")
+
+    def test_hook_skips_none_response(self):
+        """Hook should skip None responses."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_response_hook(mock_span, None, None)
+
+        mock_span.set_attribute.assert_not_called()
+
+    def test_hook_skips_non_recording_span(self):
+        """Hook should skip if span is not recording."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = False
+
+        _redis_response_hook(mock_span, None, "OK")
+
+        mock_span.set_attribute.assert_not_called()
+
+
+class TestSetupTracing:
+    """Test setup_tracing function."""
+
+    def test_returns_false_without_endpoint(self):
+        """Should return False when OTLP endpoint not set."""
+        with patch.dict("os.environ", {}, clear=True):
+            # Remove OTEL_EXPORTER_OTLP_ENDPOINT if present
+            import os
+            os.environ.pop("OTEL_EXPORTER_OTLP_ENDPOINT", None)
+            result = setup_tracing("test-service")
+            assert result is False
+
+    @patch("redis_sre_agent.observability.tracing.RedisInstrumentor")
+    @patch("redis_sre_agent.observability.tracing.HTTPXClientInstrumentor")
+    @patch("redis_sre_agent.observability.tracing.AioHttpClientInstrumentor")
+    @patch("redis_sre_agent.observability.tracing.AsyncioInstrumentor")
+    @patch("redis_sre_agent.observability.tracing.OpenAIInstrumentor")
+    @patch("redis_sre_agent.observability.tracing.trace")
+    @patch("redis_sre_agent.observability.tracing.BatchSpanProcessor")
+    @patch("redis_sre_agent.observability.tracing.OTLPSpanExporter")
+    @patch("redis_sre_agent.observability.tracing.TracerProvider")
+    def test_returns_true_with_endpoint(
+        self,
+        mock_provider,
+        mock_exporter,
+        mock_processor,
+        mock_trace,
+        mock_openai,
+        mock_asyncio,
+        mock_aiohttp,
+        mock_httpx,
+        mock_redis,
+    ):
+        """Should return True when OTLP endpoint is set."""
+        with patch.dict("os.environ", {"OTEL_EXPORTER_OTLP_ENDPOINT": "http://localhost:4318"}):
+            result = setup_tracing("test-service", "1.0.0")
+            assert result is True
+            mock_redis.return_value.instrument.assert_called_once()
+
+
+class TestGetTracer:
+    """Test get_tracer function."""
+
+    @patch("redis_sre_agent.observability.tracing.trace")
+    def test_returns_tracer(self, mock_trace):
+        """Should return a tracer for the given name."""
+        mock_tracer = MagicMock()
+        mock_trace.get_tracer.return_value = mock_tracer
+
+        result = get_tracer("test.module")
+
+        mock_trace.get_tracer.assert_called_with("test.module")
+        assert result == mock_tracer
+
+
+class TestTraceGraphNodeDecorator:
+    """Test trace_graph_node decorator."""
+
+    @pytest.mark.asyncio
+    async def test_decorator_wraps_function(self):
+        """Decorator should preserve function behavior."""
+        @trace_graph_node("test_graph", "test_node")
+        async def my_node(state):
+            return {"processed": True}
+
+        result = await my_node({})
+        assert result == {"processed": True}
+
+    def test_decorator_preserves_function_name(self):
+        """Decorator should preserve function name."""
+        @trace_graph_node("test_graph", "test_node")
+        async def my_special_node(state):
+            return state
+
+        assert my_special_node.__name__ == "my_special_node"
+
+
+class TestTraceToolDecorator:
+    """Test trace_tool decorator."""
+
+    @pytest.mark.asyncio
+    async def test_decorator_wraps_function(self):
+        """Decorator should preserve function behavior."""
+        @trace_tool("test_tool")
+        async def my_tool(query):
+            return f"result for {query}"
+
+        result = await my_tool("test")
+        assert result == "result for test"
+
+    @pytest.mark.asyncio
+    async def test_decorator_with_component(self):
+        """Decorator should work with component parameter."""
+        @trace_tool("search", component="knowledge")
+        async def search_knowledge(query):
+            return ["doc1", "doc2"]
+
+        result = await search_knowledge("redis")
+        assert result == ["doc1", "doc2"]
+
+
+class TestTraceLlmDecorator:
+    """Test trace_llm decorator."""
+
+    @pytest.mark.asyncio
+    async def test_decorator_wraps_function(self):
+        """Decorator should preserve function behavior."""
+        @trace_llm("router")
+        async def call_llm(prompt):
+            return "response"
+
+        result = await call_llm("test prompt")
+        assert result == "response"
+
+
+class TestAddSpanAttributes:
+    """Test add_span_attributes helper."""
+
+    @patch("redis_sre_agent.observability.tracing.trace")
+    def test_adds_attributes_to_current_span(self, mock_trace):
+        """Should add attributes to current span."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+        mock_trace.get_current_span.return_value = mock_span
+
+        add_span_attributes({"key1": "value1", "key2": 42})
+
+        mock_span.set_attribute.assert_any_call("key1", "value1")
+        mock_span.set_attribute.assert_any_call("key2", 42)
+
+    @patch("redis_sre_agent.observability.tracing.trace")
+    def test_skips_non_recording_span(self, mock_trace):
+        """Should skip if span is not recording."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = False
+        mock_trace.get_current_span.return_value = mock_span
+
+        add_span_attributes({"key1": "value1"})
+
+        mock_span.set_attribute.assert_not_called()
+
+    @patch("redis_sre_agent.observability.tracing.trace")
+    def test_handles_no_current_span(self, mock_trace):
+        """Should handle None current span gracefully."""
+        mock_trace.get_current_span.return_value = None
+
+        # Should not raise
+        add_span_attributes({"key1": "value1"})
+
+
+class TestTraceQLQueries:
+    """Test TraceQL query constants."""
+
+    def test_queries_defined(self):
+        """TraceQL queries should be defined."""
+        assert "agent_turns" in TRACEQL_QUERIES
+        assert "llm_calls" in TRACEQL_QUERIES
+        assert "tool_calls" in TRACEQL_QUERIES
+        assert "exclude_redis" in TRACEQL_QUERIES
+        assert "app_only" in TRACEQL_QUERIES
+
+    def test_queries_are_valid_traceql_format(self):
+        """Queries should have TraceQL braces format."""
+        for name, query in TRACEQL_QUERIES.items():
+            assert query.startswith("{"), f"{name} should start with {{"
+            assert query.endswith("}"), f"{name} should end with }}"

From 96f6e3a092dc9a341133fbc4775ab349a49d40fa Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Mon, 15 Dec 2025 17:57:12 -0800
Subject: [PATCH 2/3] Fix: handle bytes command names in Redis request hook

---
 redis_sre_agent/observability/tracing.py | 12 ++++++++++--
 tests/unit/observability/test_tracing.py |  9 +++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/redis_sre_agent/observability/tracing.py b/redis_sre_agent/observability/tracing.py
index 3ef415a5..0c379eb8 100644
--- a/redis_sre_agent/observability/tracing.py
+++ b/redis_sre_agent/observability/tracing.py
@@ -69,8 +69,16 @@ def _redis_request_hook(span: trace.Span, instance: Any, args: tuple, kwargs: di
     # Add category for filtering
     span.set_attribute(ATTR_CATEGORY, SpanCategory.REDIS.value)
 
-    # Extract command name for filtering
-    command = args[0].upper() if args else "UNKNOWN"
+    # Extract command name for filtering (handle bytes and string)
+    if args:
+        cmd = args[0]
+        if isinstance(cmd, bytes):
+            cmd_str = cmd.decode("utf-8", errors="replace")
+        else:
+            cmd_str = str(cmd)
+        command = cmd_str.upper()
+    else:
+        command = "UNKNOWN"
     span.set_attribute("redis.command", command)
 
     # Mark infrastructure commands for easy filtering
diff --git a/tests/unit/observability/test_tracing.py b/tests/unit/observability/test_tracing.py
index 547ae430..436a194e 100644
--- a/tests/unit/observability/test_tracing.py
+++ b/tests/unit/observability/test_tracing.py
@@ -175,6 +175,15 @@ def test_hook_handles_empty_args(self):
 
         mock_span.set_attribute.assert_any_call("redis.command", "UNKNOWN")
 
+    def test_hook_handles_bytes_command(self):
+        """Hook should handle bytes command names."""
+        mock_span = MagicMock()
+        mock_span.is_recording.return_value = True
+
+        _redis_request_hook(mock_span, None, (b"GET", b"mykey"), {})
+
+        mock_span.set_attribute.assert_any_call("redis.command", "GET")
+
 
 class TestRedisResponseHook:
     """Test the Redis response hook."""

From 8fd447b30150028470c9f00fda600c7c8648bf7c Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Tue, 16 Dec 2025 17:59:40 -0800
Subject: [PATCH 3/3] lint and docker fixes

---
 Dockerfile                               | 13 ++----
 docker-compose.yml                       |  2 +
 redis_sre_agent/api/app.py               |  1 -
 redis_sre_agent/api/threads.py           |  4 +-
 redis_sre_agent/observability/tracing.py | 53 +++++++++++++++++++-----
 tests/unit/observability/test_tracing.py |  6 +++
 ui/ui-kit/package.json                   |  2 +-
 7 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 420eadb4..4d75a5b1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -51,21 +51,16 @@ FROM python:3.12-slim
 # commands that use `uv run` work correctly.
 COPY --from=builder /bin/uv /bin/uv
 
-# Copy the uv-managed Python installation used by the virtualenv in /app/.venv.
-# Without this, /app/.venv/bin/python points at a non-existent interpreter under
-# /root/.local/share/uv, which causes `uv run` and even direct venv usage to
-# fail with "permission denied" when running as the non-root `app` user.
-COPY --from=builder /root/.local/share/uv /root/.local/share/uv
-
-# Make the uv Python tree readable and traversable by the unprivileged `app`
-# user so that symlinks in /app/.venv/bin/python* can be resolved.
-RUN chmod 755 /root && chmod -R 755 /root/.local/share/uv || true
+# Note: With UV_LINK_MODE=copy set in the builder stage, the virtualenv at
+# /app/.venv is self-contained and doesn't require a shared Python installation
+# under /root/.local/share/uv. All necessary files are copied directly into the venv.
 
 WORKDIR /app
 
 # Install ONLY runtime system dependencies
 # We repeat the Docker/Redis install here because they are needed at runtime.
 RUN apt-get update && apt-get install -y \
+    git \
     curl \
     ca-certificates \
     redis-tools \
diff --git a/docker-compose.yml b/docker-compose.yml
index 999e1c7d..a3672b6f 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -293,6 +293,7 @@ services:
         condition: service_started
     volumes:
       - .env:/app/.env
+      - ./config.yaml:/app/config.yaml  # Mount config for hot-reload on restart
       - ./artifacts:/app/artifacts  # For data pipeline
       - ./redis_sre_agent:/app/redis_sre_agent  # Mount source code for development
       - ./tests:/app/tests  # Mount tests for development
@@ -318,6 +319,7 @@ services:
         condition: service_healthy
     volumes:
       - .env:/app/.env
+      - ./config.yaml:/app/config.yaml  # Mount config for hot-reload on restart
       - ./artifacts:/app/artifacts
       - /var/run/docker.sock:/var/run/docker.sock  # Mount Docker socket for log access
       - ./redis_sre_agent:/app/redis_sre_agent  # Mount source code for development
diff --git a/redis_sre_agent/api/app.py b/redis_sre_agent/api/app.py
index f4544190..104b588c 100644
--- a/redis_sre_agent/api/app.py
+++ b/redis_sre_agent/api/app.py
@@ -1,7 +1,6 @@
 """Main FastAPI application for Redis SRE Agent."""
 
 import logging
-import os
 from contextlib import asynccontextmanager
 
 from fastapi import FastAPI
diff --git a/redis_sre_agent/api/threads.py b/redis_sre_agent/api/threads.py
index 337db473..bed3dbd0 100644
--- a/redis_sre_agent/api/threads.py
+++ b/redis_sre_agent/api/threads.py
@@ -50,9 +50,7 @@ async def list_threads(
                     # Get messages from the Thread.messages list (primary storage)
                     msgs = state.messages or []
                     # Count user/assistant messages
-                    user_assistant_msgs = [
-                        m for m in msgs if m.role in ("user", "assistant")
-                    ]
+                    user_assistant_msgs = [m for m in msgs if m.role in ("user", "assistant")]
                     s_out["message_count"] = len(user_assistant_msgs)
 
                     # Get latest message content from the last assistant or user message
diff --git a/redis_sre_agent/observability/tracing.py b/redis_sre_agent/observability/tracing.py
index 0c379eb8..b1f258b5 100644
--- a/redis_sre_agent/observability/tracing.py
+++ b/redis_sre_agent/observability/tracing.py
@@ -37,6 +37,7 @@ class SpanCategory(str, Enum):
 
     Use TraceQL like: {span.sre_agent.category = "llm"}
     """
+
     LLM = "llm"
     TOOL = "tool"
     GRAPH_NODE = "graph_node"
@@ -53,12 +54,29 @@ class SpanCategory(str, Enum):
 ATTR_NODE_NAME = "langgraph.node"
 
 # Redis commands that are "noisy" infrastructure - useful for filtering
-REDIS_INFRA_COMMANDS = frozenset({
-    "PING", "SELECT", "INFO", "CONFIG", "CLIENT", "DEBUG",
-    "SLOWLOG", "MEMORY", "DBSIZE", "LASTSAVE", "TIME",
-    "SCAN", "HSCAN", "SSCAN", "ZSCAN",  # Iteration commands
-    "WATCH", "MULTI", "EXEC", "DISCARD",  # Transaction commands
-})
+REDIS_INFRA_COMMANDS = frozenset(
+    {
+        "PING",
+        "SELECT",
+        "INFO",
+        "CONFIG",
+        "CLIENT",
+        "DEBUG",
+        "SLOWLOG",
+        "MEMORY",
+        "DBSIZE",
+        "LASTSAVE",
+        "TIME",
+        "SCAN",
+        "HSCAN",
+        "SSCAN",
+        "ZSCAN",  # Iteration commands
+        "WATCH",
+        "MULTI",
+        "EXEC",
+        "DISCARD",  # Transaction commands
+    }
+)
 
 
 def _redis_request_hook(span: trace.Span, instance: Any, args: tuple, kwargs: dict) -> None:
@@ -114,13 +132,17 @@ def setup_tracing(
     """
     otlp_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
     if not otlp_endpoint:
-        logger.info(f"OpenTelemetry tracing disabled for {service_name} (no OTEL_EXPORTER_OTLP_ENDPOINT)")
+        logger.info(
+            f"OpenTelemetry tracing disabled for {service_name} (no OTEL_EXPORTER_OTLP_ENDPOINT)"
+        )
         return False
 
-    resource = Resource.create({
-        "service.name": service_name,
-        "service.version": service_version,
-    })
+    resource = Resource.create(
+        {
+            "service.name": service_name,
+            "service.version": service_version,
+        }
+    )
     provider = TracerProvider(resource=resource)
     exporter = OTLPSpanExporter(
         endpoint=otlp_endpoint,
@@ -152,6 +174,7 @@ def get_tracer(name: str) -> trace.Tracer:
 
 def trace_graph_node(graph_name: str, node_name: str):
     """Decorator to trace LangGraph node execution with standard attributes."""
+
     def decorator(fn: F) -> F:
         tracer = get_tracer(fn.__module__)
 
@@ -166,12 +189,15 @@ async def wrapper(*args, **kwargs):
                 },
             ):
                 return await fn(*args, **kwargs)
+
         return wrapper  # type: ignore
+
     return decorator
 
 
 def trace_tool(tool_name: str, component: Optional[str] = None):
     """Decorator to trace tool execution with standard attributes."""
+
     def decorator(fn: F) -> F:
         tracer = get_tracer(fn.__module__)
 
@@ -185,7 +211,9 @@ async def wrapper(*args, **kwargs):
                 attrs[ATTR_COMPONENT] = component
             with tracer.start_as_current_span(f"tool.{tool_name}", attributes=attrs):
                 return await fn(*args, **kwargs)
+
         return wrapper  # type: ignore
+
     return decorator
 
 
@@ -194,6 +222,7 @@ def trace_llm(component: str):
 
     Note: This creates the span; use record_llm_call_metrics() to add token usage.
     """
+
     def decorator(fn: F) -> F:
         tracer = get_tracer(fn.__module__)
 
@@ -207,7 +236,9 @@ async def wrapper(*args, **kwargs):
                 },
             ):
                 return await fn(*args, **kwargs)
+
         return wrapper  # type: ignore
+
     return decorator
 
 
diff --git a/tests/unit/observability/test_tracing.py b/tests/unit/observability/test_tracing.py
index 436a194e..316efbd7 100644
--- a/tests/unit/observability/test_tracing.py
+++ b/tests/unit/observability/test_tracing.py
@@ -233,6 +233,7 @@ def test_returns_false_without_endpoint(self):
         with patch.dict("os.environ", {}, clear=True):
             # Remove OTEL_EXPORTER_OTLP_ENDPOINT if present
             import os
+
             os.environ.pop("OTEL_EXPORTER_OTLP_ENDPOINT", None)
             result = setup_tracing("test-service")
             assert result is False
@@ -286,6 +287,7 @@ class TestTraceGraphNodeDecorator:
     @pytest.mark.asyncio
     async def test_decorator_wraps_function(self):
         """Decorator should preserve function behavior."""
+
         @trace_graph_node("test_graph", "test_node")
         async def my_node(state):
             return {"processed": True}
@@ -295,6 +297,7 @@ async def my_node(state):
 
     def test_decorator_preserves_function_name(self):
         """Decorator should preserve function name."""
+
         @trace_graph_node("test_graph", "test_node")
         async def my_special_node(state):
             return state
@@ -308,6 +311,7 @@ class TestTraceToolDecorator:
     @pytest.mark.asyncio
     async def test_decorator_wraps_function(self):
         """Decorator should preserve function behavior."""
+
         @trace_tool("test_tool")
         async def my_tool(query):
             return f"result for {query}"
@@ -318,6 +322,7 @@ async def my_tool(query):
     @pytest.mark.asyncio
     async def test_decorator_with_component(self):
         """Decorator should work with component parameter."""
+
         @trace_tool("search", component="knowledge")
         async def search_knowledge(query):
             return ["doc1", "doc2"]
@@ -332,6 +337,7 @@ class TestTraceLlmDecorator:
     @pytest.mark.asyncio
     async def test_decorator_wraps_function(self):
         """Decorator should preserve function behavior."""
+
         @trace_llm("router")
         async def call_llm(prompt):
             return "response"
diff --git a/ui/ui-kit/package.json b/ui/ui-kit/package.json
index cbf5257d..5494cbfc 100644
--- a/ui/ui-kit/package.json
+++ b/ui/ui-kit/package.json
@@ -102,4 +102,4 @@
     "react": ">=18.0.0",
     "react-dom": ">=18.0.0"
   }
-}
\ No newline at end of file
+}