From 0d5a6ef7e8f9433a6fb07779baac2d66b293f1b7 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 21:33:03 +0900 Subject: [PATCH 01/15] feat: add examples auto-run skill and refresh example scripts --- .codex/skills/examples-auto-run/SKILL.md | 66 +++ .../skills/examples-auto-run/scripts/run.sh | 200 +++++++++ .gitignore | 1 + examples/agent_patterns/agents_as_tools.py | 6 +- .../agents_as_tools_conditional.py | 8 +- examples/agent_patterns/deterministic.py | 6 +- examples/agent_patterns/input_guardrails.py | 9 +- examples/agent_patterns/llm_as_a_judge.py | 15 +- examples/agent_patterns/parallelization.py | 6 +- examples/agent_patterns/routing.py | 11 +- examples/auto_mode.py | 37 ++ examples/basic/agent_lifecycle_example.py | 3 +- examples/basic/lifecycle_example.py | 3 +- examples/basic/previous_response_id.py | 3 +- examples/customer_service/main.py | 9 +- examples/financial_research_agent/main.py | 7 +- examples/hosted_mcp/approvals.py | 5 +- examples/mcp/git_example/main.py | 6 +- examples/mcp/prompt_server/README.md | 5 +- examples/mcp/prompt_server/main.py | 26 +- examples/mcp/prompt_server/server.py | 7 +- examples/mcp/sse_example/server.py | 14 +- .../README.md | 3 +- .../main.py | 28 +- .../server.py | 6 +- examples/mcp/streamablehttp_example/README.md | 2 +- examples/mcp/streamablehttp_example/main.py | 28 +- examples/mcp/streamablehttp_example/server.py | 20 +- examples/model_providers/litellm_provider.py | 18 +- examples/reasoning_content/main.py | 2 +- examples/reasoning_content/runner_example.py | 2 +- examples/research_bot/main.py | 7 +- examples/run_examples.py | 402 +++++++++++++++--- examples/tools/apply_patch.py | 2 +- examples/tools/shell.py | 2 +- 35 files changed, 871 insertions(+), 104 deletions(-) create mode 100644 .codex/skills/examples-auto-run/SKILL.md create mode 100755 .codex/skills/examples-auto-run/scripts/run.sh create mode 100644 examples/auto_mode.py diff --git a/.codex/skills/examples-auto-run/SKILL.md b/.codex/skills/examples-auto-run/SKILL.md new file mode 100644 index 0000000000..4e3abc7545 --- /dev/null +++ b/.codex/skills/examples-auto-run/SKILL.md @@ -0,0 +1,66 @@ +--- +name: examples-auto-run +description: Run python examples in auto mode with logging, rerun helpers, and background control. +--- + +# examples-auto-run + +## What it does + +- Runs `uv run examples/run_examples.py` with: + - `EXAMPLES_INTERACTIVE_MODE=auto` (auto-input/auto-approve). + - Per-example logs under `.tmp/examples-start-logs/`. + - Main summary log path passed via `--main-log` (also under `.tmp/examples-start-logs/`). + - Generates a rerun list of failures at `.tmp/examples-rerun.txt` when `--write-rerun` is set. +- Provides start/stop/status/logs/tail/collect/rerun helpers via `run.sh`. +- Background option keeps the process running with a pidfile; `stop` cleans it up. + +## Usage + +```bash +# Start (auto mode; interactive included by default) +.codex/skills/examples-auto-run/scripts/run.sh start [extra args to run_examples.py] +# Examples: +.codex/skills/examples-auto-run/scripts/run.sh start --filter basic +.codex/skills/examples-auto-run/scripts/run.sh start --include-server --include-audio + +# Check status +.codex/skills/examples-auto-run/scripts/run.sh status + +# Stop running job +.codex/skills/examples-auto-run/scripts/run.sh stop + +# List logs +.codex/skills/examples-auto-run/scripts/run.sh logs + +# Tail latest log (or specify one) +.codex/skills/examples-auto-run/scripts/run.sh tail +.codex/skills/examples-auto-run/scripts/run.sh tail main_20260113-123000.log + +# Collect rerun list from a main log (defaults to latest main_*.log) +.codex/skills/examples-auto-run/scripts/run.sh collect + +# Rerun only failed entries from rerun file (auto mode) +.codex/skills/examples-auto-run/scripts/run.sh rerun +``` + +## Defaults (overridable via env) + +- `EXAMPLES_INTERACTIVE_MODE=auto` +- `EXAMPLES_INCLUDE_INTERACTIVE=1` +- `EXAMPLES_INCLUDE_SERVER=0` +- `EXAMPLES_INCLUDE_AUDIO=0` +- `EXAMPLES_INCLUDE_EXTERNAL=0` +- Auto-approvals in auto mode: `APPLY_PATCH_AUTO_APPROVE=1`, `SHELL_AUTO_APPROVE=1`, `AUTO_APPROVE_MCP=1` + +## Log locations + +- Main logs: `.tmp/examples-start-logs/main_*.log` +- Per-example logs (from `run_examples.py`): `.tmp/examples-start-logs/.log` +- Rerun list: `.tmp/examples-rerun.txt` + +## Notes + +- The runner delegates to `uv run examples/run_examples.py`, which already writes per-example logs and supports `--collect`, `--rerun-file`, and `--print-auto-skip`. +- `start` uses `--write-rerun` so failures are captured automatically. +- If `.tmp/examples-rerun.txt` exists and is non-empty, invoking the skill with no args runs `rerun` by default. diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh new file mode 100755 index 0000000000..d36270c915 --- /dev/null +++ b/.codex/skills/examples-auto-run/scripts/run.sh @@ -0,0 +1,200 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)" +PID_FILE="$ROOT/.tmp/examples-auto-run.pid" +LOG_DIR="$ROOT/.tmp/examples-start-logs" +RERUN_FILE="$ROOT/.tmp/examples-rerun.txt" + +ensure_dirs() { + mkdir -p "$LOG_DIR" "$ROOT/.tmp" +} + +is_running() { + local pid="$1" + [[ -n "$pid" ]] && ps -p "$pid" >/dev/null 2>&1 +} + +cmd_start() { + ensure_dirs + local background=0 + if [[ "${1:-}" == "--background" ]]; then + background=1 + shift + fi + + local ts log_file + ts="$(date +%Y%m%d-%H%M%S)" + log_file="$LOG_DIR/main_${ts}.log" + + local run_cmd=( + uv run examples/run_examples.py + --auto-mode + --write-rerun + --main-log "$log_file" + --logs-dir "$LOG_DIR" + ) + + if [[ "$background" -eq 1 ]]; then + if [[ -f "$PID_FILE" ]]; then + local pid + pid="$(cat "$PID_FILE" 2>/dev/null || true)" + if is_running "$pid"; then + echo "examples/run_examples.py already running (pid=$pid)." + exit 1 + fi + fi + ( + trap '' HUP + export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}" + export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}" + export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}" + export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}" + export EXAMPLES_INCLUDE_INTERACTIVE="${EXAMPLES_INCLUDE_INTERACTIVE:-1}" + export EXAMPLES_INCLUDE_SERVER="${EXAMPLES_INCLUDE_SERVER:-0}" + export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}" + export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}" + cd "$ROOT" + "${run_cmd[@]}" "$@" 2>&1 | tee "$log_file" >/dev/null + ) & + local pid=$! + echo "$pid" >"$PID_FILE" + echo "Started run_examples.py (pid=$pid)" + echo "Main log: $log_file" + return 0 + fi + + export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}" + export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}" + export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}" + export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}" + export EXAMPLES_INCLUDE_INTERACTIVE="${EXAMPLES_INCLUDE_INTERACTIVE:-1}" + export EXAMPLES_INCLUDE_SERVER="${EXAMPLES_INCLUDE_SERVER:-0}" + export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}" + export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}" + cd "$ROOT" + "${run_cmd[@]}" "$@" 2>&1 | tee "$log_file" +} + +cmd_stop() { + if [[ ! -f "$PID_FILE" ]]; then + echo "No pid file; nothing to stop." + return 0 + fi + local pid + pid="$(cat "$PID_FILE" 2>/dev/null || true)" + if [[ -z "$pid" ]]; then + rm -f "$PID_FILE" + echo "Pid file empty; cleaned." + return 0 + fi + if ! is_running "$pid"; then + rm -f "$PID_FILE" + echo "Process $pid not running; cleaned pid file." + return 0 + fi + echo "Stopping pid $pid ..." + kill "$pid" 2>/dev/null || true + sleep 1 + if is_running "$pid"; then + echo "Sending SIGKILL to $pid ..." + kill -9 "$pid" 2>/dev/null || true + fi + rm -f "$PID_FILE" + echo "Stopped." +} + +cmd_status() { + if [[ -f "$PID_FILE" ]]; then + local pid + pid="$(cat "$PID_FILE" 2>/dev/null || true)" + if is_running "$pid"; then + echo "Running (pid=$pid)" + return 0 + fi + fi + echo "Not running." +} + +cmd_logs() { + ensure_dirs + ls -1t "$LOG_DIR" +} + +cmd_tail() { + ensure_dirs + local file="${1:-}" + if [[ -z "$file" ]]; then + file="$(ls -1t "$LOG_DIR" | head -n1)" + fi + if [[ -z "$file" ]]; then + echo "No log files yet." + exit 1 + fi + tail -f "$LOG_DIR/$file" +} + +collect_rerun() { + ensure_dirs + local log_file="${1:-}" + if [[ -z "$log_file" ]]; then + log_file="$(ls -1t "$LOG_DIR"/main_*.log 2>/dev/null | head -n1)" + fi + if [[ -z "$log_file" ]] || [[ ! -f "$log_file" ]]; then + echo "No main log file found." + exit 1 + fi + cd "$ROOT" + uv run examples/run_examples.py --collect "$log_file" --output "$RERUN_FILE" +} + +cmd_rerun() { + ensure_dirs + local file="${1:-$RERUN_FILE}" + if [[ ! -s "$file" ]]; then + echo "Rerun list is empty: $file" + exit 0 + fi + cd "$ROOT" + export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}" + export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}" + export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}" + export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}" + uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun +} + +usage() { + cat <<'EOF' +Usage: run.sh [args...] + +Commands: + start [--filter ... | other args] Run examples in auto mode (foreground). Pass --background to run detached. + stop Kill the running auto-run (if any). + status Show whether it is running. + logs List log files (.tmp/examples-start-logs). + tail [logfile] Tail the latest (or specified) log. + collect [main_log] Parse a main log and write failed examples to .tmp/examples-rerun.txt. + rerun [rerun_file] Run only the examples listed in .tmp/examples-rerun.txt. + +Environment overrides: + EXAMPLES_INTERACTIVE_MODE (default auto) + EXAMPLES_INCLUDE_SERVER/INTERACTIVE/AUDIO/EXTERNAL (defaults: 0/1/0/0) + APPLY_PATCH_AUTO_APPROVE, SHELL_AUTO_APPROVE, AUTO_APPROVE_MCP (default 1 in auto mode) +EOF +} + +default_cmd="start" +if [[ $# -eq 0 && -s "$RERUN_FILE" ]]; then + default_cmd="rerun" +fi + +case "${1:-$default_cmd}" in + start) shift || true; cmd_start "$@" ;; + stop) shift || true; cmd_stop ;; + status) shift || true; cmd_status ;; + logs) shift || true; cmd_logs ;; + tail) shift; cmd_tail "${1:-}" ;; + collect) shift || true; collect_rerun "${1:-}" ;; + rerun) shift || true; cmd_rerun "${1:-}" ;; + *) usage; exit 1 ;; +esac diff --git a/.gitignore b/.gitignore index 60782274e9..ac32a2998d 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,7 @@ htmlcov/ .coverage .coverage.* .cache +.tmp/ nosetests.xml coverage.xml *.cover diff --git a/examples/agent_patterns/agents_as_tools.py b/examples/agent_patterns/agents_as_tools.py index 9fd118efb3..b670e2fe06 100644 --- a/examples/agent_patterns/agents_as_tools.py +++ b/examples/agent_patterns/agents_as_tools.py @@ -1,6 +1,7 @@ import asyncio from agents import Agent, ItemHelpers, MessageOutputItem, Runner, trace +from examples.auto_mode import input_with_fallback """ This example shows the agents-as-tools pattern. The frontline agent receives a user message and @@ -56,7 +57,10 @@ async def main(): - msg = input("Hi! What would you like translated, and to which languages? ") + msg = input_with_fallback( + "Hi! What would you like translated, and to which languages? ", + "Translate 'Hello, world!' to French and Spanish.", + ) # Run the entire orchestration in a single trace with trace("Orchestrator evaluator"): diff --git a/examples/agent_patterns/agents_as_tools_conditional.py b/examples/agent_patterns/agents_as_tools_conditional.py index e00f56d5e3..87533721d3 100644 --- a/examples/agent_patterns/agents_as_tools_conditional.py +++ b/examples/agent_patterns/agents_as_tools_conditional.py @@ -3,6 +3,7 @@ from pydantic import BaseModel from agents import Agent, AgentBase, RunContextWrapper, Runner, trace +from examples.auto_mode import input_with_fallback """ This example demonstrates the agents-as-tools pattern with conditional tool enabling. @@ -81,7 +82,7 @@ async def main(): print("2. French and Spanish (2 tools)") print("3. European languages (3 tools)") - choice = input("\nSelect option (1-3): ").strip() + choice = input_with_fallback("\nSelect option (1-3): ", "2").strip() preference_map = {"1": "spanish_only", "2": "french_spanish", "3": "european"} language_preference = preference_map.get(choice, "spanish_only") @@ -95,7 +96,10 @@ async def main(): print(f"The LLM will only see and can use these {len(available_tools)} tools\n") # Get user request - user_request = input("Ask a question and see responses in available languages:\n") + user_request = input_with_fallback( + "Ask a question and see responses in available languages:\n", + "How do you say good morning?", + ) # Run with LLM interaction print("\nProcessing request...") diff --git a/examples/agent_patterns/deterministic.py b/examples/agent_patterns/deterministic.py index 0c163afe9e..30bef35e25 100644 --- a/examples/agent_patterns/deterministic.py +++ b/examples/agent_patterns/deterministic.py @@ -3,6 +3,7 @@ from pydantic import BaseModel from agents import Agent, Runner, trace +from examples.auto_mode import input_with_fallback """ This example demonstrates a deterministic flow, where each step is performed by an agent. @@ -39,7 +40,10 @@ class OutlineCheckerOutput(BaseModel): async def main(): - input_prompt = input("What kind of story do you want? ") + input_prompt = input_with_fallback( + "What kind of story do you want? ", + "Write a short sci-fi story.", + ) # Ensure the entire workflow is a single trace with trace("Deterministic story flow"): diff --git a/examples/agent_patterns/input_guardrails.py b/examples/agent_patterns/input_guardrails.py index 18ab9d2a75..7e4210d6af 100644 --- a/examples/agent_patterns/input_guardrails.py +++ b/examples/agent_patterns/input_guardrails.py @@ -13,6 +13,7 @@ TResponseInputItem, input_guardrail, ) +from examples.auto_mode import input_with_fallback, is_auto_mode """ This example shows how to use guardrails. @@ -68,9 +69,13 @@ async def main(): ) input_data: list[TResponseInputItem] = [] + auto_mode = is_auto_mode() while True: - user_input = input("Enter a message: ") + user_input = input_with_fallback( + "Enter a message: ", + "What's the capital of California?", + ) input_data.append( { "role": "user", @@ -93,6 +98,8 @@ async def main(): "content": message, } ) + if auto_mode: + break # Sample run: # Enter a message: What's the capital of California? diff --git a/examples/agent_patterns/llm_as_a_judge.py b/examples/agent_patterns/llm_as_a_judge.py index 39a55c4630..1ee4915e18 100644 --- a/examples/agent_patterns/llm_as_a_judge.py +++ b/examples/agent_patterns/llm_as_a_judge.py @@ -5,6 +5,7 @@ from typing import Literal from agents import Agent, ItemHelpers, Runner, TResponseInputItem, trace +from examples.auto_mode import input_with_fallback, is_auto_mode """ This example shows the LLM as a judge pattern. The first agent generates an outline for a story. @@ -39,10 +40,16 @@ class EvaluationFeedback: async def main() -> None: - msg = input("What kind of story would you like to hear? ") + msg = input_with_fallback( + "What kind of story would you like to hear? ", + "A detective story in space.", + ) input_items: list[TResponseInputItem] = [{"content": msg, "role": "user"}] latest_outline: str | None = None + auto_mode = is_auto_mode() + max_rounds = 3 if auto_mode else None + rounds = 0 # We'll run the entire workflow in a single trace with trace("LLM as a judge"): @@ -65,6 +72,12 @@ async def main() -> None: print("Story outline is good enough, exiting.") break + if auto_mode: + rounds += 1 + if max_rounds is not None and rounds >= max_rounds: + print("Auto mode: stopping after limited rounds.") + break + print("Re-running with feedback") input_items.append({"content": f"Feedback: {result.feedback}", "role": "user"}) diff --git a/examples/agent_patterns/parallelization.py b/examples/agent_patterns/parallelization.py index fe2a8ecd0b..60dcfbe07f 100644 --- a/examples/agent_patterns/parallelization.py +++ b/examples/agent_patterns/parallelization.py @@ -1,6 +1,7 @@ import asyncio from agents import Agent, ItemHelpers, Runner, trace +from examples.auto_mode import input_with_fallback """ This example shows the parallelization pattern. We run the agent three times in parallel, and pick @@ -19,7 +20,10 @@ async def main(): - msg = input("Hi! Enter a message, and we'll translate it to Spanish.\n\n") + msg = input_with_fallback( + "Hi! Enter a message, and we'll translate it to Spanish.\n\n", + "Good morning!", + ) # Ensure the entire workflow is a single trace with trace("Parallel translation"): diff --git a/examples/agent_patterns/routing.py b/examples/agent_patterns/routing.py index 3dcaefa980..4d0a49ab74 100644 --- a/examples/agent_patterns/routing.py +++ b/examples/agent_patterns/routing.py @@ -4,6 +4,7 @@ from openai.types.responses import ResponseContentPartDoneEvent, ResponseTextDeltaEvent from agents import Agent, RawResponsesStreamEvent, Runner, TResponseInputItem, trace +from examples.auto_mode import input_with_fallback, is_auto_mode """ This example shows the handoffs/routing pattern. The triage agent receives the first message, and @@ -37,9 +38,13 @@ async def main(): # We'll create an ID for this conversation, so we can link each trace conversation_id = str(uuid.uuid4().hex[:16]) - msg = input("Hi! We speak French, Spanish and English. How can I help? ") + msg = input_with_fallback( + "Hi! We speak French, Spanish and English. How can I help? ", + "Hello, how do I say good evening in French?", + ) agent = triage_agent inputs: list[TResponseInputItem] = [{"content": msg, "role": "user"}] + auto_mode = is_auto_mode() while True: # Each conversation turn is a single trace. Normally, each input from the user would be an @@ -61,7 +66,9 @@ async def main(): inputs = result.to_input_list() print("\n") - user_msg = input("Enter a message: ") + if auto_mode: + break + user_msg = input_with_fallback("Enter a message: ", "Thanks!") inputs.append({"content": user_msg, "role": "user"}) agent = result.current_agent diff --git a/examples/auto_mode.py b/examples/auto_mode.py new file mode 100644 index 0000000000..9a7b71fe71 --- /dev/null +++ b/examples/auto_mode.py @@ -0,0 +1,37 @@ +"""Utilities for running examples in automated mode. + +When ``EXAMPLES_INTERACTIVE_MODE=auto`` is set, these helpers provide +deterministic inputs and confirmations so examples can run without manual +interaction. The helpers are intentionally lightweight to avoid adding +dependencies to example code. +""" + +from __future__ import annotations + +import os + + +def is_auto_mode() -> bool: + """Return True when examples should bypass interactive prompts.""" + return os.environ.get("EXAMPLES_INTERACTIVE_MODE", "").lower() == "auto" + + +def input_with_fallback(prompt: str, fallback: str) -> str: + """Return the fallback text in auto mode, otherwise defer to input().""" + if is_auto_mode(): + print(f"[auto-input] {prompt.strip()} -> {fallback}") + return fallback + return input(prompt) + + +def confirm_with_fallback(prompt: str, default: bool = True) -> bool: + """Return default in auto mode; otherwise ask the user.""" + if is_auto_mode(): + choice = "yes" if default else "no" + print(f"[auto-confirm] {prompt.strip()} -> {choice}") + return default + + answer = input(prompt).strip().lower() + if not answer: + return default + return answer in {"y", "yes"} diff --git a/examples/basic/agent_lifecycle_example.py b/examples/basic/agent_lifecycle_example.py index 96238fe2ea..d135b8f452 100644 --- a/examples/basic/agent_lifecycle_example.py +++ b/examples/basic/agent_lifecycle_example.py @@ -13,6 +13,7 @@ Tool, function_tool, ) +from examples.auto_mode import input_with_fallback class CustomAgentHooks(AgentHooks): @@ -98,7 +99,7 @@ class FinalResult(BaseModel): async def main() -> None: - user_input = input("Enter a max number: ") + user_input = input_with_fallback("Enter a max number: ", "50") try: max_number = int(user_input) await Runner.run( diff --git a/examples/basic/lifecycle_example.py b/examples/basic/lifecycle_example.py index 76529c56b1..5ecd3a6b75 100644 --- a/examples/basic/lifecycle_example.py +++ b/examples/basic/lifecycle_example.py @@ -17,6 +17,7 @@ ) from agents.items import ModelResponse, TResponseInputItem from agents.tool_context import ToolContext +from examples.auto_mode import input_with_fallback class LoggingHooks(AgentHooks[Any]): @@ -146,7 +147,7 @@ class FinalResult(BaseModel): async def main() -> None: - user_input = input("Enter a max number: ") + user_input = input_with_fallback("Enter a max number: ", "50") try: max_number = int(user_input) await Runner.run( diff --git a/examples/basic/previous_response_id.py b/examples/basic/previous_response_id.py index b00bf3aa64..21c354219d 100644 --- a/examples/basic/previous_response_id.py +++ b/examples/basic/previous_response_id.py @@ -1,6 +1,7 @@ import asyncio from agents import Agent, Runner +from examples.auto_mode import input_with_fallback """This demonstrates usage of the `previous_response_id` parameter to continue a conversation. The second run passes the previous response ID to the model, which allows it to continue the @@ -59,7 +60,7 @@ async def main_stream(): if __name__ == "__main__": - is_stream = input("Run in stream mode? (y/n): ") + is_stream = input_with_fallback("Run in stream mode? (y/n): ", "n") if is_stream == "y": asyncio.run(main_stream()) else: diff --git a/examples/customer_service/main.py b/examples/customer_service/main.py index 266a7e6118..65191559c3 100644 --- a/examples/customer_service/main.py +++ b/examples/customer_service/main.py @@ -21,6 +21,7 @@ trace, ) from agents.extensions.handoff_prompt import RECOMMENDED_PROMPT_PREFIX +from examples.auto_mode import input_with_fallback, is_auto_mode ### CONTEXT @@ -143,13 +144,17 @@ async def main(): current_agent: Agent[AirlineAgentContext] = triage_agent input_items: list[TResponseInputItem] = [] context = AirlineAgentContext() + auto_mode = is_auto_mode() # Normally, each input from the user would be an API request to your app, and you can wrap the request in a trace() # Here, we'll just use a random UUID for the conversation ID conversation_id = uuid.uuid4().hex[:16] while True: - user_input = input("Enter your message: ") + user_input = input_with_fallback( + "Enter your message: ", + "What are your store hours?", + ) with trace("Customer service", group_id=conversation_id): input_items.append({"content": user_input, "role": "user"}) result = await Runner.run(current_agent, input_items, context=context) @@ -170,6 +175,8 @@ async def main(): print(f"{agent_name}: Skipping item: {new_item.__class__.__name__}") input_items = result.to_input_list() current_agent = result.last_agent + if auto_mode: + break if __name__ == "__main__": diff --git a/examples/financial_research_agent/main.py b/examples/financial_research_agent/main.py index b5b6cfdfd3..23b6d71d6b 100644 --- a/examples/financial_research_agent/main.py +++ b/examples/financial_research_agent/main.py @@ -1,5 +1,7 @@ import asyncio +from examples.auto_mode import input_with_fallback + from .manager import FinancialResearchManager @@ -8,7 +10,10 @@ # financial research query, for example: # "Write up an analysis of Apple Inc.'s most recent quarter." async def main() -> None: - query = input("Enter a financial research query: ") + query = input_with_fallback( + "Enter a financial research query: ", + "Write up an analysis of Apple Inc.'s most recent quarter.", + ) mgr = FinancialResearchManager() await mgr.run(query) diff --git a/examples/hosted_mcp/approvals.py b/examples/hosted_mcp/approvals.py index c3de0db447..2aa73c1ebc 100644 --- a/examples/hosted_mcp/approvals.py +++ b/examples/hosted_mcp/approvals.py @@ -8,14 +8,15 @@ MCPToolApprovalRequest, Runner, ) +from examples.auto_mode import confirm_with_fallback """This example demonstrates how to use the hosted MCP support in the OpenAI Responses API, with approval callbacks.""" def approval_callback(request: MCPToolApprovalRequest) -> MCPToolApprovalFunctionResult: - answer = input(f"Approve running the tool `{request.data.name}`? (y/n) ") - result: MCPToolApprovalFunctionResult = {"approve": answer == "y"} + approve = confirm_with_fallback(f"Approve running the tool `{request.data.name}`? (y/n) ", True) + result: MCPToolApprovalFunctionResult = {"approve": approve} if not result["approve"]: result["reason"] = "User denied" return result diff --git a/examples/mcp/git_example/main.py b/examples/mcp/git_example/main.py index ab229e8550..8a62744d18 100644 --- a/examples/mcp/git_example/main.py +++ b/examples/mcp/git_example/main.py @@ -3,6 +3,7 @@ from agents import Agent, Runner, trace from agents.mcp import MCPServer, MCPServerStdio +from examples.auto_mode import input_with_fallback async def run(mcp_server: MCPServer, directory_path: str): @@ -27,7 +28,10 @@ async def run(mcp_server: MCPServer, directory_path: str): async def main(): # Ask the user for the directory path - directory_path = input("Please enter the path to the git repository: ") + directory_path = input_with_fallback( + "Please enter the path to the git repository: ", + ".", + ) async with MCPServerStdio( cache_tools_list=True, # Cache the tools list, for demonstration diff --git a/examples/mcp/prompt_server/README.md b/examples/mcp/prompt_server/README.md index c1b1c3b376..c1eaa632df 100644 --- a/examples/mcp/prompt_server/README.md +++ b/examples/mcp/prompt_server/README.md @@ -10,7 +10,8 @@ uv run python examples/mcp/prompt_server/main.py ## Details -The example uses the `MCPServerStreamableHttp` class from `agents.mcp`. The server runs in a sub-process at `http://localhost:8000/mcp` and provides user-controlled prompts that generate agent instructions. +The example uses the `MCPServerStreamableHttp` class from `agents.mcp`. The script auto-selects an open localhost port (or honors `STREAMABLE_HTTP_PORT`) and runs the server at `http://:/mcp`, providing user-controlled prompts that generate agent instructions. +If you need a specific address, set `STREAMABLE_HTTP_PORT` and `STREAMABLE_HTTP_HOST`. The server exposes prompts like `generate_code_review_instructions` that take parameters such as focus area and programming language. The agent calls these prompts to dynamically generate its system instructions based on user-provided parameters. @@ -26,4 +27,4 @@ The example demonstrates two key functions: - Runs the agent against vulnerable sample code (command injection via `os.system`) - The agent analyzes the code and provides security-focused feedback using available tools -This pattern allows users to dynamically configure agent behavior through MCP prompts rather than hardcoded instructions. \ No newline at end of file +This pattern allows users to dynamically configure agent behavior through MCP prompts rather than hardcoded instructions. diff --git a/examples/mcp/prompt_server/main.py b/examples/mcp/prompt_server/main.py index 4caa95d888..543ec273c4 100644 --- a/examples/mcp/prompt_server/main.py +++ b/examples/mcp/prompt_server/main.py @@ -1,6 +1,7 @@ import asyncio import os import shutil +import socket import subprocess import time from typing import Any @@ -9,6 +10,22 @@ from agents.mcp import MCPServer, MCPServerStreamableHttp from agents.model_settings import ModelSettings +STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1") + + +def _choose_port() -> int: + env_port = os.getenv("STREAMABLE_HTTP_PORT") + if env_port: + return int(env_port) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind((STREAMABLE_HTTP_HOST, 0)) + return s.getsockname()[1] + + +STREAMABLE_HTTP_PORT = _choose_port() +os.environ.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT)) +STREAMABLE_HTTP_URL = f"http://{STREAMABLE_HTTP_HOST}:{STREAMABLE_HTTP_PORT}/mcp" + async def get_instructions_from_prompt(mcp_server: MCPServer, prompt_name: str, **kwargs) -> str: """Get agent instructions by calling MCP prompt endpoint (user-controlled)""" @@ -75,7 +92,7 @@ async def show_available_prompts(mcp_server: MCPServer): async def main(): async with MCPServerStreamableHttp( name="Simple Prompt Server", - params={"url": "http://localhost:8000/mcp"}, + params={"url": STREAMABLE_HTTP_URL}, ) as server: trace_id = gen_trace_id() with trace(workflow_name="Simple Prompt Demo", trace_id=trace_id): @@ -94,8 +111,11 @@ async def main(): this_dir = os.path.dirname(os.path.abspath(__file__)) server_file = os.path.join(this_dir, "server.py") - print("Starting Simple Prompt Server...") - process = subprocess.Popen(["uv", "run", server_file]) + print(f"Starting Simple Prompt Server at {STREAMABLE_HTTP_URL} ...") + env = os.environ.copy() + env.setdefault("STREAMABLE_HTTP_HOST", STREAMABLE_HTTP_HOST) + env.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT)) + process = subprocess.Popen(["uv", "run", server_file], env=env) time.sleep(3) print("Server started\n") except Exception as e: diff --git a/examples/mcp/prompt_server/server.py b/examples/mcp/prompt_server/server.py index 01dcbac346..7d6629acd7 100644 --- a/examples/mcp/prompt_server/server.py +++ b/examples/mcp/prompt_server/server.py @@ -1,7 +1,12 @@ +import os + from mcp.server.fastmcp import FastMCP +STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1") +STREAMABLE_HTTP_PORT = int(os.getenv("STREAMABLE_HTTP_PORT", "18080")) + # Create server -mcp = FastMCP("Prompt Server") +mcp = FastMCP("Prompt Server", host=STREAMABLE_HTTP_HOST, port=STREAMABLE_HTTP_PORT) # Instruction-generating prompts (user-controlled) diff --git a/examples/mcp/sse_example/server.py b/examples/mcp/sse_example/server.py index df364aa3af..2e4fe2db86 100644 --- a/examples/mcp/sse_example/server.py +++ b/examples/mcp/sse_example/server.py @@ -23,10 +23,16 @@ def get_secret_word() -> str: @mcp.tool() def get_current_weather(city: str) -> str: print(f"[debug-server] get_current_weather({city})") - - endpoint = "https://wttr.in" - response = requests.get(f"{endpoint}/{city}") - return response.text + # Avoid slow or flaky network calls during automated runs. + try: + endpoint = "https://wttr.in" + response = requests.get(f"{endpoint}/{city}", timeout=2) + if response.ok: + return response.text + except Exception: + pass + # Fallback keeps the tool responsive even when offline. + return f"Weather data unavailable right now; assume clear skies in {city}." if __name__ == "__main__": diff --git a/examples/mcp/streamablehttp_custom_client_example/README.md b/examples/mcp/streamablehttp_custom_client_example/README.md index 1569b3c28c..fc269a0644 100644 --- a/examples/mcp/streamablehttp_custom_client_example/README.md +++ b/examples/mcp/streamablehttp_custom_client_example/README.md @@ -38,7 +38,7 @@ def create_custom_http_client() -> httpx.AsyncClient: async with MCPServerStreamableHttp( name="Custom Client Server", params={ - "url": "http://localhost:8000/mcp", + "url": "http://localhost:/mcp", "httpx_client_factory": create_custom_http_client, }, ) as server: @@ -60,3 +60,4 @@ async with MCPServerStreamableHttp( - **Performance**: Optimize timeouts and connection settings for your use case - **Compatibility**: Work with corporate proxies and network restrictions +This example will auto-pick a free localhost port unless you set `STREAMABLE_HTTP_PORT`; use `STREAMABLE_HTTP_HOST` to change the bind address. diff --git a/examples/mcp/streamablehttp_custom_client_example/main.py b/examples/mcp/streamablehttp_custom_client_example/main.py index 41e26ec35d..9c45812009 100644 --- a/examples/mcp/streamablehttp_custom_client_example/main.py +++ b/examples/mcp/streamablehttp_custom_client_example/main.py @@ -7,6 +7,7 @@ import asyncio import os import shutil +import socket import subprocess import time from typing import Any @@ -17,6 +18,22 @@ from agents.mcp import MCPServer, MCPServerStreamableHttp from agents.model_settings import ModelSettings +STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1") + + +def _choose_port() -> int: + env_port = os.getenv("STREAMABLE_HTTP_PORT") + if env_port: + return int(env_port) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind((STREAMABLE_HTTP_HOST, 0)) + return s.getsockname()[1] + + +STREAMABLE_HTTP_PORT = _choose_port() +os.environ.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT)) +STREAMABLE_HTTP_URL = f"http://{STREAMABLE_HTTP_HOST}:{STREAMABLE_HTTP_PORT}/mcp" + def create_custom_http_client( headers: dict[str, str] | None = None, @@ -73,7 +90,7 @@ async def main(): async with MCPServerStreamableHttp( name="Streamable HTTP with Custom Client", params={ - "url": "http://localhost:8000/mcp", + "url": STREAMABLE_HTTP_URL, "httpx_client_factory": create_custom_http_client, }, ) as server: @@ -91,16 +108,19 @@ async def main(): ) # We'll run the Streamable HTTP server in a subprocess. Usually this would be a remote server, but for this - # demo, we'll run it locally at http://localhost:8000/mcp + # demo, we'll run it locally at STREAMABLE_HTTP_URL process: subprocess.Popen[Any] | None = None try: this_dir = os.path.dirname(os.path.abspath(__file__)) server_file = os.path.join(this_dir, "server.py") - print("Starting Streamable HTTP server at http://localhost:8000/mcp ...") + print(f"Starting Streamable HTTP server at {STREAMABLE_HTTP_URL} ...") # Run `uv run server.py` to start the Streamable HTTP server - process = subprocess.Popen(["uv", "run", server_file]) + env = os.environ.copy() + env.setdefault("STREAMABLE_HTTP_HOST", STREAMABLE_HTTP_HOST) + env.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT)) + process = subprocess.Popen(["uv", "run", server_file], env=env) # Give it 3 seconds to start time.sleep(3) diff --git a/examples/mcp/streamablehttp_custom_client_example/server.py b/examples/mcp/streamablehttp_custom_client_example/server.py index a078ee00fa..dd0d468753 100644 --- a/examples/mcp/streamablehttp_custom_client_example/server.py +++ b/examples/mcp/streamablehttp_custom_client_example/server.py @@ -1,9 +1,13 @@ +import os import random from mcp.server.fastmcp import FastMCP +STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1") +STREAMABLE_HTTP_PORT = int(os.getenv("STREAMABLE_HTTP_PORT", "18080")) + # Create server -mcp = FastMCP("Echo Server") +mcp = FastMCP("Echo Server", host=STREAMABLE_HTTP_HOST, port=STREAMABLE_HTTP_PORT) @mcp.tool() diff --git a/examples/mcp/streamablehttp_example/README.md b/examples/mcp/streamablehttp_example/README.md index a07fe19be3..83cae670b6 100644 --- a/examples/mcp/streamablehttp_example/README.md +++ b/examples/mcp/streamablehttp_example/README.md @@ -10,4 +10,4 @@ uv run python examples/mcp/streamablehttp_example/main.py ## Details -The example uses the `MCPServerStreamableHttp` class from `agents.mcp`. The server runs in a sub-process at `https://localhost:8000/mcp`. +The example uses the `MCPServerStreamableHttp` class from `agents.mcp`. The script picks an open localhost port automatically (or honors `STREAMABLE_HTTP_PORT` if you set it) and starts the server at `http://:/mcp`. Set `STREAMABLE_HTTP_HOST` if you need a different bind address. diff --git a/examples/mcp/streamablehttp_example/main.py b/examples/mcp/streamablehttp_example/main.py index cc95e798b6..fd1140f98e 100644 --- a/examples/mcp/streamablehttp_example/main.py +++ b/examples/mcp/streamablehttp_example/main.py @@ -1,6 +1,7 @@ import asyncio import os import shutil +import socket import subprocess import time from typing import Any @@ -9,6 +10,22 @@ from agents.mcp import MCPServer, MCPServerStreamableHttp from agents.model_settings import ModelSettings +STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1") + + +def _choose_port() -> int: + env_port = os.getenv("STREAMABLE_HTTP_PORT") + if env_port: + return int(env_port) + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind((STREAMABLE_HTTP_HOST, 0)) + return s.getsockname()[1] + + +STREAMABLE_HTTP_PORT = _choose_port() +os.environ.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT)) +STREAMABLE_HTTP_URL = f"http://{STREAMABLE_HTTP_HOST}:{STREAMABLE_HTTP_PORT}/mcp" + async def run(mcp_server: MCPServer): agent = Agent( @@ -41,7 +58,7 @@ async def main(): async with MCPServerStreamableHttp( name="Streamable HTTP Python Server", params={ - "url": "http://localhost:8000/mcp", + "url": STREAMABLE_HTTP_URL, }, ) as server: trace_id = gen_trace_id() @@ -58,16 +75,19 @@ async def main(): ) # We'll run the Streamable HTTP server in a subprocess. Usually this would be a remote server, but for this - # demo, we'll run it locally at http://localhost:8000/mcp + # demo, we'll run it locally at STREAMABLE_HTTP_URL process: subprocess.Popen[Any] | None = None try: this_dir = os.path.dirname(os.path.abspath(__file__)) server_file = os.path.join(this_dir, "server.py") - print("Starting Streamable HTTP server at http://localhost:8000/mcp ...") + print(f"Starting Streamable HTTP server at {STREAMABLE_HTTP_URL} ...") # Run `uv run server.py` to start the Streamable HTTP server - process = subprocess.Popen(["uv", "run", server_file]) + env = os.environ.copy() + env.setdefault("STREAMABLE_HTTP_HOST", STREAMABLE_HTTP_HOST) + env.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT)) + process = subprocess.Popen(["uv", "run", server_file], env=env) # Give it 3 seconds to start time.sleep(3) diff --git a/examples/mcp/streamablehttp_example/server.py b/examples/mcp/streamablehttp_example/server.py index d8f839652a..d73ab895b6 100644 --- a/examples/mcp/streamablehttp_example/server.py +++ b/examples/mcp/streamablehttp_example/server.py @@ -1,10 +1,14 @@ +import os import random import requests from mcp.server.fastmcp import FastMCP +STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1") +STREAMABLE_HTTP_PORT = int(os.getenv("STREAMABLE_HTTP_PORT", "18080")) + # Create server -mcp = FastMCP("Echo Server") +mcp = FastMCP("Echo Server", host=STREAMABLE_HTTP_HOST, port=STREAMABLE_HTTP_PORT) @mcp.tool() @@ -23,10 +27,16 @@ def get_secret_word() -> str: @mcp.tool() def get_current_weather(city: str) -> str: print(f"[debug-server] get_current_weather({city})") - - endpoint = "https://wttr.in" - response = requests.get(f"{endpoint}/{city}") - return response.text + # Avoid slow or flaky network calls during automated runs. + try: + endpoint = "https://wttr.in" + response = requests.get(f"{endpoint}/{city}", timeout=2) + if response.ok: + return response.text + except Exception: + pass + # Fallback keeps the tool responsive even when offline. + return f"Weather data unavailable right now; assume clear skies in {city}." if __name__ == "__main__": diff --git a/examples/model_providers/litellm_provider.py b/examples/model_providers/litellm_provider.py index 4a1a696fcb..ea5f09ab32 100644 --- a/examples/model_providers/litellm_provider.py +++ b/examples/model_providers/litellm_provider.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import os from agents import Agent, Runner, function_tool, set_tracing_disabled from agents.extensions.models.litellm_model import LitellmModel @@ -24,6 +25,9 @@ def get_weather(city: str): async def main(model: str, api_key: str): + if api_key == "dummy": + print("Skipping run because no valid LITELLM_API_KEY was provided.") + return agent = Agent( name="Assistant", instructions="You only respond in haikus.", @@ -36,7 +40,7 @@ async def main(model: str, api_key: str): if __name__ == "__main__": - # First try to get model/api key from args + # Prefer non-interactive defaults in auto mode to avoid blocking. import argparse parser = argparse.ArgumentParser() @@ -44,12 +48,12 @@ async def main(model: str, api_key: str): parser.add_argument("--api-key", type=str, required=False) args = parser.parse_args() - model = args.model - if not model: - model = input("Enter a model name for Litellm: ") + model = args.model or os.environ.get("LITELLM_MODEL", "openai/gpt-4o-mini") + api_key = args.api_key or os.environ.get("LITELLM_API_KEY", "dummy") - api_key = args.api_key - if not api_key: - api_key = input("Enter an API key for Litellm: ") + if not args.model: + print(f"Using default model: {model}") + if not args.api_key: + print("Using LITELLM_API_KEY from environment (or dummy placeholder).") asyncio.run(main(model, api_key)) diff --git a/examples/reasoning_content/main.py b/examples/reasoning_content/main.py index 7ccbab01b8..3db5d5cee6 100644 --- a/examples/reasoning_content/main.py +++ b/examples/reasoning_content/main.py @@ -20,7 +20,7 @@ from agents.models.interface import ModelTracing from agents.models.openai_provider import OpenAIProvider -MODEL_NAME = os.getenv("REASONING_MODEL_NAME") or "gpt-5" +MODEL_NAME = os.getenv("REASONING_MODEL_NAME") or "gpt-5.2" async def stream_with_reasoning_content(): diff --git a/examples/reasoning_content/runner_example.py b/examples/reasoning_content/runner_example.py index 3546da3502..e3c3d22506 100644 --- a/examples/reasoning_content/runner_example.py +++ b/examples/reasoning_content/runner_example.py @@ -17,7 +17,7 @@ from agents import Agent, ModelSettings, Runner, trace from agents.items import ReasoningItem -MODEL_NAME = os.getenv("EXAMPLE_MODEL_NAME") or "gpt-5.2" +MODEL_NAME = os.getenv("REASONING_MODEL_NAME") or "gpt-5.2" async def main(): diff --git a/examples/research_bot/main.py b/examples/research_bot/main.py index a0fd43dca8..b70bc8e483 100644 --- a/examples/research_bot/main.py +++ b/examples/research_bot/main.py @@ -1,10 +1,15 @@ import asyncio +from examples.auto_mode import input_with_fallback + from .manager import ResearchManager async def main() -> None: - query = input("What would you like to research? ") + query = input_with_fallback( + "What would you like to research? ", + "Impact of electric vehicles on the grid.", + ) await ResearchManager().run(query) diff --git a/examples/run_examples.py b/examples/run_examples.py index 0d51a028f1..1b52cdec21 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -1,30 +1,26 @@ -"""Run multiple example entry points in this repository. - -This script locates Python files under ``examples/`` that contain a -``__main__`` guard and executes them one by one. By default it skips -interactive, server-like, audio-heavy, and external-service examples so -that automated validation does not hang waiting for input or require -hardware. Use flags to opt into those categories when you want to run -them. - -Usage examples: - - uv run examples/run_examples.py --dry-run - uv run examples/run_examples.py --filter basic - uv run examples/run_examples.py --include-interactive --include-server - -By default the script keeps running even if an example fails; use -``--fail-fast`` to stop on the first failure. +"""Run multiple example entry points with optional auto mode and logging. + +Features: +* Discovers ``__main__``-guarded example files under ``examples/``. +* Skips interactive/server/audio/external examples unless explicitly included. +* Auto mode (``EXAMPLES_INTERACTIVE_MODE=auto``) enables deterministic inputs, + auto-approvals, and turns on interactive examples by default. +* Writes per-example logs to ``.tmp/examples-start-logs`` and a main summary log. +* Generates a rerun list of failures at ``.tmp/examples-rerun.txt``. """ from __future__ import annotations import argparse +import datetime +import os import re import shlex import subprocess import sys +import threading from collections.abc import Iterable, Sequence +from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field from pathlib import Path @@ -32,6 +28,27 @@ EXAMPLES_DIR = ROOT_DIR / "examples" MAIN_PATTERN = re.compile(r"__name__\s*==\s*['\"]__main__['\"]") +LOG_DIR_DEFAULT = ROOT_DIR / ".tmp" / "examples-start-logs" +RERUN_FILE_DEFAULT = ROOT_DIR / ".tmp" / "examples-rerun.txt" +DEFAULT_MAIN_LOG = LOG_DIR_DEFAULT / f"main_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}.log" + +# Examples that are noisy, require extra credentials, or hang in auto runs. +DEFAULT_AUTO_SKIP = { + "examples/agent_patterns/llm_as_a_judge.py", + "examples/agent_patterns/routing.py", + "examples/customer_service/main.py", + "examples/hosted_mcp/connectors.py", + "examples/mcp/git_example/main.py", + "examples/model_providers/custom_example_agent.py", + "examples/model_providers/custom_example_global.py", + "examples/model_providers/custom_example_provider.py", + "examples/realtime/app/server.py", + "examples/realtime/cli/demo.py", + "examples/realtime/twilio/server.py", + "examples/voice/static/main.py", + "examples/voice/streamed/main.py", +} + @dataclass class ExampleScript: @@ -53,6 +70,15 @@ def command(self) -> list[str]: return ["uv", "run", "python", "-m", self.module] +@dataclass +class ExampleResult: + script: ExampleScript + status: str + reason: str = "" + log_path: Path | None = None + exit_code: int | None = None + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run example scripts sequentially.") parser.add_argument( @@ -95,6 +121,55 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Show detected tags for each example entry.", ) + parser.add_argument( + "--logs-dir", + default=str(LOG_DIR_DEFAULT), + help="Directory for per-example logs and main log.", + ) + parser.add_argument( + "--main-log", + default=str(DEFAULT_MAIN_LOG), + help="Path to write the main summary log.", + ) + parser.add_argument( + "--rerun-file", + help="Only run examples listed in this file (one relative path per line).", + ) + parser.add_argument( + "--write-rerun", + action="store_true", + help="Write failures to .tmp/examples-rerun.txt after the run.", + ) + parser.add_argument( + "--collect", + help="Parse a previous main log to emit a rerun list instead of running examples.", + ) + parser.add_argument( + "--output", + help="Output path for --collect rerun list (defaults to stdout).", + ) + parser.add_argument( + "--print-auto-skip", + action="store_true", + help="Show the current auto-skip list and exit.", + ) + parser.add_argument( + "--auto-mode", + action="store_true", + help="Force EXAMPLES_INTERACTIVE_MODE=auto for this run.", + ) + parser.add_argument( + "--jobs", + "-j", + type=int, + default=int(os.environ.get("EXAMPLES_JOBS", "4")), + help="Number of examples to run in parallel (default: 4). Use 1 to force serial execution.", + ) + parser.add_argument( + "--no-buffer-output", + action="store_true", + help="Stream each example's stdout directly (may interleave). By default output is buffered per example to reduce interleaving.", + ) return parser.parse_args() @@ -103,7 +178,7 @@ def detect_tags(path: Path, source: str) -> set[str]: lower_source = source.lower() lower_parts = [part.lower() for part in path.parts] - if re.search(r"\binput\s*\(", source): + if re.search(r"\binput\s*\(", source) or "input_with_fallback(" in lower_source: tags.add("interactive") if "prompt_toolkit" in lower_source or "questionary" in lower_source: tags.add("interactive") @@ -153,9 +228,17 @@ def discover_examples(filters: Iterable[str]) -> list[ExampleScript]: return sorted(examples, key=lambda item: item.relpath) -def should_skip(tags: set[str], allowed_overrides: set[str]) -> tuple[bool, set[str]]: +def should_skip( + tags: set[str], + allowed_overrides: set[str], + auto_skip_set: set[str], + relpath: str, + auto_mode: bool, +) -> tuple[bool, set[str]]: blocked = {"interactive", "server", "audio", "external"} - allowed_overrides active_blockers = tags & blocked + if auto_mode and relpath in auto_skip_set: + active_blockers = active_blockers | {"auto-skip"} return (len(active_blockers) > 0, active_blockers) @@ -163,60 +246,281 @@ def format_command(cmd: Sequence[str]) -> str: return shlex.join(cmd) +def env_flag(name: str) -> bool | None: + raw = os.environ.get(name) + if raw is None: + return None + return raw.strip().lower() in {"1", "true", "yes", "on"} + + +def load_auto_skip() -> set[str]: + env_value = os.environ.get("EXAMPLES_AUTO_SKIP", "") + if env_value.strip(): + parts = re.split(r"[\s,]+", env_value.strip()) + return {p for p in parts if p} + return set(DEFAULT_AUTO_SKIP) + + +def write_main_log_line(handle, line: str) -> None: + handle.write(line + "\n") + handle.flush() + + +def ensure_dirs(path: Path) -> None: + if path.suffix: + path.parent.mkdir(parents=True, exist_ok=True) + else: + path.mkdir(parents=True, exist_ok=True) + + +def parse_rerun_from_log(log_path: Path) -> list[str]: + if not log_path.exists(): + raise FileNotFoundError(log_path) + rerun: list[str] = [] + with log_path.open("r", encoding="utf-8") as handle: + for line in handle: + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + parts = stripped.split() + if len(parts) < 2: + continue + status, relpath = parts[0].upper(), parts[1] + if status in {"FAILED", "ERROR", "UNKNOWN"}: + rerun.append(relpath) + return rerun + + def run_examples(examples: Sequence[ExampleScript], args: argparse.Namespace) -> int: overrides: set[str] = set() - if args.include_interactive: + if args.include_interactive or env_flag("EXAMPLES_INCLUDE_INTERACTIVE"): overrides.add("interactive") - if args.include_server: + if args.include_server or env_flag("EXAMPLES_INCLUDE_SERVER"): overrides.add("server") - if args.include_audio: + if args.include_audio or env_flag("EXAMPLES_INCLUDE_AUDIO"): overrides.add("audio") - if args.include_external: + if args.include_external or env_flag("EXAMPLES_INCLUDE_EXTERNAL"): overrides.add("external") + logs_dir = Path(args.logs_dir).resolve() + main_log_path = Path(args.main_log).resolve() + auto_mode = args.auto_mode or os.environ.get("EXAMPLES_INTERACTIVE_MODE", "").lower() == "auto" + auto_skip_set = load_auto_skip() + + if auto_mode and "interactive" not in overrides: + overrides.add("interactive") + + ensure_dirs(logs_dir) + ensure_dirs(main_log_path) + rerun_entries: list[str] = [] + if not examples: print("No example entry points found that match the filters.") return 0 + print(f"Interactive mode: {'auto' if auto_mode else 'prompt'}") print(f"Found {len(examples)} example entry points under examples/.") executed = 0 skipped = 0 failed = 0 + results: list[ExampleResult] = [] + + jobs = max(1, args.jobs) + if args.fail_fast and jobs > 1: + # Preserve fail-fast semantics by forcing serial execution. + jobs = 1 + + output_lock = threading.Lock() + main_log_lock = threading.Lock() + buffer_output = not args.no_buffer_output and os.environ.get( + "EXAMPLES_BUFFER_OUTPUT", "1" + ).lower() not in {"0", "false", "no", "off"} + + def safe_write_main(line: str) -> None: + with main_log_lock: + write_main_log_line(main_log, line) + + def run_single(example: ExampleScript) -> ExampleResult: + relpath = example.relpath + log_filename = f"{relpath.replace('/', '__')}.log" + log_path = logs_dir / log_filename + ensure_dirs(log_path) + + env = os.environ.copy() + if auto_mode: + env.setdefault("EXAMPLES_INTERACTIVE_MODE", "auto") + env.setdefault("APPLY_PATCH_AUTO_APPROVE", "1") + env.setdefault("SHELL_AUTO_APPROVE", "1") + env.setdefault("AUTO_APPROVE_MCP", "1") + + proc = subprocess.Popen( + example.command, + cwd=ROOT_DIR, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + env=env, + ) + assert proc.stdout is not None + buffer_lines: list[str] = [] + + with log_path.open("w", encoding="utf-8") as per_log: + for line in proc.stdout: + per_log.write(line) + if buffer_output: + buffer_lines.append(line) + else: + with output_lock: + sys.stdout.write(f"[{relpath}] {line}") + proc.wait() + exit_code = proc.returncode + + if buffer_output and buffer_lines: + with output_lock: + for line in buffer_lines: + sys.stdout.write(f"[{relpath}] {line}") + + if exit_code == 0: + safe_write_main(f"PASSED {relpath} exit=0 log={log_path.relative_to(ROOT_DIR)}") + return ExampleResult( + script=example, + status="passed", + log_path=log_path, + exit_code=exit_code, + ) + + info = f"exit={exit_code}" + with output_lock: + print(f" !! {relpath} exited with {exit_code}") + safe_write_main(f"FAILED {relpath} exit={exit_code} log={log_path.relative_to(ROOT_DIR)}") + return ExampleResult( + script=example, + status="failed", + reason=info, + log_path=log_path, + exit_code=exit_code, + ) + + with main_log_path.open("w", encoding="utf-8") as main_log: + safe_write_main(f"# run started {datetime.datetime.now().isoformat()}") + safe_write_main(f"# filters: {args.filter or '-'}") + safe_write_main(f"# include: {sorted(overrides)}") + safe_write_main(f"# auto_mode: {auto_mode}") + safe_write_main(f"# logs_dir: {logs_dir}") + safe_write_main(f"# jobs: {jobs}") + safe_write_main(f"# buffer_output: {buffer_output}") + + run_list: list[ExampleScript] = [] + + for example in examples: + relpath = example.relpath + skip, reasons = should_skip(example.tags, overrides, auto_skip_set, relpath, auto_mode) + tag_label = f" [tags: {', '.join(sorted(example.tags))}]" if args.verbose else "" + + if skip: + reason_label = f" (skipped: {', '.join(sorted(reasons))})" if reasons else "" + print(f"- SKIP {relpath}{tag_label}{reason_label}") + safe_write_main(f"SKIPPED {relpath} reasons={','.join(sorted(reasons))}") + skipped += 1 + results.append( + ExampleResult(script=example, status="skipped", reason=",".join(reasons)) + ) + continue + + print(f"- RUN {relpath}{tag_label}") + print(f" cmd: {format_command(example.command)}") + + if args.dry_run: + safe_write_main(f"DRYRUN {relpath}") + results.append(ExampleResult(script=example, status="dry-run")) + continue + + run_list.append(example) + + run_results: dict[str, ExampleResult] = {} + if run_list: + with ThreadPoolExecutor(max_workers=jobs) as executor: + future_map = {executor.submit(run_single, ex): ex for ex in run_list} + for future in as_completed(future_map): + result = future.result() + run_results[result.script.relpath] = result + + for ex in run_list: + result = run_results[ex.relpath] + results.append(result) + if result.status == "passed": + executed += 1 + elif result.status == "failed": + failed += 1 + rerun_entries.append(ex.relpath) + if args.fail_fast: + safe_write_main("# fail-fast stop") + break + + safe_write_main(f"# summary executed={executed} skipped={skipped} failed={failed}") + + if args.write_rerun and rerun_entries: + ensure_dirs(RERUN_FILE_DEFAULT) + RERUN_FILE_DEFAULT.write_text("\n".join(rerun_entries) + "\n", encoding="utf-8") + print(f"Wrote rerun list to {RERUN_FILE_DEFAULT}") + + print(f"Main log: {main_log_path}") + print(f"Done. Ran {executed} example(s), skipped {skipped}, failed {failed}.") - for example in examples: - skip, reasons = should_skip(example.tags, overrides) - tag_label = f" [tags: {', '.join(sorted(example.tags))}]" if args.verbose else "" - - if skip: - reason_label = f" (skipped: {', '.join(sorted(reasons))})" if reasons else "" - print(f"- SKIP {example.relpath}{tag_label}{reason_label}") - skipped += 1 - continue - - print(f"- RUN {example.relpath}{tag_label}") - print(f" cmd: {format_command(example.command)}") - - if args.dry_run: - continue - - result = subprocess.run(example.command, cwd=ROOT_DIR) - if result.returncode != 0: - print(f" !! {example.relpath} exited with {result.returncode}") - failed += 1 - if args.fail_fast: - return result.returncode - continue - - executed += 1 + # Summary table + status_w = 9 + name_w = 44 + info_w = 32 + print("\nResults:") + print(f"{'status'.ljust(status_w)} {'example'.ljust(name_w)} {'info'.ljust(info_w)} log") + print(f"{'-' * status_w} {'-' * name_w} {'-' * info_w} ---") + for result in results: + info = result.reason or ("exit 0" if result.status == "passed" else "") + log_disp = ( + str(result.log_path.relative_to(ROOT_DIR)) + if result.log_path and result.log_path.exists() + else "-" + ) + print( + f"{result.status.ljust(status_w)} {result.script.relpath.ljust(name_w)} {info.ljust(info_w)} {log_disp}" + ) - print(f"Done. Ran {executed} example(s), skipped {skipped}, failed {failed}.") return 0 if failed == 0 else 1 def main() -> int: args = parse_args() + if args.print_auto_skip: + for entry in sorted(load_auto_skip()): + print(entry) + return 0 + + if args.collect: + paths = parse_rerun_from_log(Path(args.collect)) + if args.output: + out = Path(args.output) + ensure_dirs(out) + out.write_text("\n".join(paths) + "\n", encoding="utf-8") + print(f"Wrote {len(paths)} entries to {out}") + else: + for p in paths: + print(p) + return 0 + examples = discover_examples(args.filter) + if args.rerun_file: + rerun_set = { + line.strip() + for line in Path(args.rerun_file).read_text(encoding="utf-8").splitlines() + if line.strip() + } + examples = [ex for ex in examples if ex.relpath in rerun_set] + if not examples: + print("Rerun list is empty; nothing to do.") + return 0 + print(f"Rerun mode: {len(examples)} example(s) from {args.rerun_file}") + return run_examples(examples, args) diff --git a/examples/tools/apply_patch.py b/examples/tools/apply_patch.py index 19d0cfb7dc..d93c4f3fb8 100644 --- a/examples/tools/apply_patch.py +++ b/examples/tools/apply_patch.py @@ -162,7 +162,7 @@ async def main(auto_approve: bool, model: str) -> None: ) parser.add_argument( "--model", - default="gpt-5.1", + default="gpt-5.2", help="Model ID to use for the agent.", ) args = parser.parse_args() diff --git a/examples/tools/shell.py b/examples/tools/shell.py index 7dcb133095..37e815178a 100644 --- a/examples/tools/shell.py +++ b/examples/tools/shell.py @@ -108,7 +108,7 @@ async def main(prompt: str, model: str) -> None: ) parser.add_argument( "--model", - default="gpt-5.1", + default="gpt-5.2", ) args = parser.parse_args() asyncio.run(main(args.prompt, args.model)) From 368fdbd1b9a287957ba1c64e09f71124cc777a64 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 21:39:02 +0900 Subject: [PATCH 02/15] fix --- examples/mcp/prompt_server/main.py | 5 +++-- examples/mcp/streamablehttp_custom_client_example/main.py | 5 +++-- examples/mcp/streamablehttp_example/main.py | 5 +++-- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/examples/mcp/prompt_server/main.py b/examples/mcp/prompt_server/main.py index 543ec273c4..3cd045e63b 100644 --- a/examples/mcp/prompt_server/main.py +++ b/examples/mcp/prompt_server/main.py @@ -4,7 +4,7 @@ import socket import subprocess import time -from typing import Any +from typing import Any, cast from agents import Agent, Runner, gen_trace_id, trace from agents.mcp import MCPServer, MCPServerStreamableHttp @@ -19,7 +19,8 @@ def _choose_port() -> int: return int(env_port) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind((STREAMABLE_HTTP_HOST, 0)) - return s.getsockname()[1] + address = cast(tuple[str, int], s.getsockname()) + return address[1] STREAMABLE_HTTP_PORT = _choose_port() diff --git a/examples/mcp/streamablehttp_custom_client_example/main.py b/examples/mcp/streamablehttp_custom_client_example/main.py index 9c45812009..20cbef1cdc 100644 --- a/examples/mcp/streamablehttp_custom_client_example/main.py +++ b/examples/mcp/streamablehttp_custom_client_example/main.py @@ -10,7 +10,7 @@ import socket import subprocess import time -from typing import Any +from typing import Any, cast import httpx @@ -27,7 +27,8 @@ def _choose_port() -> int: return int(env_port) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind((STREAMABLE_HTTP_HOST, 0)) - return s.getsockname()[1] + address = cast(tuple[str, int], s.getsockname()) + return address[1] STREAMABLE_HTTP_PORT = _choose_port() diff --git a/examples/mcp/streamablehttp_example/main.py b/examples/mcp/streamablehttp_example/main.py index fd1140f98e..564a7bf98f 100644 --- a/examples/mcp/streamablehttp_example/main.py +++ b/examples/mcp/streamablehttp_example/main.py @@ -4,7 +4,7 @@ import socket import subprocess import time -from typing import Any +from typing import Any, cast from agents import Agent, Runner, gen_trace_id, trace from agents.mcp import MCPServer, MCPServerStreamableHttp @@ -19,7 +19,8 @@ def _choose_port() -> int: return int(env_port) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind((STREAMABLE_HTTP_HOST, 0)) - return s.getsockname()[1] + address = cast(tuple[str, int], s.getsockname()) + return address[1] STREAMABLE_HTTP_PORT = _choose_port() From 18f24a28cd3174338499bd6f7841b120cb81d5fb Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 21:41:27 +0900 Subject: [PATCH 03/15] fix --- .codex/skills/examples-auto-run/scripts/run.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh index d36270c915..28d502d60c 100755 --- a/.codex/skills/examples-auto-run/scripts/run.sh +++ b/.codex/skills/examples-auto-run/scripts/run.sh @@ -23,15 +23,16 @@ cmd_start() { shift fi - local ts log_file + local ts main_log stdout_log ts="$(date +%Y%m%d-%H%M%S)" - log_file="$LOG_DIR/main_${ts}.log" + main_log="$LOG_DIR/main_${ts}.log" + stdout_log="$LOG_DIR/stdout_${ts}.log" local run_cmd=( uv run examples/run_examples.py --auto-mode --write-rerun - --main-log "$log_file" + --main-log "$main_log" --logs-dir "$LOG_DIR" ) @@ -55,12 +56,13 @@ cmd_start() { export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}" export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}" cd "$ROOT" - "${run_cmd[@]}" "$@" 2>&1 | tee "$log_file" >/dev/null + "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log" >/dev/null ) & local pid=$! echo "$pid" >"$PID_FILE" echo "Started run_examples.py (pid=$pid)" - echo "Main log: $log_file" + echo "Main log: $main_log" + echo "Stdout log: $stdout_log" return 0 fi @@ -73,7 +75,7 @@ cmd_start() { export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}" export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}" cd "$ROOT" - "${run_cmd[@]}" "$@" 2>&1 | tee "$log_file" + "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log" } cmd_stop() { From 7a4fa62677ac6fb8ddcee09364b91a051b828b51 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 21:52:19 +0900 Subject: [PATCH 04/15] fix --- .codex/skills/examples-auto-run/SKILL.md | 13 + .../skills/examples-auto-run/scripts/run.sh | 48 ++- examples/behavioral_validation.py | 285 ++++++++++++++++++ examples/run_examples.py | 12 - 4 files changed, 345 insertions(+), 13 deletions(-) create mode 100644 examples/behavioral_validation.py diff --git a/.codex/skills/examples-auto-run/SKILL.md b/.codex/skills/examples-auto-run/SKILL.md index 4e3abc7545..9d554b426c 100644 --- a/.codex/skills/examples-auto-run/SKILL.md +++ b/.codex/skills/examples-auto-run/SKILL.md @@ -58,9 +58,22 @@ description: Run python examples in auto mode with logging, rerun helpers, and b - Main logs: `.tmp/examples-start-logs/main_*.log` - Per-example logs (from `run_examples.py`): `.tmp/examples-start-logs/.log` - Rerun list: `.tmp/examples-rerun.txt` +- Stdout logs: `.tmp/examples-start-logs/stdout_*.log` ## Notes - The runner delegates to `uv run examples/run_examples.py`, which already writes per-example logs and supports `--collect`, `--rerun-file`, and `--print-auto-skip`. - `start` uses `--write-rerun` so failures are captured automatically. - If `.tmp/examples-rerun.txt` exists and is non-empty, invoking the skill with no args runs `rerun` by default. + +## Behavioral validation + +- After every foreground `start` or `rerun`, the script automatically runs `uv run examples/behavioral_validation.py` against the generated main log. +- The validator: + 1. Reads the example source to derive expected messages (print strings and prompt/message assignments). + 2. Reads each passed example’s log and checks those messages appeared. + 3. Reports per-example status with the full matching log lines; missing expectations are flagged. +- Background runs do not validate automatically; after they finish, run: + ```bash + .codex/skills/examples-auto-run/scripts/run.sh validate + ``` diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh index 28d502d60c..4963c42ac5 100755 --- a/.codex/skills/examples-auto-run/scripts/run.sh +++ b/.codex/skills/examples-auto-run/scripts/run.sh @@ -15,6 +15,19 @@ is_running() { [[ -n "$pid" ]] && ps -p "$pid" >/dev/null 2>&1 } +run_validation() { + local main_log="$1" + if [[ -z "$main_log" ]]; then + echo "Validation skipped: main log path is empty." + return 0 + fi + if [[ ! -f "$main_log" ]]; then + echo "Validation skipped: main log not found: $main_log" + return 0 + fi + uv run examples/behavioral_validation.py --main-log "$main_log" --logs-dir "$LOG_DIR" || true +} + cmd_start() { ensure_dirs local background=0 @@ -63,6 +76,7 @@ cmd_start() { echo "Started run_examples.py (pid=$pid)" echo "Main log: $main_log" echo "Stdout log: $stdout_log" + echo "Run '.codex/skills/examples-auto-run/scripts/run.sh validate \"$main_log\"' after it finishes." return 0 fi @@ -75,7 +89,12 @@ cmd_start() { export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}" export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}" cd "$ROOT" + set +e "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log" + local run_status=${PIPESTATUS[0]} + set -e + run_validation "$main_log" + return "$run_status" } cmd_stop() { @@ -157,12 +176,37 @@ cmd_rerun() { echo "Rerun list is empty: $file" exit 0 fi + local ts main_log stdout_log + ts="$(date +%Y%m%d-%H%M%S)" + main_log="$LOG_DIR/main_${ts}.log" + stdout_log="$LOG_DIR/stdout_${ts}.log" cd "$ROOT" export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}" export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}" export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}" export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}" - uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun + set +e + uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun --main-log "$main_log" --logs-dir "$LOG_DIR" 2>&1 | tee "$stdout_log" + local run_status=${PIPESTATUS[0]} + set -e + run_validation "$main_log" + return "$run_status" +} + +cmd_validate() { + ensure_dirs + local main_log="${1:-}" + if [[ -z "$main_log" ]]; then + main_log="$(ls -1t "$LOG_DIR"/main_*.log 2>/dev/null | head -n1)" + fi + if [[ -z "$main_log" ]]; then + echo "No main log found." + exit 1 + fi + if [[ "$main_log" != /* && -f "$LOG_DIR/$main_log" ]]; then + main_log="$LOG_DIR/$main_log" + fi + run_validation "$main_log" } usage() { @@ -177,6 +221,7 @@ Commands: tail [logfile] Tail the latest (or specified) log. collect [main_log] Parse a main log and write failed examples to .tmp/examples-rerun.txt. rerun [rerun_file] Run only the examples listed in .tmp/examples-rerun.txt. + validate [main_log] Run behavioral validation against the latest (or given) main log. Environment overrides: EXAMPLES_INTERACTIVE_MODE (default auto) @@ -198,5 +243,6 @@ case "${1:-$default_cmd}" in tail) shift; cmd_tail "${1:-}" ;; collect) shift || true; collect_rerun "${1:-}" ;; rerun) shift || true; cmd_rerun "${1:-}" ;; + validate) shift || true; cmd_validate "${1:-}" ;; *) usage; exit 1 ;; esac diff --git a/examples/behavioral_validation.py b/examples/behavioral_validation.py new file mode 100644 index 0000000000..db94150cdb --- /dev/null +++ b/examples/behavioral_validation.py @@ -0,0 +1,285 @@ +"""Lightweight behavioral validation for example runs. + +Reads a main log emitted by `examples/run_examples.py`, inspects the source +files for each passed example to derive expected messages, and checks that the +per-example logs contain those messages. The goal is to provide quick evidence +that the observed behavior matches the intended flow without re-running code. +""" + +from __future__ import annotations + +import argparse +import ast +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, Sequence + +ROOT_DIR = Path(__file__).resolve().parent.parent +LOG_DIR_DEFAULT = ROOT_DIR / ".tmp" / "examples-start-logs" + +ENTRY_RE = re.compile(r"^(PASSED|FAILED|SKIPPED|DRYRUN)\s+(\S+)(?:.*log=([^\s]+))?") + + +@dataclass +class MainEntry: + status: str + relpath: str + log_path: Path | None + + +@dataclass +class ValidationHit: + expectation: str + lines: list[str] + + +@dataclass +class ValidationResult: + relpath: str + log_path: Path | None + status: str # ok, warn, fail + hits: list[ValidationHit] + missing: list[str] + notes: list[str] + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Validate example behavior from logs.") + parser.add_argument( + "--main-log", + help="Path to the main log (defaults to latest main_*.log in logs dir).", + ) + parser.add_argument( + "--logs-dir", + default=str(LOG_DIR_DEFAULT), + help="Directory containing main and per-example logs.", + ) + parser.add_argument( + "--limit", + type=int, + default=5, + help="Maximum expectations to check per example (to keep output readable).", + ) + return parser.parse_args() + + +def find_latest_main_log(log_dir: Path) -> Path | None: + candidates = sorted(log_dir.glob("main_*.log"), key=lambda p: p.stat().st_mtime, reverse=True) + return candidates[0] if candidates else None + + +def parse_main_log(path: Path) -> list[MainEntry]: + entries: list[MainEntry] = [] + for raw_line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + line = raw_line.strip() + match = ENTRY_RE.match(line) + if not match: + continue + status, relpath, log_rel = match.groups() + log_path = ROOT_DIR / log_rel if log_rel else None + entries.append(MainEntry(status=status, relpath=relpath, log_path=log_path)) + return entries + + +def clean_text(value: str) -> str: + return " ".join(value.split()) + + +def _extract_from_print_calls(tree: ast.AST) -> list[str]: + texts: list[str] = [] + for node in ast.walk(tree): + if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "print": + for arg in node.args: + if isinstance(arg, ast.Constant) and isinstance(arg.value, str): + texts.append(arg.value) + return texts + + +def _extract_from_assignments(tree: ast.AST) -> list[str]: + texts: list[str] = [] + target_keywords = {"message", "prompt", "question", "instruction", "text"} + for node in ast.walk(tree): + if not isinstance(node, ast.Assign): + continue + if not isinstance(node.value, ast.Constant) or not isinstance(node.value.value, str): + continue + for target in node.targets: + if isinstance(target, ast.Name) and any( + key in target.id.lower() for key in target_keywords + ): + texts.append(node.value.value) + return texts + + +def derive_expectations(source: str, limit: int) -> list[str]: + try: + tree = ast.parse(source) + except SyntaxError: + return [] + + texts: list[str] = [] + texts.extend(_extract_from_print_calls(tree)) + texts.extend(_extract_from_assignments(tree)) + + cleaned: list[str] = [] + for text in texts: + normalized = clean_text(text) + if 8 <= len(normalized) <= 200: + cleaned.append(normalized) + + # Preserve order while removing duplicates. + seen = set() + ordered = [] + for item in cleaned: + if item not in seen: + seen.add(item) + ordered.append(item) + + return ordered[:limit] + + +def find_lines_with_snippet(lines: Sequence[str], snippet: str) -> list[str]: + hits: list[str] = [] + for line in lines: + if snippet in line: + hits.append(line.rstrip("\n")) + return hits + + +def validate_example(entry: MainEntry, limit: int) -> ValidationResult: + log_path = entry.log_path + notes: list[str] = [] + if log_path is None or not log_path.exists(): + return ValidationResult( + relpath=entry.relpath, + log_path=log_path, + status="fail", + hits=[], + missing=[], + notes=["Log file not found."], + ) + + source_path = ROOT_DIR / entry.relpath + if not source_path.exists(): + return ValidationResult( + relpath=entry.relpath, + log_path=log_path, + status="fail", + hits=[], + missing=[], + notes=["Source file not found."], + ) + + try: + source_text = source_path.read_text(encoding="utf-8") + except OSError as exc: + return ValidationResult( + relpath=entry.relpath, + log_path=log_path, + status="fail", + hits=[], + missing=[], + notes=[f"Could not read source: {exc}"], + ) + + expectations = derive_expectations(source_text, limit=limit) + + try: + log_lines = log_path.read_text(encoding="utf-8", errors="replace").splitlines() + except OSError as exc: + return ValidationResult( + relpath=entry.relpath, + log_path=log_path, + status="fail", + hits=[], + missing=[], + notes=[f"Could not read log: {exc}"], + ) + + if not expectations: + notes.append("No expectations derived from source (skip validation heuristics).") + return ValidationResult( + relpath=entry.relpath, + log_path=log_path, + status="warn", + hits=[], + missing=[], + notes=notes, + ) + + hits: list[ValidationHit] = [] + missing: list[str] = [] + + for expectation in expectations: + lines = find_lines_with_snippet(log_lines, expectation) + if lines: + hits.append(ValidationHit(expectation=expectation, lines=lines)) + else: + missing.append(expectation) + + if hits: + status = "ok" if not missing else "warn" + else: + status = "warn" + notes.append("No expected messages observed in log.") + + return ValidationResult( + relpath=entry.relpath, + log_path=log_path, + status=status, + hits=hits, + missing=missing, + notes=notes, + ) + + +def format_result(result: ValidationResult) -> list[str]: + lines: list[str] = [] + header = f"{result.status.upper():<4} {result.relpath}" + lines.append(header) + if result.log_path: + lines.append(f" log: {result.log_path}") + for hit in result.hits: + for line in hit.lines: + lines.append(f" hit: {line}") + for miss in result.missing: + lines.append(f" missing: {miss}") + for note in result.notes: + lines.append(f" note: {note}") + return lines + + +def main() -> int: + args = parse_args() + log_dir = Path(args.logs_dir) + main_log = Path(args.main_log) if args.main_log else find_latest_main_log(log_dir) + + if main_log is None: + print(f"No main log found under {log_dir}") + return 1 + if not main_log.exists(): + print(f"Main log does not exist: {main_log}") + return 1 + + entries = parse_main_log(main_log) + passed = [e for e in entries if e.status == "PASSED"] + + print(f"Behavioral validation for {main_log} ({len(passed)} passed entries)") + + if not passed: + print("No passed entries to validate.") + return 0 + + results = [validate_example(entry, limit=args.limit) for entry in passed] + + for result in results: + for line in format_result(result): + print(line) + + failures = sum(1 for r in results if r.status == "fail") + return 1 if failures else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/run_examples.py b/examples/run_examples.py index 1b52cdec21..a3a3498185 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -111,11 +111,6 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Include examples that rely on extra services like Redis, Dapr, Twilio, or Playwright.", ) - parser.add_argument( - "--fail-fast", - action="store_true", - help="Stop after the first failing example.", - ) parser.add_argument( "--verbose", action="store_true", @@ -327,9 +322,6 @@ def run_examples(examples: Sequence[ExampleScript], args: argparse.Namespace) -> results: list[ExampleResult] = [] jobs = max(1, args.jobs) - if args.fail_fast and jobs > 1: - # Preserve fail-fast semantics by forcing serial execution. - jobs = 1 output_lock = threading.Lock() main_log_lock = threading.Lock() @@ -454,10 +446,6 @@ def run_single(example: ExampleScript) -> ExampleResult: elif result.status == "failed": failed += 1 rerun_entries.append(ex.relpath) - if args.fail_fast: - safe_write_main("# fail-fast stop") - break - safe_write_main(f"# summary executed={executed} skipped={skipped} failed={failed}") if args.write_rerun and rerun_entries: From ef1af0fa7909b8ca7ce4cde393633ed579c5aa5a Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 21:53:51 +0900 Subject: [PATCH 05/15] fix --- .codex/skills/examples-auto-run/scripts/run.sh | 2 +- examples/behavioral_validation.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh index 4963c42ac5..c861b55424 100755 --- a/.codex/skills/examples-auto-run/scripts/run.sh +++ b/.codex/skills/examples-auto-run/scripts/run.sh @@ -69,7 +69,7 @@ cmd_start() { export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}" export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}" cd "$ROOT" - "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log" >/dev/null + exec "${run_cmd[@]}" "$@" > >(tee "$stdout_log") 2>&1 ) & local pid=$! echo "$pid" >"$PID_FILE" diff --git a/examples/behavioral_validation.py b/examples/behavioral_validation.py index db94150cdb..154f87d9aa 100644 --- a/examples/behavioral_validation.py +++ b/examples/behavioral_validation.py @@ -11,9 +11,9 @@ import argparse import ast import re +from collections.abc import Sequence from dataclasses import dataclass from pathlib import Path -from typing import Iterable, Sequence ROOT_DIR = Path(__file__).resolve().parent.parent LOG_DIR_DEFAULT = ROOT_DIR / ".tmp" / "examples-start-logs" @@ -89,7 +89,11 @@ def clean_text(value: str) -> str: def _extract_from_print_calls(tree: ast.AST) -> list[str]: texts: list[str] = [] for node in ast.walk(tree): - if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "print": + if ( + isinstance(node, ast.Call) + and isinstance(node.func, ast.Name) + and node.func.id == "print" + ): for arg in node.args: if isinstance(arg, ast.Constant) and isinstance(arg.value, str): texts.append(arg.value) From ba42259da9775c805a8c889ff5d36b54d470fd13 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:00:26 +0900 Subject: [PATCH 06/15] fix --- examples/run_examples.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/examples/run_examples.py b/examples/run_examples.py index a3a3498185..fcc13f8e65 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -241,6 +241,13 @@ def format_command(cmd: Sequence[str]) -> str: return shlex.join(cmd) +def display_path(path: Path) -> str: + try: + return str(path.relative_to(ROOT_DIR)) + except ValueError: + return str(path) + + def env_flag(name: str) -> bool | None: raw = os.environ.get(name) if raw is None: @@ -374,7 +381,7 @@ def run_single(example: ExampleScript) -> ExampleResult: sys.stdout.write(f"[{relpath}] {line}") if exit_code == 0: - safe_write_main(f"PASSED {relpath} exit=0 log={log_path.relative_to(ROOT_DIR)}") + safe_write_main(f"PASSED {relpath} exit=0 log={display_path(log_path)}") return ExampleResult( script=example, status="passed", @@ -385,7 +392,7 @@ def run_single(example: ExampleScript) -> ExampleResult: info = f"exit={exit_code}" with output_lock: print(f" !! {relpath} exited with {exit_code}") - safe_write_main(f"FAILED {relpath} exit={exit_code} log={log_path.relative_to(ROOT_DIR)}") + safe_write_main(f"FAILED {relpath} exit={exit_code} log={display_path(log_path)}") return ExampleResult( script=example, status="failed", @@ -466,9 +473,7 @@ def run_single(example: ExampleScript) -> ExampleResult: for result in results: info = result.reason or ("exit 0" if result.status == "passed" else "") log_disp = ( - str(result.log_path.relative_to(ROOT_DIR)) - if result.log_path and result.log_path.exists() - else "-" + display_path(result.log_path) if result.log_path and result.log_path.exists() else "-" ) print( f"{result.status.ljust(status_w)} {result.script.relpath.ljust(name_w)} {info.ljust(info_w)} {log_disp}" From 285f391ac8e4baed9aa5537239c0c881be8131b9 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:09:39 +0900 Subject: [PATCH 07/15] fix --- .codex/skills/examples-auto-run/SKILL.md | 25 +- .../skills/examples-auto-run/scripts/run.sh | 33 -- examples/behavioral_validation.py | 289 ------------------ 3 files changed, 14 insertions(+), 333 deletions(-) delete mode 100644 examples/behavioral_validation.py diff --git a/.codex/skills/examples-auto-run/SKILL.md b/.codex/skills/examples-auto-run/SKILL.md index 9d554b426c..9fdab22469 100644 --- a/.codex/skills/examples-auto-run/SKILL.md +++ b/.codex/skills/examples-auto-run/SKILL.md @@ -66,14 +66,17 @@ description: Run python examples in auto mode with logging, rerun helpers, and b - `start` uses `--write-rerun` so failures are captured automatically. - If `.tmp/examples-rerun.txt` exists and is non-empty, invoking the skill with no args runs `rerun` by default. -## Behavioral validation - -- After every foreground `start` or `rerun`, the script automatically runs `uv run examples/behavioral_validation.py` against the generated main log. -- The validator: - 1. Reads the example source to derive expected messages (print strings and prompt/message assignments). - 2. Reads each passed example’s log and checks those messages appeared. - 3. Reports per-example status with the full matching log lines; missing expectations are flagged. -- Background runs do not validate automatically; after they finish, run: - ```bash - .codex/skills/examples-auto-run/scripts/run.sh validate - ``` +## Behavioral validation (Codex/LLM responsibility) + +The runner no longer auto-runs `examples/behavioral_validation.py`. After every foreground `start` or `rerun`, **Codex must manually validate** all exit-0 entries: + +1. Read the example source (and comments) to infer intended flow, tools used, and expected key outputs. +2. Open the matching per-example log under `.tmp/examples-start-logs/`. +3. Confirm the intended actions/results occurred; flag omissions or divergences. +4. Do this for **all passed examples**, not just a sample. +5. Report immediately after the run with concise citations to the exact log lines that justify the validation. + +If you still want the heuristic validator, you can run it manually: +```bash +.codex/skills/examples-auto-run/scripts/run.sh validate +``` diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh index c861b55424..74258f8cac 100755 --- a/.codex/skills/examples-auto-run/scripts/run.sh +++ b/.codex/skills/examples-auto-run/scripts/run.sh @@ -15,19 +15,6 @@ is_running() { [[ -n "$pid" ]] && ps -p "$pid" >/dev/null 2>&1 } -run_validation() { - local main_log="$1" - if [[ -z "$main_log" ]]; then - echo "Validation skipped: main log path is empty." - return 0 - fi - if [[ ! -f "$main_log" ]]; then - echo "Validation skipped: main log not found: $main_log" - return 0 - fi - uv run examples/behavioral_validation.py --main-log "$main_log" --logs-dir "$LOG_DIR" || true -} - cmd_start() { ensure_dirs local background=0 @@ -93,7 +80,6 @@ cmd_start() { "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log" local run_status=${PIPESTATUS[0]} set -e - run_validation "$main_log" return "$run_status" } @@ -189,26 +175,9 @@ cmd_rerun() { uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun --main-log "$main_log" --logs-dir "$LOG_DIR" 2>&1 | tee "$stdout_log" local run_status=${PIPESTATUS[0]} set -e - run_validation "$main_log" return "$run_status" } -cmd_validate() { - ensure_dirs - local main_log="${1:-}" - if [[ -z "$main_log" ]]; then - main_log="$(ls -1t "$LOG_DIR"/main_*.log 2>/dev/null | head -n1)" - fi - if [[ -z "$main_log" ]]; then - echo "No main log found." - exit 1 - fi - if [[ "$main_log" != /* && -f "$LOG_DIR/$main_log" ]]; then - main_log="$LOG_DIR/$main_log" - fi - run_validation "$main_log" -} - usage() { cat <<'EOF' Usage: run.sh [args...] @@ -221,7 +190,6 @@ Commands: tail [logfile] Tail the latest (or specified) log. collect [main_log] Parse a main log and write failed examples to .tmp/examples-rerun.txt. rerun [rerun_file] Run only the examples listed in .tmp/examples-rerun.txt. - validate [main_log] Run behavioral validation against the latest (or given) main log. Environment overrides: EXAMPLES_INTERACTIVE_MODE (default auto) @@ -243,6 +211,5 @@ case "${1:-$default_cmd}" in tail) shift; cmd_tail "${1:-}" ;; collect) shift || true; collect_rerun "${1:-}" ;; rerun) shift || true; cmd_rerun "${1:-}" ;; - validate) shift || true; cmd_validate "${1:-}" ;; *) usage; exit 1 ;; esac diff --git a/examples/behavioral_validation.py b/examples/behavioral_validation.py deleted file mode 100644 index 154f87d9aa..0000000000 --- a/examples/behavioral_validation.py +++ /dev/null @@ -1,289 +0,0 @@ -"""Lightweight behavioral validation for example runs. - -Reads a main log emitted by `examples/run_examples.py`, inspects the source -files for each passed example to derive expected messages, and checks that the -per-example logs contain those messages. The goal is to provide quick evidence -that the observed behavior matches the intended flow without re-running code. -""" - -from __future__ import annotations - -import argparse -import ast -import re -from collections.abc import Sequence -from dataclasses import dataclass -from pathlib import Path - -ROOT_DIR = Path(__file__).resolve().parent.parent -LOG_DIR_DEFAULT = ROOT_DIR / ".tmp" / "examples-start-logs" - -ENTRY_RE = re.compile(r"^(PASSED|FAILED|SKIPPED|DRYRUN)\s+(\S+)(?:.*log=([^\s]+))?") - - -@dataclass -class MainEntry: - status: str - relpath: str - log_path: Path | None - - -@dataclass -class ValidationHit: - expectation: str - lines: list[str] - - -@dataclass -class ValidationResult: - relpath: str - log_path: Path | None - status: str # ok, warn, fail - hits: list[ValidationHit] - missing: list[str] - notes: list[str] - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Validate example behavior from logs.") - parser.add_argument( - "--main-log", - help="Path to the main log (defaults to latest main_*.log in logs dir).", - ) - parser.add_argument( - "--logs-dir", - default=str(LOG_DIR_DEFAULT), - help="Directory containing main and per-example logs.", - ) - parser.add_argument( - "--limit", - type=int, - default=5, - help="Maximum expectations to check per example (to keep output readable).", - ) - return parser.parse_args() - - -def find_latest_main_log(log_dir: Path) -> Path | None: - candidates = sorted(log_dir.glob("main_*.log"), key=lambda p: p.stat().st_mtime, reverse=True) - return candidates[0] if candidates else None - - -def parse_main_log(path: Path) -> list[MainEntry]: - entries: list[MainEntry] = [] - for raw_line in path.read_text(encoding="utf-8", errors="replace").splitlines(): - line = raw_line.strip() - match = ENTRY_RE.match(line) - if not match: - continue - status, relpath, log_rel = match.groups() - log_path = ROOT_DIR / log_rel if log_rel else None - entries.append(MainEntry(status=status, relpath=relpath, log_path=log_path)) - return entries - - -def clean_text(value: str) -> str: - return " ".join(value.split()) - - -def _extract_from_print_calls(tree: ast.AST) -> list[str]: - texts: list[str] = [] - for node in ast.walk(tree): - if ( - isinstance(node, ast.Call) - and isinstance(node.func, ast.Name) - and node.func.id == "print" - ): - for arg in node.args: - if isinstance(arg, ast.Constant) and isinstance(arg.value, str): - texts.append(arg.value) - return texts - - -def _extract_from_assignments(tree: ast.AST) -> list[str]: - texts: list[str] = [] - target_keywords = {"message", "prompt", "question", "instruction", "text"} - for node in ast.walk(tree): - if not isinstance(node, ast.Assign): - continue - if not isinstance(node.value, ast.Constant) or not isinstance(node.value.value, str): - continue - for target in node.targets: - if isinstance(target, ast.Name) and any( - key in target.id.lower() for key in target_keywords - ): - texts.append(node.value.value) - return texts - - -def derive_expectations(source: str, limit: int) -> list[str]: - try: - tree = ast.parse(source) - except SyntaxError: - return [] - - texts: list[str] = [] - texts.extend(_extract_from_print_calls(tree)) - texts.extend(_extract_from_assignments(tree)) - - cleaned: list[str] = [] - for text in texts: - normalized = clean_text(text) - if 8 <= len(normalized) <= 200: - cleaned.append(normalized) - - # Preserve order while removing duplicates. - seen = set() - ordered = [] - for item in cleaned: - if item not in seen: - seen.add(item) - ordered.append(item) - - return ordered[:limit] - - -def find_lines_with_snippet(lines: Sequence[str], snippet: str) -> list[str]: - hits: list[str] = [] - for line in lines: - if snippet in line: - hits.append(line.rstrip("\n")) - return hits - - -def validate_example(entry: MainEntry, limit: int) -> ValidationResult: - log_path = entry.log_path - notes: list[str] = [] - if log_path is None or not log_path.exists(): - return ValidationResult( - relpath=entry.relpath, - log_path=log_path, - status="fail", - hits=[], - missing=[], - notes=["Log file not found."], - ) - - source_path = ROOT_DIR / entry.relpath - if not source_path.exists(): - return ValidationResult( - relpath=entry.relpath, - log_path=log_path, - status="fail", - hits=[], - missing=[], - notes=["Source file not found."], - ) - - try: - source_text = source_path.read_text(encoding="utf-8") - except OSError as exc: - return ValidationResult( - relpath=entry.relpath, - log_path=log_path, - status="fail", - hits=[], - missing=[], - notes=[f"Could not read source: {exc}"], - ) - - expectations = derive_expectations(source_text, limit=limit) - - try: - log_lines = log_path.read_text(encoding="utf-8", errors="replace").splitlines() - except OSError as exc: - return ValidationResult( - relpath=entry.relpath, - log_path=log_path, - status="fail", - hits=[], - missing=[], - notes=[f"Could not read log: {exc}"], - ) - - if not expectations: - notes.append("No expectations derived from source (skip validation heuristics).") - return ValidationResult( - relpath=entry.relpath, - log_path=log_path, - status="warn", - hits=[], - missing=[], - notes=notes, - ) - - hits: list[ValidationHit] = [] - missing: list[str] = [] - - for expectation in expectations: - lines = find_lines_with_snippet(log_lines, expectation) - if lines: - hits.append(ValidationHit(expectation=expectation, lines=lines)) - else: - missing.append(expectation) - - if hits: - status = "ok" if not missing else "warn" - else: - status = "warn" - notes.append("No expected messages observed in log.") - - return ValidationResult( - relpath=entry.relpath, - log_path=log_path, - status=status, - hits=hits, - missing=missing, - notes=notes, - ) - - -def format_result(result: ValidationResult) -> list[str]: - lines: list[str] = [] - header = f"{result.status.upper():<4} {result.relpath}" - lines.append(header) - if result.log_path: - lines.append(f" log: {result.log_path}") - for hit in result.hits: - for line in hit.lines: - lines.append(f" hit: {line}") - for miss in result.missing: - lines.append(f" missing: {miss}") - for note in result.notes: - lines.append(f" note: {note}") - return lines - - -def main() -> int: - args = parse_args() - log_dir = Path(args.logs_dir) - main_log = Path(args.main_log) if args.main_log else find_latest_main_log(log_dir) - - if main_log is None: - print(f"No main log found under {log_dir}") - return 1 - if not main_log.exists(): - print(f"Main log does not exist: {main_log}") - return 1 - - entries = parse_main_log(main_log) - passed = [e for e in entries if e.status == "PASSED"] - - print(f"Behavioral validation for {main_log} ({len(passed)} passed entries)") - - if not passed: - print("No passed entries to validate.") - return 0 - - results = [validate_example(entry, limit=args.limit) for entry in passed] - - for result in results: - for line in format_result(result): - print(line) - - failures = sum(1 for r in results if r.status == "fail") - return 1 if failures else 0 - - -if __name__ == "__main__": - raise SystemExit(main()) From bd9dd6fcc332315f24e9a5344be9195174bc60a6 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:15:41 +0900 Subject: [PATCH 08/15] fix --- .codex/skills/examples-auto-run/SKILL.md | 7 +------ examples/run_examples.py | 4 ++-- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/.codex/skills/examples-auto-run/SKILL.md b/.codex/skills/examples-auto-run/SKILL.md index 9fdab22469..cca492b6a8 100644 --- a/.codex/skills/examples-auto-run/SKILL.md +++ b/.codex/skills/examples-auto-run/SKILL.md @@ -68,15 +68,10 @@ description: Run python examples in auto mode with logging, rerun helpers, and b ## Behavioral validation (Codex/LLM responsibility) -The runner no longer auto-runs `examples/behavioral_validation.py`. After every foreground `start` or `rerun`, **Codex must manually validate** all exit-0 entries: +The runner does not perform any automated behavioral validation. After every foreground `start` or `rerun`, **Codex must manually validate** all exit-0 entries: 1. Read the example source (and comments) to infer intended flow, tools used, and expected key outputs. 2. Open the matching per-example log under `.tmp/examples-start-logs/`. 3. Confirm the intended actions/results occurred; flag omissions or divergences. 4. Do this for **all passed examples**, not just a sample. 5. Report immediately after the run with concise citations to the exact log lines that justify the validation. - -If you still want the heuristic validator, you can run it manually: -```bash -.codex/skills/examples-auto-run/scripts/run.sh validate -``` diff --git a/examples/run_examples.py b/examples/run_examples.py index fcc13f8e65..e828aacdd2 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -348,8 +348,8 @@ def run_single(example: ExampleScript) -> ExampleResult: env = os.environ.copy() if auto_mode: - env.setdefault("EXAMPLES_INTERACTIVE_MODE", "auto") - env.setdefault("APPLY_PATCH_AUTO_APPROVE", "1") + env["EXAMPLES_INTERACTIVE_MODE"] = "auto" + env["APPLY_PATCH_AUTO_APPROVE"] = "1" env.setdefault("SHELL_AUTO_APPROVE", "1") env.setdefault("AUTO_APPROVE_MCP", "1") From 2297757e4c5a69315ffbcd65cdb6b9c662e638b7 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:22:50 +0900 Subject: [PATCH 09/15] fix --- examples/run_examples.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/examples/run_examples.py b/examples/run_examples.py index e828aacdd2..a15c9f32f7 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -268,11 +268,18 @@ def write_main_log_line(handle, line: str) -> None: handle.flush() -def ensure_dirs(path: Path) -> None: - if path.suffix: - path.parent.mkdir(parents=True, exist_ok=True) - else: - path.mkdir(parents=True, exist_ok=True) +def ensure_dirs(path: Path, is_file: bool | None = None) -> None: + """Create directories for a file or directory path. + + If `is_file` is True, always create the parent directory. If False, create the + directory itself. When None, treat paths with a suffix as files and others as + directories, but suffix-less file names should pass is_file=True to avoid + accidental directory creation. + """ + if is_file is None: + is_file = bool(path.suffix) + target = path.parent if is_file else path + target.mkdir(parents=True, exist_ok=True) def parse_rerun_from_log(log_path: Path) -> list[str]: @@ -312,8 +319,8 @@ def run_examples(examples: Sequence[ExampleScript], args: argparse.Namespace) -> if auto_mode and "interactive" not in overrides: overrides.add("interactive") - ensure_dirs(logs_dir) - ensure_dirs(main_log_path) + ensure_dirs(logs_dir, is_file=False) + ensure_dirs(main_log_path, is_file=True) rerun_entries: list[str] = [] if not examples: @@ -344,7 +351,7 @@ def run_single(example: ExampleScript) -> ExampleResult: relpath = example.relpath log_filename = f"{relpath.replace('/', '__')}.log" log_path = logs_dir / log_filename - ensure_dirs(log_path) + ensure_dirs(log_path, is_file=True) env = os.environ.copy() if auto_mode: @@ -456,7 +463,7 @@ def run_single(example: ExampleScript) -> ExampleResult: safe_write_main(f"# summary executed={executed} skipped={skipped} failed={failed}") if args.write_rerun and rerun_entries: - ensure_dirs(RERUN_FILE_DEFAULT) + ensure_dirs(RERUN_FILE_DEFAULT, is_file=True) RERUN_FILE_DEFAULT.write_text("\n".join(rerun_entries) + "\n", encoding="utf-8") print(f"Wrote rerun list to {RERUN_FILE_DEFAULT}") @@ -493,7 +500,7 @@ def main() -> int: paths = parse_rerun_from_log(Path(args.collect)) if args.output: out = Path(args.output) - ensure_dirs(out) + ensure_dirs(out, is_file=True) out.write_text("\n".join(paths) + "\n", encoding="utf-8") print(f"Wrote {len(paths)} entries to {out}") else: From 2d8b516962c4a12223999f293921dc5b5d21447b Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:29:20 +0900 Subject: [PATCH 10/15] fix --- examples/run_examples.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/run_examples.py b/examples/run_examples.py index a15c9f32f7..168b8245a3 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -462,9 +462,13 @@ def run_single(example: ExampleScript) -> ExampleResult: rerun_entries.append(ex.relpath) safe_write_main(f"# summary executed={executed} skipped={skipped} failed={failed}") - if args.write_rerun and rerun_entries: + if args.write_rerun: ensure_dirs(RERUN_FILE_DEFAULT, is_file=True) - RERUN_FILE_DEFAULT.write_text("\n".join(rerun_entries) + "\n", encoding="utf-8") + if rerun_entries: + contents = "\n".join(rerun_entries) + "\n" + else: + contents = "" + RERUN_FILE_DEFAULT.write_text(contents, encoding="utf-8") print(f"Wrote rerun list to {RERUN_FILE_DEFAULT}") print(f"Main log: {main_log_path}") From 57b405e9c84c70fc03c985231027008ae6bcb9b3 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:31:59 +0900 Subject: [PATCH 11/15] fix --- examples/run_examples.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/examples/run_examples.py b/examples/run_examples.py index 168b8245a3..5b4285d598 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -369,20 +369,36 @@ def run_single(example: ExampleScript) -> ExampleResult: env=env, ) assert proc.stdout is not None + force_prompt_stream = (not auto_mode) and ("interactive" in example.tags) + buffer_output_local = buffer_output and not force_prompt_stream buffer_lines: list[str] = [] with log_path.open("w", encoding="utf-8") as per_log: - for line in proc.stdout: - per_log.write(line) - if buffer_output: - buffer_lines.append(line) - else: + if force_prompt_stream: + at_line_start = True + while True: + char = proc.stdout.read(1) + if char == "": + break + per_log.write(char) with output_lock: - sys.stdout.write(f"[{relpath}] {line}") + if at_line_start: + sys.stdout.write(f"[{relpath}] ") + sys.stdout.write(char) + sys.stdout.flush() + at_line_start = char == "\n" + else: + for line in proc.stdout: + per_log.write(line) + if buffer_output_local: + buffer_lines.append(line) + else: + with output_lock: + sys.stdout.write(f"[{relpath}] {line}") proc.wait() exit_code = proc.returncode - if buffer_output and buffer_lines: + if buffer_output_local and buffer_lines: with output_lock: for line in buffer_lines: sys.stdout.write(f"[{relpath}] {line}") From 4cf694873132dd70882c8591b93108aa3c646d6a Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:34:52 +0900 Subject: [PATCH 12/15] fix --- examples/run_examples.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/run_examples.py b/examples/run_examples.py index 5b4285d598..bd0088a0c7 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -173,7 +173,11 @@ def detect_tags(path: Path, source: str) -> set[str]: lower_source = source.lower() lower_parts = [part.lower() for part in path.parts] - if re.search(r"\binput\s*\(", source) or "input_with_fallback(" in lower_source: + if ( + re.search(r"\binput\s*\(", source) + or "input_with_fallback(" in lower_source + or "confirm_with_fallback(" in lower_source + ): tags.add("interactive") if "prompt_toolkit" in lower_source or "questionary" in lower_source: tags.add("interactive") From 1dd7df3f654e36a6e7e03e6440aed1245d1c3f59 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:41:03 +0900 Subject: [PATCH 13/15] fix --- examples/run_examples.py | 15 ++++++++++----- examples/tools/apply_patch.py | 5 +++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/examples/run_examples.py b/examples/run_examples.py index bd0088a0c7..edbf545bbf 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -22,7 +22,7 @@ from collections.abc import Iterable, Sequence from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field -from pathlib import Path +from pathlib import Path, PurePosixPath ROOT_DIR = Path(__file__).resolve().parent.parent EXAMPLES_DIR = ROOT_DIR / "examples" @@ -57,7 +57,7 @@ class ExampleScript: @property def relpath(self) -> str: - return str(self.path.relative_to(ROOT_DIR)) + return normalize_relpath(str(self.path.relative_to(ROOT_DIR))) @property def module(self) -> str: @@ -79,6 +79,11 @@ class ExampleResult: exit_code: int | None = None +def normalize_relpath(relpath: str) -> str: + normalized = relpath.replace("\\", "/") + return str(PurePosixPath(normalized)) + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Run example scripts sequentially.") parser.add_argument( @@ -263,8 +268,8 @@ def load_auto_skip() -> set[str]: env_value = os.environ.get("EXAMPLES_AUTO_SKIP", "") if env_value.strip(): parts = re.split(r"[\s,]+", env_value.strip()) - return {p for p in parts if p} - return set(DEFAULT_AUTO_SKIP) + return {normalize_relpath(p) for p in parts if p} + return {normalize_relpath(p) for p in DEFAULT_AUTO_SKIP} def write_main_log_line(handle, line: str) -> None: @@ -300,7 +305,7 @@ def parse_rerun_from_log(log_path: Path) -> list[str]: continue status, relpath = parts[0].upper(), parts[1] if status in {"FAILED", "ERROR", "UNKNOWN"}: - rerun.append(relpath) + rerun.append(normalize_relpath(relpath)) return rerun diff --git a/examples/tools/apply_patch.py b/examples/tools/apply_patch.py index d93c4f3fb8..57a49755c6 100644 --- a/examples/tools/apply_patch.py +++ b/examples/tools/apply_patch.py @@ -7,6 +7,7 @@ from agents import Agent, ApplyPatchTool, ModelSettings, Runner, apply_diff, trace from agents.editor import ApplyPatchOperation, ApplyPatchResult +from examples.auto_mode import confirm_with_fallback, is_auto_mode class ApprovalTracker: @@ -89,8 +90,8 @@ def _require_approval(self, operation: ApplyPatchOperation, display_path: str) - if operation.diff: preview = operation.diff if len(operation.diff) < 400 else f"{operation.diff[:400]}…" print("- diff preview:\n", preview) - answer = input("Proceed? [y/N] ").strip().lower() - if answer not in {"y", "yes"}: + approved = confirm_with_fallback("Proceed? [y/N] ", default=is_auto_mode()) + if not approved: raise RuntimeError("Apply patch operation rejected by user.") self._approvals.remember(fingerprint) From 25e552d65d0224a718046e51b542eb2010c82605 Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:42:25 +0900 Subject: [PATCH 14/15] fix --- examples/run_examples.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/run_examples.py b/examples/run_examples.py index edbf545bbf..4a15ecc662 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -469,6 +469,14 @@ def run_single(example: ExampleScript) -> ExampleResult: run_list.append(example) + if run_list and (not auto_mode) and any("interactive" in ex.tags for ex in run_list): + if jobs != 1: + print( + "Interactive examples detected; forcing serial execution to avoid shared stdin." + ) + safe_write_main("# jobs_adjusted: 1 reason=interactive") + jobs = 1 + run_results: dict[str, ExampleResult] = {} if run_list: with ThreadPoolExecutor(max_workers=jobs) as executor: From 46df3c3d78aff3f4642fabab1857663e16aac6bf Mon Sep 17 00:00:00 2001 From: Kazuhiro Sera Date: Tue, 13 Jan 2026 22:52:12 +0900 Subject: [PATCH 15/15] fix --- examples/run_examples.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/run_examples.py b/examples/run_examples.py index 4a15ecc662..a3a8174464 100644 --- a/examples/run_examples.py +++ b/examples/run_examples.py @@ -469,12 +469,16 @@ def run_single(example: ExampleScript) -> ExampleResult: run_list.append(example) - if run_list and (not auto_mode) and any("interactive" in ex.tags for ex in run_list): + interactive_in_run_list = any("interactive" in ex.tags for ex in run_list) + interactive_requested = "interactive" in overrides + + if run_list and (not auto_mode) and (interactive_in_run_list or interactive_requested): if jobs != 1: print( "Interactive examples detected; forcing serial execution to avoid shared stdin." ) - safe_write_main("# jobs_adjusted: 1 reason=interactive") + reason = "interactive" if interactive_in_run_list else "interactive-requested" + safe_write_main(f"# jobs_adjusted: 1 reason={reason}") jobs = 1 run_results: dict[str, ExampleResult] = {}