From 0d5a6ef7e8f9433a6fb07779baac2d66b293f1b7 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 21:33:03 +0900
Subject: [PATCH 01/15] feat: add examples auto-run skill and refresh example
 scripts

---
 .codex/skills/examples-auto-run/SKILL.md      |  66 +++
 .../skills/examples-auto-run/scripts/run.sh   | 200 +++++++++
 .gitignore                                    |   1 +
 examples/agent_patterns/agents_as_tools.py    |   6 +-
 .../agents_as_tools_conditional.py            |   8 +-
 examples/agent_patterns/deterministic.py      |   6 +-
 examples/agent_patterns/input_guardrails.py   |   9 +-
 examples/agent_patterns/llm_as_a_judge.py     |  15 +-
 examples/agent_patterns/parallelization.py    |   6 +-
 examples/agent_patterns/routing.py            |  11 +-
 examples/auto_mode.py                         |  37 ++
 examples/basic/agent_lifecycle_example.py     |   3 +-
 examples/basic/lifecycle_example.py           |   3 +-
 examples/basic/previous_response_id.py        |   3 +-
 examples/customer_service/main.py             |   9 +-
 examples/financial_research_agent/main.py     |   7 +-
 examples/hosted_mcp/approvals.py              |   5 +-
 examples/mcp/git_example/main.py              |   6 +-
 examples/mcp/prompt_server/README.md          |   5 +-
 examples/mcp/prompt_server/main.py            |  26 +-
 examples/mcp/prompt_server/server.py          |   7 +-
 examples/mcp/sse_example/server.py            |  14 +-
 .../README.md                                 |   3 +-
 .../main.py                                   |  28 +-
 .../server.py                                 |   6 +-
 examples/mcp/streamablehttp_example/README.md |   2 +-
 examples/mcp/streamablehttp_example/main.py   |  28 +-
 examples/mcp/streamablehttp_example/server.py |  20 +-
 examples/model_providers/litellm_provider.py  |  18 +-
 examples/reasoning_content/main.py            |   2 +-
 examples/reasoning_content/runner_example.py  |   2 +-
 examples/research_bot/main.py                 |   7 +-
 examples/run_examples.py                      | 402 +++++++++++++++---
 examples/tools/apply_patch.py                 |   2 +-
 examples/tools/shell.py                       |   2 +-
 35 files changed, 871 insertions(+), 104 deletions(-)
 create mode 100644 .codex/skills/examples-auto-run/SKILL.md
 create mode 100755 .codex/skills/examples-auto-run/scripts/run.sh
 create mode 100644 examples/auto_mode.py

diff --git a/.codex/skills/examples-auto-run/SKILL.md b/.codex/skills/examples-auto-run/SKILL.md
new file mode 100644
index 0000000000..4e3abc7545
--- /dev/null
+++ b/.codex/skills/examples-auto-run/SKILL.md
@@ -0,0 +1,66 @@
+---
+name: examples-auto-run
+description: Run python examples in auto mode with logging, rerun helpers, and background control.
+---
+
+# examples-auto-run
+
+## What it does
+
+- Runs `uv run examples/run_examples.py` with:
+  - `EXAMPLES_INTERACTIVE_MODE=auto` (auto-input/auto-approve).
+  - Per-example logs under `.tmp/examples-start-logs/`.
+  - Main summary log path passed via `--main-log` (also under `.tmp/examples-start-logs/`).
+  - Generates a rerun list of failures at `.tmp/examples-rerun.txt` when `--write-rerun` is set.
+- Provides start/stop/status/logs/tail/collect/rerun helpers via `run.sh`.
+- Background option keeps the process running with a pidfile; `stop` cleans it up.
+
+## Usage
+
+```bash
+# Start (auto mode; interactive included by default)
+.codex/skills/examples-auto-run/scripts/run.sh start [extra args to run_examples.py]
+# Examples:
+.codex/skills/examples-auto-run/scripts/run.sh start --filter basic
+.codex/skills/examples-auto-run/scripts/run.sh start --include-server --include-audio
+
+# Check status
+.codex/skills/examples-auto-run/scripts/run.sh status
+
+# Stop running job
+.codex/skills/examples-auto-run/scripts/run.sh stop
+
+# List logs
+.codex/skills/examples-auto-run/scripts/run.sh logs
+
+# Tail latest log (or specify one)
+.codex/skills/examples-auto-run/scripts/run.sh tail
+.codex/skills/examples-auto-run/scripts/run.sh tail main_20260113-123000.log
+
+# Collect rerun list from a main log (defaults to latest main_*.log)
+.codex/skills/examples-auto-run/scripts/run.sh collect
+
+# Rerun only failed entries from rerun file (auto mode)
+.codex/skills/examples-auto-run/scripts/run.sh rerun
+```
+
+## Defaults (overridable via env)
+
+- `EXAMPLES_INTERACTIVE_MODE=auto`
+- `EXAMPLES_INCLUDE_INTERACTIVE=1`
+- `EXAMPLES_INCLUDE_SERVER=0`
+- `EXAMPLES_INCLUDE_AUDIO=0`
+- `EXAMPLES_INCLUDE_EXTERNAL=0`
+- Auto-approvals in auto mode: `APPLY_PATCH_AUTO_APPROVE=1`, `SHELL_AUTO_APPROVE=1`, `AUTO_APPROVE_MCP=1`
+
+## Log locations
+
+- Main logs: `.tmp/examples-start-logs/main_*.log`
+- Per-example logs (from `run_examples.py`): `.tmp/examples-start-logs/<module_path>.log`
+- Rerun list: `.tmp/examples-rerun.txt`
+
+## Notes
+
+- The runner delegates to `uv run examples/run_examples.py`, which already writes per-example logs and supports `--collect`, `--rerun-file`, and `--print-auto-skip`.
+- `start` uses `--write-rerun` so failures are captured automatically.
+- If `.tmp/examples-rerun.txt` exists and is non-empty, invoking the skill with no args runs `rerun` by default.
diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh
new file mode 100755
index 0000000000..d36270c915
--- /dev/null
+++ b/.codex/skills/examples-auto-run/scripts/run.sh
@@ -0,0 +1,200 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)"
+PID_FILE="$ROOT/.tmp/examples-auto-run.pid"
+LOG_DIR="$ROOT/.tmp/examples-start-logs"
+RERUN_FILE="$ROOT/.tmp/examples-rerun.txt"
+
+ensure_dirs() {
+  mkdir -p "$LOG_DIR" "$ROOT/.tmp"
+}
+
+is_running() {
+  local pid="$1"
+  [[ -n "$pid" ]] && ps -p "$pid" >/dev/null 2>&1
+}
+
+cmd_start() {
+  ensure_dirs
+  local background=0
+  if [[ "${1:-}" == "--background" ]]; then
+    background=1
+    shift
+  fi
+
+  local ts log_file
+  ts="$(date +%Y%m%d-%H%M%S)"
+  log_file="$LOG_DIR/main_${ts}.log"
+
+  local run_cmd=(
+    uv run examples/run_examples.py
+    --auto-mode
+    --write-rerun
+    --main-log "$log_file"
+    --logs-dir "$LOG_DIR"
+  )
+
+  if [[ "$background" -eq 1 ]]; then
+    if [[ -f "$PID_FILE" ]]; then
+      local pid
+      pid="$(cat "$PID_FILE" 2>/dev/null || true)"
+      if is_running "$pid"; then
+        echo "examples/run_examples.py already running (pid=$pid)."
+        exit 1
+      fi
+    fi
+    (
+      trap '' HUP
+      export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}"
+      export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}"
+      export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}"
+      export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}"
+      export EXAMPLES_INCLUDE_INTERACTIVE="${EXAMPLES_INCLUDE_INTERACTIVE:-1}"
+      export EXAMPLES_INCLUDE_SERVER="${EXAMPLES_INCLUDE_SERVER:-0}"
+      export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}"
+      export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}"
+      cd "$ROOT"
+      "${run_cmd[@]}" "$@" 2>&1 | tee "$log_file" >/dev/null
+    ) &
+    local pid=$!
+    echo "$pid" >"$PID_FILE"
+    echo "Started run_examples.py (pid=$pid)"
+    echo "Main log: $log_file"
+    return 0
+  fi
+
+  export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}"
+  export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}"
+  export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}"
+  export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}"
+  export EXAMPLES_INCLUDE_INTERACTIVE="${EXAMPLES_INCLUDE_INTERACTIVE:-1}"
+  export EXAMPLES_INCLUDE_SERVER="${EXAMPLES_INCLUDE_SERVER:-0}"
+  export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}"
+  export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}"
+  cd "$ROOT"
+  "${run_cmd[@]}" "$@" 2>&1 | tee "$log_file"
+}
+
+cmd_stop() {
+  if [[ ! -f "$PID_FILE" ]]; then
+    echo "No pid file; nothing to stop."
+    return 0
+  fi
+  local pid
+  pid="$(cat "$PID_FILE" 2>/dev/null || true)"
+  if [[ -z "$pid" ]]; then
+    rm -f "$PID_FILE"
+    echo "Pid file empty; cleaned."
+    return 0
+  fi
+  if ! is_running "$pid"; then
+    rm -f "$PID_FILE"
+    echo "Process $pid not running; cleaned pid file."
+    return 0
+  fi
+  echo "Stopping pid $pid ..."
+  kill "$pid" 2>/dev/null || true
+  sleep 1
+  if is_running "$pid"; then
+    echo "Sending SIGKILL to $pid ..."
+    kill -9 "$pid" 2>/dev/null || true
+  fi
+  rm -f "$PID_FILE"
+  echo "Stopped."
+}
+
+cmd_status() {
+  if [[ -f "$PID_FILE" ]]; then
+    local pid
+    pid="$(cat "$PID_FILE" 2>/dev/null || true)"
+    if is_running "$pid"; then
+      echo "Running (pid=$pid)"
+      return 0
+    fi
+  fi
+  echo "Not running."
+}
+
+cmd_logs() {
+  ensure_dirs
+  ls -1t "$LOG_DIR"
+}
+
+cmd_tail() {
+  ensure_dirs
+  local file="${1:-}"
+  if [[ -z "$file" ]]; then
+    file="$(ls -1t "$LOG_DIR" | head -n1)"
+  fi
+  if [[ -z "$file" ]]; then
+    echo "No log files yet."
+    exit 1
+  fi
+  tail -f "$LOG_DIR/$file"
+}
+
+collect_rerun() {
+  ensure_dirs
+  local log_file="${1:-}"
+  if [[ -z "$log_file" ]]; then
+    log_file="$(ls -1t "$LOG_DIR"/main_*.log 2>/dev/null | head -n1)"
+  fi
+  if [[ -z "$log_file" ]] || [[ ! -f "$log_file" ]]; then
+    echo "No main log file found."
+    exit 1
+  fi
+  cd "$ROOT"
+  uv run examples/run_examples.py --collect "$log_file" --output "$RERUN_FILE"
+}
+
+cmd_rerun() {
+  ensure_dirs
+  local file="${1:-$RERUN_FILE}"
+  if [[ ! -s "$file" ]]; then
+    echo "Rerun list is empty: $file"
+    exit 0
+  fi
+  cd "$ROOT"
+  export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}"
+  export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}"
+  export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}"
+  export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}"
+  uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun
+}
+
+usage() {
+  cat <<'EOF'
+Usage: run.sh <start|stop|status|logs|tail|collect|rerun> [args...]
+
+Commands:
+  start [--filter ... | other args]   Run examples in auto mode (foreground). Pass --background to run detached.
+  stop                                Kill the running auto-run (if any).
+  status                              Show whether it is running.
+  logs                                List log files (.tmp/examples-start-logs).
+  tail [logfile]                      Tail the latest (or specified) log.
+  collect [main_log]                  Parse a main log and write failed examples to .tmp/examples-rerun.txt.
+  rerun [rerun_file]                  Run only the examples listed in .tmp/examples-rerun.txt.
+
+Environment overrides:
+  EXAMPLES_INTERACTIVE_MODE (default auto)
+  EXAMPLES_INCLUDE_SERVER/INTERACTIVE/AUDIO/EXTERNAL (defaults: 0/1/0/0)
+  APPLY_PATCH_AUTO_APPROVE, SHELL_AUTO_APPROVE, AUTO_APPROVE_MCP (default 1 in auto mode)
+EOF
+}
+
+default_cmd="start"
+if [[ $# -eq 0 && -s "$RERUN_FILE" ]]; then
+  default_cmd="rerun"
+fi
+
+case "${1:-$default_cmd}" in
+  start) shift || true; cmd_start "$@" ;;
+  stop) shift || true; cmd_stop ;;
+  status) shift || true; cmd_status ;;
+  logs) shift || true; cmd_logs ;;
+  tail) shift; cmd_tail "${1:-}" ;;
+  collect) shift || true; collect_rerun "${1:-}" ;;
+  rerun) shift || true; cmd_rerun "${1:-}" ;;
+  *) usage; exit 1 ;;
+esac
diff --git a/.gitignore b/.gitignore
index 60782274e9..ac32a2998d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,7 @@ htmlcov/
 .coverage
 .coverage.*
 .cache
+.tmp/
 nosetests.xml
 coverage.xml
 *.cover
diff --git a/examples/agent_patterns/agents_as_tools.py b/examples/agent_patterns/agents_as_tools.py
index 9fd118efb3..b670e2fe06 100644
--- a/examples/agent_patterns/agents_as_tools.py
+++ b/examples/agent_patterns/agents_as_tools.py
@@ -1,6 +1,7 @@
 import asyncio
 
 from agents import Agent, ItemHelpers, MessageOutputItem, Runner, trace
+from examples.auto_mode import input_with_fallback
 
 """
 This example shows the agents-as-tools pattern. The frontline agent receives a user message and
@@ -56,7 +57,10 @@
 
 
 async def main():
-    msg = input("Hi! What would you like translated, and to which languages? ")
+    msg = input_with_fallback(
+        "Hi! What would you like translated, and to which languages? ",
+        "Translate 'Hello, world!' to French and Spanish.",
+    )
 
     # Run the entire orchestration in a single trace
     with trace("Orchestrator evaluator"):
diff --git a/examples/agent_patterns/agents_as_tools_conditional.py b/examples/agent_patterns/agents_as_tools_conditional.py
index e00f56d5e3..87533721d3 100644
--- a/examples/agent_patterns/agents_as_tools_conditional.py
+++ b/examples/agent_patterns/agents_as_tools_conditional.py
@@ -3,6 +3,7 @@
 from pydantic import BaseModel
 
 from agents import Agent, AgentBase, RunContextWrapper, Runner, trace
+from examples.auto_mode import input_with_fallback
 
 """
 This example demonstrates the agents-as-tools pattern with conditional tool enabling.
@@ -81,7 +82,7 @@ async def main():
     print("2. French and Spanish (2 tools)")
     print("3. European languages (3 tools)")
 
-    choice = input("\nSelect option (1-3): ").strip()
+    choice = input_with_fallback("\nSelect option (1-3): ", "2").strip()
     preference_map = {"1": "spanish_only", "2": "french_spanish", "3": "european"}
     language_preference = preference_map.get(choice, "spanish_only")
 
@@ -95,7 +96,10 @@ async def main():
     print(f"The LLM will only see and can use these {len(available_tools)} tools\n")
 
     # Get user request
-    user_request = input("Ask a question and see responses in available languages:\n")
+    user_request = input_with_fallback(
+        "Ask a question and see responses in available languages:\n",
+        "How do you say good morning?",
+    )
 
     # Run with LLM interaction
     print("\nProcessing request...")
diff --git a/examples/agent_patterns/deterministic.py b/examples/agent_patterns/deterministic.py
index 0c163afe9e..30bef35e25 100644
--- a/examples/agent_patterns/deterministic.py
+++ b/examples/agent_patterns/deterministic.py
@@ -3,6 +3,7 @@
 from pydantic import BaseModel
 
 from agents import Agent, Runner, trace
+from examples.auto_mode import input_with_fallback
 
 """
 This example demonstrates a deterministic flow, where each step is performed by an agent.
@@ -39,7 +40,10 @@ class OutlineCheckerOutput(BaseModel):
 
 
 async def main():
-    input_prompt = input("What kind of story do you want? ")
+    input_prompt = input_with_fallback(
+        "What kind of story do you want? ",
+        "Write a short sci-fi story.",
+    )
 
     # Ensure the entire workflow is a single trace
     with trace("Deterministic story flow"):
diff --git a/examples/agent_patterns/input_guardrails.py b/examples/agent_patterns/input_guardrails.py
index 18ab9d2a75..7e4210d6af 100644
--- a/examples/agent_patterns/input_guardrails.py
+++ b/examples/agent_patterns/input_guardrails.py
@@ -13,6 +13,7 @@
     TResponseInputItem,
     input_guardrail,
 )
+from examples.auto_mode import input_with_fallback, is_auto_mode
 
 """
 This example shows how to use guardrails.
@@ -68,9 +69,13 @@ async def main():
     )
 
     input_data: list[TResponseInputItem] = []
+    auto_mode = is_auto_mode()
 
     while True:
-        user_input = input("Enter a message: ")
+        user_input = input_with_fallback(
+            "Enter a message: ",
+            "What's the capital of California?",
+        )
         input_data.append(
             {
                 "role": "user",
@@ -93,6 +98,8 @@ async def main():
                     "content": message,
                 }
             )
+        if auto_mode:
+            break
 
     # Sample run:
     # Enter a message: What's the capital of California?
diff --git a/examples/agent_patterns/llm_as_a_judge.py b/examples/agent_patterns/llm_as_a_judge.py
index 39a55c4630..1ee4915e18 100644
--- a/examples/agent_patterns/llm_as_a_judge.py
+++ b/examples/agent_patterns/llm_as_a_judge.py
@@ -5,6 +5,7 @@
 from typing import Literal
 
 from agents import Agent, ItemHelpers, Runner, TResponseInputItem, trace
+from examples.auto_mode import input_with_fallback, is_auto_mode
 
 """
 This example shows the LLM as a judge pattern. The first agent generates an outline for a story.
@@ -39,10 +40,16 @@ class EvaluationFeedback:
 
 
 async def main() -> None:
-    msg = input("What kind of story would you like to hear? ")
+    msg = input_with_fallback(
+        "What kind of story would you like to hear? ",
+        "A detective story in space.",
+    )
     input_items: list[TResponseInputItem] = [{"content": msg, "role": "user"}]
 
     latest_outline: str | None = None
+    auto_mode = is_auto_mode()
+    max_rounds = 3 if auto_mode else None
+    rounds = 0
 
     # We'll run the entire workflow in a single trace
     with trace("LLM as a judge"):
@@ -65,6 +72,12 @@ async def main() -> None:
                 print("Story outline is good enough, exiting.")
                 break
 
+            if auto_mode:
+                rounds += 1
+                if max_rounds is not None and rounds >= max_rounds:
+                    print("Auto mode: stopping after limited rounds.")
+                    break
+
             print("Re-running with feedback")
 
             input_items.append({"content": f"Feedback: {result.feedback}", "role": "user"})
diff --git a/examples/agent_patterns/parallelization.py b/examples/agent_patterns/parallelization.py
index fe2a8ecd0b..60dcfbe07f 100644
--- a/examples/agent_patterns/parallelization.py
+++ b/examples/agent_patterns/parallelization.py
@@ -1,6 +1,7 @@
 import asyncio
 
 from agents import Agent, ItemHelpers, Runner, trace
+from examples.auto_mode import input_with_fallback
 
 """
 This example shows the parallelization pattern. We run the agent three times in parallel, and pick
@@ -19,7 +20,10 @@
 
 
 async def main():
-    msg = input("Hi! Enter a message, and we'll translate it to Spanish.\n\n")
+    msg = input_with_fallback(
+        "Hi! Enter a message, and we'll translate it to Spanish.\n\n",
+        "Good morning!",
+    )
 
     # Ensure the entire workflow is a single trace
     with trace("Parallel translation"):
diff --git a/examples/agent_patterns/routing.py b/examples/agent_patterns/routing.py
index 3dcaefa980..4d0a49ab74 100644
--- a/examples/agent_patterns/routing.py
+++ b/examples/agent_patterns/routing.py
@@ -4,6 +4,7 @@
 from openai.types.responses import ResponseContentPartDoneEvent, ResponseTextDeltaEvent
 
 from agents import Agent, RawResponsesStreamEvent, Runner, TResponseInputItem, trace
+from examples.auto_mode import input_with_fallback, is_auto_mode
 
 """
 This example shows the handoffs/routing pattern. The triage agent receives the first message, and
@@ -37,9 +38,13 @@ async def main():
     # We'll create an ID for this conversation, so we can link each trace
     conversation_id = str(uuid.uuid4().hex[:16])
 
-    msg = input("Hi! We speak French, Spanish and English. How can I help? ")
+    msg = input_with_fallback(
+        "Hi! We speak French, Spanish and English. How can I help? ",
+        "Hello, how do I say good evening in French?",
+    )
     agent = triage_agent
     inputs: list[TResponseInputItem] = [{"content": msg, "role": "user"}]
+    auto_mode = is_auto_mode()
 
     while True:
         # Each conversation turn is a single trace. Normally, each input from the user would be an
@@ -61,7 +66,9 @@ async def main():
         inputs = result.to_input_list()
         print("\n")
 
-        user_msg = input("Enter a message: ")
+        if auto_mode:
+            break
+        user_msg = input_with_fallback("Enter a message: ", "Thanks!")
         inputs.append({"content": user_msg, "role": "user"})
         agent = result.current_agent
 
diff --git a/examples/auto_mode.py b/examples/auto_mode.py
new file mode 100644
index 0000000000..9a7b71fe71
--- /dev/null
+++ b/examples/auto_mode.py
@@ -0,0 +1,37 @@
+"""Utilities for running examples in automated mode.
+
+When ``EXAMPLES_INTERACTIVE_MODE=auto`` is set, these helpers provide
+deterministic inputs and confirmations so examples can run without manual
+interaction. The helpers are intentionally lightweight to avoid adding
+dependencies to example code.
+"""
+
+from __future__ import annotations
+
+import os
+
+
+def is_auto_mode() -> bool:
+    """Return True when examples should bypass interactive prompts."""
+    return os.environ.get("EXAMPLES_INTERACTIVE_MODE", "").lower() == "auto"
+
+
+def input_with_fallback(prompt: str, fallback: str) -> str:
+    """Return the fallback text in auto mode, otherwise defer to input()."""
+    if is_auto_mode():
+        print(f"[auto-input] {prompt.strip()} -> {fallback}")
+        return fallback
+    return input(prompt)
+
+
+def confirm_with_fallback(prompt: str, default: bool = True) -> bool:
+    """Return default in auto mode; otherwise ask the user."""
+    if is_auto_mode():
+        choice = "yes" if default else "no"
+        print(f"[auto-confirm] {prompt.strip()} -> {choice}")
+        return default
+
+    answer = input(prompt).strip().lower()
+    if not answer:
+        return default
+    return answer in {"y", "yes"}
diff --git a/examples/basic/agent_lifecycle_example.py b/examples/basic/agent_lifecycle_example.py
index 96238fe2ea..d135b8f452 100644
--- a/examples/basic/agent_lifecycle_example.py
+++ b/examples/basic/agent_lifecycle_example.py
@@ -13,6 +13,7 @@
     Tool,
     function_tool,
 )
+from examples.auto_mode import input_with_fallback
 
 
 class CustomAgentHooks(AgentHooks):
@@ -98,7 +99,7 @@ class FinalResult(BaseModel):
 
 
 async def main() -> None:
-    user_input = input("Enter a max number: ")
+    user_input = input_with_fallback("Enter a max number: ", "50")
     try:
         max_number = int(user_input)
         await Runner.run(
diff --git a/examples/basic/lifecycle_example.py b/examples/basic/lifecycle_example.py
index 76529c56b1..5ecd3a6b75 100644
--- a/examples/basic/lifecycle_example.py
+++ b/examples/basic/lifecycle_example.py
@@ -17,6 +17,7 @@
 )
 from agents.items import ModelResponse, TResponseInputItem
 from agents.tool_context import ToolContext
+from examples.auto_mode import input_with_fallback
 
 
 class LoggingHooks(AgentHooks[Any]):
@@ -146,7 +147,7 @@ class FinalResult(BaseModel):
 
 
 async def main() -> None:
-    user_input = input("Enter a max number: ")
+    user_input = input_with_fallback("Enter a max number: ", "50")
     try:
         max_number = int(user_input)
         await Runner.run(
diff --git a/examples/basic/previous_response_id.py b/examples/basic/previous_response_id.py
index b00bf3aa64..21c354219d 100644
--- a/examples/basic/previous_response_id.py
+++ b/examples/basic/previous_response_id.py
@@ -1,6 +1,7 @@
 import asyncio
 
 from agents import Agent, Runner
+from examples.auto_mode import input_with_fallback
 
 """This demonstrates usage of the `previous_response_id` parameter to continue a conversation.
 The second run passes the previous response ID to the model, which allows it to continue the
@@ -59,7 +60,7 @@ async def main_stream():
 
 
 if __name__ == "__main__":
-    is_stream = input("Run in stream mode? (y/n): ")
+    is_stream = input_with_fallback("Run in stream mode? (y/n): ", "n")
     if is_stream == "y":
         asyncio.run(main_stream())
     else:
diff --git a/examples/customer_service/main.py b/examples/customer_service/main.py
index 266a7e6118..65191559c3 100644
--- a/examples/customer_service/main.py
+++ b/examples/customer_service/main.py
@@ -21,6 +21,7 @@
     trace,
 )
 from agents.extensions.handoff_prompt import RECOMMENDED_PROMPT_PREFIX
+from examples.auto_mode import input_with_fallback, is_auto_mode
 
 ### CONTEXT
 
@@ -143,13 +144,17 @@ async def main():
     current_agent: Agent[AirlineAgentContext] = triage_agent
     input_items: list[TResponseInputItem] = []
     context = AirlineAgentContext()
+    auto_mode = is_auto_mode()
 
     # Normally, each input from the user would be an API request to your app, and you can wrap the request in a trace()
     # Here, we'll just use a random UUID for the conversation ID
     conversation_id = uuid.uuid4().hex[:16]
 
     while True:
-        user_input = input("Enter your message: ")
+        user_input = input_with_fallback(
+            "Enter your message: ",
+            "What are your store hours?",
+        )
         with trace("Customer service", group_id=conversation_id):
             input_items.append({"content": user_input, "role": "user"})
             result = await Runner.run(current_agent, input_items, context=context)
@@ -170,6 +175,8 @@ async def main():
                     print(f"{agent_name}: Skipping item: {new_item.__class__.__name__}")
             input_items = result.to_input_list()
             current_agent = result.last_agent
+        if auto_mode:
+            break
 
 
 if __name__ == "__main__":
diff --git a/examples/financial_research_agent/main.py b/examples/financial_research_agent/main.py
index b5b6cfdfd3..23b6d71d6b 100644
--- a/examples/financial_research_agent/main.py
+++ b/examples/financial_research_agent/main.py
@@ -1,5 +1,7 @@
 import asyncio
 
+from examples.auto_mode import input_with_fallback
+
 from .manager import FinancialResearchManager
 
 
@@ -8,7 +10,10 @@
 # financial research query, for example:
 # "Write up an analysis of Apple Inc.'s most recent quarter."
 async def main() -> None:
-    query = input("Enter a financial research query: ")
+    query = input_with_fallback(
+        "Enter a financial research query: ",
+        "Write up an analysis of Apple Inc.'s most recent quarter.",
+    )
     mgr = FinancialResearchManager()
     await mgr.run(query)
 
diff --git a/examples/hosted_mcp/approvals.py b/examples/hosted_mcp/approvals.py
index c3de0db447..2aa73c1ebc 100644
--- a/examples/hosted_mcp/approvals.py
+++ b/examples/hosted_mcp/approvals.py
@@ -8,14 +8,15 @@
     MCPToolApprovalRequest,
     Runner,
 )
+from examples.auto_mode import confirm_with_fallback
 
 """This example demonstrates how to use the hosted MCP support in the OpenAI Responses API, with
 approval callbacks."""
 
 
 def approval_callback(request: MCPToolApprovalRequest) -> MCPToolApprovalFunctionResult:
-    answer = input(f"Approve running the tool `{request.data.name}`? (y/n) ")
-    result: MCPToolApprovalFunctionResult = {"approve": answer == "y"}
+    approve = confirm_with_fallback(f"Approve running the tool `{request.data.name}`? (y/n) ", True)
+    result: MCPToolApprovalFunctionResult = {"approve": approve}
     if not result["approve"]:
         result["reason"] = "User denied"
     return result
diff --git a/examples/mcp/git_example/main.py b/examples/mcp/git_example/main.py
index ab229e8550..8a62744d18 100644
--- a/examples/mcp/git_example/main.py
+++ b/examples/mcp/git_example/main.py
@@ -3,6 +3,7 @@
 
 from agents import Agent, Runner, trace
 from agents.mcp import MCPServer, MCPServerStdio
+from examples.auto_mode import input_with_fallback
 
 
 async def run(mcp_server: MCPServer, directory_path: str):
@@ -27,7 +28,10 @@ async def run(mcp_server: MCPServer, directory_path: str):
 
 async def main():
     # Ask the user for the directory path
-    directory_path = input("Please enter the path to the git repository: ")
+    directory_path = input_with_fallback(
+        "Please enter the path to the git repository: ",
+        ".",
+    )
 
     async with MCPServerStdio(
         cache_tools_list=True,  # Cache the tools list, for demonstration
diff --git a/examples/mcp/prompt_server/README.md b/examples/mcp/prompt_server/README.md
index c1b1c3b376..c1eaa632df 100644
--- a/examples/mcp/prompt_server/README.md
+++ b/examples/mcp/prompt_server/README.md
@@ -10,7 +10,8 @@ uv run python examples/mcp/prompt_server/main.py
 
 ## Details
 
-The example uses the `MCPServerStreamableHttp` class from `agents.mcp`. The server runs in a sub-process at `http://localhost:8000/mcp` and provides user-controlled prompts that generate agent instructions.
+The example uses the `MCPServerStreamableHttp` class from `agents.mcp`. The script auto-selects an open localhost port (or honors `STREAMABLE_HTTP_PORT`) and runs the server at `http://<host>:<port>/mcp`, providing user-controlled prompts that generate agent instructions.
+If you need a specific address, set `STREAMABLE_HTTP_PORT` and `STREAMABLE_HTTP_HOST`.
 
 The server exposes prompts like `generate_code_review_instructions` that take parameters such as focus area and programming language. The agent calls these prompts to dynamically generate its system instructions based on user-provided parameters.
 
@@ -26,4 +27,4 @@ The example demonstrates two key functions:
    - Runs the agent against vulnerable sample code (command injection via `os.system`)
    - The agent analyzes the code and provides security-focused feedback using available tools
 
-This pattern allows users to dynamically configure agent behavior through MCP prompts rather than hardcoded instructions. 
\ No newline at end of file
+This pattern allows users to dynamically configure agent behavior through MCP prompts rather than hardcoded instructions. 
diff --git a/examples/mcp/prompt_server/main.py b/examples/mcp/prompt_server/main.py
index 4caa95d888..543ec273c4 100644
--- a/examples/mcp/prompt_server/main.py
+++ b/examples/mcp/prompt_server/main.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import shutil
+import socket
 import subprocess
 import time
 from typing import Any
@@ -9,6 +10,22 @@
 from agents.mcp import MCPServer, MCPServerStreamableHttp
 from agents.model_settings import ModelSettings
 
+STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1")
+
+
+def _choose_port() -> int:
+    env_port = os.getenv("STREAMABLE_HTTP_PORT")
+    if env_port:
+        return int(env_port)
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind((STREAMABLE_HTTP_HOST, 0))
+        return s.getsockname()[1]
+
+
+STREAMABLE_HTTP_PORT = _choose_port()
+os.environ.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT))
+STREAMABLE_HTTP_URL = f"http://{STREAMABLE_HTTP_HOST}:{STREAMABLE_HTTP_PORT}/mcp"
+
 
 async def get_instructions_from_prompt(mcp_server: MCPServer, prompt_name: str, **kwargs) -> str:
     """Get agent instructions by calling MCP prompt endpoint (user-controlled)"""
@@ -75,7 +92,7 @@ async def show_available_prompts(mcp_server: MCPServer):
 async def main():
     async with MCPServerStreamableHttp(
         name="Simple Prompt Server",
-        params={"url": "http://localhost:8000/mcp"},
+        params={"url": STREAMABLE_HTTP_URL},
     ) as server:
         trace_id = gen_trace_id()
         with trace(workflow_name="Simple Prompt Demo", trace_id=trace_id):
@@ -94,8 +111,11 @@ async def main():
         this_dir = os.path.dirname(os.path.abspath(__file__))
         server_file = os.path.join(this_dir, "server.py")
 
-        print("Starting Simple Prompt Server...")
-        process = subprocess.Popen(["uv", "run", server_file])
+        print(f"Starting Simple Prompt Server at {STREAMABLE_HTTP_URL} ...")
+        env = os.environ.copy()
+        env.setdefault("STREAMABLE_HTTP_HOST", STREAMABLE_HTTP_HOST)
+        env.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT))
+        process = subprocess.Popen(["uv", "run", server_file], env=env)
         time.sleep(3)
         print("Server started\n")
     except Exception as e:
diff --git a/examples/mcp/prompt_server/server.py b/examples/mcp/prompt_server/server.py
index 01dcbac346..7d6629acd7 100644
--- a/examples/mcp/prompt_server/server.py
+++ b/examples/mcp/prompt_server/server.py
@@ -1,7 +1,12 @@
+import os
+
 from mcp.server.fastmcp import FastMCP
 
+STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1")
+STREAMABLE_HTTP_PORT = int(os.getenv("STREAMABLE_HTTP_PORT", "18080"))
+
 # Create server
-mcp = FastMCP("Prompt Server")
+mcp = FastMCP("Prompt Server", host=STREAMABLE_HTTP_HOST, port=STREAMABLE_HTTP_PORT)
 
 
 # Instruction-generating prompts (user-controlled)
diff --git a/examples/mcp/sse_example/server.py b/examples/mcp/sse_example/server.py
index df364aa3af..2e4fe2db86 100644
--- a/examples/mcp/sse_example/server.py
+++ b/examples/mcp/sse_example/server.py
@@ -23,10 +23,16 @@ def get_secret_word() -> str:
 @mcp.tool()
 def get_current_weather(city: str) -> str:
     print(f"[debug-server] get_current_weather({city})")
-
-    endpoint = "https://wttr.in"
-    response = requests.get(f"{endpoint}/{city}")
-    return response.text
+    # Avoid slow or flaky network calls during automated runs.
+    try:
+        endpoint = "https://wttr.in"
+        response = requests.get(f"{endpoint}/{city}", timeout=2)
+        if response.ok:
+            return response.text
+    except Exception:
+        pass
+    # Fallback keeps the tool responsive even when offline.
+    return f"Weather data unavailable right now; assume clear skies in {city}."
 
 
 if __name__ == "__main__":
diff --git a/examples/mcp/streamablehttp_custom_client_example/README.md b/examples/mcp/streamablehttp_custom_client_example/README.md
index 1569b3c28c..fc269a0644 100644
--- a/examples/mcp/streamablehttp_custom_client_example/README.md
+++ b/examples/mcp/streamablehttp_custom_client_example/README.md
@@ -38,7 +38,7 @@ def create_custom_http_client() -> httpx.AsyncClient:
 async with MCPServerStreamableHttp(
     name="Custom Client Server",
     params={
-        "url": "http://localhost:8000/mcp",
+        "url": "http://localhost:<port>/mcp",
         "httpx_client_factory": create_custom_http_client,
     },
 ) as server:
@@ -60,3 +60,4 @@ async with MCPServerStreamableHttp(
 - **Performance**: Optimize timeouts and connection settings for your use case
 - **Compatibility**: Work with corporate proxies and network restrictions
 
+This example will auto-pick a free localhost port unless you set `STREAMABLE_HTTP_PORT`; use `STREAMABLE_HTTP_HOST` to change the bind address.
diff --git a/examples/mcp/streamablehttp_custom_client_example/main.py b/examples/mcp/streamablehttp_custom_client_example/main.py
index 41e26ec35d..9c45812009 100644
--- a/examples/mcp/streamablehttp_custom_client_example/main.py
+++ b/examples/mcp/streamablehttp_custom_client_example/main.py
@@ -7,6 +7,7 @@
 import asyncio
 import os
 import shutil
+import socket
 import subprocess
 import time
 from typing import Any
@@ -17,6 +18,22 @@
 from agents.mcp import MCPServer, MCPServerStreamableHttp
 from agents.model_settings import ModelSettings
 
+STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1")
+
+
+def _choose_port() -> int:
+    env_port = os.getenv("STREAMABLE_HTTP_PORT")
+    if env_port:
+        return int(env_port)
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind((STREAMABLE_HTTP_HOST, 0))
+        return s.getsockname()[1]
+
+
+STREAMABLE_HTTP_PORT = _choose_port()
+os.environ.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT))
+STREAMABLE_HTTP_URL = f"http://{STREAMABLE_HTTP_HOST}:{STREAMABLE_HTTP_PORT}/mcp"
+
 
 def create_custom_http_client(
     headers: dict[str, str] | None = None,
@@ -73,7 +90,7 @@ async def main():
     async with MCPServerStreamableHttp(
         name="Streamable HTTP with Custom Client",
         params={
-            "url": "http://localhost:8000/mcp",
+            "url": STREAMABLE_HTTP_URL,
             "httpx_client_factory": create_custom_http_client,
         },
     ) as server:
@@ -91,16 +108,19 @@ async def main():
         )
 
     # We'll run the Streamable HTTP server in a subprocess. Usually this would be a remote server, but for this
-    # demo, we'll run it locally at http://localhost:8000/mcp
+    # demo, we'll run it locally at STREAMABLE_HTTP_URL
     process: subprocess.Popen[Any] | None = None
     try:
         this_dir = os.path.dirname(os.path.abspath(__file__))
         server_file = os.path.join(this_dir, "server.py")
 
-        print("Starting Streamable HTTP server at http://localhost:8000/mcp ...")
+        print(f"Starting Streamable HTTP server at {STREAMABLE_HTTP_URL} ...")
 
         # Run `uv run server.py` to start the Streamable HTTP server
-        process = subprocess.Popen(["uv", "run", server_file])
+        env = os.environ.copy()
+        env.setdefault("STREAMABLE_HTTP_HOST", STREAMABLE_HTTP_HOST)
+        env.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT))
+        process = subprocess.Popen(["uv", "run", server_file], env=env)
         # Give it 3 seconds to start
         time.sleep(3)
 
diff --git a/examples/mcp/streamablehttp_custom_client_example/server.py b/examples/mcp/streamablehttp_custom_client_example/server.py
index a078ee00fa..dd0d468753 100644
--- a/examples/mcp/streamablehttp_custom_client_example/server.py
+++ b/examples/mcp/streamablehttp_custom_client_example/server.py
@@ -1,9 +1,13 @@
+import os
 import random
 
 from mcp.server.fastmcp import FastMCP
 
+STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1")
+STREAMABLE_HTTP_PORT = int(os.getenv("STREAMABLE_HTTP_PORT", "18080"))
+
 # Create server
-mcp = FastMCP("Echo Server")
+mcp = FastMCP("Echo Server", host=STREAMABLE_HTTP_HOST, port=STREAMABLE_HTTP_PORT)
 
 
 @mcp.tool()
diff --git a/examples/mcp/streamablehttp_example/README.md b/examples/mcp/streamablehttp_example/README.md
index a07fe19be3..83cae670b6 100644
--- a/examples/mcp/streamablehttp_example/README.md
+++ b/examples/mcp/streamablehttp_example/README.md
@@ -10,4 +10,4 @@ uv run python examples/mcp/streamablehttp_example/main.py
 
 ## Details
 
-The example uses the `MCPServerStreamableHttp` class from `agents.mcp`. The server runs in a sub-process at `https://localhost:8000/mcp`.
+The example uses the `MCPServerStreamableHttp` class from `agents.mcp`. The script picks an open localhost port automatically (or honors `STREAMABLE_HTTP_PORT` if you set it) and starts the server at `http://<host>:<port>/mcp`. Set `STREAMABLE_HTTP_HOST` if you need a different bind address.
diff --git a/examples/mcp/streamablehttp_example/main.py b/examples/mcp/streamablehttp_example/main.py
index cc95e798b6..fd1140f98e 100644
--- a/examples/mcp/streamablehttp_example/main.py
+++ b/examples/mcp/streamablehttp_example/main.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import shutil
+import socket
 import subprocess
 import time
 from typing import Any
@@ -9,6 +10,22 @@
 from agents.mcp import MCPServer, MCPServerStreamableHttp
 from agents.model_settings import ModelSettings
 
+STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1")
+
+
+def _choose_port() -> int:
+    env_port = os.getenv("STREAMABLE_HTTP_PORT")
+    if env_port:
+        return int(env_port)
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind((STREAMABLE_HTTP_HOST, 0))
+        return s.getsockname()[1]
+
+
+STREAMABLE_HTTP_PORT = _choose_port()
+os.environ.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT))
+STREAMABLE_HTTP_URL = f"http://{STREAMABLE_HTTP_HOST}:{STREAMABLE_HTTP_PORT}/mcp"
+
 
 async def run(mcp_server: MCPServer):
     agent = Agent(
@@ -41,7 +58,7 @@ async def main():
     async with MCPServerStreamableHttp(
         name="Streamable HTTP Python Server",
         params={
-            "url": "http://localhost:8000/mcp",
+            "url": STREAMABLE_HTTP_URL,
         },
     ) as server:
         trace_id = gen_trace_id()
@@ -58,16 +75,19 @@ async def main():
         )
 
     # We'll run the Streamable HTTP server in a subprocess. Usually this would be a remote server, but for this
-    # demo, we'll run it locally at http://localhost:8000/mcp
+    # demo, we'll run it locally at STREAMABLE_HTTP_URL
     process: subprocess.Popen[Any] | None = None
     try:
         this_dir = os.path.dirname(os.path.abspath(__file__))
         server_file = os.path.join(this_dir, "server.py")
 
-        print("Starting Streamable HTTP server at http://localhost:8000/mcp ...")
+        print(f"Starting Streamable HTTP server at {STREAMABLE_HTTP_URL} ...")
 
         # Run `uv run server.py` to start the Streamable HTTP server
-        process = subprocess.Popen(["uv", "run", server_file])
+        env = os.environ.copy()
+        env.setdefault("STREAMABLE_HTTP_HOST", STREAMABLE_HTTP_HOST)
+        env.setdefault("STREAMABLE_HTTP_PORT", str(STREAMABLE_HTTP_PORT))
+        process = subprocess.Popen(["uv", "run", server_file], env=env)
         # Give it 3 seconds to start
         time.sleep(3)
 
diff --git a/examples/mcp/streamablehttp_example/server.py b/examples/mcp/streamablehttp_example/server.py
index d8f839652a..d73ab895b6 100644
--- a/examples/mcp/streamablehttp_example/server.py
+++ b/examples/mcp/streamablehttp_example/server.py
@@ -1,10 +1,14 @@
+import os
 import random
 
 import requests
 from mcp.server.fastmcp import FastMCP
 
+STREAMABLE_HTTP_HOST = os.getenv("STREAMABLE_HTTP_HOST", "127.0.0.1")
+STREAMABLE_HTTP_PORT = int(os.getenv("STREAMABLE_HTTP_PORT", "18080"))
+
 # Create server
-mcp = FastMCP("Echo Server")
+mcp = FastMCP("Echo Server", host=STREAMABLE_HTTP_HOST, port=STREAMABLE_HTTP_PORT)
 
 
 @mcp.tool()
@@ -23,10 +27,16 @@ def get_secret_word() -> str:
 @mcp.tool()
 def get_current_weather(city: str) -> str:
     print(f"[debug-server] get_current_weather({city})")
-
-    endpoint = "https://wttr.in"
-    response = requests.get(f"{endpoint}/{city}")
-    return response.text
+    # Avoid slow or flaky network calls during automated runs.
+    try:
+        endpoint = "https://wttr.in"
+        response = requests.get(f"{endpoint}/{city}", timeout=2)
+        if response.ok:
+            return response.text
+    except Exception:
+        pass
+    # Fallback keeps the tool responsive even when offline.
+    return f"Weather data unavailable right now; assume clear skies in {city}."
 
 
 if __name__ == "__main__":
diff --git a/examples/model_providers/litellm_provider.py b/examples/model_providers/litellm_provider.py
index 4a1a696fcb..ea5f09ab32 100644
--- a/examples/model_providers/litellm_provider.py
+++ b/examples/model_providers/litellm_provider.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import asyncio
+import os
 
 from agents import Agent, Runner, function_tool, set_tracing_disabled
 from agents.extensions.models.litellm_model import LitellmModel
@@ -24,6 +25,9 @@ def get_weather(city: str):
 
 
 async def main(model: str, api_key: str):
+    if api_key == "dummy":
+        print("Skipping run because no valid LITELLM_API_KEY was provided.")
+        return
     agent = Agent(
         name="Assistant",
         instructions="You only respond in haikus.",
@@ -36,7 +40,7 @@ async def main(model: str, api_key: str):
 
 
 if __name__ == "__main__":
-    # First try to get model/api key from args
+    # Prefer non-interactive defaults in auto mode to avoid blocking.
     import argparse
 
     parser = argparse.ArgumentParser()
@@ -44,12 +48,12 @@ async def main(model: str, api_key: str):
     parser.add_argument("--api-key", type=str, required=False)
     args = parser.parse_args()
 
-    model = args.model
-    if not model:
-        model = input("Enter a model name for Litellm: ")
+    model = args.model or os.environ.get("LITELLM_MODEL", "openai/gpt-4o-mini")
+    api_key = args.api_key or os.environ.get("LITELLM_API_KEY", "dummy")
 
-    api_key = args.api_key
-    if not api_key:
-        api_key = input("Enter an API key for Litellm: ")
+    if not args.model:
+        print(f"Using default model: {model}")
+    if not args.api_key:
+        print("Using LITELLM_API_KEY from environment (or dummy placeholder).")
 
     asyncio.run(main(model, api_key))
diff --git a/examples/reasoning_content/main.py b/examples/reasoning_content/main.py
index 7ccbab01b8..3db5d5cee6 100644
--- a/examples/reasoning_content/main.py
+++ b/examples/reasoning_content/main.py
@@ -20,7 +20,7 @@
 from agents.models.interface import ModelTracing
 from agents.models.openai_provider import OpenAIProvider
 
-MODEL_NAME = os.getenv("REASONING_MODEL_NAME") or "gpt-5"
+MODEL_NAME = os.getenv("REASONING_MODEL_NAME") or "gpt-5.2"
 
 
 async def stream_with_reasoning_content():
diff --git a/examples/reasoning_content/runner_example.py b/examples/reasoning_content/runner_example.py
index 3546da3502..e3c3d22506 100644
--- a/examples/reasoning_content/runner_example.py
+++ b/examples/reasoning_content/runner_example.py
@@ -17,7 +17,7 @@
 from agents import Agent, ModelSettings, Runner, trace
 from agents.items import ReasoningItem
 
-MODEL_NAME = os.getenv("EXAMPLE_MODEL_NAME") or "gpt-5.2"
+MODEL_NAME = os.getenv("REASONING_MODEL_NAME") or "gpt-5.2"
 
 
 async def main():
diff --git a/examples/research_bot/main.py b/examples/research_bot/main.py
index a0fd43dca8..b70bc8e483 100644
--- a/examples/research_bot/main.py
+++ b/examples/research_bot/main.py
@@ -1,10 +1,15 @@
 import asyncio
 
+from examples.auto_mode import input_with_fallback
+
 from .manager import ResearchManager
 
 
 async def main() -> None:
-    query = input("What would you like to research? ")
+    query = input_with_fallback(
+        "What would you like to research? ",
+        "Impact of electric vehicles on the grid.",
+    )
     await ResearchManager().run(query)
 
 
diff --git a/examples/run_examples.py b/examples/run_examples.py
index 0d51a028f1..1b52cdec21 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -1,30 +1,26 @@
-"""Run multiple example entry points in this repository.
-
-This script locates Python files under ``examples/`` that contain a
-``__main__`` guard and executes them one by one. By default it skips
-interactive, server-like, audio-heavy, and external-service examples so
-that automated validation does not hang waiting for input or require
-hardware. Use flags to opt into those categories when you want to run
-them.
-
-Usage examples:
-
-    uv run examples/run_examples.py --dry-run
-    uv run examples/run_examples.py --filter basic
-    uv run examples/run_examples.py --include-interactive --include-server
-
-By default the script keeps running even if an example fails; use
-``--fail-fast`` to stop on the first failure.
+"""Run multiple example entry points with optional auto mode and logging.
+
+Features:
+* Discovers ``__main__``-guarded example files under ``examples/``.
+* Skips interactive/server/audio/external examples unless explicitly included.
+* Auto mode (``EXAMPLES_INTERACTIVE_MODE=auto``) enables deterministic inputs,
+  auto-approvals, and turns on interactive examples by default.
+* Writes per-example logs to ``.tmp/examples-start-logs`` and a main summary log.
+* Generates a rerun list of failures at ``.tmp/examples-rerun.txt``.
 """
 
 from __future__ import annotations
 
 import argparse
+import datetime
+import os
 import re
 import shlex
 import subprocess
 import sys
+import threading
 from collections.abc import Iterable, Sequence
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
 from pathlib import Path
 
@@ -32,6 +28,27 @@
 EXAMPLES_DIR = ROOT_DIR / "examples"
 MAIN_PATTERN = re.compile(r"__name__\s*==\s*['\"]__main__['\"]")
 
+LOG_DIR_DEFAULT = ROOT_DIR / ".tmp" / "examples-start-logs"
+RERUN_FILE_DEFAULT = ROOT_DIR / ".tmp" / "examples-rerun.txt"
+DEFAULT_MAIN_LOG = LOG_DIR_DEFAULT / f"main_{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}.log"
+
+# Examples that are noisy, require extra credentials, or hang in auto runs.
+DEFAULT_AUTO_SKIP = {
+    "examples/agent_patterns/llm_as_a_judge.py",
+    "examples/agent_patterns/routing.py",
+    "examples/customer_service/main.py",
+    "examples/hosted_mcp/connectors.py",
+    "examples/mcp/git_example/main.py",
+    "examples/model_providers/custom_example_agent.py",
+    "examples/model_providers/custom_example_global.py",
+    "examples/model_providers/custom_example_provider.py",
+    "examples/realtime/app/server.py",
+    "examples/realtime/cli/demo.py",
+    "examples/realtime/twilio/server.py",
+    "examples/voice/static/main.py",
+    "examples/voice/streamed/main.py",
+}
+
 
 @dataclass
 class ExampleScript:
@@ -53,6 +70,15 @@ def command(self) -> list[str]:
         return ["uv", "run", "python", "-m", self.module]
 
 
+@dataclass
+class ExampleResult:
+    script: ExampleScript
+    status: str
+    reason: str = ""
+    log_path: Path | None = None
+    exit_code: int | None = None
+
+
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Run example scripts sequentially.")
     parser.add_argument(
@@ -95,6 +121,55 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Show detected tags for each example entry.",
     )
+    parser.add_argument(
+        "--logs-dir",
+        default=str(LOG_DIR_DEFAULT),
+        help="Directory for per-example logs and main log.",
+    )
+    parser.add_argument(
+        "--main-log",
+        default=str(DEFAULT_MAIN_LOG),
+        help="Path to write the main summary log.",
+    )
+    parser.add_argument(
+        "--rerun-file",
+        help="Only run examples listed in this file (one relative path per line).",
+    )
+    parser.add_argument(
+        "--write-rerun",
+        action="store_true",
+        help="Write failures to .tmp/examples-rerun.txt after the run.",
+    )
+    parser.add_argument(
+        "--collect",
+        help="Parse a previous main log to emit a rerun list instead of running examples.",
+    )
+    parser.add_argument(
+        "--output",
+        help="Output path for --collect rerun list (defaults to stdout).",
+    )
+    parser.add_argument(
+        "--print-auto-skip",
+        action="store_true",
+        help="Show the current auto-skip list and exit.",
+    )
+    parser.add_argument(
+        "--auto-mode",
+        action="store_true",
+        help="Force EXAMPLES_INTERACTIVE_MODE=auto for this run.",
+    )
+    parser.add_argument(
+        "--jobs",
+        "-j",
+        type=int,
+        default=int(os.environ.get("EXAMPLES_JOBS", "4")),
+        help="Number of examples to run in parallel (default: 4). Use 1 to force serial execution.",
+    )
+    parser.add_argument(
+        "--no-buffer-output",
+        action="store_true",
+        help="Stream each example's stdout directly (may interleave). By default output is buffered per example to reduce interleaving.",
+    )
     return parser.parse_args()
 
 
@@ -103,7 +178,7 @@ def detect_tags(path: Path, source: str) -> set[str]:
     lower_source = source.lower()
     lower_parts = [part.lower() for part in path.parts]
 
-    if re.search(r"\binput\s*\(", source):
+    if re.search(r"\binput\s*\(", source) or "input_with_fallback(" in lower_source:
         tags.add("interactive")
     if "prompt_toolkit" in lower_source or "questionary" in lower_source:
         tags.add("interactive")
@@ -153,9 +228,17 @@ def discover_examples(filters: Iterable[str]) -> list[ExampleScript]:
     return sorted(examples, key=lambda item: item.relpath)
 
 
-def should_skip(tags: set[str], allowed_overrides: set[str]) -> tuple[bool, set[str]]:
+def should_skip(
+    tags: set[str],
+    allowed_overrides: set[str],
+    auto_skip_set: set[str],
+    relpath: str,
+    auto_mode: bool,
+) -> tuple[bool, set[str]]:
     blocked = {"interactive", "server", "audio", "external"} - allowed_overrides
     active_blockers = tags & blocked
+    if auto_mode and relpath in auto_skip_set:
+        active_blockers = active_blockers | {"auto-skip"}
     return (len(active_blockers) > 0, active_blockers)
 
 
@@ -163,60 +246,281 @@ def format_command(cmd: Sequence[str]) -> str:
     return shlex.join(cmd)
 
 
+def env_flag(name: str) -> bool | None:
+    raw = os.environ.get(name)
+    if raw is None:
+        return None
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+
+
+def load_auto_skip() -> set[str]:
+    env_value = os.environ.get("EXAMPLES_AUTO_SKIP", "")
+    if env_value.strip():
+        parts = re.split(r"[\s,]+", env_value.strip())
+        return {p for p in parts if p}
+    return set(DEFAULT_AUTO_SKIP)
+
+
+def write_main_log_line(handle, line: str) -> None:
+    handle.write(line + "\n")
+    handle.flush()
+
+
+def ensure_dirs(path: Path) -> None:
+    if path.suffix:
+        path.parent.mkdir(parents=True, exist_ok=True)
+    else:
+        path.mkdir(parents=True, exist_ok=True)
+
+
+def parse_rerun_from_log(log_path: Path) -> list[str]:
+    if not log_path.exists():
+        raise FileNotFoundError(log_path)
+    rerun: list[str] = []
+    with log_path.open("r", encoding="utf-8") as handle:
+        for line in handle:
+            stripped = line.strip()
+            if not stripped or stripped.startswith("#"):
+                continue
+            parts = stripped.split()
+            if len(parts) < 2:
+                continue
+            status, relpath = parts[0].upper(), parts[1]
+            if status in {"FAILED", "ERROR", "UNKNOWN"}:
+                rerun.append(relpath)
+    return rerun
+
+
 def run_examples(examples: Sequence[ExampleScript], args: argparse.Namespace) -> int:
     overrides: set[str] = set()
-    if args.include_interactive:
+    if args.include_interactive or env_flag("EXAMPLES_INCLUDE_INTERACTIVE"):
         overrides.add("interactive")
-    if args.include_server:
+    if args.include_server or env_flag("EXAMPLES_INCLUDE_SERVER"):
         overrides.add("server")
-    if args.include_audio:
+    if args.include_audio or env_flag("EXAMPLES_INCLUDE_AUDIO"):
         overrides.add("audio")
-    if args.include_external:
+    if args.include_external or env_flag("EXAMPLES_INCLUDE_EXTERNAL"):
         overrides.add("external")
 
+    logs_dir = Path(args.logs_dir).resolve()
+    main_log_path = Path(args.main_log).resolve()
+    auto_mode = args.auto_mode or os.environ.get("EXAMPLES_INTERACTIVE_MODE", "").lower() == "auto"
+    auto_skip_set = load_auto_skip()
+
+    if auto_mode and "interactive" not in overrides:
+        overrides.add("interactive")
+
+    ensure_dirs(logs_dir)
+    ensure_dirs(main_log_path)
+    rerun_entries: list[str] = []
+
     if not examples:
         print("No example entry points found that match the filters.")
         return 0
 
+    print(f"Interactive mode: {'auto' if auto_mode else 'prompt'}")
     print(f"Found {len(examples)} example entry points under examples/.")
 
     executed = 0
     skipped = 0
     failed = 0
+    results: list[ExampleResult] = []
+
+    jobs = max(1, args.jobs)
+    if args.fail_fast and jobs > 1:
+        # Preserve fail-fast semantics by forcing serial execution.
+        jobs = 1
+
+    output_lock = threading.Lock()
+    main_log_lock = threading.Lock()
+    buffer_output = not args.no_buffer_output and os.environ.get(
+        "EXAMPLES_BUFFER_OUTPUT", "1"
+    ).lower() not in {"0", "false", "no", "off"}
+
+    def safe_write_main(line: str) -> None:
+        with main_log_lock:
+            write_main_log_line(main_log, line)
+
+    def run_single(example: ExampleScript) -> ExampleResult:
+        relpath = example.relpath
+        log_filename = f"{relpath.replace('/', '__')}.log"
+        log_path = logs_dir / log_filename
+        ensure_dirs(log_path)
+
+        env = os.environ.copy()
+        if auto_mode:
+            env.setdefault("EXAMPLES_INTERACTIVE_MODE", "auto")
+            env.setdefault("APPLY_PATCH_AUTO_APPROVE", "1")
+            env.setdefault("SHELL_AUTO_APPROVE", "1")
+            env.setdefault("AUTO_APPROVE_MCP", "1")
+
+        proc = subprocess.Popen(
+            example.command,
+            cwd=ROOT_DIR,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            env=env,
+        )
+        assert proc.stdout is not None
+        buffer_lines: list[str] = []
+
+        with log_path.open("w", encoding="utf-8") as per_log:
+            for line in proc.stdout:
+                per_log.write(line)
+                if buffer_output:
+                    buffer_lines.append(line)
+                else:
+                    with output_lock:
+                        sys.stdout.write(f"[{relpath}] {line}")
+        proc.wait()
+        exit_code = proc.returncode
+
+        if buffer_output and buffer_lines:
+            with output_lock:
+                for line in buffer_lines:
+                    sys.stdout.write(f"[{relpath}] {line}")
+
+        if exit_code == 0:
+            safe_write_main(f"PASSED {relpath} exit=0 log={log_path.relative_to(ROOT_DIR)}")
+            return ExampleResult(
+                script=example,
+                status="passed",
+                log_path=log_path,
+                exit_code=exit_code,
+            )
+
+        info = f"exit={exit_code}"
+        with output_lock:
+            print(f"  !! {relpath} exited with {exit_code}")
+        safe_write_main(f"FAILED {relpath} exit={exit_code} log={log_path.relative_to(ROOT_DIR)}")
+        return ExampleResult(
+            script=example,
+            status="failed",
+            reason=info,
+            log_path=log_path,
+            exit_code=exit_code,
+        )
+
+    with main_log_path.open("w", encoding="utf-8") as main_log:
+        safe_write_main(f"# run started {datetime.datetime.now().isoformat()}")
+        safe_write_main(f"# filters: {args.filter or '-'}")
+        safe_write_main(f"# include: {sorted(overrides)}")
+        safe_write_main(f"# auto_mode: {auto_mode}")
+        safe_write_main(f"# logs_dir: {logs_dir}")
+        safe_write_main(f"# jobs: {jobs}")
+        safe_write_main(f"# buffer_output: {buffer_output}")
+
+        run_list: list[ExampleScript] = []
+
+        for example in examples:
+            relpath = example.relpath
+            skip, reasons = should_skip(example.tags, overrides, auto_skip_set, relpath, auto_mode)
+            tag_label = f" [tags: {', '.join(sorted(example.tags))}]" if args.verbose else ""
+
+            if skip:
+                reason_label = f" (skipped: {', '.join(sorted(reasons))})" if reasons else ""
+                print(f"- SKIP {relpath}{tag_label}{reason_label}")
+                safe_write_main(f"SKIPPED {relpath} reasons={','.join(sorted(reasons))}")
+                skipped += 1
+                results.append(
+                    ExampleResult(script=example, status="skipped", reason=",".join(reasons))
+                )
+                continue
+
+            print(f"- RUN  {relpath}{tag_label}")
+            print(f"  cmd: {format_command(example.command)}")
+
+            if args.dry_run:
+                safe_write_main(f"DRYRUN {relpath}")
+                results.append(ExampleResult(script=example, status="dry-run"))
+                continue
+
+            run_list.append(example)
+
+        run_results: dict[str, ExampleResult] = {}
+        if run_list:
+            with ThreadPoolExecutor(max_workers=jobs) as executor:
+                future_map = {executor.submit(run_single, ex): ex for ex in run_list}
+                for future in as_completed(future_map):
+                    result = future.result()
+                    run_results[result.script.relpath] = result
+
+        for ex in run_list:
+            result = run_results[ex.relpath]
+            results.append(result)
+            if result.status == "passed":
+                executed += 1
+            elif result.status == "failed":
+                failed += 1
+                rerun_entries.append(ex.relpath)
+                if args.fail_fast:
+                    safe_write_main("# fail-fast stop")
+                    break
+
+        safe_write_main(f"# summary executed={executed} skipped={skipped} failed={failed}")
+
+    if args.write_rerun and rerun_entries:
+        ensure_dirs(RERUN_FILE_DEFAULT)
+        RERUN_FILE_DEFAULT.write_text("\n".join(rerun_entries) + "\n", encoding="utf-8")
+        print(f"Wrote rerun list to {RERUN_FILE_DEFAULT}")
+
+    print(f"Main log: {main_log_path}")
+    print(f"Done. Ran {executed} example(s), skipped {skipped}, failed {failed}.")
 
-    for example in examples:
-        skip, reasons = should_skip(example.tags, overrides)
-        tag_label = f" [tags: {', '.join(sorted(example.tags))}]" if args.verbose else ""
-
-        if skip:
-            reason_label = f" (skipped: {', '.join(sorted(reasons))})" if reasons else ""
-            print(f"- SKIP {example.relpath}{tag_label}{reason_label}")
-            skipped += 1
-            continue
-
-        print(f"- RUN  {example.relpath}{tag_label}")
-        print(f"  cmd: {format_command(example.command)}")
-
-        if args.dry_run:
-            continue
-
-        result = subprocess.run(example.command, cwd=ROOT_DIR)
-        if result.returncode != 0:
-            print(f"  !! {example.relpath} exited with {result.returncode}")
-            failed += 1
-            if args.fail_fast:
-                return result.returncode
-            continue
-
-        executed += 1
+    # Summary table
+    status_w = 9
+    name_w = 44
+    info_w = 32
+    print("\nResults:")
+    print(f"{'status'.ljust(status_w)} {'example'.ljust(name_w)} {'info'.ljust(info_w)} log")
+    print(f"{'-' * status_w} {'-' * name_w} {'-' * info_w} ---")
+    for result in results:
+        info = result.reason or ("exit 0" if result.status == "passed" else "")
+        log_disp = (
+            str(result.log_path.relative_to(ROOT_DIR))
+            if result.log_path and result.log_path.exists()
+            else "-"
+        )
+        print(
+            f"{result.status.ljust(status_w)} {result.script.relpath.ljust(name_w)} {info.ljust(info_w)} {log_disp}"
+        )
 
-    print(f"Done. Ran {executed} example(s), skipped {skipped}, failed {failed}.")
     return 0 if failed == 0 else 1
 
 
 def main() -> int:
     args = parse_args()
+    if args.print_auto_skip:
+        for entry in sorted(load_auto_skip()):
+            print(entry)
+        return 0
+
+    if args.collect:
+        paths = parse_rerun_from_log(Path(args.collect))
+        if args.output:
+            out = Path(args.output)
+            ensure_dirs(out)
+            out.write_text("\n".join(paths) + "\n", encoding="utf-8")
+            print(f"Wrote {len(paths)} entries to {out}")
+        else:
+            for p in paths:
+                print(p)
+        return 0
+
     examples = discover_examples(args.filter)
+    if args.rerun_file:
+        rerun_set = {
+            line.strip()
+            for line in Path(args.rerun_file).read_text(encoding="utf-8").splitlines()
+            if line.strip()
+        }
+        examples = [ex for ex in examples if ex.relpath in rerun_set]
+        if not examples:
+            print("Rerun list is empty; nothing to do.")
+            return 0
+        print(f"Rerun mode: {len(examples)} example(s) from {args.rerun_file}")
+
     return run_examples(examples, args)
 
 
diff --git a/examples/tools/apply_patch.py b/examples/tools/apply_patch.py
index 19d0cfb7dc..d93c4f3fb8 100644
--- a/examples/tools/apply_patch.py
+++ b/examples/tools/apply_patch.py
@@ -162,7 +162,7 @@ async def main(auto_approve: bool, model: str) -> None:
     )
     parser.add_argument(
         "--model",
-        default="gpt-5.1",
+        default="gpt-5.2",
         help="Model ID to use for the agent.",
     )
     args = parser.parse_args()
diff --git a/examples/tools/shell.py b/examples/tools/shell.py
index 7dcb133095..37e815178a 100644
--- a/examples/tools/shell.py
+++ b/examples/tools/shell.py
@@ -108,7 +108,7 @@ async def main(prompt: str, model: str) -> None:
     )
     parser.add_argument(
         "--model",
-        default="gpt-5.1",
+        default="gpt-5.2",
     )
     args = parser.parse_args()
     asyncio.run(main(args.prompt, args.model))

From 368fdbd1b9a287957ba1c64e09f71124cc777a64 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 21:39:02 +0900
Subject: [PATCH 02/15] fix

---
 examples/mcp/prompt_server/main.py                        | 5 +++--
 examples/mcp/streamablehttp_custom_client_example/main.py | 5 +++--
 examples/mcp/streamablehttp_example/main.py               | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/mcp/prompt_server/main.py b/examples/mcp/prompt_server/main.py
index 543ec273c4..3cd045e63b 100644
--- a/examples/mcp/prompt_server/main.py
+++ b/examples/mcp/prompt_server/main.py
@@ -4,7 +4,7 @@
 import socket
 import subprocess
 import time
-from typing import Any
+from typing import Any, cast
 
 from agents import Agent, Runner, gen_trace_id, trace
 from agents.mcp import MCPServer, MCPServerStreamableHttp
@@ -19,7 +19,8 @@ def _choose_port() -> int:
         return int(env_port)
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind((STREAMABLE_HTTP_HOST, 0))
-        return s.getsockname()[1]
+        address = cast(tuple[str, int], s.getsockname())
+        return address[1]
 
 
 STREAMABLE_HTTP_PORT = _choose_port()
diff --git a/examples/mcp/streamablehttp_custom_client_example/main.py b/examples/mcp/streamablehttp_custom_client_example/main.py
index 9c45812009..20cbef1cdc 100644
--- a/examples/mcp/streamablehttp_custom_client_example/main.py
+++ b/examples/mcp/streamablehttp_custom_client_example/main.py
@@ -10,7 +10,7 @@
 import socket
 import subprocess
 import time
-from typing import Any
+from typing import Any, cast
 
 import httpx
 
@@ -27,7 +27,8 @@ def _choose_port() -> int:
         return int(env_port)
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind((STREAMABLE_HTTP_HOST, 0))
-        return s.getsockname()[1]
+        address = cast(tuple[str, int], s.getsockname())
+        return address[1]
 
 
 STREAMABLE_HTTP_PORT = _choose_port()
diff --git a/examples/mcp/streamablehttp_example/main.py b/examples/mcp/streamablehttp_example/main.py
index fd1140f98e..564a7bf98f 100644
--- a/examples/mcp/streamablehttp_example/main.py
+++ b/examples/mcp/streamablehttp_example/main.py
@@ -4,7 +4,7 @@
 import socket
 import subprocess
 import time
-from typing import Any
+from typing import Any, cast
 
 from agents import Agent, Runner, gen_trace_id, trace
 from agents.mcp import MCPServer, MCPServerStreamableHttp
@@ -19,7 +19,8 @@ def _choose_port() -> int:
         return int(env_port)
     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
         s.bind((STREAMABLE_HTTP_HOST, 0))
-        return s.getsockname()[1]
+        address = cast(tuple[str, int], s.getsockname())
+        return address[1]
 
 
 STREAMABLE_HTTP_PORT = _choose_port()

From 18f24a28cd3174338499bd6f7841b120cb81d5fb Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 21:41:27 +0900
Subject: [PATCH 03/15] fix

---
 .codex/skills/examples-auto-run/scripts/run.sh | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh
index d36270c915..28d502d60c 100755
--- a/.codex/skills/examples-auto-run/scripts/run.sh
+++ b/.codex/skills/examples-auto-run/scripts/run.sh
@@ -23,15 +23,16 @@ cmd_start() {
     shift
   fi
 
-  local ts log_file
+  local ts main_log stdout_log
   ts="$(date +%Y%m%d-%H%M%S)"
-  log_file="$LOG_DIR/main_${ts}.log"
+  main_log="$LOG_DIR/main_${ts}.log"
+  stdout_log="$LOG_DIR/stdout_${ts}.log"
 
   local run_cmd=(
     uv run examples/run_examples.py
     --auto-mode
     --write-rerun
-    --main-log "$log_file"
+    --main-log "$main_log"
     --logs-dir "$LOG_DIR"
   )
 
@@ -55,12 +56,13 @@ cmd_start() {
       export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}"
       export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}"
       cd "$ROOT"
-      "${run_cmd[@]}" "$@" 2>&1 | tee "$log_file" >/dev/null
+      "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log" >/dev/null
     ) &
     local pid=$!
     echo "$pid" >"$PID_FILE"
     echo "Started run_examples.py (pid=$pid)"
-    echo "Main log: $log_file"
+    echo "Main log: $main_log"
+    echo "Stdout log: $stdout_log"
     return 0
   fi
 
@@ -73,7 +75,7 @@ cmd_start() {
   export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}"
   export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}"
   cd "$ROOT"
-  "${run_cmd[@]}" "$@" 2>&1 | tee "$log_file"
+  "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log"
 }
 
 cmd_stop() {

From 7a4fa62677ac6fb8ddcee09364b91a051b828b51 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 21:52:19 +0900
Subject: [PATCH 04/15] fix

---
 .codex/skills/examples-auto-run/SKILL.md      |  13 +
 .../skills/examples-auto-run/scripts/run.sh   |  48 ++-
 examples/behavioral_validation.py             | 285 ++++++++++++++++++
 examples/run_examples.py                      |  12 -
 4 files changed, 345 insertions(+), 13 deletions(-)
 create mode 100644 examples/behavioral_validation.py

diff --git a/.codex/skills/examples-auto-run/SKILL.md b/.codex/skills/examples-auto-run/SKILL.md
index 4e3abc7545..9d554b426c 100644
--- a/.codex/skills/examples-auto-run/SKILL.md
+++ b/.codex/skills/examples-auto-run/SKILL.md
@@ -58,9 +58,22 @@ description: Run python examples in auto mode with logging, rerun helpers, and b
 - Main logs: `.tmp/examples-start-logs/main_*.log`
 - Per-example logs (from `run_examples.py`): `.tmp/examples-start-logs/<module_path>.log`
 - Rerun list: `.tmp/examples-rerun.txt`
+- Stdout logs: `.tmp/examples-start-logs/stdout_*.log`
 
 ## Notes
 
 - The runner delegates to `uv run examples/run_examples.py`, which already writes per-example logs and supports `--collect`, `--rerun-file`, and `--print-auto-skip`.
 - `start` uses `--write-rerun` so failures are captured automatically.
 - If `.tmp/examples-rerun.txt` exists and is non-empty, invoking the skill with no args runs `rerun` by default.
+
+## Behavioral validation
+
+- After every foreground `start` or `rerun`, the script automatically runs `uv run examples/behavioral_validation.py` against the generated main log.
+- The validator:
+  1. Reads the example source to derive expected messages (print strings and prompt/message assignments).
+  2. Reads each passed example’s log and checks those messages appeared.
+  3. Reports per-example status with the full matching log lines; missing expectations are flagged.
+- Background runs do not validate automatically; after they finish, run:
+  ```bash
+  .codex/skills/examples-auto-run/scripts/run.sh validate <main_log_path>
+  ```
diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh
index 28d502d60c..4963c42ac5 100755
--- a/.codex/skills/examples-auto-run/scripts/run.sh
+++ b/.codex/skills/examples-auto-run/scripts/run.sh
@@ -15,6 +15,19 @@ is_running() {
   [[ -n "$pid" ]] && ps -p "$pid" >/dev/null 2>&1
 }
 
+run_validation() {
+  local main_log="$1"
+  if [[ -z "$main_log" ]]; then
+    echo "Validation skipped: main log path is empty."
+    return 0
+  fi
+  if [[ ! -f "$main_log" ]]; then
+    echo "Validation skipped: main log not found: $main_log"
+    return 0
+  fi
+  uv run examples/behavioral_validation.py --main-log "$main_log" --logs-dir "$LOG_DIR" || true
+}
+
 cmd_start() {
   ensure_dirs
   local background=0
@@ -63,6 +76,7 @@ cmd_start() {
     echo "Started run_examples.py (pid=$pid)"
     echo "Main log: $main_log"
     echo "Stdout log: $stdout_log"
+    echo "Run '.codex/skills/examples-auto-run/scripts/run.sh validate \"$main_log\"' after it finishes."
     return 0
   fi
 
@@ -75,7 +89,12 @@ cmd_start() {
   export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}"
   export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}"
   cd "$ROOT"
+  set +e
   "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log"
+  local run_status=${PIPESTATUS[0]}
+  set -e
+  run_validation "$main_log"
+  return "$run_status"
 }
 
 cmd_stop() {
@@ -157,12 +176,37 @@ cmd_rerun() {
     echo "Rerun list is empty: $file"
     exit 0
   fi
+  local ts main_log stdout_log
+  ts="$(date +%Y%m%d-%H%M%S)"
+  main_log="$LOG_DIR/main_${ts}.log"
+  stdout_log="$LOG_DIR/stdout_${ts}.log"
   cd "$ROOT"
   export EXAMPLES_INTERACTIVE_MODE="${EXAMPLES_INTERACTIVE_MODE:-auto}"
   export APPLY_PATCH_AUTO_APPROVE="${APPLY_PATCH_AUTO_APPROVE:-1}"
   export SHELL_AUTO_APPROVE="${SHELL_AUTO_APPROVE:-1}"
   export AUTO_APPROVE_MCP="${AUTO_APPROVE_MCP:-1}"
-  uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun
+  set +e
+  uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun --main-log "$main_log" --logs-dir "$LOG_DIR" 2>&1 | tee "$stdout_log"
+  local run_status=${PIPESTATUS[0]}
+  set -e
+  run_validation "$main_log"
+  return "$run_status"
+}
+
+cmd_validate() {
+  ensure_dirs
+  local main_log="${1:-}"
+  if [[ -z "$main_log" ]]; then
+    main_log="$(ls -1t "$LOG_DIR"/main_*.log 2>/dev/null | head -n1)"
+  fi
+  if [[ -z "$main_log" ]]; then
+    echo "No main log found."
+    exit 1
+  fi
+  if [[ "$main_log" != /* && -f "$LOG_DIR/$main_log" ]]; then
+    main_log="$LOG_DIR/$main_log"
+  fi
+  run_validation "$main_log"
 }
 
 usage() {
@@ -177,6 +221,7 @@ Commands:
   tail [logfile]                      Tail the latest (or specified) log.
   collect [main_log]                  Parse a main log and write failed examples to .tmp/examples-rerun.txt.
   rerun [rerun_file]                  Run only the examples listed in .tmp/examples-rerun.txt.
+  validate [main_log]                 Run behavioral validation against the latest (or given) main log.
 
 Environment overrides:
   EXAMPLES_INTERACTIVE_MODE (default auto)
@@ -198,5 +243,6 @@ case "${1:-$default_cmd}" in
   tail) shift; cmd_tail "${1:-}" ;;
   collect) shift || true; collect_rerun "${1:-}" ;;
   rerun) shift || true; cmd_rerun "${1:-}" ;;
+  validate) shift || true; cmd_validate "${1:-}" ;;
   *) usage; exit 1 ;;
 esac
diff --git a/examples/behavioral_validation.py b/examples/behavioral_validation.py
new file mode 100644
index 0000000000..db94150cdb
--- /dev/null
+++ b/examples/behavioral_validation.py
@@ -0,0 +1,285 @@
+"""Lightweight behavioral validation for example runs.
+
+Reads a main log emitted by `examples/run_examples.py`, inspects the source
+files for each passed example to derive expected messages, and checks that the
+per-example logs contain those messages. The goal is to provide quick evidence
+that the observed behavior matches the intended flow without re-running code.
+"""
+
+from __future__ import annotations
+
+import argparse
+import ast
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable, Sequence
+
+ROOT_DIR = Path(__file__).resolve().parent.parent
+LOG_DIR_DEFAULT = ROOT_DIR / ".tmp" / "examples-start-logs"
+
+ENTRY_RE = re.compile(r"^(PASSED|FAILED|SKIPPED|DRYRUN)\s+(\S+)(?:.*log=([^\s]+))?")
+
+
+@dataclass
+class MainEntry:
+    status: str
+    relpath: str
+    log_path: Path | None
+
+
+@dataclass
+class ValidationHit:
+    expectation: str
+    lines: list[str]
+
+
+@dataclass
+class ValidationResult:
+    relpath: str
+    log_path: Path | None
+    status: str  # ok, warn, fail
+    hits: list[ValidationHit]
+    missing: list[str]
+    notes: list[str]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Validate example behavior from logs.")
+    parser.add_argument(
+        "--main-log",
+        help="Path to the main log (defaults to latest main_*.log in logs dir).",
+    )
+    parser.add_argument(
+        "--logs-dir",
+        default=str(LOG_DIR_DEFAULT),
+        help="Directory containing main and per-example logs.",
+    )
+    parser.add_argument(
+        "--limit",
+        type=int,
+        default=5,
+        help="Maximum expectations to check per example (to keep output readable).",
+    )
+    return parser.parse_args()
+
+
+def find_latest_main_log(log_dir: Path) -> Path | None:
+    candidates = sorted(log_dir.glob("main_*.log"), key=lambda p: p.stat().st_mtime, reverse=True)
+    return candidates[0] if candidates else None
+
+
+def parse_main_log(path: Path) -> list[MainEntry]:
+    entries: list[MainEntry] = []
+    for raw_line in path.read_text(encoding="utf-8", errors="replace").splitlines():
+        line = raw_line.strip()
+        match = ENTRY_RE.match(line)
+        if not match:
+            continue
+        status, relpath, log_rel = match.groups()
+        log_path = ROOT_DIR / log_rel if log_rel else None
+        entries.append(MainEntry(status=status, relpath=relpath, log_path=log_path))
+    return entries
+
+
+def clean_text(value: str) -> str:
+    return " ".join(value.split())
+
+
+def _extract_from_print_calls(tree: ast.AST) -> list[str]:
+    texts: list[str] = []
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "print":
+            for arg in node.args:
+                if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
+                    texts.append(arg.value)
+    return texts
+
+
+def _extract_from_assignments(tree: ast.AST) -> list[str]:
+    texts: list[str] = []
+    target_keywords = {"message", "prompt", "question", "instruction", "text"}
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.Assign):
+            continue
+        if not isinstance(node.value, ast.Constant) or not isinstance(node.value.value, str):
+            continue
+        for target in node.targets:
+            if isinstance(target, ast.Name) and any(
+                key in target.id.lower() for key in target_keywords
+            ):
+                texts.append(node.value.value)
+    return texts
+
+
+def derive_expectations(source: str, limit: int) -> list[str]:
+    try:
+        tree = ast.parse(source)
+    except SyntaxError:
+        return []
+
+    texts: list[str] = []
+    texts.extend(_extract_from_print_calls(tree))
+    texts.extend(_extract_from_assignments(tree))
+
+    cleaned: list[str] = []
+    for text in texts:
+        normalized = clean_text(text)
+        if 8 <= len(normalized) <= 200:
+            cleaned.append(normalized)
+
+    # Preserve order while removing duplicates.
+    seen = set()
+    ordered = []
+    for item in cleaned:
+        if item not in seen:
+            seen.add(item)
+            ordered.append(item)
+
+    return ordered[:limit]
+
+
+def find_lines_with_snippet(lines: Sequence[str], snippet: str) -> list[str]:
+    hits: list[str] = []
+    for line in lines:
+        if snippet in line:
+            hits.append(line.rstrip("\n"))
+    return hits
+
+
+def validate_example(entry: MainEntry, limit: int) -> ValidationResult:
+    log_path = entry.log_path
+    notes: list[str] = []
+    if log_path is None or not log_path.exists():
+        return ValidationResult(
+            relpath=entry.relpath,
+            log_path=log_path,
+            status="fail",
+            hits=[],
+            missing=[],
+            notes=["Log file not found."],
+        )
+
+    source_path = ROOT_DIR / entry.relpath
+    if not source_path.exists():
+        return ValidationResult(
+            relpath=entry.relpath,
+            log_path=log_path,
+            status="fail",
+            hits=[],
+            missing=[],
+            notes=["Source file not found."],
+        )
+
+    try:
+        source_text = source_path.read_text(encoding="utf-8")
+    except OSError as exc:
+        return ValidationResult(
+            relpath=entry.relpath,
+            log_path=log_path,
+            status="fail",
+            hits=[],
+            missing=[],
+            notes=[f"Could not read source: {exc}"],
+        )
+
+    expectations = derive_expectations(source_text, limit=limit)
+
+    try:
+        log_lines = log_path.read_text(encoding="utf-8", errors="replace").splitlines()
+    except OSError as exc:
+        return ValidationResult(
+            relpath=entry.relpath,
+            log_path=log_path,
+            status="fail",
+            hits=[],
+            missing=[],
+            notes=[f"Could not read log: {exc}"],
+        )
+
+    if not expectations:
+        notes.append("No expectations derived from source (skip validation heuristics).")
+        return ValidationResult(
+            relpath=entry.relpath,
+            log_path=log_path,
+            status="warn",
+            hits=[],
+            missing=[],
+            notes=notes,
+        )
+
+    hits: list[ValidationHit] = []
+    missing: list[str] = []
+
+    for expectation in expectations:
+        lines = find_lines_with_snippet(log_lines, expectation)
+        if lines:
+            hits.append(ValidationHit(expectation=expectation, lines=lines))
+        else:
+            missing.append(expectation)
+
+    if hits:
+        status = "ok" if not missing else "warn"
+    else:
+        status = "warn"
+        notes.append("No expected messages observed in log.")
+
+    return ValidationResult(
+        relpath=entry.relpath,
+        log_path=log_path,
+        status=status,
+        hits=hits,
+        missing=missing,
+        notes=notes,
+    )
+
+
+def format_result(result: ValidationResult) -> list[str]:
+    lines: list[str] = []
+    header = f"{result.status.upper():<4} {result.relpath}"
+    lines.append(header)
+    if result.log_path:
+        lines.append(f"  log: {result.log_path}")
+    for hit in result.hits:
+        for line in hit.lines:
+            lines.append(f"  hit: {line}")
+    for miss in result.missing:
+        lines.append(f"  missing: {miss}")
+    for note in result.notes:
+        lines.append(f"  note: {note}")
+    return lines
+
+
+def main() -> int:
+    args = parse_args()
+    log_dir = Path(args.logs_dir)
+    main_log = Path(args.main_log) if args.main_log else find_latest_main_log(log_dir)
+
+    if main_log is None:
+        print(f"No main log found under {log_dir}")
+        return 1
+    if not main_log.exists():
+        print(f"Main log does not exist: {main_log}")
+        return 1
+
+    entries = parse_main_log(main_log)
+    passed = [e for e in entries if e.status == "PASSED"]
+
+    print(f"Behavioral validation for {main_log} ({len(passed)} passed entries)")
+
+    if not passed:
+        print("No passed entries to validate.")
+        return 0
+
+    results = [validate_example(entry, limit=args.limit) for entry in passed]
+
+    for result in results:
+        for line in format_result(result):
+            print(line)
+
+    failures = sum(1 for r in results if r.status == "fail")
+    return 1 if failures else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/examples/run_examples.py b/examples/run_examples.py
index 1b52cdec21..a3a3498185 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -111,11 +111,6 @@ def parse_args() -> argparse.Namespace:
         action="store_true",
         help="Include examples that rely on extra services like Redis, Dapr, Twilio, or Playwright.",
     )
-    parser.add_argument(
-        "--fail-fast",
-        action="store_true",
-        help="Stop after the first failing example.",
-    )
     parser.add_argument(
         "--verbose",
         action="store_true",
@@ -327,9 +322,6 @@ def run_examples(examples: Sequence[ExampleScript], args: argparse.Namespace) ->
     results: list[ExampleResult] = []
 
     jobs = max(1, args.jobs)
-    if args.fail_fast and jobs > 1:
-        # Preserve fail-fast semantics by forcing serial execution.
-        jobs = 1
 
     output_lock = threading.Lock()
     main_log_lock = threading.Lock()
@@ -454,10 +446,6 @@ def run_single(example: ExampleScript) -> ExampleResult:
             elif result.status == "failed":
                 failed += 1
                 rerun_entries.append(ex.relpath)
-                if args.fail_fast:
-                    safe_write_main("# fail-fast stop")
-                    break
-
         safe_write_main(f"# summary executed={executed} skipped={skipped} failed={failed}")
 
     if args.write_rerun and rerun_entries:

From ef1af0fa7909b8ca7ce4cde393633ed579c5aa5a Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 21:53:51 +0900
Subject: [PATCH 05/15] fix

---
 .codex/skills/examples-auto-run/scripts/run.sh | 2 +-
 examples/behavioral_validation.py              | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh
index 4963c42ac5..c861b55424 100755
--- a/.codex/skills/examples-auto-run/scripts/run.sh
+++ b/.codex/skills/examples-auto-run/scripts/run.sh
@@ -69,7 +69,7 @@ cmd_start() {
       export EXAMPLES_INCLUDE_AUDIO="${EXAMPLES_INCLUDE_AUDIO:-0}"
       export EXAMPLES_INCLUDE_EXTERNAL="${EXAMPLES_INCLUDE_EXTERNAL:-0}"
       cd "$ROOT"
-      "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log" >/dev/null
+      exec "${run_cmd[@]}" "$@" > >(tee "$stdout_log") 2>&1
     ) &
     local pid=$!
     echo "$pid" >"$PID_FILE"
diff --git a/examples/behavioral_validation.py b/examples/behavioral_validation.py
index db94150cdb..154f87d9aa 100644
--- a/examples/behavioral_validation.py
+++ b/examples/behavioral_validation.py
@@ -11,9 +11,9 @@
 import argparse
 import ast
 import re
+from collections.abc import Sequence
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Iterable, Sequence
 
 ROOT_DIR = Path(__file__).resolve().parent.parent
 LOG_DIR_DEFAULT = ROOT_DIR / ".tmp" / "examples-start-logs"
@@ -89,7 +89,11 @@ def clean_text(value: str) -> str:
 def _extract_from_print_calls(tree: ast.AST) -> list[str]:
     texts: list[str] = []
     for node in ast.walk(tree):
-        if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "print":
+        if (
+            isinstance(node, ast.Call)
+            and isinstance(node.func, ast.Name)
+            and node.func.id == "print"
+        ):
             for arg in node.args:
                 if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
                     texts.append(arg.value)

From ba42259da9775c805a8c889ff5d36b54d470fd13 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:00:26 +0900
Subject: [PATCH 06/15] fix

---
 examples/run_examples.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/examples/run_examples.py b/examples/run_examples.py
index a3a3498185..fcc13f8e65 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -241,6 +241,13 @@ def format_command(cmd: Sequence[str]) -> str:
     return shlex.join(cmd)
 
 
+def display_path(path: Path) -> str:
+    try:
+        return str(path.relative_to(ROOT_DIR))
+    except ValueError:
+        return str(path)
+
+
 def env_flag(name: str) -> bool | None:
     raw = os.environ.get(name)
     if raw is None:
@@ -374,7 +381,7 @@ def run_single(example: ExampleScript) -> ExampleResult:
                     sys.stdout.write(f"[{relpath}] {line}")
 
         if exit_code == 0:
-            safe_write_main(f"PASSED {relpath} exit=0 log={log_path.relative_to(ROOT_DIR)}")
+            safe_write_main(f"PASSED {relpath} exit=0 log={display_path(log_path)}")
             return ExampleResult(
                 script=example,
                 status="passed",
@@ -385,7 +392,7 @@ def run_single(example: ExampleScript) -> ExampleResult:
         info = f"exit={exit_code}"
         with output_lock:
             print(f"  !! {relpath} exited with {exit_code}")
-        safe_write_main(f"FAILED {relpath} exit={exit_code} log={log_path.relative_to(ROOT_DIR)}")
+        safe_write_main(f"FAILED {relpath} exit={exit_code} log={display_path(log_path)}")
         return ExampleResult(
             script=example,
             status="failed",
@@ -466,9 +473,7 @@ def run_single(example: ExampleScript) -> ExampleResult:
     for result in results:
         info = result.reason or ("exit 0" if result.status == "passed" else "")
         log_disp = (
-            str(result.log_path.relative_to(ROOT_DIR))
-            if result.log_path and result.log_path.exists()
-            else "-"
+            display_path(result.log_path) if result.log_path and result.log_path.exists() else "-"
         )
         print(
             f"{result.status.ljust(status_w)} {result.script.relpath.ljust(name_w)} {info.ljust(info_w)} {log_disp}"

From 285f391ac8e4baed9aa5537239c0c881be8131b9 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:09:39 +0900
Subject: [PATCH 07/15] fix

---
 .codex/skills/examples-auto-run/SKILL.md      |  25 +-
 .../skills/examples-auto-run/scripts/run.sh   |  33 --
 examples/behavioral_validation.py             | 289 ------------------
 3 files changed, 14 insertions(+), 333 deletions(-)
 delete mode 100644 examples/behavioral_validation.py

diff --git a/.codex/skills/examples-auto-run/SKILL.md b/.codex/skills/examples-auto-run/SKILL.md
index 9d554b426c..9fdab22469 100644
--- a/.codex/skills/examples-auto-run/SKILL.md
+++ b/.codex/skills/examples-auto-run/SKILL.md
@@ -66,14 +66,17 @@ description: Run python examples in auto mode with logging, rerun helpers, and b
 - `start` uses `--write-rerun` so failures are captured automatically.
 - If `.tmp/examples-rerun.txt` exists and is non-empty, invoking the skill with no args runs `rerun` by default.
 
-## Behavioral validation
-
-- After every foreground `start` or `rerun`, the script automatically runs `uv run examples/behavioral_validation.py` against the generated main log.
-- The validator:
-  1. Reads the example source to derive expected messages (print strings and prompt/message assignments).
-  2. Reads each passed example’s log and checks those messages appeared.
-  3. Reports per-example status with the full matching log lines; missing expectations are flagged.
-- Background runs do not validate automatically; after they finish, run:
-  ```bash
-  .codex/skills/examples-auto-run/scripts/run.sh validate <main_log_path>
-  ```
+## Behavioral validation (Codex/LLM responsibility)
+
+The runner no longer auto-runs `examples/behavioral_validation.py`. After every foreground `start` or `rerun`, **Codex must manually validate** all exit-0 entries:
+
+1. Read the example source (and comments) to infer intended flow, tools used, and expected key outputs.
+2. Open the matching per-example log under `.tmp/examples-start-logs/`.
+3. Confirm the intended actions/results occurred; flag omissions or divergences.
+4. Do this for **all passed examples**, not just a sample.
+5. Report immediately after the run with concise citations to the exact log lines that justify the validation.
+
+If you still want the heuristic validator, you can run it manually:
+```bash
+.codex/skills/examples-auto-run/scripts/run.sh validate <main_log_path>
+```
diff --git a/.codex/skills/examples-auto-run/scripts/run.sh b/.codex/skills/examples-auto-run/scripts/run.sh
index c861b55424..74258f8cac 100755
--- a/.codex/skills/examples-auto-run/scripts/run.sh
+++ b/.codex/skills/examples-auto-run/scripts/run.sh
@@ -15,19 +15,6 @@ is_running() {
   [[ -n "$pid" ]] && ps -p "$pid" >/dev/null 2>&1
 }
 
-run_validation() {
-  local main_log="$1"
-  if [[ -z "$main_log" ]]; then
-    echo "Validation skipped: main log path is empty."
-    return 0
-  fi
-  if [[ ! -f "$main_log" ]]; then
-    echo "Validation skipped: main log not found: $main_log"
-    return 0
-  fi
-  uv run examples/behavioral_validation.py --main-log "$main_log" --logs-dir "$LOG_DIR" || true
-}
-
 cmd_start() {
   ensure_dirs
   local background=0
@@ -93,7 +80,6 @@ cmd_start() {
   "${run_cmd[@]}" "$@" 2>&1 | tee "$stdout_log"
   local run_status=${PIPESTATUS[0]}
   set -e
-  run_validation "$main_log"
   return "$run_status"
 }
 
@@ -189,26 +175,9 @@ cmd_rerun() {
   uv run examples/run_examples.py --auto-mode --rerun-file "$file" --write-rerun --main-log "$main_log" --logs-dir "$LOG_DIR" 2>&1 | tee "$stdout_log"
   local run_status=${PIPESTATUS[0]}
   set -e
-  run_validation "$main_log"
   return "$run_status"
 }
 
-cmd_validate() {
-  ensure_dirs
-  local main_log="${1:-}"
-  if [[ -z "$main_log" ]]; then
-    main_log="$(ls -1t "$LOG_DIR"/main_*.log 2>/dev/null | head -n1)"
-  fi
-  if [[ -z "$main_log" ]]; then
-    echo "No main log found."
-    exit 1
-  fi
-  if [[ "$main_log" != /* && -f "$LOG_DIR/$main_log" ]]; then
-    main_log="$LOG_DIR/$main_log"
-  fi
-  run_validation "$main_log"
-}
-
 usage() {
   cat <<'EOF'
 Usage: run.sh <start|stop|status|logs|tail|collect|rerun> [args...]
@@ -221,7 +190,6 @@ Commands:
   tail [logfile]                      Tail the latest (or specified) log.
   collect [main_log]                  Parse a main log and write failed examples to .tmp/examples-rerun.txt.
   rerun [rerun_file]                  Run only the examples listed in .tmp/examples-rerun.txt.
-  validate [main_log]                 Run behavioral validation against the latest (or given) main log.
 
 Environment overrides:
   EXAMPLES_INTERACTIVE_MODE (default auto)
@@ -243,6 +211,5 @@ case "${1:-$default_cmd}" in
   tail) shift; cmd_tail "${1:-}" ;;
   collect) shift || true; collect_rerun "${1:-}" ;;
   rerun) shift || true; cmd_rerun "${1:-}" ;;
-  validate) shift || true; cmd_validate "${1:-}" ;;
   *) usage; exit 1 ;;
 esac
diff --git a/examples/behavioral_validation.py b/examples/behavioral_validation.py
deleted file mode 100644
index 154f87d9aa..0000000000
--- a/examples/behavioral_validation.py
+++ /dev/null
@@ -1,289 +0,0 @@
-"""Lightweight behavioral validation for example runs.
-
-Reads a main log emitted by `examples/run_examples.py`, inspects the source
-files for each passed example to derive expected messages, and checks that the
-per-example logs contain those messages. The goal is to provide quick evidence
-that the observed behavior matches the intended flow without re-running code.
-"""
-
-from __future__ import annotations
-
-import argparse
-import ast
-import re
-from collections.abc import Sequence
-from dataclasses import dataclass
-from pathlib import Path
-
-ROOT_DIR = Path(__file__).resolve().parent.parent
-LOG_DIR_DEFAULT = ROOT_DIR / ".tmp" / "examples-start-logs"
-
-ENTRY_RE = re.compile(r"^(PASSED|FAILED|SKIPPED|DRYRUN)\s+(\S+)(?:.*log=([^\s]+))?")
-
-
-@dataclass
-class MainEntry:
-    status: str
-    relpath: str
-    log_path: Path | None
-
-
-@dataclass
-class ValidationHit:
-    expectation: str
-    lines: list[str]
-
-
-@dataclass
-class ValidationResult:
-    relpath: str
-    log_path: Path | None
-    status: str  # ok, warn, fail
-    hits: list[ValidationHit]
-    missing: list[str]
-    notes: list[str]
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Validate example behavior from logs.")
-    parser.add_argument(
-        "--main-log",
-        help="Path to the main log (defaults to latest main_*.log in logs dir).",
-    )
-    parser.add_argument(
-        "--logs-dir",
-        default=str(LOG_DIR_DEFAULT),
-        help="Directory containing main and per-example logs.",
-    )
-    parser.add_argument(
-        "--limit",
-        type=int,
-        default=5,
-        help="Maximum expectations to check per example (to keep output readable).",
-    )
-    return parser.parse_args()
-
-
-def find_latest_main_log(log_dir: Path) -> Path | None:
-    candidates = sorted(log_dir.glob("main_*.log"), key=lambda p: p.stat().st_mtime, reverse=True)
-    return candidates[0] if candidates else None
-
-
-def parse_main_log(path: Path) -> list[MainEntry]:
-    entries: list[MainEntry] = []
-    for raw_line in path.read_text(encoding="utf-8", errors="replace").splitlines():
-        line = raw_line.strip()
-        match = ENTRY_RE.match(line)
-        if not match:
-            continue
-        status, relpath, log_rel = match.groups()
-        log_path = ROOT_DIR / log_rel if log_rel else None
-        entries.append(MainEntry(status=status, relpath=relpath, log_path=log_path))
-    return entries
-
-
-def clean_text(value: str) -> str:
-    return " ".join(value.split())
-
-
-def _extract_from_print_calls(tree: ast.AST) -> list[str]:
-    texts: list[str] = []
-    for node in ast.walk(tree):
-        if (
-            isinstance(node, ast.Call)
-            and isinstance(node.func, ast.Name)
-            and node.func.id == "print"
-        ):
-            for arg in node.args:
-                if isinstance(arg, ast.Constant) and isinstance(arg.value, str):
-                    texts.append(arg.value)
-    return texts
-
-
-def _extract_from_assignments(tree: ast.AST) -> list[str]:
-    texts: list[str] = []
-    target_keywords = {"message", "prompt", "question", "instruction", "text"}
-    for node in ast.walk(tree):
-        if not isinstance(node, ast.Assign):
-            continue
-        if not isinstance(node.value, ast.Constant) or not isinstance(node.value.value, str):
-            continue
-        for target in node.targets:
-            if isinstance(target, ast.Name) and any(
-                key in target.id.lower() for key in target_keywords
-            ):
-                texts.append(node.value.value)
-    return texts
-
-
-def derive_expectations(source: str, limit: int) -> list[str]:
-    try:
-        tree = ast.parse(source)
-    except SyntaxError:
-        return []
-
-    texts: list[str] = []
-    texts.extend(_extract_from_print_calls(tree))
-    texts.extend(_extract_from_assignments(tree))
-
-    cleaned: list[str] = []
-    for text in texts:
-        normalized = clean_text(text)
-        if 8 <= len(normalized) <= 200:
-            cleaned.append(normalized)
-
-    # Preserve order while removing duplicates.
-    seen = set()
-    ordered = []
-    for item in cleaned:
-        if item not in seen:
-            seen.add(item)
-            ordered.append(item)
-
-    return ordered[:limit]
-
-
-def find_lines_with_snippet(lines: Sequence[str], snippet: str) -> list[str]:
-    hits: list[str] = []
-    for line in lines:
-        if snippet in line:
-            hits.append(line.rstrip("\n"))
-    return hits
-
-
-def validate_example(entry: MainEntry, limit: int) -> ValidationResult:
-    log_path = entry.log_path
-    notes: list[str] = []
-    if log_path is None or not log_path.exists():
-        return ValidationResult(
-            relpath=entry.relpath,
-            log_path=log_path,
-            status="fail",
-            hits=[],
-            missing=[],
-            notes=["Log file not found."],
-        )
-
-    source_path = ROOT_DIR / entry.relpath
-    if not source_path.exists():
-        return ValidationResult(
-            relpath=entry.relpath,
-            log_path=log_path,
-            status="fail",
-            hits=[],
-            missing=[],
-            notes=["Source file not found."],
-        )
-
-    try:
-        source_text = source_path.read_text(encoding="utf-8")
-    except OSError as exc:
-        return ValidationResult(
-            relpath=entry.relpath,
-            log_path=log_path,
-            status="fail",
-            hits=[],
-            missing=[],
-            notes=[f"Could not read source: {exc}"],
-        )
-
-    expectations = derive_expectations(source_text, limit=limit)
-
-    try:
-        log_lines = log_path.read_text(encoding="utf-8", errors="replace").splitlines()
-    except OSError as exc:
-        return ValidationResult(
-            relpath=entry.relpath,
-            log_path=log_path,
-            status="fail",
-            hits=[],
-            missing=[],
-            notes=[f"Could not read log: {exc}"],
-        )
-
-    if not expectations:
-        notes.append("No expectations derived from source (skip validation heuristics).")
-        return ValidationResult(
-            relpath=entry.relpath,
-            log_path=log_path,
-            status="warn",
-            hits=[],
-            missing=[],
-            notes=notes,
-        )
-
-    hits: list[ValidationHit] = []
-    missing: list[str] = []
-
-    for expectation in expectations:
-        lines = find_lines_with_snippet(log_lines, expectation)
-        if lines:
-            hits.append(ValidationHit(expectation=expectation, lines=lines))
-        else:
-            missing.append(expectation)
-
-    if hits:
-        status = "ok" if not missing else "warn"
-    else:
-        status = "warn"
-        notes.append("No expected messages observed in log.")
-
-    return ValidationResult(
-        relpath=entry.relpath,
-        log_path=log_path,
-        status=status,
-        hits=hits,
-        missing=missing,
-        notes=notes,
-    )
-
-
-def format_result(result: ValidationResult) -> list[str]:
-    lines: list[str] = []
-    header = f"{result.status.upper():<4} {result.relpath}"
-    lines.append(header)
-    if result.log_path:
-        lines.append(f"  log: {result.log_path}")
-    for hit in result.hits:
-        for line in hit.lines:
-            lines.append(f"  hit: {line}")
-    for miss in result.missing:
-        lines.append(f"  missing: {miss}")
-    for note in result.notes:
-        lines.append(f"  note: {note}")
-    return lines
-
-
-def main() -> int:
-    args = parse_args()
-    log_dir = Path(args.logs_dir)
-    main_log = Path(args.main_log) if args.main_log else find_latest_main_log(log_dir)
-
-    if main_log is None:
-        print(f"No main log found under {log_dir}")
-        return 1
-    if not main_log.exists():
-        print(f"Main log does not exist: {main_log}")
-        return 1
-
-    entries = parse_main_log(main_log)
-    passed = [e for e in entries if e.status == "PASSED"]
-
-    print(f"Behavioral validation for {main_log} ({len(passed)} passed entries)")
-
-    if not passed:
-        print("No passed entries to validate.")
-        return 0
-
-    results = [validate_example(entry, limit=args.limit) for entry in passed]
-
-    for result in results:
-        for line in format_result(result):
-            print(line)
-
-    failures = sum(1 for r in results if r.status == "fail")
-    return 1 if failures else 0
-
-
-if __name__ == "__main__":
-    raise SystemExit(main())

From bd9dd6fcc332315f24e9a5344be9195174bc60a6 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:15:41 +0900
Subject: [PATCH 08/15] fix

---
 .codex/skills/examples-auto-run/SKILL.md | 7 +------
 examples/run_examples.py                 | 4 ++--
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.codex/skills/examples-auto-run/SKILL.md b/.codex/skills/examples-auto-run/SKILL.md
index 9fdab22469..cca492b6a8 100644
--- a/.codex/skills/examples-auto-run/SKILL.md
+++ b/.codex/skills/examples-auto-run/SKILL.md
@@ -68,15 +68,10 @@ description: Run python examples in auto mode with logging, rerun helpers, and b
 
 ## Behavioral validation (Codex/LLM responsibility)
 
-The runner no longer auto-runs `examples/behavioral_validation.py`. After every foreground `start` or `rerun`, **Codex must manually validate** all exit-0 entries:
+The runner does not perform any automated behavioral validation. After every foreground `start` or `rerun`, **Codex must manually validate** all exit-0 entries:
 
 1. Read the example source (and comments) to infer intended flow, tools used, and expected key outputs.
 2. Open the matching per-example log under `.tmp/examples-start-logs/`.
 3. Confirm the intended actions/results occurred; flag omissions or divergences.
 4. Do this for **all passed examples**, not just a sample.
 5. Report immediately after the run with concise citations to the exact log lines that justify the validation.
-
-If you still want the heuristic validator, you can run it manually:
-```bash
-.codex/skills/examples-auto-run/scripts/run.sh validate <main_log_path>
-```
diff --git a/examples/run_examples.py b/examples/run_examples.py
index fcc13f8e65..e828aacdd2 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -348,8 +348,8 @@ def run_single(example: ExampleScript) -> ExampleResult:
 
         env = os.environ.copy()
         if auto_mode:
-            env.setdefault("EXAMPLES_INTERACTIVE_MODE", "auto")
-            env.setdefault("APPLY_PATCH_AUTO_APPROVE", "1")
+            env["EXAMPLES_INTERACTIVE_MODE"] = "auto"
+            env["APPLY_PATCH_AUTO_APPROVE"] = "1"
             env.setdefault("SHELL_AUTO_APPROVE", "1")
             env.setdefault("AUTO_APPROVE_MCP", "1")
 

From 2297757e4c5a69315ffbcd65cdb6b9c662e638b7 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:22:50 +0900
Subject: [PATCH 09/15] fix

---
 examples/run_examples.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/examples/run_examples.py b/examples/run_examples.py
index e828aacdd2..a15c9f32f7 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -268,11 +268,18 @@ def write_main_log_line(handle, line: str) -> None:
     handle.flush()
 
 
-def ensure_dirs(path: Path) -> None:
-    if path.suffix:
-        path.parent.mkdir(parents=True, exist_ok=True)
-    else:
-        path.mkdir(parents=True, exist_ok=True)
+def ensure_dirs(path: Path, is_file: bool | None = None) -> None:
+    """Create directories for a file or directory path.
+
+    If `is_file` is True, always create the parent directory. If False, create the
+    directory itself. When None, treat paths with a suffix as files and others as
+    directories, but suffix-less file names should pass is_file=True to avoid
+    accidental directory creation.
+    """
+    if is_file is None:
+        is_file = bool(path.suffix)
+    target = path.parent if is_file else path
+    target.mkdir(parents=True, exist_ok=True)
 
 
 def parse_rerun_from_log(log_path: Path) -> list[str]:
@@ -312,8 +319,8 @@ def run_examples(examples: Sequence[ExampleScript], args: argparse.Namespace) ->
     if auto_mode and "interactive" not in overrides:
         overrides.add("interactive")
 
-    ensure_dirs(logs_dir)
-    ensure_dirs(main_log_path)
+    ensure_dirs(logs_dir, is_file=False)
+    ensure_dirs(main_log_path, is_file=True)
     rerun_entries: list[str] = []
 
     if not examples:
@@ -344,7 +351,7 @@ def run_single(example: ExampleScript) -> ExampleResult:
         relpath = example.relpath
         log_filename = f"{relpath.replace('/', '__')}.log"
         log_path = logs_dir / log_filename
-        ensure_dirs(log_path)
+        ensure_dirs(log_path, is_file=True)
 
         env = os.environ.copy()
         if auto_mode:
@@ -456,7 +463,7 @@ def run_single(example: ExampleScript) -> ExampleResult:
         safe_write_main(f"# summary executed={executed} skipped={skipped} failed={failed}")
 
     if args.write_rerun and rerun_entries:
-        ensure_dirs(RERUN_FILE_DEFAULT)
+        ensure_dirs(RERUN_FILE_DEFAULT, is_file=True)
         RERUN_FILE_DEFAULT.write_text("\n".join(rerun_entries) + "\n", encoding="utf-8")
         print(f"Wrote rerun list to {RERUN_FILE_DEFAULT}")
 
@@ -493,7 +500,7 @@ def main() -> int:
         paths = parse_rerun_from_log(Path(args.collect))
         if args.output:
             out = Path(args.output)
-            ensure_dirs(out)
+            ensure_dirs(out, is_file=True)
             out.write_text("\n".join(paths) + "\n", encoding="utf-8")
             print(f"Wrote {len(paths)} entries to {out}")
         else:

From 2d8b516962c4a12223999f293921dc5b5d21447b Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:29:20 +0900
Subject: [PATCH 10/15] fix

---
 examples/run_examples.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/run_examples.py b/examples/run_examples.py
index a15c9f32f7..168b8245a3 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -462,9 +462,13 @@ def run_single(example: ExampleScript) -> ExampleResult:
                 rerun_entries.append(ex.relpath)
         safe_write_main(f"# summary executed={executed} skipped={skipped} failed={failed}")
 
-    if args.write_rerun and rerun_entries:
+    if args.write_rerun:
         ensure_dirs(RERUN_FILE_DEFAULT, is_file=True)
-        RERUN_FILE_DEFAULT.write_text("\n".join(rerun_entries) + "\n", encoding="utf-8")
+        if rerun_entries:
+            contents = "\n".join(rerun_entries) + "\n"
+        else:
+            contents = ""
+        RERUN_FILE_DEFAULT.write_text(contents, encoding="utf-8")
         print(f"Wrote rerun list to {RERUN_FILE_DEFAULT}")
 
     print(f"Main log: {main_log_path}")

From 57b405e9c84c70fc03c985231027008ae6bcb9b3 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:31:59 +0900
Subject: [PATCH 11/15] fix

---
 examples/run_examples.py | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/examples/run_examples.py b/examples/run_examples.py
index 168b8245a3..5b4285d598 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -369,20 +369,36 @@ def run_single(example: ExampleScript) -> ExampleResult:
             env=env,
         )
         assert proc.stdout is not None
+        force_prompt_stream = (not auto_mode) and ("interactive" in example.tags)
+        buffer_output_local = buffer_output and not force_prompt_stream
         buffer_lines: list[str] = []
 
         with log_path.open("w", encoding="utf-8") as per_log:
-            for line in proc.stdout:
-                per_log.write(line)
-                if buffer_output:
-                    buffer_lines.append(line)
-                else:
+            if force_prompt_stream:
+                at_line_start = True
+                while True:
+                    char = proc.stdout.read(1)
+                    if char == "":
+                        break
+                    per_log.write(char)
                     with output_lock:
-                        sys.stdout.write(f"[{relpath}] {line}")
+                        if at_line_start:
+                            sys.stdout.write(f"[{relpath}] ")
+                        sys.stdout.write(char)
+                        sys.stdout.flush()
+                    at_line_start = char == "\n"
+            else:
+                for line in proc.stdout:
+                    per_log.write(line)
+                    if buffer_output_local:
+                        buffer_lines.append(line)
+                    else:
+                        with output_lock:
+                            sys.stdout.write(f"[{relpath}] {line}")
         proc.wait()
         exit_code = proc.returncode
 
-        if buffer_output and buffer_lines:
+        if buffer_output_local and buffer_lines:
             with output_lock:
                 for line in buffer_lines:
                     sys.stdout.write(f"[{relpath}] {line}")

From 4cf694873132dd70882c8591b93108aa3c646d6a Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:34:52 +0900
Subject: [PATCH 12/15] fix

---
 examples/run_examples.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/run_examples.py b/examples/run_examples.py
index 5b4285d598..bd0088a0c7 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -173,7 +173,11 @@ def detect_tags(path: Path, source: str) -> set[str]:
     lower_source = source.lower()
     lower_parts = [part.lower() for part in path.parts]
 
-    if re.search(r"\binput\s*\(", source) or "input_with_fallback(" in lower_source:
+    if (
+        re.search(r"\binput\s*\(", source)
+        or "input_with_fallback(" in lower_source
+        or "confirm_with_fallback(" in lower_source
+    ):
         tags.add("interactive")
     if "prompt_toolkit" in lower_source or "questionary" in lower_source:
         tags.add("interactive")

From 1dd7df3f654e36a6e7e03e6440aed1245d1c3f59 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:41:03 +0900
Subject: [PATCH 13/15] fix

---
 examples/run_examples.py      | 15 ++++++++++-----
 examples/tools/apply_patch.py |  5 +++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/examples/run_examples.py b/examples/run_examples.py
index bd0088a0c7..edbf545bbf 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -22,7 +22,7 @@
 from collections.abc import Iterable, Sequence
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 
 ROOT_DIR = Path(__file__).resolve().parent.parent
 EXAMPLES_DIR = ROOT_DIR / "examples"
@@ -57,7 +57,7 @@ class ExampleScript:
 
     @property
     def relpath(self) -> str:
-        return str(self.path.relative_to(ROOT_DIR))
+        return normalize_relpath(str(self.path.relative_to(ROOT_DIR)))
 
     @property
     def module(self) -> str:
@@ -79,6 +79,11 @@ class ExampleResult:
     exit_code: int | None = None
 
 
+def normalize_relpath(relpath: str) -> str:
+    normalized = relpath.replace("\\", "/")
+    return str(PurePosixPath(normalized))
+
+
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="Run example scripts sequentially.")
     parser.add_argument(
@@ -263,8 +268,8 @@ def load_auto_skip() -> set[str]:
     env_value = os.environ.get("EXAMPLES_AUTO_SKIP", "")
     if env_value.strip():
         parts = re.split(r"[\s,]+", env_value.strip())
-        return {p for p in parts if p}
-    return set(DEFAULT_AUTO_SKIP)
+        return {normalize_relpath(p) for p in parts if p}
+    return {normalize_relpath(p) for p in DEFAULT_AUTO_SKIP}
 
 
 def write_main_log_line(handle, line: str) -> None:
@@ -300,7 +305,7 @@ def parse_rerun_from_log(log_path: Path) -> list[str]:
                 continue
             status, relpath = parts[0].upper(), parts[1]
             if status in {"FAILED", "ERROR", "UNKNOWN"}:
-                rerun.append(relpath)
+                rerun.append(normalize_relpath(relpath))
     return rerun
 
 
diff --git a/examples/tools/apply_patch.py b/examples/tools/apply_patch.py
index d93c4f3fb8..57a49755c6 100644
--- a/examples/tools/apply_patch.py
+++ b/examples/tools/apply_patch.py
@@ -7,6 +7,7 @@
 
 from agents import Agent, ApplyPatchTool, ModelSettings, Runner, apply_diff, trace
 from agents.editor import ApplyPatchOperation, ApplyPatchResult
+from examples.auto_mode import confirm_with_fallback, is_auto_mode
 
 
 class ApprovalTracker:
@@ -89,8 +90,8 @@ def _require_approval(self, operation: ApplyPatchOperation, display_path: str) -
         if operation.diff:
             preview = operation.diff if len(operation.diff) < 400 else f"{operation.diff[:400]}…"
             print("- diff preview:\n", preview)
-        answer = input("Proceed? [y/N] ").strip().lower()
-        if answer not in {"y", "yes"}:
+        approved = confirm_with_fallback("Proceed? [y/N] ", default=is_auto_mode())
+        if not approved:
             raise RuntimeError("Apply patch operation rejected by user.")
         self._approvals.remember(fingerprint)
 

From 25e552d65d0224a718046e51b542eb2010c82605 Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:42:25 +0900
Subject: [PATCH 14/15] fix

---
 examples/run_examples.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/examples/run_examples.py b/examples/run_examples.py
index edbf545bbf..4a15ecc662 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -469,6 +469,14 @@ def run_single(example: ExampleScript) -> ExampleResult:
 
             run_list.append(example)
 
+        if run_list and (not auto_mode) and any("interactive" in ex.tags for ex in run_list):
+            if jobs != 1:
+                print(
+                    "Interactive examples detected; forcing serial execution to avoid shared stdin."
+                )
+                safe_write_main("# jobs_adjusted: 1 reason=interactive")
+            jobs = 1
+
         run_results: dict[str, ExampleResult] = {}
         if run_list:
             with ThreadPoolExecutor(max_workers=jobs) as executor:

From 46df3c3d78aff3f4642fabab1857663e16aac6bf Mon Sep 17 00:00:00 2001
From: Kazuhiro Sera <seratch@openai.com>
Date: Tue, 13 Jan 2026 22:52:12 +0900
Subject: [PATCH 15/15] fix

---
 examples/run_examples.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/run_examples.py b/examples/run_examples.py
index 4a15ecc662..a3a8174464 100644
--- a/examples/run_examples.py
+++ b/examples/run_examples.py
@@ -469,12 +469,16 @@ def run_single(example: ExampleScript) -> ExampleResult:
 
             run_list.append(example)
 
-        if run_list and (not auto_mode) and any("interactive" in ex.tags for ex in run_list):
+        interactive_in_run_list = any("interactive" in ex.tags for ex in run_list)
+        interactive_requested = "interactive" in overrides
+
+        if run_list and (not auto_mode) and (interactive_in_run_list or interactive_requested):
             if jobs != 1:
                 print(
                     "Interactive examples detected; forcing serial execution to avoid shared stdin."
                 )
-                safe_write_main("# jobs_adjusted: 1 reason=interactive")
+                reason = "interactive" if interactive_in_run_list else "interactive-requested"
+                safe_write_main(f"# jobs_adjusted: 1 reason={reason}")
             jobs = 1
 
         run_results: dict[str, ExampleResult] = {}