From 824faeb98aad9b089c8bfb796ecb3604242c8581 Mon Sep 17 00:00:00 2001
From: Ankan Misra <misra13arko@gmail.com>
Date: Sat, 3 Jan 2026 00:38:10 +0530
Subject: [PATCH 1/3] fix(handoffs): filter duplicate items from model input
 when nest_handoff_history is enabled (#2171)

---
 src/agents/_run_impl.py                   |  30 ++-
 src/agents/handoffs/__init__.py           |   7 +
 src/agents/handoffs/history.py            |  58 ++++-
 src/agents/run.py                         |  11 +-
 tests/test_agent_runner.py                |  11 +-
 tests/test_agent_runner_streamed.py       |  20 +-
 tests/test_handoff_history_duplication.py | 276 ++++++++++++++++++++++
 tests/test_soft_cancel.py                 |   2 +-
 8 files changed, 377 insertions(+), 38 deletions(-)
 create mode 100644 tests/test_handoff_history_duplication.py

diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py
index 54fceef57f..6e0f9cf3eb 100644
--- a/src/agents/_run_impl.py
+++ b/src/agents/_run_impl.py
@@ -243,7 +243,8 @@ class SingleStepResult:
     """Items generated before the current step."""
 
     new_step_items: list[RunItem]
-    """Items generated during this current step."""
+    """Items generated during this current step. May be filtered during handoffs to avoid
+    duplication in model input."""
 
     next_step: NextStepHandoff | NextStepFinalOutput | NextStepRunAgain
     """The next step to take."""
@@ -254,11 +255,18 @@ class SingleStepResult:
     tool_output_guardrail_results: list[ToolOutputGuardrailResult]
     """Tool output guardrail results from this step."""
 
+    session_step_items: list[RunItem] | None = None
+    """Full unfiltered items for session history. When set, these are used instead of
+    new_step_items for session saving and generated_items property."""
+
     @property
     def generated_items(self) -> list[RunItem]:
         """Items generated during the agent run (i.e. everything generated after
         `original_input`)."""
-        return self.pre_step_items + self.new_step_items
+        items = (
+            self.session_step_items if self.session_step_items is not None else self.new_step_items
+        )
+        return self.pre_step_items + items
 
 
 def get_model_tracing_impl(
@@ -1285,6 +1293,12 @@ async def execute_handoffs(
                 )
                 pre_step_items = list(filtered.pre_handoff_items)
                 new_step_items = list(filtered.new_items)
+                # For custom input filters, use input_items if available, otherwise new_items
+                if filtered.input_items is not None:
+                    session_step_items = list(filtered.new_items)
+                    new_step_items = list(filtered.input_items)
+                else:
+                    session_step_items = None
             elif should_nest_history and handoff_input_data is not None:
                 nested = nest_handoff_history(
                     handoff_input_data,
@@ -1296,7 +1310,16 @@ async def execute_handoffs(
                     else list(nested.input_history)
                 )
                 pre_step_items = list(nested.pre_handoff_items)
-                new_step_items = list(nested.new_items)
+                # Keep full new_items for session history.
+                session_step_items = list(nested.new_items)
+                # Use input_items (filtered) for model input if available.
+                if nested.input_items is not None:
+                    new_step_items = list(nested.input_items)
+                else:
+                    new_step_items = session_step_items
+            else:
+                # No filtering or nesting - session_step_items not needed
+                session_step_items = None
 
         return SingleStepResult(
             original_input=original_input,
@@ -1306,6 +1329,7 @@ async def execute_handoffs(
             next_step=NextStepHandoff(new_agent),
             tool_input_guardrail_results=[],
             tool_output_guardrail_results=[],
+            session_step_items=session_step_items,
         )
 
     @classmethod
diff --git a/src/agents/handoffs/__init__.py b/src/agents/handoffs/__init__.py
index 0876bfa581..11372dde0e 100644
--- a/src/agents/handoffs/__init__.py
+++ b/src/agents/handoffs/__init__.py
@@ -62,6 +62,13 @@ class HandoffInputData:
     later on, it is optional for backwards compatibility.
     """
 
+    input_items: tuple[RunItem, ...] | None = None
+    """
+    Items to include in the next agent's input. When set, these items are used instead of
+    new_items for building the input to the next agent. This allows filtering duplicates
+    from agent input while preserving all items in new_items for session history.
+    """
+
     def clone(self, **kwargs: Any) -> HandoffInputData:
         """
         Make a copy of the handoff input data, with the given arguments changed. For example, you
diff --git a/src/agents/handoffs/history.py b/src/agents/handoffs/history.py
index dc59547fbf..caac4b5cbf 100644
--- a/src/agents/handoffs/history.py
+++ b/src/agents/handoffs/history.py
@@ -26,6 +26,13 @@
 _conversation_history_start = _DEFAULT_CONVERSATION_HISTORY_START
 _conversation_history_end = _DEFAULT_CONVERSATION_HISTORY_END
 
+# Item types that are summarized in the conversation history.
+# They should not be forwarded verbatim to the next agent to avoid duplication.
+_SUMMARY_ONLY_INPUT_TYPES = {
+    "function_call",
+    "function_call_output",
+}
+
 
 def set_conversation_history_wrappers(
     *,
@@ -67,23 +74,34 @@ def nest_handoff_history(
 
     normalized_history = _normalize_input_history(handoff_input_data.input_history)
     flattened_history = _flatten_nested_history_messages(normalized_history)
-    pre_items_as_inputs = [
-        _run_item_to_plain_input(item) for item in handoff_input_data.pre_handoff_items
-    ]
-    new_items_as_inputs = [_run_item_to_plain_input(item) for item in handoff_input_data.new_items]
+
+    # Convert items to plain inputs for the transcript summary.
+    pre_items_as_inputs: list[TResponseInputItem] = []
+    filtered_pre_items: list[RunItem] = []
+    for run_item in handoff_input_data.pre_handoff_items:
+        plain_input = _run_item_to_plain_input(run_item)
+        pre_items_as_inputs.append(plain_input)
+        if _should_forward_pre_item(plain_input):
+            filtered_pre_items.append(run_item)
+
+    new_items_as_inputs: list[TResponseInputItem] = []
+    filtered_input_items: list[RunItem] = []
+    for run_item in handoff_input_data.new_items:
+        plain_input = _run_item_to_plain_input(run_item)
+        new_items_as_inputs.append(plain_input)
+        if _should_forward_new_item(plain_input):
+            filtered_input_items.append(run_item)
+
     transcript = flattened_history + pre_items_as_inputs + new_items_as_inputs
 
     mapper = history_mapper or default_handoff_history_mapper
     history_items = mapper(transcript)
-    filtered_pre_items = tuple(
-        item
-        for item in handoff_input_data.pre_handoff_items
-        if _get_run_item_role(item) != "assistant"
-    )
 
     return handoff_input_data.clone(
         input_history=tuple(deepcopy(item) for item in history_items),
-        pre_handoff_items=filtered_pre_items,
+        pre_handoff_items=tuple(filtered_pre_items),
+        # new_items stays unchanged for session history.
+        input_items=tuple(filtered_input_items),
     )
 
 
@@ -231,6 +249,20 @@ def _split_role_and_name(role_text: str) -> tuple[str, str | None]:
     return (role_text or "developer", None)
 
 
-def _get_run_item_role(run_item: RunItem) -> str | None:
-    role_candidate = run_item.to_input_item().get("role")
-    return role_candidate if isinstance(role_candidate, str) else None
+def _should_forward_pre_item(input_item: TResponseInputItem) -> bool:
+    """Return False when the previous transcript item is represented in the summary."""
+    role_candidate = input_item.get("role")
+    if isinstance(role_candidate, str) and role_candidate == "assistant":
+        return False
+    type_candidate = input_item.get("type")
+    return not (isinstance(type_candidate, str) and type_candidate in _SUMMARY_ONLY_INPUT_TYPES)
+
+
+def _should_forward_new_item(input_item: TResponseInputItem) -> bool:
+    """Return False for tool or side-effect items that the summary already covers."""
+    # Items with a role should always be forwarded.
+    role_candidate = input_item.get("role")
+    if isinstance(role_candidate, str) and role_candidate:
+        return True
+    type_candidate = input_item.get("type")
+    return not (isinstance(type_candidate, str) and type_candidate in _SUMMARY_ONLY_INPUT_TYPES)
diff --git a/src/agents/run.py b/src/agents/run.py
index 5b5e6fdfae..da373ea824 100644
--- a/src/agents/run.py
+++ b/src/agents/run.py
@@ -1740,10 +1740,15 @@ async def _get_single_step_result_from_streamed_response(
             context_wrapper=context_wrapper,
             run_config=run_config,
         )
+        # Use session_step_items (unfiltered) if available for streaming observability,
+        # otherwise fall back to new_step_items.
+        streaming_items = (
+            single_step_result.session_step_items
+            if single_step_result.session_step_items is not None
+            else single_step_result.new_step_items
+        )
         new_step_items = [
-            item
-            for item in single_step_result.new_step_items
-            if item not in new_items_processed_response
+            item for item in streaming_items if item not in new_items_processed_response
         ]
         RunImpl.stream_step_items_to_queue(new_step_items, event_queue)
 
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
index 6dcfc06afe..4f0885add8 100644
--- a/tests/test_agent_runner.py
+++ b/tests/test_agent_runner.py
@@ -172,9 +172,9 @@ async def test_handoffs():
 
     assert result.final_output == "done"
     assert len(result.raw_responses) == 3, "should have three model responses"
-    assert len(result.to_input_list()) == 7, (
-        "should have 7 inputs: summary message, tool call, tool result, message, handoff, "
-        "handoff result, and done message"
+    assert len(result.to_input_list()) == 5, (
+        "should have 5 session items: summary message (contains pre_handoff tool call/result), "
+        "message, handoff call, handoff output, and done message"
     )
     assert result.last_agent == agent_1, "should have handed off to agent_1"
 
@@ -226,10 +226,7 @@ async def test_structured_output():
 
     assert result.final_output == Foo(bar="baz")
     assert len(result.raw_responses) == 4, "should have four model responses"
-    assert len(result.to_input_list()) == 10, (
-        "should have input: conversation summary, function call, function call result, message, "
-        "handoff, handoff output, preamble message, tool call, tool call result, final output"
-    )
+    assert len(result.to_input_list()) == 8, "should have 8 inputs including summary"
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
     assert result.final_output == Foo(bar="baz"), "should have structured output"
diff --git a/tests/test_agent_runner_streamed.py b/tests/test_agent_runner_streamed.py
index 222afda78c..d1ab1635de 100644
--- a/tests/test_agent_runner_streamed.py
+++ b/tests/test_agent_runner_streamed.py
@@ -175,9 +175,9 @@ async def test_handoffs():
 
     assert result.final_output == "done"
     assert len(result.raw_responses) == 3, "should have three model responses"
-    assert len(result.to_input_list()) == 7, (
-        "should have 7 inputs: summary message, tool call, tool result, message, handoff, "
-        "handoff result, and done message"
+    assert len(result.to_input_list()) == 5, (
+        "should have 5 session items: summary message (contains pre_handoff tool call/result), "
+        "message, handoff call, handoff output, and done message"
     )
     assert result.last_agent == agent_1, "should have handed off to agent_1"
 
@@ -231,10 +231,7 @@ async def test_structured_output():
 
     assert result.final_output == Foo(bar="baz")
     assert len(result.raw_responses) == 4, "should have four model responses"
-    assert len(result.to_input_list()) == 10, (
-        "should have input: conversation summary, function call, function call result, message, "
-        "handoff, handoff output, preamble message, tool call, tool call result, final output"
-    )
+    assert len(result.to_input_list()) == 8, "should have 8 inputs including summary"
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
     assert result.final_output == Foo(bar="baz"), "should have structured output"
@@ -717,9 +714,9 @@ async def test_streaming_events():
 
     assert result.final_output == Foo(bar="baz")
     assert len(result.raw_responses) == 4, "should have four model responses"
-    assert len(result.to_input_list()) == 9, (
-        "should have input: conversation summary, function call, function call result, message, "
-        "handoff, handoff output, tool call, tool call result, final output"
+    assert len(result.to_input_list()) == 7, (
+        "should have input: conversation summary (contains pre-handoff tool calls), "
+        "message, handoff, handoff output, tool call, tool call result, final output"
     )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
@@ -737,7 +734,8 @@ async def test_streaming_events():
         "tool_call_output": 2,
         "message": 2,  # get_text_message("a_message") + get_final_output_message(...)
         "handoff": 1,  # get_handoff_tool_call(agent_1)
-        "handoff_output": 1,  # handoff_output_item
+        # handoff_output is summarized in conversation history, not duplicated as raw item
+        "handoff_output": 0,
     }
 
     total_expected_item_count = sum(expected_item_type_map.values())
diff --git a/tests/test_handoff_history_duplication.py b/tests/test_handoff_history_duplication.py
new file mode 100644
index 0000000000..617c7ef710
--- /dev/null
+++ b/tests/test_handoff_history_duplication.py
@@ -0,0 +1,276 @@
+"""Tests for handoff history duplication fix (Issue #2171).
+
+These tests verify that when nest_handoff_history is enabled,
+function_call and function_call_output items are NOT duplicated
+in the input sent to the next agent.
+"""
+
+from typing import Any, cast
+
+from openai.types.responses import (
+    ResponseFunctionToolCall,
+    ResponseOutputMessage,
+    ResponseOutputText,
+)
+
+from agents import Agent
+from agents.handoffs import HandoffInputData, nest_handoff_history
+from agents.items import (
+    HandoffCallItem,
+    HandoffOutputItem,
+    MessageOutputItem,
+    ToolCallItem,
+    ToolCallOutputItem,
+)
+
+
+def _create_mock_agent() -> Agent:
+    """Create a mock agent for testing."""
+    return Agent(name="test_agent")
+
+
+def _create_tool_call_item(agent: Agent) -> ToolCallItem:
+    """Create a mock ToolCallItem."""
+    raw_item = ResponseFunctionToolCall(
+        id="call_tool_123",
+        call_id="call_tool_123",
+        name="get_weather",
+        arguments='{"city": "London"}',
+        type="function_call",
+    )
+    return ToolCallItem(agent=agent, raw_item=raw_item, type="tool_call_item")
+
+
+def _create_tool_output_item(agent: Agent) -> ToolCallOutputItem:
+    """Create a mock ToolCallOutputItem."""
+    raw_item = {
+        "type": "function_call_output",
+        "call_id": "call_tool_123",
+        "output": "Sunny, 22°C",
+    }
+    return ToolCallOutputItem(
+        agent=agent,
+        raw_item=raw_item,
+        output="Sunny, 22°C",
+        type="tool_call_output_item",
+    )
+
+
+def _create_handoff_call_item(agent: Agent) -> HandoffCallItem:
+    """Create a mock HandoffCallItem."""
+    raw_item = ResponseFunctionToolCall(
+        id="call_handoff_456",
+        call_id="call_handoff_456",
+        name="transfer_to_agent_b",
+        arguments="{}",
+        type="function_call",
+    )
+    return HandoffCallItem(agent=agent, raw_item=raw_item, type="handoff_call_item")
+
+
+def _create_handoff_output_item(agent: Agent[Any]) -> HandoffOutputItem:
+    """Create a mock HandoffOutputItem."""
+    raw_item: dict[str, str] = {
+        "type": "function_call_output",
+        "call_id": "call_handoff_456",
+        "output": '{"assistant": "agent_b"}',
+    }
+    return HandoffOutputItem(
+        agent=agent,
+        raw_item=cast(Any, raw_item),
+        source_agent=agent,
+        target_agent=agent,
+        type="handoff_output_item",
+    )
+
+
+def _create_message_item(agent: Agent) -> MessageOutputItem:
+    """Create a mock MessageOutputItem."""
+    raw_item = ResponseOutputMessage(
+        id="msg_123",
+        content=[ResponseOutputText(text="Hello!", type="output_text", annotations=[])],
+        role="assistant",
+        status="completed",
+        type="message",
+    )
+    return MessageOutputItem(agent=agent, raw_item=raw_item, type="message_output_item")
+
+
+class TestHandoffHistoryDuplicationFix:
+    """Tests for Issue #2171: nest_handoff_history duplication fix."""
+
+    def test_pre_handoff_tool_items_are_filtered(self):
+        """Verify ToolCallItem and ToolCallOutputItem in pre_handoff_items are filtered.
+
+        These items should NOT appear in the filtered output because they are
+        already included in the summary message.
+        """
+        agent = _create_mock_agent()
+
+        handoff_data = HandoffInputData(
+            input_history=({"role": "user", "content": "Hello"},),
+            pre_handoff_items=(
+                _create_tool_call_item(agent),
+                _create_tool_output_item(agent),
+            ),
+            new_items=(),
+        )
+
+        nested = nest_handoff_history(handoff_data)
+
+        # pre_handoff_items should be empty (tool items filtered)
+        assert len(nested.pre_handoff_items) == 0, (
+            "ToolCallItem and ToolCallOutputItem should be filtered from pre_handoff_items"
+        )
+
+        # Summary should contain the conversation
+        assert len(nested.input_history) == 1
+        first_item = nested.input_history[0]
+        assert isinstance(first_item, dict)
+        assert "<CONVERSATION HISTORY>" in str(first_item.get("content", ""))
+
+    def test_new_items_handoff_output_is_filtered_for_input(self):
+        """Verify HandoffOutputItem in new_items is filtered from input_items.
+
+        The HandoffOutputItem is a function_call_output which would be duplicated.
+        It should be filtered from input_items but preserved in new_items.
+        """
+        agent = _create_mock_agent()
+
+        handoff_data = HandoffInputData(
+            input_history=({"role": "user", "content": "Hello"},),
+            pre_handoff_items=(),
+            new_items=(
+                _create_handoff_call_item(agent),
+                _create_handoff_output_item(agent),
+            ),
+        )
+
+        nested = nest_handoff_history(handoff_data)
+
+        # new_items should still have both items (for session history)
+        assert len(nested.new_items) == 2, "new_items should preserve all items for session history"
+
+        # input_items should be populated and filtered
+        assert nested.input_items is not None, "input_items should be populated"
+
+        # input_items should NOT contain HandoffOutputItem (it's function_call_output)
+        has_handoff_output = any(isinstance(item, HandoffOutputItem) for item in nested.input_items)
+        assert not has_handoff_output, "HandoffOutputItem should be filtered from input_items"
+
+    def test_message_items_are_preserved_in_new_items(self):
+        """Verify MessageOutputItem in new_items is preserved.
+
+        Message items have a 'role' and should NOT be filtered from input_items.
+        Note: pre_handoff_items are converted to summary text regardless of type.
+        """
+        agent = _create_mock_agent()
+
+        handoff_data = HandoffInputData(
+            input_history=({"role": "user", "content": "Hello"},),
+            pre_handoff_items=(),  # pre_handoff items go into summary
+            new_items=(_create_message_item(agent),),
+        )
+
+        nested = nest_handoff_history(handoff_data)
+
+        # Message items should be preserved in new_items
+        assert len(nested.new_items) == 1, "MessageOutputItem should be preserved in new_items"
+        # And in input_items (since it has a role)
+        assert nested.input_items is not None
+        assert len(nested.input_items) == 1, "MessageOutputItem should be preserved in input_items"
+        assert isinstance(nested.input_items[0], MessageOutputItem)
+
+    def test_summary_contains_filtered_items_as_text(self):
+        """Verify the summary message contains the filtered tool items as text.
+
+        This ensures observability - the items are not lost, just converted to text.
+        """
+        agent = _create_mock_agent()
+
+        handoff_data = HandoffInputData(
+            input_history=({"role": "user", "content": "Hello"},),
+            pre_handoff_items=(
+                _create_tool_call_item(agent),
+                _create_tool_output_item(agent),
+            ),
+            new_items=(),
+        )
+
+        nested = nest_handoff_history(handoff_data)
+
+        first_item = nested.input_history[0]
+        assert isinstance(first_item, dict)
+        summary = str(first_item.get("content", ""))
+
+        # Summary should contain function_call reference
+        assert "function_call" in summary or "get_weather" in summary, (
+            "Summary should contain the tool call that was filtered"
+        )
+
+    def test_input_items_field_exists_after_nesting(self):
+        """Verify the input_items field is populated after nest_handoff_history.
+
+        This is the key field that separates model input from session history.
+        """
+        agent = _create_mock_agent()
+
+        handoff_data = HandoffInputData(
+            input_history=({"role": "user", "content": "Hello"},),
+            pre_handoff_items=(),
+            new_items=(_create_handoff_call_item(agent),),
+        )
+
+        nested = nest_handoff_history(handoff_data)
+
+        assert nested.input_items is not None, (
+            "input_items should be populated after nest_handoff_history"
+        )
+
+    def test_full_handoff_scenario_no_duplication(self):
+        """Full end-to-end test of the handoff scenario from Issue #2171.
+
+        Simulates: User -> Agent does tool call -> Agent hands off to next agent
+        Verifies: Next agent receives summary only, no duplicate raw items.
+        """
+        agent = _create_mock_agent()
+
+        # Full scenario: tool call in pre_handoff, handoff in new_items
+        handoff_data = HandoffInputData(
+            input_history=({"role": "user", "content": "What's the weather?"},),
+            pre_handoff_items=(
+                _create_tool_call_item(agent),  # function_call
+                _create_tool_output_item(agent),  # function_call_output
+            ),
+            new_items=(
+                _create_message_item(agent),  # assistant message
+                _create_handoff_call_item(agent),  # function_call (handoff)
+                _create_handoff_output_item(agent),  # function_call_output (handoff)
+            ),
+        )
+
+        nested = nest_handoff_history(handoff_data)
+
+        # Count what would be sent to the model
+        total_model_items = (
+            len(nested.input_history)  # Summary
+            + len(nested.pre_handoff_items)  # Filtered pre-handoff
+            + len(nested.input_items or [])  # Filtered new items
+        )
+
+        # Before fix: would have 6+ items (summary + raw tool items)
+        # After fix: should have ~2 items (summary + message)
+        assert total_model_items <= 3, (
+            f"Model should receive at most 3 items (summary + messages), got {total_model_items}"
+        )
+
+        # Verify no raw function_call_output items in model input
+        all_input_items = list(nested.pre_handoff_items) + list(nested.input_items or [])
+        function_call_outputs = [
+            item
+            for item in all_input_items
+            if isinstance(item, (ToolCallOutputItem, HandoffOutputItem))
+        ]
+        assert len(function_call_outputs) == 0, (
+            "No function_call_output items should be in model input"
+        )
diff --git a/tests/test_soft_cancel.py b/tests/test_soft_cancel.py
index 395f2fb6f3..ddb51f8f17 100644
--- a/tests/test_soft_cancel.py
+++ b/tests/test_soft_cancel.py
@@ -421,7 +421,7 @@ async def on_invoke_handoff(context, data):
 
     handoff_seen = False
     async for event in result.stream_events():
-        if event.type == "run_item_stream_event" and event.name == "handoff_occured":
+        if event.type == "run_item_stream_event" and event.name == "handoff_requested":
             handoff_seen = True
             # Cancel right after handoff
             result.cancel(mode="after_turn")

From 562d30b87dc20c410a792ad0dc0fb9bb9e3ade03 Mon Sep 17 00:00:00 2001
From: Ankan Misra <misra13arko@gmail.com>
Date: Sat, 3 Jan 2026 18:50:25 +0530
Subject: [PATCH 2/3] fix:session save drop unfiltered handoff items

---
 src/agents/run.py | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/src/agents/run.py b/src/agents/run.py
index da373ea824..6ddea2dc65 100644
--- a/src/agents/run.py
+++ b/src/agents/run.py
@@ -736,7 +736,11 @@ async def run(
                                 for guardrail_result in input_guardrail_results
                             ):
                                 await self._save_result_to_session(
-                                    session, [], turn_result.new_step_items
+                                    session,
+                                    [],
+                                    turn_result.session_step_items
+                                    if turn_result.session_step_items is not None
+                                    else turn_result.new_step_items,
                                 )
 
                             return result
@@ -748,7 +752,11 @@ async def run(
                                     for guardrail_result in input_guardrail_results
                                 ):
                                     await self._save_result_to_session(
-                                        session, [], turn_result.new_step_items
+                                        session,
+                                        [],
+                                        turn_result.session_step_items
+                                        if turn_result.session_step_items is not None
+                                        else turn_result.new_step_items,
                                     )
                             current_agent = cast(Agent[TContext], turn_result.next_step.new_agent)
                             current_span.finish(reset_current=True)
@@ -760,7 +768,11 @@ async def run(
                                 for guardrail_result in input_guardrail_results
                             ):
                                 await self._save_result_to_session(
-                                    session, [], turn_result.new_step_items
+                                    session,
+                                    [],
+                                    turn_result.session_step_items
+                                    if turn_result.session_step_items is not None
+                                    else turn_result.new_step_items,
                                 )
                         else:
                             raise AgentsException(
@@ -1229,7 +1241,11 @@ async def _start_streaming(
                             )
                             if should_skip_session_save is False:
                                 await AgentRunner._save_result_to_session(
-                                    session, [], turn_result.new_step_items
+                                    session,
+                                    [],
+                                    turn_result.session_step_items
+                                    if turn_result.session_step_items is not None
+                                    else turn_result.new_step_items,
                                 )
 
                         current_agent = turn_result.next_step.new_agent
@@ -1275,7 +1291,11 @@ async def _start_streaming(
                             )
                             if should_skip_session_save is False:
                                 await AgentRunner._save_result_to_session(
-                                    session, [], turn_result.new_step_items
+                                    session,
+                                    [],
+                                    turn_result.session_step_items
+                                    if turn_result.session_step_items is not None
+                                    else turn_result.new_step_items,
                                 )
 
                         streamed_result._event_queue.put_nowait(QueueCompleteSentinel())
@@ -1288,7 +1308,11 @@ async def _start_streaming(
                             )
                             if should_skip_session_save is False:
                                 await AgentRunner._save_result_to_session(
-                                    session, [], turn_result.new_step_items
+                                    session,
+                                    [],
+                                    turn_result.session_step_items
+                                    if turn_result.session_step_items is not None
+                                    else turn_result.new_step_items,
                                 )
 
                         # Check for soft cancel after turn completion

From d568a0efeeedf96b2a789b6ca6b1c20aeb5f4d28 Mon Sep 17 00:00:00 2001
From: Ankan Misra <misra13arko@gmail.com>
Date: Sat, 3 Jan 2026 19:06:57 +0530
Subject: [PATCH 3/3] fix(handoffs): preserve unfiltered items for session
 history and observability

---
 src/agents/_run_impl.py             |  2 +-
 src/agents/run.py                   | 25 ++++++++++++++++++++-----
 tests/test_agent_runner.py          | 11 +++++++----
 tests/test_agent_runner_streamed.py | 17 ++++++++++-------
 4 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/src/agents/_run_impl.py b/src/agents/_run_impl.py
index 6e0f9cf3eb..bd5c3e9a27 100644
--- a/src/agents/_run_impl.py
+++ b/src/agents/_run_impl.py
@@ -262,7 +262,7 @@ class SingleStepResult:
     @property
     def generated_items(self) -> list[RunItem]:
         """Items generated during the agent run (i.e. everything generated after
-        `original_input`)."""
+        `original_input`). Uses session_step_items when available for full observability."""
         items = (
             self.session_step_items if self.session_step_items is not None else self.new_step_items
         )
diff --git a/src/agents/run.py b/src/agents/run.py
index 6ddea2dc65..8c63eec5b0 100644
--- a/src/agents/run.py
+++ b/src/agents/run.py
@@ -579,7 +579,8 @@ async def run(
         ):
             current_turn = 0
             original_input: str | list[TResponseInputItem] = _copy_str_or_list(prepared_input)
-            generated_items: list[RunItem] = []
+            generated_items: list[RunItem] = []  # For model input (may be filtered on handoffs)
+            session_items: list[RunItem] = []  # For observability (always unfiltered)
             model_responses: list[ModelResponse] = []
 
             context_wrapper: RunContextWrapper[TContext] = RunContextWrapper(
@@ -701,7 +702,15 @@ async def run(
 
                     model_responses.append(turn_result.model_response)
                     original_input = turn_result.original_input
-                    generated_items = turn_result.generated_items
+                    # For model input, use new_step_items (filtered on handoffs)
+                    generated_items = turn_result.pre_step_items + turn_result.new_step_items
+                    # Accumulate unfiltered items for observability
+                    session_items_for_turn = (
+                        turn_result.session_step_items
+                        if turn_result.session_step_items is not None
+                        else turn_result.new_step_items
+                    )
+                    session_items.extend(session_items_for_turn)
 
                     if server_conversation_tracker is not None:
                         server_conversation_tracker.track_server_items(turn_result.model_response)
@@ -721,7 +730,7 @@ async def run(
                             )
                             result = RunResult(
                                 input=original_input,
-                                new_items=generated_items,
+                                new_items=session_items,  # Use unfiltered items for observability
                                 raw_responses=model_responses,
                                 final_output=turn_result.next_step.output,
                                 _last_agent=current_agent,
@@ -788,7 +797,7 @@ async def run(
             except AgentsException as exc:
                 exc.run_data = RunErrorDetails(
                     input=original_input,
-                    new_items=generated_items,
+                    new_items=session_items,  # Use unfiltered items for observability
                     raw_responses=model_responses,
                     last_agent=current_agent,
                     context_wrapper=context_wrapper,
@@ -1225,7 +1234,13 @@ async def _start_streaming(
                         turn_result.model_response
                     ]
                     streamed_result.input = turn_result.original_input
-                    streamed_result.new_items = turn_result.generated_items
+                    # Accumulate unfiltered items for observability
+                    session_items_for_turn = (
+                        turn_result.session_step_items
+                        if turn_result.session_step_items is not None
+                        else turn_result.new_step_items
+                    )
+                    streamed_result.new_items.extend(session_items_for_turn)
 
                     if server_conversation_tracker is not None:
                         server_conversation_tracker.track_server_items(turn_result.model_response)
diff --git a/tests/test_agent_runner.py b/tests/test_agent_runner.py
index 4f0885add8..6dcfc06afe 100644
--- a/tests/test_agent_runner.py
+++ b/tests/test_agent_runner.py
@@ -172,9 +172,9 @@ async def test_handoffs():
 
     assert result.final_output == "done"
     assert len(result.raw_responses) == 3, "should have three model responses"
-    assert len(result.to_input_list()) == 5, (
-        "should have 5 session items: summary message (contains pre_handoff tool call/result), "
-        "message, handoff call, handoff output, and done message"
+    assert len(result.to_input_list()) == 7, (
+        "should have 7 inputs: summary message, tool call, tool result, message, handoff, "
+        "handoff result, and done message"
     )
     assert result.last_agent == agent_1, "should have handed off to agent_1"
 
@@ -226,7 +226,10 @@ async def test_structured_output():
 
     assert result.final_output == Foo(bar="baz")
     assert len(result.raw_responses) == 4, "should have four model responses"
-    assert len(result.to_input_list()) == 8, "should have 8 inputs including summary"
+    assert len(result.to_input_list()) == 10, (
+        "should have input: conversation summary, function call, function call result, message, "
+        "handoff, handoff output, preamble message, tool call, tool call result, final output"
+    )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
     assert result.final_output == Foo(bar="baz"), "should have structured output"
diff --git a/tests/test_agent_runner_streamed.py b/tests/test_agent_runner_streamed.py
index d1ab1635de..a520140a7c 100644
--- a/tests/test_agent_runner_streamed.py
+++ b/tests/test_agent_runner_streamed.py
@@ -175,9 +175,9 @@ async def test_handoffs():
 
     assert result.final_output == "done"
     assert len(result.raw_responses) == 3, "should have three model responses"
-    assert len(result.to_input_list()) == 5, (
-        "should have 5 session items: summary message (contains pre_handoff tool call/result), "
-        "message, handoff call, handoff output, and done message"
+    assert len(result.to_input_list()) == 7, (
+        "should have 7 inputs: summary message, tool call, tool result, message, handoff, "
+        "handoff result, and done message"
     )
     assert result.last_agent == agent_1, "should have handed off to agent_1"
 
@@ -231,7 +231,10 @@ async def test_structured_output():
 
     assert result.final_output == Foo(bar="baz")
     assert len(result.raw_responses) == 4, "should have four model responses"
-    assert len(result.to_input_list()) == 8, "should have 8 inputs including summary"
+    assert len(result.to_input_list()) == 10, (
+        "should have input: conversation summary, function call, function call result, message, "
+        "handoff, handoff output, preamble message, tool call, tool call result, final output"
+    )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"
     assert result.final_output == Foo(bar="baz"), "should have structured output"
@@ -714,9 +717,9 @@ async def test_streaming_events():
 
     assert result.final_output == Foo(bar="baz")
     assert len(result.raw_responses) == 4, "should have four model responses"
-    assert len(result.to_input_list()) == 7, (
-        "should have input: conversation summary (contains pre-handoff tool calls), "
-        "message, handoff, handoff output, tool call, tool call result, final output"
+    assert len(result.to_input_list()) == 9, (
+        "should have input: conversation summary, function call, function call result, message, "
+        "handoff, handoff output, tool call, tool call result, final output"
     )
 
     assert result.last_agent == agent_1, "should have handed off to agent_1"