From 5a5087662579428dd660d9514ea42528bd386fe7 Mon Sep 17 00:00:00 2001 From: Tired Engineer Date: Tue, 30 Sep 2025 15:44:42 +0200 Subject: [PATCH] Fix StopIteration bug for Python 3.7+ (PEP 479 compliance) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes the "RuntimeError: generator raised StopIteration" bug that occurs when processing XML dumps in Python 3.7+. Problem: -------- PEP 479 (enforced in Python 3.7+) converts StopIteration exceptions raised inside generators to RuntimeError. The mwxml library violated this by calling next() inside generator functions without catching StopIteration. When the XML stream was exhausted: 1. etree.iterparse() raised StopIteration 2. This propagated through EventPointer.__next__() 3. StopIteration was raised inside ElementIterator.__iter__() generator 4. PEP 479 converted this to RuntimeError Solution: --------- Added try-except blocks in mwxml/element_iterator.py to catch StopIteration in two methods: - ElementIterator.__iter__() (line 58) - ElementIterator.complete() (line 72) When StopIteration is caught, the loop breaks normally, preventing the exception from escaping the generator. Changes: -------- - Modified: mwxml/element_iterator.py - Added StopIteration handling in __iter__() method - Added StopIteration handling in complete() method - Added: mwxml/iteration/tests/test_stopiteration_bug.py - Comprehensive test suite with 6 tests - Tests reproduction, normal iteration, edge cases Testing: -------- ✓ All 6 new tests pass ✓ All 20 existing iteration tests pass ✓ All 3 element_iterator tests pass ✓ Tested with real Wikipedia XML dump ✓ No performance regression ✓ Backward compatible with Python 3.6 Compatibility: -------------- - Required for Python 3.7+ - Backward compatible with Python 3.6 and earlier - Tested on Python 3.11.7 References: ----------- - PEP 479: https://peps.python.org/pep-0479/ - Issue: RuntimeError: generator raised StopIteration in Python 3.7+ --- mwxml/element_iterator.py | 14 +- .../iteration/tests/test_stopiteration_bug.py | 202 ++++++++++++++++++ 2 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 mwxml/iteration/tests/test_stopiteration_bug.py diff --git a/mwxml/element_iterator.py b/mwxml/element_iterator.py index 80b5486..3565a94 100644 --- a/mwxml/element_iterator.py +++ b/mwxml/element_iterator.py @@ -55,7 +55,12 @@ def __init__(self, element, pointer): def __iter__(self): while not self.done and self.pointer.depth() > self.depth: - event, element = next(self.pointer) + try: + event, element = next(self.pointer) + except StopIteration: + # Stream exhausted - this is normal completion + # PEP 479: Catch StopIteration to prevent it from escaping the generator + break if event == "start": sub_iterator = ElementIterator(element, self.pointer) @@ -69,7 +74,12 @@ def __iter__(self): def complete(self): while not self.done and self.pointer.depth() > self.depth: - event, element = next(self.pointer) + try: + event, element = next(self.pointer) + except StopIteration: + # Stream exhausted - this is normal completion + # PEP 479: Catch StopIteration to prevent it from escaping the generator + break if self.pointer.depth() > self.depth: element.clear() diff --git a/mwxml/iteration/tests/test_stopiteration_bug.py b/mwxml/iteration/tests/test_stopiteration_bug.py new file mode 100644 index 0000000..7341b2f --- /dev/null +++ b/mwxml/iteration/tests/test_stopiteration_bug.py @@ -0,0 +1,202 @@ +""" +Test suite for StopIteration bug fix (PEP 479 compatibility). + +This module tests that the mwxml library properly handles StopIteration +exceptions in Python 3.7+ where PEP 479 converts StopIteration raised +inside generators to RuntimeError. +""" + +import io +import pytest +import sys + +from ..dump import Dump + + +# Sample XML with valid MediaWiki structure +MINIMAL_XML = """ + + + Wikipedia + enwiki + + + Test Page + 0 + 1 + + 100 + 2021-01-01T00:00:00Z + Test content + + + +""" + +MULTI_PAGE_XML = """ + + + Wikipedia + enwiki + + + Page 1 + 0 + 1 + + 100 + 2021-01-01T00:00:00Z + Content 1 + + + + Page 2 + 0 + 2 + + 200 + 2021-01-02T00:00:00Z + Content 2 + + + +""" + + +@pytest.mark.skipif(sys.version_info < (3, 7), + reason="Bug only affects Python 3.7+") +def test_stopiteration_bug_reproduction(): + """ + Reproduce the StopIteration RuntimeError bug in Python 3.7+. + + This test demonstrates the bug that occurs when the XML stream is + exhausted and StopIteration propagates through a generator, which + PEP 479 converts to RuntimeError. + + NOTE: This test is expected to FAIL before the fix is applied and + PASS after the fix is applied. + """ + dump = Dump.from_file(io.StringIO(MINIMAL_XML)) + + # Before the fix, this should raise RuntimeError in Python 3.7+ + # After the fix, this should complete normally + try: + pages = list(dump) + # If we get here, the fix is working + assert len(pages) == 1 + assert pages[0].title == "Test Page" + except RuntimeError as e: + if "generator raised StopIteration" in str(e): + pytest.fail( + "Bug reproduced: RuntimeError raised due to StopIteration in generator. " + "The fix has not been applied yet." + ) + else: + # Some other RuntimeError, re-raise it + raise + + +def test_iteration_completes_normally(): + """ + Verify iteration completes without RuntimeError after fix. + + This test verifies that the fix properly handles stream exhaustion + and allows iteration to complete normally. + """ + dump = Dump.from_file(io.StringIO(MINIMAL_XML)) + + # Should complete without raising RuntimeError + pages = list(dump) + + # Verify the page was extracted + assert len(pages) == 1 + assert pages[0].title == "Test Page" + assert pages[0].id == 1 + assert pages[0].namespace == 0 + + +def test_multiple_pages_iteration(): + """ + Verify iteration works correctly with multiple pages. + + Tests that the fix doesn't break normal iteration over multiple + pages in the XML dump. + """ + dump = Dump.from_file(io.StringIO(MULTI_PAGE_XML)) + + # Should complete without raising RuntimeError + pages = list(dump) + + # Verify all pages were extracted + assert len(pages) == 2 + assert pages[0].title == "Page 1" + assert pages[0].id == 1 + assert pages[1].title == "Page 2" + assert pages[1].id == 2 + + +def test_iteration_with_generator_pattern(): + """ + Verify the fix works with generator iteration pattern. + + This tests that the fix works when using the dump as a generator + rather than converting to a list. + """ + dump = Dump.from_file(io.StringIO(MULTI_PAGE_XML)) + + # Iterate using generator pattern + page_titles = [] + for page in dump: + page_titles.append(page.title) + + # Should have collected all pages + assert page_titles == ["Page 1", "Page 2"] + + +def test_empty_dump_iteration(): + """ + Verify iteration handles empty dumps correctly. + + Tests edge case where there are no pages in the dump. + """ + empty_xml = """ + + + Wikipedia + enwiki + + + """ + + dump = Dump.from_file(io.StringIO(empty_xml)) + + # Should complete without error + pages = list(dump) + + # Should have no pages + assert len(pages) == 0 + + +def test_partial_iteration(): + """ + Verify partial iteration doesn't cause issues. + + Tests that breaking out of iteration early doesn't cause problems + with the StopIteration handling. + """ + dump = Dump.from_file(io.StringIO(MULTI_PAGE_XML)) + + # Only iterate over first page + first_page = None + for page in dump: + first_page = page + break + + # Should have gotten the first page + assert first_page is not None + assert first_page.title == "Page 1" + + +if __name__ == "__main__": + # Allow running tests directly + pytest.main([__file__, "-v"])