From 3fd0accf66d1d2db73a607720944efff07d0368d Mon Sep 17 00:00:00 2001 From: Fahad Date: Wed, 26 Nov 2025 19:22:03 +0300 Subject: [PATCH 1/9] FIX: added on_bad_lines support for dtype conversion failures #63168 --- pandas/_libs/parsers.pyx | 96 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c2767dc47b5e4..e3d063a964227 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -937,6 +937,11 @@ cdef class TextReader: int64_t num_cols dict results bint is_default_dict_dtype + set bad_rows + dict failed_columns_dtypes + + bad_rows = set() + failed_columns_dtypes = {} start = self.parser_start @@ -1009,6 +1014,26 @@ cdef class TextReader: col_res, na_count = self._convert_tokens( i, start, end, name, na_filter, na_hashset, na_fset, col_dtype) + except (ValueError, TypeError, OverflowError) as e: + # GH#63168: Handle dtype conversion failures based on on_bad_lines + if self.parser.on_bad_lines == SKIP or self.parser.on_bad_lines == WARN: + # Fall back to string conversion + col_res, na_count = self._string_convert( + i, start, end, na_filter, na_hashset) + + # Track this column's intended dtype for later bad row detection + if col_dtype is not None: + failed_columns_dtypes[i] = col_dtype + + if self.parser.on_bad_lines == WARN: + warnings.warn( + f"Could not convert column {name} to dtype {col_dtype}: " + f"{e}. Rows with unconvertible values will be skipped.", + ParserWarning, + stacklevel=find_stack_level() + ) + else: + raise finally: # gh-21353 # @@ -1034,6 +1059,32 @@ cdef class TextReader: results[i] = col_res + # GH#63168: Filter out bad rows if on_bad_lines is SKIP or WARN + if failed_columns_dtypes: + # Identify bad rows from columns that failed dtype conversion + for col_idx, target_dtype in failed_columns_dtypes.items(): + col_values = results[col_idx] + bad_row_indices = _identify_bad_rows(col_values, target_dtype) + bad_rows.update(bad_row_indices) + + if bad_rows: + num_rows = end - start + good_mask = np.ones(num_rows, dtype=np.bool_) + for bad_idx in bad_rows: + good_mask[bad_idx] = False + + # Filter all columns to keep only good rows + for col_idx in results: + results[col_idx] = results[col_idx][good_mask] + + if self.parser.on_bad_lines == WARN: + warnings.warn( + f"Skipped {len(bad_rows)} line(s) due to dtype " + f"conversion errors.", + ParserWarning, + stacklevel=find_stack_level() + ) + self.parser_start += end - start return results @@ -1404,6 +1455,51 @@ STR_NA_VALUES = { _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) +def _identify_bad_rows(values, dtype): + """ + Identify row indices where values cannot be converted to the target dtype. + + GH#63168: Used to find rows that should be skipped when on_bad_lines='skip'. + + Parameters + ---------- + values : ndarray + Array of values (typically strings/objects) to check. + dtype : numpy dtype + Target dtype to check conversion against. + + Returns + ------- + set + Set of row indices (0-based) that cannot be converted. + """ + bad_indices = set() + + for idx in range(len(values)): + val = values[idx] + + # Skip None/NaN values - they're handled separately + if val is None: + continue + if isinstance(val, float) and np.isnan(val): + continue + if isinstance(val, str) and val.strip() == "": + continue + + try: + if dtype.kind in "iu": # integer types + int(val) + elif dtype.kind == "f": # float types + float(val) + elif dtype.kind == "b": # boolean + # Boolean conversion is more complex, skip for now + pass + except (ValueError, TypeError): + bad_indices.add(idx) + + return bad_indices + + def _maybe_upcast( arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy" ): From c30677de2327c2d03f416e157016dbc9ed46ca36 Mon Sep 17 00:00:00 2001 From: Nasser M <123868652+nejail@users.noreply.github.com> Date: Wed, 26 Nov 2025 20:05:37 +0300 Subject: [PATCH 2/9] DOC: Update comment for handling bool dtype converstion --- pandas/_libs/parsers.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e3d063a964227..63fb1ba82ab6c 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1015,13 +1015,13 @@ cdef class TextReader: i, start, end, name, na_filter, na_hashset, na_fset, col_dtype) except (ValueError, TypeError, OverflowError) as e: - # GH#63168: Handle dtype conversion failures based on on_bad_lines + # Handle dtype conversion failure based on on_bad_lines if self.parser.on_bad_lines == SKIP or self.parser.on_bad_lines == WARN: # Fall back to string conversion col_res, na_count = self._string_convert( i, start, end, na_filter, na_hashset) - # Track this column's intended dtype for later bad row detection + # Track the columns intended dtype for bad row detection lateron if col_dtype is not None: failed_columns_dtypes[i] = col_dtype @@ -1059,7 +1059,7 @@ cdef class TextReader: results[i] = col_res - # GH#63168: Filter out bad rows if on_bad_lines is SKIP or WARN + # Filters out the bad rows if on_bad_lines is skipped or warned if failed_columns_dtypes: # Identify bad rows from columns that failed dtype conversion for col_idx, target_dtype in failed_columns_dtypes.items(): @@ -1457,16 +1457,16 @@ _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) def _identify_bad_rows(values, dtype): """ - Identify row indices where values cannot be converted to the target dtype. + Identify the row indices when values cannot be converted to the intended target - GH#63168: Used to find rows that should be skipped when on_bad_lines='skip'. + This can be used to find rows that should be skipped when on_bad_lines='skip' Parameters ---------- values : ndarray - Array of values (typically strings/objects) to check. + Array of values to check dtype : numpy dtype - Target dtype to check conversion against. + Target dtype to check conversion against Returns ------- @@ -1492,7 +1492,7 @@ def _identify_bad_rows(values, dtype): elif dtype.kind == "f": # float types float(val) elif dtype.kind == "b": # boolean - # Boolean conversion is more complex, skip for now + # Complex pass it until we fix again pass except (ValueError, TypeError): bad_indices.add(idx) From 09d10d2feb04170b0d4bfb06a9a7cfc6e552b864 Mon Sep 17 00:00:00 2001 From: Anurag Aryal Date: Thu, 4 Dec 2025 12:46:12 -0500 Subject: [PATCH 3/9] FIX: read_csv: Implement string-based 'on_bad_lines' options for C engine --- pandas/_libs/parsers.pyx | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 63fb1ba82ab6c..3abb69d34dfb8 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -461,7 +461,21 @@ cdef class TextReader: raise ValueError("Only length-1 comment characters supported") self.parser.commentchar = ord(comment) - self.parser.on_bad_lines = on_bad_lines + if isinstance(on_bad_lines, str): + if on_bad_lines == 'error': + c_on_bad_lines = ERROR + elif on_bad_lines == 'warn': + c_on_bad_lines = WARN + elif on_bad_lines == 'skip': + c_on_bad_lines = SKIP + # Note: can add 'skip_with_log' here later when we work on logging + else: + raise ValueError(f"Invalid value for on_bad_lines: {on_bad_lines}") + else: + # If it's not a string, assume it's already an integer/enum constant (like ERROR) + c_on_bad_lines = on_bad_lines + + self.parser.on_bad_lines = c_on_bad_lines self.skiprows = skiprows if skiprows is not None: From 75db71cd48248b20af983e5228037940530f827f Mon Sep 17 00:00:00 2001 From: Hazem Elsayed Date: Mon, 8 Dec 2025 17:37:50 +0300 Subject: [PATCH 4/9] FIX: read_csv: Re-convert to target dtype after filtering bad lines --- pandas/_libs/parsers.pyx | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 3abb69d34dfb8..a11353464674d 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1099,6 +1099,16 @@ cdef class TextReader: stacklevel=find_stack_level() ) + # Re-convert failed columns to their target dtype now that bad rows + # have been removed. All remaining values should be convertible. + for col_idx, target_dtype in failed_columns_dtypes.items(): + try: + results[col_idx] = np.array(results[col_idx]).astype(target_dtype) + except (ValueError, TypeError, OverflowError): + # If conversion still fails, keep as string (shouldn't happen + # if _identify_bad_rows worked correctly, but be defensive) + pass + self.parser_start += end - start return results From 2b00ad2979b94587b9d585028834a3388aa72fec Mon Sep 17 00:00:00 2001 From: Hazem Elsayed Date: Mon, 8 Dec 2025 17:37:55 +0300 Subject: [PATCH 5/9] TST: Add tests for on_bad_lines with dtype conversion failures #63168 --- .../io/parser/common/test_read_errors.py | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index b8cf435ef0443..36e1ecef6590e 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -313,3 +313,111 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers): ): result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "on_bad_lines,should_warn", + [ + ("skip", False), + ("warn", True), + ], +) +def test_on_bad_lines_dtype_conversion_skip(c_parser_only, on_bad_lines, should_warn): + # GH#63168 - on_bad_lines should handle dtype conversion failures + parser = c_parser_only + data = "col1,col2,col3\n1,2,3\na,4,5\n4,5,6" + + if should_warn: + with tm.assert_produces_warning( + ParserWarning, + match="Could not convert column|Skipped .* line", + check_stacklevel=False, + ): + result = parser.read_csv( + StringIO(data), + dtype={"col1": int, "col2": int, "col3": int}, + on_bad_lines=on_bad_lines, + ) + else: + result = parser.read_csv( + StringIO(data), + dtype={"col1": int, "col2": int, "col3": int}, + on_bad_lines=on_bad_lines, + ) + + # Row with 'a' cannot convert to int, should be skipped + expected = DataFrame({"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_dtype_conversion_error(c_parser_only): + # GH#63168 - on_bad_lines='error' should raise on dtype conversion failure + parser = c_parser_only + data = "col1,col2\n1,2\na,3" + + with pytest.raises(ValueError, match="invalid literal for int"): + parser.read_csv( + StringIO(data), + dtype={"col1": int, "col2": int}, + on_bad_lines="error", + ) + + +def test_on_bad_lines_dtype_float_conversion(c_parser_only): + # GH#63168 - Float dtype with non-numeric values + parser = c_parser_only + data = "a,b\n1.5,2.5\nfoo,3.5\n4.5,5.5" + + result = parser.read_csv( + StringIO(data), + dtype={"a": float, "b": float}, + on_bad_lines="skip", + ) + + expected = DataFrame({"a": [1.5, 4.5], "b": [2.5, 5.5]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_dtype_partial_columns(c_parser_only): + # GH#63168 - Only some columns have dtype specified + parser = c_parser_only + data = "a,b,c\n1,hello,3\nfoo,world,6\n4,test,9" + + result = parser.read_csv( + StringIO(data), + dtype={"a": int, "c": int}, + on_bad_lines="skip", + ) + + expected = DataFrame({"a": [1, 4], "b": ["hello", "test"], "c": [3, 9]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_dtype_mixed_errors(c_parser_only): + # GH#63168 - Mix of structural errors (wrong field count) and dtype errors + parser = c_parser_only + data = "a,b,c\n1,2,3\nwrong_field_count\nfoo,4,5\n6,7,8" + + result = parser.read_csv( + StringIO(data), + dtype={"a": int, "b": int, "c": int}, + on_bad_lines="skip", + ) + + expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_dtype_all_bad_rows(c_parser_only): + # GH#63168 - All data rows fail conversion + parser = c_parser_only + data = "a,b\nfoo,bar\nbaz,qux" + + result = parser.read_csv( + StringIO(data), + dtype={"a": int, "b": int}, + on_bad_lines="skip", + ) + + expected = DataFrame({"a": [], "b": []}).astype(int) + tm.assert_frame_equal(result, expected) From 37f45927b315cbcad6d396e714ba896655993e11 Mon Sep 17 00:00:00 2001 From: Nasser M <123868652+nejail@users.noreply.github.com> Date: Tue, 9 Dec 2025 20:18:11 +0300 Subject: [PATCH 6/9] DOC: add whatsnew entry for on_bad_lines dtype fix #63168 --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bebd928924214..1d00d5842ae92 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1417,6 +1417,7 @@ Other - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) - Bug when calling :py:func:`copy.copy` on a :class:`DataFrame` or :class:`Series` which would return a deep copy instead of a shallow copy (:issue:`62971`) +- Fixed bug in :func:`read_csv` where ``on_bad_lines="skip"`` would not skip rows that failed dtype conversion (:issue:`63168`) - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Accessing the underlying NumPy array of a DataFrame or Series will return a read-only From 1afd32b67a6588cc30a8aa84ca6df3a60a5ba2b0 Mon Sep 17 00:00:00 2001 From: Nasser M <123868652+nejail@users.noreply.github.com> Date: Tue, 9 Dec 2025 20:30:00 +0300 Subject: [PATCH 7/9] Removing unnecessary comments --- pandas/_libs/parsers.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a11353464674d..cefe289b51a86 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -472,7 +472,6 @@ cdef class TextReader: else: raise ValueError(f"Invalid value for on_bad_lines: {on_bad_lines}") else: - # If it's not a string, assume it's already an integer/enum constant (like ERROR) c_on_bad_lines = on_bad_lines self.parser.on_bad_lines = c_on_bad_lines From d1b2eb4d094132a8c8e9c809d2f0a0a5c65d40a3 Mon Sep 17 00:00:00 2001 From: Nasser M <123868652+nejail@users.noreply.github.com> Date: Tue, 9 Dec 2025 20:40:54 +0300 Subject: [PATCH 8/9] Fix string quotes in parsers.pyx for pre-commit.ci --- pandas/_libs/parsers.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index cefe289b51a86..38d333fa6c395 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -462,11 +462,11 @@ cdef class TextReader: self.parser.commentchar = ord(comment) if isinstance(on_bad_lines, str): - if on_bad_lines == 'error': + if on_bad_lines == "error": c_on_bad_lines = ERROR - elif on_bad_lines == 'warn': + elif on_bad_lines == "warn": c_on_bad_lines = WARN - elif on_bad_lines == 'skip': + elif on_bad_lines == "skip": c_on_bad_lines = SKIP # Note: can add 'skip_with_log' here later when we work on logging else: @@ -1482,7 +1482,7 @@ def _identify_bad_rows(values, dtype): """ Identify the row indices when values cannot be converted to the intended target - This can be used to find rows that should be skipped when on_bad_lines='skip' + This can be used to find rows that should be skipped when on_bad_lines="skip" Parameters ---------- From 1cf090fd8bdf5cc4bfaadff7a50f8c6e963fdfd7 Mon Sep 17 00:00:00 2001 From: Hazem Elsayed Date: Wed, 10 Dec 2025 09:58:40 +0300 Subject: [PATCH 9/9] FIX: add explicit int sizes to tests --- .../io/parser/common/test_read_errors.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index 36e1ecef6590e..00ddf084bbefb 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -335,18 +335,20 @@ def test_on_bad_lines_dtype_conversion_skip(c_parser_only, on_bad_lines, should_ ): result = parser.read_csv( StringIO(data), - dtype={"col1": int, "col2": int, "col3": int}, + dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64}, on_bad_lines=on_bad_lines, ) else: result = parser.read_csv( StringIO(data), - dtype={"col1": int, "col2": int, "col3": int}, + dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64}, on_bad_lines=on_bad_lines, ) # Row with 'a' cannot convert to int, should be skipped - expected = DataFrame({"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]}) + expected = DataFrame( + {"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]}, dtype=np.int64 + ) tm.assert_frame_equal(result, expected) @@ -358,7 +360,7 @@ def test_on_bad_lines_dtype_conversion_error(c_parser_only): with pytest.raises(ValueError, match="invalid literal for int"): parser.read_csv( StringIO(data), - dtype={"col1": int, "col2": int}, + dtype={"col1": np.int64, "col2": np.int64}, on_bad_lines="error", ) @@ -385,11 +387,17 @@ def test_on_bad_lines_dtype_partial_columns(c_parser_only): result = parser.read_csv( StringIO(data), - dtype={"a": int, "c": int}, + dtype={"a": np.int64, "c": np.int64}, on_bad_lines="skip", ) - expected = DataFrame({"a": [1, 4], "b": ["hello", "test"], "c": [3, 9]}) + expected = DataFrame( + { + "a": np.array([1, 4], dtype=np.int64), + "b": ["hello", "test"], + "c": np.array([3, 9], dtype=np.int64), + } + ) tm.assert_frame_equal(result, expected) @@ -400,11 +408,11 @@ def test_on_bad_lines_dtype_mixed_errors(c_parser_only): result = parser.read_csv( StringIO(data), - dtype={"a": int, "b": int, "c": int}, + dtype={"a": np.int64, "b": np.int64, "c": np.int64}, on_bad_lines="skip", ) - expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]}) + expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]}, dtype=np.int64) tm.assert_frame_equal(result, expected) @@ -415,9 +423,9 @@ def test_on_bad_lines_dtype_all_bad_rows(c_parser_only): result = parser.read_csv( StringIO(data), - dtype={"a": int, "b": int}, + dtype={"a": np.int64, "b": np.int64}, on_bad_lines="skip", ) - expected = DataFrame({"a": [], "b": []}).astype(int) + expected = DataFrame({"a": [], "b": []}).astype(np.int64) tm.assert_frame_equal(result, expected)