diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bebd928924214..1d00d5842ae92 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1417,6 +1417,7 @@ Other - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`) - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`) - Bug when calling :py:func:`copy.copy` on a :class:`DataFrame` or :class:`Series` which would return a deep copy instead of a shallow copy (:issue:`62971`) +- Fixed bug in :func:`read_csv` where ``on_bad_lines="skip"`` would not skip rows that failed dtype conversion (:issue:`63168`) - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`) - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`) - Accessing the underlying NumPy array of a DataFrame or Series will return a read-only diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index c2767dc47b5e4..38d333fa6c395 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -461,7 +461,20 @@ cdef class TextReader: raise ValueError("Only length-1 comment characters supported") self.parser.commentchar = ord(comment) - self.parser.on_bad_lines = on_bad_lines + if isinstance(on_bad_lines, str): + if on_bad_lines == "error": + c_on_bad_lines = ERROR + elif on_bad_lines == "warn": + c_on_bad_lines = WARN + elif on_bad_lines == "skip": + c_on_bad_lines = SKIP + # Note: can add 'skip_with_log' here later when we work on logging + else: + raise ValueError(f"Invalid value for on_bad_lines: {on_bad_lines}") + else: + c_on_bad_lines = on_bad_lines + + self.parser.on_bad_lines = c_on_bad_lines self.skiprows = skiprows if skiprows is not None: @@ -937,6 +950,11 @@ cdef class TextReader: int64_t num_cols dict results bint is_default_dict_dtype + set bad_rows + dict failed_columns_dtypes + + bad_rows = set() + failed_columns_dtypes = {} start = self.parser_start @@ -1009,6 +1027,26 @@ cdef class TextReader: col_res, na_count = self._convert_tokens( i, start, end, name, na_filter, na_hashset, na_fset, col_dtype) + except (ValueError, TypeError, OverflowError) as e: + # Handle dtype conversion failure based on on_bad_lines + if self.parser.on_bad_lines == SKIP or self.parser.on_bad_lines == WARN: + # Fall back to string conversion + col_res, na_count = self._string_convert( + i, start, end, na_filter, na_hashset) + + # Track the columns intended dtype for bad row detection lateron + if col_dtype is not None: + failed_columns_dtypes[i] = col_dtype + + if self.parser.on_bad_lines == WARN: + warnings.warn( + f"Could not convert column {name} to dtype {col_dtype}: " + f"{e}. Rows with unconvertible values will be skipped.", + ParserWarning, + stacklevel=find_stack_level() + ) + else: + raise finally: # gh-21353 # @@ -1034,6 +1072,42 @@ cdef class TextReader: results[i] = col_res + # Filters out the bad rows if on_bad_lines is skipped or warned + if failed_columns_dtypes: + # Identify bad rows from columns that failed dtype conversion + for col_idx, target_dtype in failed_columns_dtypes.items(): + col_values = results[col_idx] + bad_row_indices = _identify_bad_rows(col_values, target_dtype) + bad_rows.update(bad_row_indices) + + if bad_rows: + num_rows = end - start + good_mask = np.ones(num_rows, dtype=np.bool_) + for bad_idx in bad_rows: + good_mask[bad_idx] = False + + # Filter all columns to keep only good rows + for col_idx in results: + results[col_idx] = results[col_idx][good_mask] + + if self.parser.on_bad_lines == WARN: + warnings.warn( + f"Skipped {len(bad_rows)} line(s) due to dtype " + f"conversion errors.", + ParserWarning, + stacklevel=find_stack_level() + ) + + # Re-convert failed columns to their target dtype now that bad rows + # have been removed. All remaining values should be convertible. + for col_idx, target_dtype in failed_columns_dtypes.items(): + try: + results[col_idx] = np.array(results[col_idx]).astype(target_dtype) + except (ValueError, TypeError, OverflowError): + # If conversion still fails, keep as string (shouldn't happen + # if _identify_bad_rows worked correctly, but be defensive) + pass + self.parser_start += end - start return results @@ -1404,6 +1478,51 @@ STR_NA_VALUES = { _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES)) +def _identify_bad_rows(values, dtype): + """ + Identify the row indices when values cannot be converted to the intended target + + This can be used to find rows that should be skipped when on_bad_lines="skip" + + Parameters + ---------- + values : ndarray + Array of values to check + dtype : numpy dtype + Target dtype to check conversion against + + Returns + ------- + set + Set of row indices (0-based) that cannot be converted. + """ + bad_indices = set() + + for idx in range(len(values)): + val = values[idx] + + # Skip None/NaN values - they're handled separately + if val is None: + continue + if isinstance(val, float) and np.isnan(val): + continue + if isinstance(val, str) and val.strip() == "": + continue + + try: + if dtype.kind in "iu": # integer types + int(val) + elif dtype.kind == "f": # float types + float(val) + elif dtype.kind == "b": # boolean + # Complex pass it until we fix again + pass + except (ValueError, TypeError): + bad_indices.add(idx) + + return bad_indices + + def _maybe_upcast( arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy" ): diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py index b8cf435ef0443..00ddf084bbefb 100644 --- a/pandas/tests/io/parser/common/test_read_errors.py +++ b/pandas/tests/io/parser/common/test_read_errors.py @@ -313,3 +313,119 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers): ): result = parser.read_csv(StringIO(data), on_bad_lines="warn") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "on_bad_lines,should_warn", + [ + ("skip", False), + ("warn", True), + ], +) +def test_on_bad_lines_dtype_conversion_skip(c_parser_only, on_bad_lines, should_warn): + # GH#63168 - on_bad_lines should handle dtype conversion failures + parser = c_parser_only + data = "col1,col2,col3\n1,2,3\na,4,5\n4,5,6" + + if should_warn: + with tm.assert_produces_warning( + ParserWarning, + match="Could not convert column|Skipped .* line", + check_stacklevel=False, + ): + result = parser.read_csv( + StringIO(data), + dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64}, + on_bad_lines=on_bad_lines, + ) + else: + result = parser.read_csv( + StringIO(data), + dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64}, + on_bad_lines=on_bad_lines, + ) + + # Row with 'a' cannot convert to int, should be skipped + expected = DataFrame( + {"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]}, dtype=np.int64 + ) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_dtype_conversion_error(c_parser_only): + # GH#63168 - on_bad_lines='error' should raise on dtype conversion failure + parser = c_parser_only + data = "col1,col2\n1,2\na,3" + + with pytest.raises(ValueError, match="invalid literal for int"): + parser.read_csv( + StringIO(data), + dtype={"col1": np.int64, "col2": np.int64}, + on_bad_lines="error", + ) + + +def test_on_bad_lines_dtype_float_conversion(c_parser_only): + # GH#63168 - Float dtype with non-numeric values + parser = c_parser_only + data = "a,b\n1.5,2.5\nfoo,3.5\n4.5,5.5" + + result = parser.read_csv( + StringIO(data), + dtype={"a": float, "b": float}, + on_bad_lines="skip", + ) + + expected = DataFrame({"a": [1.5, 4.5], "b": [2.5, 5.5]}) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_dtype_partial_columns(c_parser_only): + # GH#63168 - Only some columns have dtype specified + parser = c_parser_only + data = "a,b,c\n1,hello,3\nfoo,world,6\n4,test,9" + + result = parser.read_csv( + StringIO(data), + dtype={"a": np.int64, "c": np.int64}, + on_bad_lines="skip", + ) + + expected = DataFrame( + { + "a": np.array([1, 4], dtype=np.int64), + "b": ["hello", "test"], + "c": np.array([3, 9], dtype=np.int64), + } + ) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_dtype_mixed_errors(c_parser_only): + # GH#63168 - Mix of structural errors (wrong field count) and dtype errors + parser = c_parser_only + data = "a,b,c\n1,2,3\nwrong_field_count\nfoo,4,5\n6,7,8" + + result = parser.read_csv( + StringIO(data), + dtype={"a": np.int64, "b": np.int64, "c": np.int64}, + on_bad_lines="skip", + ) + + expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]}, dtype=np.int64) + tm.assert_frame_equal(result, expected) + + +def test_on_bad_lines_dtype_all_bad_rows(c_parser_only): + # GH#63168 - All data rows fail conversion + parser = c_parser_only + data = "a,b\nfoo,bar\nbaz,qux" + + result = parser.read_csv( + StringIO(data), + dtype={"a": np.int64, "b": np.int64}, + on_bad_lines="skip", + ) + + expected = DataFrame({"a": [], "b": []}).astype(np.int64) + tm.assert_frame_equal(result, expected)