Skip to content
Open
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1417,6 +1417,7 @@ Other
- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
- Bug when calling :py:func:`copy.copy` on a :class:`DataFrame` or :class:`Series` which would return a deep copy instead of a shallow copy (:issue:`62971`)
- Fixed bug in :func:`read_csv` where ``on_bad_lines="skip"`` would not skip rows that failed dtype conversion (:issue:`63168`)
- Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`)
- Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`)
- Accessing the underlying NumPy array of a DataFrame or Series will return a read-only
Expand Down
121 changes: 120 additions & 1 deletion pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,20 @@ cdef class TextReader:
raise ValueError("Only length-1 comment characters supported")
self.parser.commentchar = <char>ord(comment)

self.parser.on_bad_lines = on_bad_lines
if isinstance(on_bad_lines, str):
if on_bad_lines == "error":
c_on_bad_lines = ERROR
elif on_bad_lines == "warn":
c_on_bad_lines = WARN
elif on_bad_lines == "skip":
c_on_bad_lines = SKIP
# Note: can add 'skip_with_log' here later when we work on logging
else:
raise ValueError(f"Invalid value for on_bad_lines: {on_bad_lines}")
else:
c_on_bad_lines = on_bad_lines

self.parser.on_bad_lines = c_on_bad_lines

self.skiprows = skiprows
if skiprows is not None:
Expand Down Expand Up @@ -937,6 +950,11 @@ cdef class TextReader:
int64_t num_cols
dict results
bint is_default_dict_dtype
set bad_rows
dict failed_columns_dtypes

bad_rows = set()
failed_columns_dtypes = {}

start = self.parser_start

Expand Down Expand Up @@ -1009,6 +1027,26 @@ cdef class TextReader:
col_res, na_count = self._convert_tokens(
i, start, end, name, na_filter, na_hashset,
na_fset, col_dtype)
except (ValueError, TypeError, OverflowError) as e:
# Handle dtype conversion failure based on on_bad_lines
if self.parser.on_bad_lines == SKIP or self.parser.on_bad_lines == WARN:
# Fall back to string conversion
col_res, na_count = self._string_convert(
i, start, end, na_filter, na_hashset)

# Track the columns intended dtype for bad row detection lateron
if col_dtype is not None:
failed_columns_dtypes[i] = col_dtype

if self.parser.on_bad_lines == WARN:
warnings.warn(
f"Could not convert column {name} to dtype {col_dtype}: "
f"{e}. Rows with unconvertible values will be skipped.",
ParserWarning,
stacklevel=find_stack_level()
)
else:
raise
finally:
# gh-21353
#
Expand All @@ -1034,6 +1072,42 @@ cdef class TextReader:

results[i] = col_res

# Filters out the bad rows if on_bad_lines is skipped or warned
if failed_columns_dtypes:
# Identify bad rows from columns that failed dtype conversion
for col_idx, target_dtype in failed_columns_dtypes.items():
col_values = results[col_idx]
bad_row_indices = _identify_bad_rows(col_values, target_dtype)
bad_rows.update(bad_row_indices)

if bad_rows:
num_rows = end - start
good_mask = np.ones(num_rows, dtype=np.bool_)
for bad_idx in bad_rows:
good_mask[bad_idx] = False

# Filter all columns to keep only good rows
for col_idx in results:
results[col_idx] = results[col_idx][good_mask]

if self.parser.on_bad_lines == WARN:
warnings.warn(
f"Skipped {len(bad_rows)} line(s) due to dtype "
f"conversion errors.",
ParserWarning,
stacklevel=find_stack_level()
)

# Re-convert failed columns to their target dtype now that bad rows
# have been removed. All remaining values should be convertible.
for col_idx, target_dtype in failed_columns_dtypes.items():
try:
results[col_idx] = np.array(results[col_idx]).astype(target_dtype)
except (ValueError, TypeError, OverflowError):
# If conversion still fails, keep as string (shouldn't happen
# if _identify_bad_rows worked correctly, but be defensive)
pass

self.parser_start += end - start

return results
Expand Down Expand Up @@ -1404,6 +1478,51 @@ STR_NA_VALUES = {
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))


def _identify_bad_rows(values, dtype):
"""
Identify the row indices when values cannot be converted to the intended target

This can be used to find rows that should be skipped when on_bad_lines="skip"

Parameters
----------
values : ndarray
Array of values to check
dtype : numpy dtype
Target dtype to check conversion against

Returns
-------
set
Set of row indices (0-based) that cannot be converted.
"""
bad_indices = set()

for idx in range(len(values)):
val = values[idx]

# Skip None/NaN values - they're handled separately
if val is None:
continue
if isinstance(val, float) and np.isnan(val):
continue
if isinstance(val, str) and val.strip() == "":
continue

try:
if dtype.kind in "iu": # integer types
int(val)
elif dtype.kind == "f": # float types
float(val)
elif dtype.kind == "b": # boolean
# Complex pass it until we fix again
pass
except (ValueError, TypeError):
bad_indices.add(idx)

return bad_indices


def _maybe_upcast(
arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy"
):
Expand Down
116 changes: 116 additions & 0 deletions pandas/tests/io/parser/common/test_read_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,3 +313,119 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers):
):
result = parser.read_csv(StringIO(data), on_bad_lines="warn")
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"on_bad_lines,should_warn",
[
("skip", False),
("warn", True),
],
)
def test_on_bad_lines_dtype_conversion_skip(c_parser_only, on_bad_lines, should_warn):
# GH#63168 - on_bad_lines should handle dtype conversion failures
parser = c_parser_only
data = "col1,col2,col3\n1,2,3\na,4,5\n4,5,6"

if should_warn:
with tm.assert_produces_warning(
ParserWarning,
match="Could not convert column|Skipped .* line",
check_stacklevel=False,
):
result = parser.read_csv(
StringIO(data),
dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64},
on_bad_lines=on_bad_lines,
)
else:
result = parser.read_csv(
StringIO(data),
dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64},
on_bad_lines=on_bad_lines,
)

# Row with 'a' cannot convert to int, should be skipped
expected = DataFrame(
{"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]}, dtype=np.int64
)
tm.assert_frame_equal(result, expected)


def test_on_bad_lines_dtype_conversion_error(c_parser_only):
# GH#63168 - on_bad_lines='error' should raise on dtype conversion failure
parser = c_parser_only
data = "col1,col2\n1,2\na,3"

with pytest.raises(ValueError, match="invalid literal for int"):
parser.read_csv(
StringIO(data),
dtype={"col1": np.int64, "col2": np.int64},
on_bad_lines="error",
)


def test_on_bad_lines_dtype_float_conversion(c_parser_only):
# GH#63168 - Float dtype with non-numeric values
parser = c_parser_only
data = "a,b\n1.5,2.5\nfoo,3.5\n4.5,5.5"

result = parser.read_csv(
StringIO(data),
dtype={"a": float, "b": float},
on_bad_lines="skip",
)

expected = DataFrame({"a": [1.5, 4.5], "b": [2.5, 5.5]})
tm.assert_frame_equal(result, expected)


def test_on_bad_lines_dtype_partial_columns(c_parser_only):
# GH#63168 - Only some columns have dtype specified
parser = c_parser_only
data = "a,b,c\n1,hello,3\nfoo,world,6\n4,test,9"

result = parser.read_csv(
StringIO(data),
dtype={"a": np.int64, "c": np.int64},
on_bad_lines="skip",
)

expected = DataFrame(
{
"a": np.array([1, 4], dtype=np.int64),
"b": ["hello", "test"],
"c": np.array([3, 9], dtype=np.int64),
}
)
tm.assert_frame_equal(result, expected)


def test_on_bad_lines_dtype_mixed_errors(c_parser_only):
# GH#63168 - Mix of structural errors (wrong field count) and dtype errors
parser = c_parser_only
data = "a,b,c\n1,2,3\nwrong_field_count\nfoo,4,5\n6,7,8"

result = parser.read_csv(
StringIO(data),
dtype={"a": np.int64, "b": np.int64, "c": np.int64},
on_bad_lines="skip",
)

expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]}, dtype=np.int64)
tm.assert_frame_equal(result, expected)


def test_on_bad_lines_dtype_all_bad_rows(c_parser_only):
# GH#63168 - All data rows fail conversion
parser = c_parser_only
data = "a,b\nfoo,bar\nbaz,qux"

result = parser.read_csv(
StringIO(data),
dtype={"a": np.int64, "b": np.int64},
on_bad_lines="skip",
)

expected = DataFrame({"a": [], "b": []}).astype(np.int64)
tm.assert_frame_equal(result, expected)
Loading