@@ -937,6 +937,11 @@ cdef class TextReader:
937937 int64_t num_cols
938938 dict results
939939 bint is_default_dict_dtype
940+ set bad_rows
941+ dict failed_columns_dtypes
942+
943+ bad_rows = set ()
944+ failed_columns_dtypes = {}
940945
941946 start = self .parser_start
942947
@@ -1009,6 +1014,26 @@ cdef class TextReader:
10091014 col_res, na_count = self ._convert_tokens(
10101015 i, start, end, name, na_filter, na_hashset,
10111016 na_fset, col_dtype)
1017+ except (ValueError , TypeError , OverflowError ) as e:
1018+ # GH#63168: Handle dtype conversion failures based on on_bad_lines
1019+ if self .parser.on_bad_lines == SKIP or self .parser.on_bad_lines == WARN:
1020+ # Fall back to string conversion
1021+ col_res, na_count = self ._string_convert(
1022+ i, start, end, na_filter, na_hashset)
1023+
1024+ # Track this column's intended dtype for later bad row detection
1025+ if col_dtype is not None :
1026+ failed_columns_dtypes[i] = col_dtype
1027+
1028+ if self .parser.on_bad_lines == WARN:
1029+ warnings.warn(
1030+ f" Could not convert column {name} to dtype {col_dtype}: "
1031+ f" {e}. Rows with unconvertible values will be skipped." ,
1032+ ParserWarning,
1033+ stacklevel = find_stack_level()
1034+ )
1035+ else :
1036+ raise
10121037 finally :
10131038 # gh-21353
10141039 #
@@ -1034,6 +1059,32 @@ cdef class TextReader:
10341059
10351060 results[i] = col_res
10361061
1062+ # GH#63168: Filter out bad rows if on_bad_lines is SKIP or WARN
1063+ if failed_columns_dtypes:
1064+ # Identify bad rows from columns that failed dtype conversion
1065+ for col_idx, target_dtype in failed_columns_dtypes.items():
1066+ col_values = results[col_idx]
1067+ bad_row_indices = _identify_bad_rows(col_values, target_dtype)
1068+ bad_rows.update(bad_row_indices)
1069+
1070+ if bad_rows:
1071+ num_rows = end - start
1072+ good_mask = np.ones(num_rows, dtype = np.bool_)
1073+ for bad_idx in bad_rows:
1074+ good_mask[bad_idx] = False
1075+
1076+ # Filter all columns to keep only good rows
1077+ for col_idx in results:
1078+ results[col_idx] = results[col_idx][good_mask]
1079+
1080+ if self .parser.on_bad_lines == WARN:
1081+ warnings.warn(
1082+ f" Skipped {len(bad_rows)} line(s) due to dtype "
1083+ f" conversion errors." ,
1084+ ParserWarning,
1085+ stacklevel = find_stack_level()
1086+ )
1087+
10371088 self .parser_start += end - start
10381089
10391090 return results
@@ -1404,6 +1455,51 @@ STR_NA_VALUES = {
14041455_NA_VALUES = _ensure_encoded(list (STR_NA_VALUES))
14051456
14061457
1458+ def _identify_bad_rows (values , dtype ):
1459+ """
1460+ Identify row indices where values cannot be converted to the target dtype.
1461+
1462+ GH#63168: Used to find rows that should be skipped when on_bad_lines='skip'.
1463+
1464+ Parameters
1465+ ----------
1466+ values : ndarray
1467+ Array of values (typically strings/objects) to check.
1468+ dtype : numpy dtype
1469+ Target dtype to check conversion against.
1470+
1471+ Returns
1472+ -------
1473+ set
1474+ Set of row indices (0-based) that cannot be converted.
1475+ """
1476+ bad_indices = set ()
1477+
1478+ for idx in range (len (values)):
1479+ val = values[idx]
1480+
1481+ # Skip None/NaN values - they're handled separately
1482+ if val is None :
1483+ continue
1484+ if isinstance (val, float ) and np.isnan(val):
1485+ continue
1486+ if isinstance (val, str ) and val.strip() == " " :
1487+ continue
1488+
1489+ try :
1490+ if dtype.kind in " iu" : # integer types
1491+ int (val)
1492+ elif dtype.kind == " f" : # float types
1493+ float (val)
1494+ elif dtype.kind == " b" : # boolean
1495+ # Boolean conversion is more complex, skip for now
1496+ pass
1497+ except (ValueError , TypeError ):
1498+ bad_indices.add(idx)
1499+
1500+ return bad_indices
1501+
1502+
14071503def _maybe_upcast (
14081504 arr , use_dtype_backend: bool = False , dtype_backend: str = " numpy"
14091505):
0 commit comments