pandas-dev · nejail · Nov 26, 2025 · Nov 26, 2025 · Dec 4, 2025 · Dec 8, 2025
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -1417,6 +1417,7 @@ Other
 - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
 - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
 - Bug when calling :py:func:`copy.copy` on a :class:`DataFrame` or :class:`Series` which would return a deep copy instead of a shallow copy (:issue:`62971`)
+- Fixed bug in :func:`read_csv` where ``on_bad_lines="skip"`` would not skip rows that failed dtype conversion (:issue:`63168`)
 - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`)
 - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`)
 - Accessing the underlying NumPy array of a DataFrame or Series will return a read-only

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -461,7 +461,20 @@ cdef class TextReader:
                 raise ValueError("Only length-1 comment characters supported")
             self.parser.commentchar = <char>ord(comment)
 
-        self.parser.on_bad_lines = on_bad_lines
+        if isinstance(on_bad_lines, str):
+            if on_bad_lines == "error":
+                c_on_bad_lines = ERROR
+            elif on_bad_lines == "warn":
+                c_on_bad_lines = WARN
+            elif on_bad_lines == "skip":
+                c_on_bad_lines = SKIP
+            # Note: can add 'skip_with_log' here later when we work on logging
+            else:
+                raise ValueError(f"Invalid value for on_bad_lines: {on_bad_lines}")
+        else:
+            c_on_bad_lines = on_bad_lines
+
+        self.parser.on_bad_lines = c_on_bad_lines
 
         self.skiprows = skiprows
         if skiprows is not None:
@@ -937,6 +950,11 @@ cdef class TextReader:
             int64_t num_cols
             dict results
             bint is_default_dict_dtype
+            set bad_rows
+            dict failed_columns_dtypes
+
+        bad_rows = set()
+        failed_columns_dtypes = {}
 
         start = self.parser_start
 
@@ -1009,6 +1027,26 @@ cdef class TextReader:
                 col_res, na_count = self._convert_tokens(
                     i, start, end, name, na_filter, na_hashset,
                     na_fset, col_dtype)
+            except (ValueError, TypeError, OverflowError) as e:
+                # Handle dtype conversion failure based on on_bad_lines
+                if self.parser.on_bad_lines == SKIP or self.parser.on_bad_lines == WARN:
+                    # Fall back to string conversion
+                    col_res, na_count = self._string_convert(
+                        i, start, end, na_filter, na_hashset)
+
+                    # Track the columns intended dtype for bad row detection lateron
+                    if col_dtype is not None:
+                        failed_columns_dtypes[i] = col_dtype
+
+                    if self.parser.on_bad_lines == WARN:
+                        warnings.warn(
+                            f"Could not convert column {name} to dtype {col_dtype}: "
+                            f"{e}. Rows with unconvertible values will be skipped.",
+                            ParserWarning,
+                            stacklevel=find_stack_level()
+                        )
+                else:
+                    raise
             finally:
                 # gh-21353
                 #
@@ -1034,6 +1072,42 @@ cdef class TextReader:
 
             results[i] = col_res
 
+        # Filters out the bad rows if on_bad_lines is skipped or warned
+        if failed_columns_dtypes:
+            # Identify bad rows from columns that failed dtype conversion
+            for col_idx, target_dtype in failed_columns_dtypes.items():
+                col_values = results[col_idx]
+                bad_row_indices = _identify_bad_rows(col_values, target_dtype)
+                bad_rows.update(bad_row_indices)
+
+            if bad_rows:
+                num_rows = end - start
+                good_mask = np.ones(num_rows, dtype=np.bool_)
+                for bad_idx in bad_rows:
+                    good_mask[bad_idx] = False
+
+                # Filter all columns to keep only good rows
+                for col_idx in results:
+                    results[col_idx] = results[col_idx][good_mask]
+
+                if self.parser.on_bad_lines == WARN:
+                    warnings.warn(
+                        f"Skipped {len(bad_rows)} line(s) due to dtype "
+                        f"conversion errors.",
+                        ParserWarning,
+                        stacklevel=find_stack_level()
+                    )
+
+            # Re-convert failed columns to their target dtype now that bad rows
+            # have been removed. All remaining values should be convertible.
+            for col_idx, target_dtype in failed_columns_dtypes.items():
+                try:
+                    results[col_idx] = np.array(results[col_idx]).astype(target_dtype)
+                except (ValueError, TypeError, OverflowError):
+                    # If conversion still fails, keep as string (shouldn't happen
+                    # if _identify_bad_rows worked correctly, but be defensive)
+                    pass
+
         self.parser_start += end - start
 
         return results
@@ -1404,6 +1478,51 @@ STR_NA_VALUES = {
 _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 
+def _identify_bad_rows(values, dtype):
+    """
+    Identify the row indices when values cannot be converted to the intended target
+
+    This can be used to find rows that should be skipped when on_bad_lines="skip"
+
+    Parameters
+    ----------
+    values : ndarray
+        Array of values to check
+    dtype : numpy dtype
+        Target dtype to check conversion against
+
+    Returns
+    -------
+    set
+        Set of row indices (0-based) that cannot be converted.
+    """
+    bad_indices = set()
+
+    for idx in range(len(values)):
+        val = values[idx]
+
+        # Skip None/NaN values - they're handled separately
+        if val is None:
+            continue
+        if isinstance(val, float) and np.isnan(val):
+            continue
+        if isinstance(val, str) and val.strip() == "":
+            continue
+
+        try:
+            if dtype.kind in "iu":  # integer types
+                int(val)
+            elif dtype.kind == "f":  # float types
+                float(val)
+            elif dtype.kind == "b":  # boolean
+                # Complex pass it until we fix again
+                pass
+        except (ValueError, TypeError):
+            bad_indices.add(idx)
+
+    return bad_indices
+
+
 def _maybe_upcast(
     arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy"
 ):

diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
@@ -313,3 +313,119 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers):
     ):
         result = parser.read_csv(StringIO(data), on_bad_lines="warn")
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "on_bad_lines,should_warn",
+    [
+        ("skip", False),
+        ("warn", True),
+    ],
+)
+def test_on_bad_lines_dtype_conversion_skip(c_parser_only, on_bad_lines, should_warn):
+    # GH#63168 - on_bad_lines should handle dtype conversion failures
+    parser = c_parser_only
+    data = "col1,col2,col3\n1,2,3\na,4,5\n4,5,6"
+
+    if should_warn:
+        with tm.assert_produces_warning(
+            ParserWarning,
+            match="Could not convert column|Skipped .* line",
+            check_stacklevel=False,
+        ):
+            result = parser.read_csv(
+                StringIO(data),
+                dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64},
+                on_bad_lines=on_bad_lines,
+            )
+    else:
+        result = parser.read_csv(
+            StringIO(data),
+            dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64},
+            on_bad_lines=on_bad_lines,
+        )
+
+    # Row with 'a' cannot convert to int, should be skipped
+    expected = DataFrame(
+        {"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]}, dtype=np.int64
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_dtype_conversion_error(c_parser_only):
+    # GH#63168 - on_bad_lines='error' should raise on dtype conversion failure
+    parser = c_parser_only
+    data = "col1,col2\n1,2\na,3"
+
+    with pytest.raises(ValueError, match="invalid literal for int"):
+        parser.read_csv(
+            StringIO(data),
+            dtype={"col1": np.int64, "col2": np.int64},
+            on_bad_lines="error",
+        )
+
+
+def test_on_bad_lines_dtype_float_conversion(c_parser_only):
+    # GH#63168 - Float dtype with non-numeric values
+    parser = c_parser_only
+    data = "a,b\n1.5,2.5\nfoo,3.5\n4.5,5.5"
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"a": float, "b": float},
+        on_bad_lines="skip",
+    )
+
+    expected = DataFrame({"a": [1.5, 4.5], "b": [2.5, 5.5]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_dtype_partial_columns(c_parser_only):
+    # GH#63168 - Only some columns have dtype specified
+    parser = c_parser_only
+    data = "a,b,c\n1,hello,3\nfoo,world,6\n4,test,9"
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"a": np.int64, "c": np.int64},
+        on_bad_lines="skip",
+    )
+
+    expected = DataFrame(
+        {
+            "a": np.array([1, 4], dtype=np.int64),
+            "b": ["hello", "test"],
+            "c": np.array([3, 9], dtype=np.int64),
+        }
+    )
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_dtype_mixed_errors(c_parser_only):
+    # GH#63168 - Mix of structural errors (wrong field count) and dtype errors
+    parser = c_parser_only
+    data = "a,b,c\n1,2,3\nwrong_field_count\nfoo,4,5\n6,7,8"
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"a": np.int64, "b": np.int64, "c": np.int64},
+        on_bad_lines="skip",
+    )
+
+    expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]}, dtype=np.int64)
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_dtype_all_bad_rows(c_parser_only):
+    # GH#63168 - All data rows fail conversion
+    parser = c_parser_only
+    data = "a,b\nfoo,bar\nbaz,qux"
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"a": np.int64, "b": np.int64},
+        on_bad_lines="skip",
+    )
+
+    expected = DataFrame({"a": [], "b": []}).astype(np.int64)
+    tm.assert_frame_equal(result, expected)