From 3fd0accf66d1d2db73a607720944efff07d0368d Mon Sep 17 00:00:00 2001
From: Fahad <fahadabd@andrew.cmu.edu>
Date: Wed, 26 Nov 2025 19:22:03 +0300
Subject: [PATCH 1/9] FIX: added on_bad_lines support for dtype conversion
 failures #63168

---
 pandas/_libs/parsers.pyx | 96 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index c2767dc47b5e4..e3d063a964227 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -937,6 +937,11 @@ cdef class TextReader:
             int64_t num_cols
             dict results
             bint is_default_dict_dtype
+            set bad_rows
+            dict failed_columns_dtypes
+
+        bad_rows = set()
+        failed_columns_dtypes = {}
 
         start = self.parser_start
 
@@ -1009,6 +1014,26 @@ cdef class TextReader:
                 col_res, na_count = self._convert_tokens(
                     i, start, end, name, na_filter, na_hashset,
                     na_fset, col_dtype)
+            except (ValueError, TypeError, OverflowError) as e:
+                # GH#63168: Handle dtype conversion failures based on on_bad_lines
+                if self.parser.on_bad_lines == SKIP or self.parser.on_bad_lines == WARN:
+                    # Fall back to string conversion
+                    col_res, na_count = self._string_convert(
+                        i, start, end, na_filter, na_hashset)
+
+                    # Track this column's intended dtype for later bad row detection
+                    if col_dtype is not None:
+                        failed_columns_dtypes[i] = col_dtype
+
+                    if self.parser.on_bad_lines == WARN:
+                        warnings.warn(
+                            f"Could not convert column {name} to dtype {col_dtype}: "
+                            f"{e}. Rows with unconvertible values will be skipped.",
+                            ParserWarning,
+                            stacklevel=find_stack_level()
+                        )
+                else:
+                    raise
             finally:
                 # gh-21353
                 #
@@ -1034,6 +1059,32 @@ cdef class TextReader:
 
             results[i] = col_res
 
+        # GH#63168: Filter out bad rows if on_bad_lines is SKIP or WARN
+        if failed_columns_dtypes:
+            # Identify bad rows from columns that failed dtype conversion
+            for col_idx, target_dtype in failed_columns_dtypes.items():
+                col_values = results[col_idx]
+                bad_row_indices = _identify_bad_rows(col_values, target_dtype)
+                bad_rows.update(bad_row_indices)
+
+            if bad_rows:
+                num_rows = end - start
+                good_mask = np.ones(num_rows, dtype=np.bool_)
+                for bad_idx in bad_rows:
+                    good_mask[bad_idx] = False
+
+                # Filter all columns to keep only good rows
+                for col_idx in results:
+                    results[col_idx] = results[col_idx][good_mask]
+
+                if self.parser.on_bad_lines == WARN:
+                    warnings.warn(
+                        f"Skipped {len(bad_rows)} line(s) due to dtype "
+                        f"conversion errors.",
+                        ParserWarning,
+                        stacklevel=find_stack_level()
+                    )
+
         self.parser_start += end - start
 
         return results
@@ -1404,6 +1455,51 @@ STR_NA_VALUES = {
 _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 
+def _identify_bad_rows(values, dtype):
+    """
+    Identify row indices where values cannot be converted to the target dtype.
+
+    GH#63168: Used to find rows that should be skipped when on_bad_lines='skip'.
+
+    Parameters
+    ----------
+    values : ndarray
+        Array of values (typically strings/objects) to check.
+    dtype : numpy dtype
+        Target dtype to check conversion against.
+
+    Returns
+    -------
+    set
+        Set of row indices (0-based) that cannot be converted.
+    """
+    bad_indices = set()
+
+    for idx in range(len(values)):
+        val = values[idx]
+
+        # Skip None/NaN values - they're handled separately
+        if val is None:
+            continue
+        if isinstance(val, float) and np.isnan(val):
+            continue
+        if isinstance(val, str) and val.strip() == "":
+            continue
+
+        try:
+            if dtype.kind in "iu":  # integer types
+                int(val)
+            elif dtype.kind == "f":  # float types
+                float(val)
+            elif dtype.kind == "b":  # boolean
+                # Boolean conversion is more complex, skip for now
+                pass
+        except (ValueError, TypeError):
+            bad_indices.add(idx)
+
+    return bad_indices
+
+
 def _maybe_upcast(
     arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy"
 ):

From c30677de2327c2d03f416e157016dbc9ed46ca36 Mon Sep 17 00:00:00 2001
From: Nasser M <123868652+nejail@users.noreply.github.com>
Date: Wed, 26 Nov 2025 20:05:37 +0300
Subject: [PATCH 2/9] DOC: Update comment for handling bool dtype converstion

---
 pandas/_libs/parsers.pyx | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index e3d063a964227..63fb1ba82ab6c 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1015,13 +1015,13 @@ cdef class TextReader:
                     i, start, end, name, na_filter, na_hashset,
                     na_fset, col_dtype)
             except (ValueError, TypeError, OverflowError) as e:
-                # GH#63168: Handle dtype conversion failures based on on_bad_lines
+                # Handle dtype conversion failure based on on_bad_lines
                 if self.parser.on_bad_lines == SKIP or self.parser.on_bad_lines == WARN:
                     # Fall back to string conversion
                     col_res, na_count = self._string_convert(
                         i, start, end, na_filter, na_hashset)
 
-                    # Track this column's intended dtype for later bad row detection
+                    # Track the columns intended dtype for bad row detection lateron
                     if col_dtype is not None:
                         failed_columns_dtypes[i] = col_dtype
 
@@ -1059,7 +1059,7 @@ cdef class TextReader:
 
             results[i] = col_res
 
-        # GH#63168: Filter out bad rows if on_bad_lines is SKIP or WARN
+        # Filters out the bad rows if on_bad_lines is skipped or warned
         if failed_columns_dtypes:
             # Identify bad rows from columns that failed dtype conversion
             for col_idx, target_dtype in failed_columns_dtypes.items():
@@ -1457,16 +1457,16 @@ _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
 
 def _identify_bad_rows(values, dtype):
     """
-    Identify row indices where values cannot be converted to the target dtype.
+    Identify the row indices when values cannot be converted to the intended target
 
-    GH#63168: Used to find rows that should be skipped when on_bad_lines='skip'.
+    This can be used to find rows that should be skipped when on_bad_lines='skip'
 
     Parameters
     ----------
     values : ndarray
-        Array of values (typically strings/objects) to check.
+        Array of values to check
     dtype : numpy dtype
-        Target dtype to check conversion against.
+        Target dtype to check conversion against
 
     Returns
     -------
@@ -1492,7 +1492,7 @@ def _identify_bad_rows(values, dtype):
             elif dtype.kind == "f":  # float types
                 float(val)
             elif dtype.kind == "b":  # boolean
-                # Boolean conversion is more complex, skip for now
+                # Complex pass it until we fix again
                 pass
         except (ValueError, TypeError):
             bad_indices.add(idx)

From 09d10d2feb04170b0d4bfb06a9a7cfc6e552b864 Mon Sep 17 00:00:00 2001
From: Anurag Aryal <aryalanurag20602003@gmail.com>
Date: Thu, 4 Dec 2025 12:46:12 -0500
Subject: [PATCH 3/9] FIX: read_csv: Implement string-based 'on_bad_lines'
 options for C engine

---
 pandas/_libs/parsers.pyx | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 63fb1ba82ab6c..3abb69d34dfb8 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -461,7 +461,21 @@ cdef class TextReader:
                 raise ValueError("Only length-1 comment characters supported")
             self.parser.commentchar = <char>ord(comment)
 
-        self.parser.on_bad_lines = on_bad_lines
+        if isinstance(on_bad_lines, str):
+            if on_bad_lines == 'error':
+                c_on_bad_lines = ERROR
+            elif on_bad_lines == 'warn':
+                c_on_bad_lines = WARN
+            elif on_bad_lines == 'skip':
+                c_on_bad_lines = SKIP
+            # Note: can add 'skip_with_log' here later when we work on logging
+            else:
+                raise ValueError(f"Invalid value for on_bad_lines: {on_bad_lines}")
+        else:
+            # If it's not a string, assume it's already an integer/enum constant (like ERROR)
+            c_on_bad_lines = on_bad_lines
+
+        self.parser.on_bad_lines = c_on_bad_lines
 
         self.skiprows = skiprows
         if skiprows is not None:

From 75db71cd48248b20af983e5228037940530f827f Mon Sep 17 00:00:00 2001
From: Hazem Elsayed <helsayed@avey.ai>
Date: Mon, 8 Dec 2025 17:37:50 +0300
Subject: [PATCH 4/9] FIX: read_csv: Re-convert to target dtype after filtering
 bad lines

---
 pandas/_libs/parsers.pyx | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index 3abb69d34dfb8..a11353464674d 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -1099,6 +1099,16 @@ cdef class TextReader:
                         stacklevel=find_stack_level()
                     )
 
+            # Re-convert failed columns to their target dtype now that bad rows
+            # have been removed. All remaining values should be convertible.
+            for col_idx, target_dtype in failed_columns_dtypes.items():
+                try:
+                    results[col_idx] = np.array(results[col_idx]).astype(target_dtype)
+                except (ValueError, TypeError, OverflowError):
+                    # If conversion still fails, keep as string (shouldn't happen
+                    # if _identify_bad_rows worked correctly, but be defensive)
+                    pass
+
         self.parser_start += end - start
 
         return results

From 2b00ad2979b94587b9d585028834a3388aa72fec Mon Sep 17 00:00:00 2001
From: Hazem Elsayed <helsayed@avey.ai>
Date: Mon, 8 Dec 2025 17:37:55 +0300
Subject: [PATCH 5/9] TST: Add tests for on_bad_lines with dtype conversion
 failures #63168

---
 .../io/parser/common/test_read_errors.py      | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
index b8cf435ef0443..36e1ecef6590e 100644
--- a/pandas/tests/io/parser/common/test_read_errors.py
+++ b/pandas/tests/io/parser/common/test_read_errors.py
@@ -313,3 +313,111 @@ def test_on_bad_lines_warn_correct_formatting(all_parsers):
     ):
         result = parser.read_csv(StringIO(data), on_bad_lines="warn")
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "on_bad_lines,should_warn",
+    [
+        ("skip", False),
+        ("warn", True),
+    ],
+)
+def test_on_bad_lines_dtype_conversion_skip(c_parser_only, on_bad_lines, should_warn):
+    # GH#63168 - on_bad_lines should handle dtype conversion failures
+    parser = c_parser_only
+    data = "col1,col2,col3\n1,2,3\na,4,5\n4,5,6"
+
+    if should_warn:
+        with tm.assert_produces_warning(
+            ParserWarning,
+            match="Could not convert column|Skipped .* line",
+            check_stacklevel=False,
+        ):
+            result = parser.read_csv(
+                StringIO(data),
+                dtype={"col1": int, "col2": int, "col3": int},
+                on_bad_lines=on_bad_lines,
+            )
+    else:
+        result = parser.read_csv(
+            StringIO(data),
+            dtype={"col1": int, "col2": int, "col3": int},
+            on_bad_lines=on_bad_lines,
+        )
+
+    # Row with 'a' cannot convert to int, should be skipped
+    expected = DataFrame({"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_dtype_conversion_error(c_parser_only):
+    # GH#63168 - on_bad_lines='error' should raise on dtype conversion failure
+    parser = c_parser_only
+    data = "col1,col2\n1,2\na,3"
+
+    with pytest.raises(ValueError, match="invalid literal for int"):
+        parser.read_csv(
+            StringIO(data),
+            dtype={"col1": int, "col2": int},
+            on_bad_lines="error",
+        )
+
+
+def test_on_bad_lines_dtype_float_conversion(c_parser_only):
+    # GH#63168 - Float dtype with non-numeric values
+    parser = c_parser_only
+    data = "a,b\n1.5,2.5\nfoo,3.5\n4.5,5.5"
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"a": float, "b": float},
+        on_bad_lines="skip",
+    )
+
+    expected = DataFrame({"a": [1.5, 4.5], "b": [2.5, 5.5]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_dtype_partial_columns(c_parser_only):
+    # GH#63168 - Only some columns have dtype specified
+    parser = c_parser_only
+    data = "a,b,c\n1,hello,3\nfoo,world,6\n4,test,9"
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"a": int, "c": int},
+        on_bad_lines="skip",
+    )
+
+    expected = DataFrame({"a": [1, 4], "b": ["hello", "test"], "c": [3, 9]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_dtype_mixed_errors(c_parser_only):
+    # GH#63168 - Mix of structural errors (wrong field count) and dtype errors
+    parser = c_parser_only
+    data = "a,b,c\n1,2,3\nwrong_field_count\nfoo,4,5\n6,7,8"
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"a": int, "b": int, "c": int},
+        on_bad_lines="skip",
+    )
+
+    expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]})
+    tm.assert_frame_equal(result, expected)
+
+
+def test_on_bad_lines_dtype_all_bad_rows(c_parser_only):
+    # GH#63168 - All data rows fail conversion
+    parser = c_parser_only
+    data = "a,b\nfoo,bar\nbaz,qux"
+
+    result = parser.read_csv(
+        StringIO(data),
+        dtype={"a": int, "b": int},
+        on_bad_lines="skip",
+    )
+
+    expected = DataFrame({"a": [], "b": []}).astype(int)
+    tm.assert_frame_equal(result, expected)

From 37f45927b315cbcad6d396e714ba896655993e11 Mon Sep 17 00:00:00 2001
From: Nasser M <123868652+nejail@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:18:11 +0300
Subject: [PATCH 6/9] DOC: add whatsnew entry for on_bad_lines dtype fix #63168

---
 doc/source/whatsnew/v3.0.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
index bebd928924214..1d00d5842ae92 100644
--- a/doc/source/whatsnew/v3.0.0.rst
+++ b/doc/source/whatsnew/v3.0.0.rst
@@ -1417,6 +1417,7 @@ Other
 - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
 - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
 - Bug when calling :py:func:`copy.copy` on a :class:`DataFrame` or :class:`Series` which would return a deep copy instead of a shallow copy (:issue:`62971`)
+- Fixed bug in :func:`read_csv` where ``on_bad_lines="skip"`` would not skip rows that failed dtype conversion (:issue:`63168`)
 - Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`)
 - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`)
 - Accessing the underlying NumPy array of a DataFrame or Series will return a read-only

From 1afd32b67a6588cc30a8aa84ca6df3a60a5ba2b0 Mon Sep 17 00:00:00 2001
From: Nasser M <123868652+nejail@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:30:00 +0300
Subject: [PATCH 7/9] Removing unnecessary comments

---
 pandas/_libs/parsers.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index a11353464674d..cefe289b51a86 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -472,7 +472,6 @@ cdef class TextReader:
             else:
                 raise ValueError(f"Invalid value for on_bad_lines: {on_bad_lines}")
         else:
-            # If it's not a string, assume it's already an integer/enum constant (like ERROR)
             c_on_bad_lines = on_bad_lines
 
         self.parser.on_bad_lines = c_on_bad_lines

From d1b2eb4d094132a8c8e9c809d2f0a0a5c65d40a3 Mon Sep 17 00:00:00 2001
From: Nasser M <123868652+nejail@users.noreply.github.com>
Date: Tue, 9 Dec 2025 20:40:54 +0300
Subject: [PATCH 8/9] Fix string quotes in parsers.pyx for pre-commit.ci

---
 pandas/_libs/parsers.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
index cefe289b51a86..38d333fa6c395 100644
--- a/pandas/_libs/parsers.pyx
+++ b/pandas/_libs/parsers.pyx
@@ -462,11 +462,11 @@ cdef class TextReader:
             self.parser.commentchar = <char>ord(comment)
 
         if isinstance(on_bad_lines, str):
-            if on_bad_lines == 'error':
+            if on_bad_lines == "error":
                 c_on_bad_lines = ERROR
-            elif on_bad_lines == 'warn':
+            elif on_bad_lines == "warn":
                 c_on_bad_lines = WARN
-            elif on_bad_lines == 'skip':
+            elif on_bad_lines == "skip":
                 c_on_bad_lines = SKIP
             # Note: can add 'skip_with_log' here later when we work on logging
             else:
@@ -1482,7 +1482,7 @@ def _identify_bad_rows(values, dtype):
     """
     Identify the row indices when values cannot be converted to the intended target
 
-    This can be used to find rows that should be skipped when on_bad_lines='skip'
+    This can be used to find rows that should be skipped when on_bad_lines="skip"
 
     Parameters
     ----------

From 1cf090fd8bdf5cc4bfaadff7a50f8c6e963fdfd7 Mon Sep 17 00:00:00 2001
From: Hazem Elsayed <helsayed@avey.ai>
Date: Wed, 10 Dec 2025 09:58:40 +0300
Subject: [PATCH 9/9] FIX: add explicit int sizes to tests

---
 .../io/parser/common/test_read_errors.py      | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/pandas/tests/io/parser/common/test_read_errors.py b/pandas/tests/io/parser/common/test_read_errors.py
index 36e1ecef6590e..00ddf084bbefb 100644
--- a/pandas/tests/io/parser/common/test_read_errors.py
+++ b/pandas/tests/io/parser/common/test_read_errors.py
@@ -335,18 +335,20 @@ def test_on_bad_lines_dtype_conversion_skip(c_parser_only, on_bad_lines, should_
         ):
             result = parser.read_csv(
                 StringIO(data),
-                dtype={"col1": int, "col2": int, "col3": int},
+                dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64},
                 on_bad_lines=on_bad_lines,
             )
     else:
         result = parser.read_csv(
             StringIO(data),
-            dtype={"col1": int, "col2": int, "col3": int},
+            dtype={"col1": np.int64, "col2": np.int64, "col3": np.int64},
             on_bad_lines=on_bad_lines,
         )
 
     # Row with 'a' cannot convert to int, should be skipped
-    expected = DataFrame({"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]})
+    expected = DataFrame(
+        {"col1": [1, 4], "col2": [2, 5], "col3": [3, 6]}, dtype=np.int64
+    )
     tm.assert_frame_equal(result, expected)
 
 
@@ -358,7 +360,7 @@ def test_on_bad_lines_dtype_conversion_error(c_parser_only):
     with pytest.raises(ValueError, match="invalid literal for int"):
         parser.read_csv(
             StringIO(data),
-            dtype={"col1": int, "col2": int},
+            dtype={"col1": np.int64, "col2": np.int64},
             on_bad_lines="error",
         )
 
@@ -385,11 +387,17 @@ def test_on_bad_lines_dtype_partial_columns(c_parser_only):
 
     result = parser.read_csv(
         StringIO(data),
-        dtype={"a": int, "c": int},
+        dtype={"a": np.int64, "c": np.int64},
         on_bad_lines="skip",
     )
 
-    expected = DataFrame({"a": [1, 4], "b": ["hello", "test"], "c": [3, 9]})
+    expected = DataFrame(
+        {
+            "a": np.array([1, 4], dtype=np.int64),
+            "b": ["hello", "test"],
+            "c": np.array([3, 9], dtype=np.int64),
+        }
+    )
     tm.assert_frame_equal(result, expected)
 
 
@@ -400,11 +408,11 @@ def test_on_bad_lines_dtype_mixed_errors(c_parser_only):
 
     result = parser.read_csv(
         StringIO(data),
-        dtype={"a": int, "b": int, "c": int},
+        dtype={"a": np.int64, "b": np.int64, "c": np.int64},
         on_bad_lines="skip",
     )
 
-    expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]})
+    expected = DataFrame({"a": [1, 6], "b": [2, 7], "c": [3, 8]}, dtype=np.int64)
     tm.assert_frame_equal(result, expected)
 
 
@@ -415,9 +423,9 @@ def test_on_bad_lines_dtype_all_bad_rows(c_parser_only):
 
     result = parser.read_csv(
         StringIO(data),
-        dtype={"a": int, "b": int},
+        dtype={"a": np.int64, "b": np.int64},
         on_bad_lines="skip",
     )
 
-    expected = DataFrame({"a": [], "b": []}).astype(int)
+    expected = DataFrame({"a": [], "b": []}).astype(np.int64)
     tm.assert_frame_equal(result, expected)