Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 50 additions & 29 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def _get_pyarrow_options(self) -> None:
"quotechar": "quote_char",
}
for pandas_name, pyarrow_name in mapping.items():
if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None:
option_value = self.kwds.get(pandas_name)
if option_value is not None:
self.kwds[pyarrow_name] = self.kwds.pop(pandas_name)

# Date format handling
Expand All @@ -80,22 +81,28 @@ def _get_pyarrow_options(self) -> None:
date_format = None
self.kwds["timestamp_parsers"] = date_format

self.parse_options = {
option_name: option_value
for option_name, option_value in self.kwds.items()
if option_value is not None
and option_name
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
}
# Prefetch kwds used in parse_options to avoid per-iteration dict lookup
delimiter = self.kwds.get("delimiter")
quote_char = self.kwds.get("quote_char")
escape_char = self.kwds.get("escape_char")
ignore_empty_lines = self.kwds.get("ignore_empty_lines")

self.parse_options = {}
if delimiter is not None:
self.parse_options["delimiter"] = delimiter
if quote_char is not None:
self.parse_options["quote_char"] = quote_char
if escape_char is not None:
self.parse_options["escape_char"] = escape_char
if ignore_empty_lines is not None:
self.parse_options["ignore_empty_lines"] = ignore_empty_lines

on_bad_lines = self.kwds.get("on_bad_lines")
if on_bad_lines is not None:
if callable(on_bad_lines):
self.parse_options["invalid_row_handler"] = on_bad_lines
elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR:
self.parse_options["invalid_row_handler"] = (
None # PyArrow raises an exception by default
)
self.parse_options["invalid_row_handler"] = None
elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN:

def handle_warning(invalid_row) -> str:
Expand All @@ -111,27 +118,41 @@ def handle_warning(invalid_row) -> str:
elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP:
self.parse_options["invalid_row_handler"] = lambda _: "skip"

self.convert_options = {
option_name: option_value
for option_name, option_value in self.kwds.items()
if option_value is not None
and option_name
in (
"include_columns",
"null_values",
"true_values",
"false_values",
"decimal_point",
"timestamp_parsers",
)
}
self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"]
# Prefetch and build convert_options dict directly for known option names
# This avoids recreating a larger intermediate dictionary
convert_options = {}
include_columns = self.kwds.get("include_columns")
if include_columns is not None:
convert_options["include_columns"] = include_columns
null_values = self.kwds.get("null_values")
if null_values is not None:
convert_options["null_values"] = null_values
true_values = self.kwds.get("true_values")
if true_values is not None:
convert_options["true_values"] = true_values
false_values = self.kwds.get("false_values")
if false_values is not None:
convert_options["false_values"] = false_values
decimal_point = self.kwds.get("decimal_point")
if decimal_point is not None:
convert_options["decimal_point"] = decimal_point
timestamp_parsers = self.kwds.get("timestamp_parsers")
if timestamp_parsers is not None:
convert_options["timestamp_parsers"] = timestamp_parsers

# Efficient membership check for strings_can_be_null
convert_options["strings_can_be_null"] = (
"" in null_values if null_values is not None else False
)
# autogenerated column names are prefixed with 'f' in pyarrow.csv
if self.header is None and "include_columns" in self.convert_options:
self.convert_options["include_columns"] = [
f"f{n}" for n in self.convert_options["include_columns"]
if self.header is None and "include_columns" in convert_options:
convert_options["include_columns"] = [
f"f{n}" for n in convert_options["include_columns"]
]

self.convert_options = convert_options

# Compose read_options without extra getitem overhead
self.read_options = {
"autogenerate_column_names": self.header is None,
"skip_rows": self.header
Expand Down