diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 672672490996d..8e84a227418ad 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -62,7 +62,8 @@ def _get_pyarrow_options(self) -> None: "quotechar": "quote_char", } for pandas_name, pyarrow_name in mapping.items(): - if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: + option_value = self.kwds.get(pandas_name) + if option_value is not None: self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) # Date format handling @@ -80,22 +81,28 @@ def _get_pyarrow_options(self) -> None: date_format = None self.kwds["timestamp_parsers"] = date_format - self.parse_options = { - option_name: option_value - for option_name, option_value in self.kwds.items() - if option_value is not None - and option_name - in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines") - } + # Prefetch kwds used in parse_options to avoid per-iteration dict lookup + delimiter = self.kwds.get("delimiter") + quote_char = self.kwds.get("quote_char") + escape_char = self.kwds.get("escape_char") + ignore_empty_lines = self.kwds.get("ignore_empty_lines") + + self.parse_options = {} + if delimiter is not None: + self.parse_options["delimiter"] = delimiter + if quote_char is not None: + self.parse_options["quote_char"] = quote_char + if escape_char is not None: + self.parse_options["escape_char"] = escape_char + if ignore_empty_lines is not None: + self.parse_options["ignore_empty_lines"] = ignore_empty_lines on_bad_lines = self.kwds.get("on_bad_lines") if on_bad_lines is not None: if callable(on_bad_lines): self.parse_options["invalid_row_handler"] = on_bad_lines elif on_bad_lines == ParserBase.BadLineHandleMethod.ERROR: - self.parse_options["invalid_row_handler"] = ( - None # PyArrow raises an exception by default - ) + self.parse_options["invalid_row_handler"] = None elif on_bad_lines == ParserBase.BadLineHandleMethod.WARN: def handle_warning(invalid_row) -> str: @@ -111,27 +118,41 @@ def handle_warning(invalid_row) -> str: elif on_bad_lines == ParserBase.BadLineHandleMethod.SKIP: self.parse_options["invalid_row_handler"] = lambda _: "skip" - self.convert_options = { - option_name: option_value - for option_name, option_value in self.kwds.items() - if option_value is not None - and option_name - in ( - "include_columns", - "null_values", - "true_values", - "false_values", - "decimal_point", - "timestamp_parsers", - ) - } - self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] + # Prefetch and build convert_options dict directly for known option names + # This avoids recreating a larger intermediate dictionary + convert_options = {} + include_columns = self.kwds.get("include_columns") + if include_columns is not None: + convert_options["include_columns"] = include_columns + null_values = self.kwds.get("null_values") + if null_values is not None: + convert_options["null_values"] = null_values + true_values = self.kwds.get("true_values") + if true_values is not None: + convert_options["true_values"] = true_values + false_values = self.kwds.get("false_values") + if false_values is not None: + convert_options["false_values"] = false_values + decimal_point = self.kwds.get("decimal_point") + if decimal_point is not None: + convert_options["decimal_point"] = decimal_point + timestamp_parsers = self.kwds.get("timestamp_parsers") + if timestamp_parsers is not None: + convert_options["timestamp_parsers"] = timestamp_parsers + + # Efficient membership check for strings_can_be_null + convert_options["strings_can_be_null"] = ( + "" in null_values if null_values is not None else False + ) # autogenerated column names are prefixed with 'f' in pyarrow.csv - if self.header is None and "include_columns" in self.convert_options: - self.convert_options["include_columns"] = [ - f"f{n}" for n in self.convert_options["include_columns"] + if self.header is None and "include_columns" in convert_options: + convert_options["include_columns"] = [ + f"f{n}" for n in convert_options["include_columns"] ] + self.convert_options = convert_options + + # Compose read_options without extra getitem overhead self.read_options = { "autogenerate_column_names": self.header is None, "skip_rows": self.header