diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ab032e4..5eb14fe1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,9 @@ # Contributing to duckdb-python +## Setting up a development environment + +See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python). + ## General Guidelines ### **Did you find a bug?** @@ -39,7 +43,3 @@ ### Testing cross-platform and cross-Python * On your fork you can [run](https://docs.github.com/en/actions/using-workflows/manually-running-a-workflow#running-a-workflow) the Packaging workflow manually for any branch. You can choose whether to build for all platforms or a subset, and to either run the full testsuite, the fast tests only, or no tests at all. - -## Setting up a development environment - -See the [instructions on duckdb.org](https://duckdb.org/docs/stable/dev/building/python). diff --git a/_duckdb-stubs/__init__.pyi b/_duckdb-stubs/__init__.pyi index 6b323184..75389791 100644 --- a/_duckdb-stubs/__init__.pyi +++ b/_duckdb-stubs/__init__.pyi @@ -721,6 +721,7 @@ class DuckDBPyRelation: write_partition_columns: bool | None = None, append: bool | None = None, filename_pattern: str | None = None, + file_size_bytes: str | int | None = None, ) -> None: ... def to_table(self, table_name: str) -> None: ... def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ... @@ -774,6 +775,7 @@ class DuckDBPyRelation: write_partition_columns: bool | None = None, append: bool | None = None, filename_pattern: str | None = None, + file_size_bytes: str | int | None = None, ) -> None: ... @property def alias(self) -> str: ... diff --git a/src/duckdb_py/include/duckdb_python/pyrelation.hpp b/src/duckdb_py/include/duckdb_python/pyrelation.hpp index 06cf9e94..b1975e7f 100644 --- a/src/duckdb_py/include/duckdb_python/pyrelation.hpp +++ b/src/duckdb_py/include/duckdb_python/pyrelation.hpp @@ -214,7 +214,8 @@ struct DuckDBPyRelation { const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(), const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(), const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(), - const py::object &append = py::none(), const py::object &filename_pattern = py::none()); + const py::object &append = py::none(), const py::object &filename_pattern = py::none(), + const py::object &file_size_bytes = py::none()); void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(), const py::object &header = py::none(), const py::object "echar = py::none(), diff --git a/src/duckdb_py/pyrelation.cpp b/src/duckdb_py/pyrelation.cpp index bbc7a2ec..58cfcc29 100644 --- a/src/duckdb_py/pyrelation.cpp +++ b/src/duckdb_py/pyrelation.cpp @@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr const py::object &overwrite, const py::object &per_thread_output, const py::object &use_tmp_file, const py::object &partition_by, const py::object &write_partition_columns, const py::object &append, - const py::object &filename_pattern) { + const py::object &filename_pattern, const py::object &file_size_bytes) { case_insensitive_map_t> options; if (!py::none().is(compression)) { @@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr options["filename_pattern"] = {Value(py::str(filename_pattern))}; } + if (!py::none().is(file_size_bytes)) { + if (py::isinstance(file_size_bytes)) { + int64_t file_size_bytes_int = py::int_(file_size_bytes); + options["file_size_bytes"] = {Value(file_size_bytes_int)}; + } else if (py::isinstance(file_size_bytes)) { + options["file_size_bytes"] = {Value(py::str(file_size_bytes))}; + } else { + throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string"); + } + } + auto write_parquet = rel->WriteParquetRel(filename, std::move(options)); PyExecuteRelation(write_parquet); } diff --git a/src/duckdb_py/pyrelation/initialize.cpp b/src/duckdb_py/pyrelation/initialize.cpp index 7bfea441..35eeff40 100644 --- a/src/duckdb_py/pyrelation/initialize.cpp +++ b/src/duckdb_py/pyrelation/initialize.cpp @@ -37,7 +37,7 @@ static void InitializeConsumers(py::class_ &m) { py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(), py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(), py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(), - py::arg("filename_pattern") = py::none()); + py::arg("filename_pattern") = py::none(), py::arg("file_size_bytes") = py::none()); DefineMethod( {"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'", diff --git a/tests/fast/api/test_to_parquet.py b/tests/fast/api/test_to_parquet.py index f0952e68..370ab8e4 100644 --- a/tests/fast/api/test_to_parquet.py +++ b/tests/fast/api/test_to_parquet.py @@ -225,3 +225,37 @@ def test_filename_pattern_with_uuid(self, pd): result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)") expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")] assert result.execute().fetchall() == expected + + @pytest.mark.parametrize("file_size_bytes", [1000, "1k"]) + def test_file_size_bytes_basic(self, file_size_bytes): + temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 + + # use same test data as external/duckdb/test/sql/copy/file_size_bytes.test + rel = duckdb.from_query("SELECT i AS col_a, i AS col_b FROM range(0,10000) tbl(i);") + rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, row_group_size=2000) + + # Check that multiple files were created + files = list(pathlib.Path(temp_file_name).iterdir()) + assert len(files) > 1, f"Expected multiple files, got {len(files)}" + + # Verify data integrity + result = duckdb.read_parquet(f"{temp_file_name}/*.parquet") + assert len(result.execute().fetchall()) == 10000 + + @pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()]) + @pytest.mark.parametrize("file_size_bytes", ["256MB", "1G"]) + def test_file_size_bytes_human_readable(self, pd, file_size_bytes): + temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118 + df = pd.DataFrame( + { + "name": ["rei", "shinji", "asuka", "kaworu"], + "float": [321.0, 123.0, 23.0, 340.0], + "category": ["a", "a", "b", "c"], + } + ) + rel = duckdb.from_df(df) + rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes) + + # With large file size limits, should create just one file + parquet_rel = duckdb.read_parquet(temp_file_name) + assert rel.execute().fetchall() == parquet_rel.execute().fetchall()