Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions _duckdb-stubs/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,7 @@ class DuckDBPyRelation:
write_partition_columns: bool | None = None,
append: bool | None = None,
filename_pattern: str | None = None,
file_size_bytes: str | int | None = None,
) -> None: ...
def to_table(self, table_name: str) -> None: ...
def to_view(self, view_name: str, replace: bool = True) -> DuckDBPyRelation: ...
Expand Down Expand Up @@ -774,6 +775,7 @@ class DuckDBPyRelation:
write_partition_columns: bool | None = None,
append: bool | None = None,
filename_pattern: str | None = None,
file_size_bytes: str | int | None = None,
) -> None: ...
@property
def alias(self) -> str: ...
Expand Down
3 changes: 2 additions & 1 deletion src/duckdb_py/include/duckdb_python/pyrelation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ struct DuckDBPyRelation {
const py::object &row_group_size = py::none(), const py::object &overwrite = py::none(),
const py::object &per_thread_output = py::none(), const py::object &use_tmp_file = py::none(),
const py::object &partition_by = py::none(), const py::object &write_partition_columns = py::none(),
const py::object &append = py::none(), const py::object &filename_pattern = py::none());
const py::object &append = py::none(), const py::object &filename_pattern = py::none(),
const py::object &file_size_bytes = py::none());

void ToCSV(const string &filename, const py::object &sep = py::none(), const py::object &na_rep = py::none(),
const py::object &header = py::none(), const py::object &quotechar = py::none(),
Expand Down
13 changes: 12 additions & 1 deletion src/duckdb_py/pyrelation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
const py::object &overwrite, const py::object &per_thread_output,
const py::object &use_tmp_file, const py::object &partition_by,
const py::object &write_partition_columns, const py::object &append,
const py::object &filename_pattern) {
const py::object &filename_pattern, const py::object &file_size_bytes) {
case_insensitive_map_t<vector<Value>> options;

if (!py::none().is(compression)) {
Expand Down Expand Up @@ -1312,6 +1312,17 @@ void DuckDBPyRelation::ToParquet(const string &filename, const py::object &compr
options["filename_pattern"] = {Value(py::str(filename_pattern))};
}

if (!py::none().is(file_size_bytes)) {
if (py::isinstance<py::int_>(file_size_bytes)) {
int64_t file_size_bytes_int = py::int_(file_size_bytes);
options["file_size_bytes"] = {Value(file_size_bytes_int)};
} else if (py::isinstance<py::str>(file_size_bytes)) {
options["file_size_bytes"] = {Value(py::str(file_size_bytes))};
} else {
throw InvalidInputException("to_parquet only accepts 'file_size_bytes' as an integer or string");
}
}

auto write_parquet = rel->WriteParquetRel(filename, std::move(options));
PyExecuteRelation(write_parquet);
}
Expand Down
2 changes: 1 addition & 1 deletion src/duckdb_py/pyrelation/initialize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
py::arg("overwrite") = py::none(), py::arg("per_thread_output") = py::none(),
py::arg("use_tmp_file") = py::none(), py::arg("partition_by") = py::none(),
py::arg("write_partition_columns") = py::none(), py::arg("append") = py::none(),
py::arg("filename_pattern") = py::none());
py::arg("filename_pattern") = py::none(), py::arg("file_size_bytes") = py::none());

DefineMethod(
{"to_csv", "write_csv"}, m, &DuckDBPyRelation::ToCSV, "Write the relation object to a CSV file in 'file_name'",
Expand Down
64 changes: 64 additions & 0 deletions tests/fast/api/test_to_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,3 +225,67 @@ def test_filename_pattern_with_uuid(self, pd):
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
expected = [("rei", 321.0, "a"), ("shinji", 123.0, "a"), ("asuka", 23.0, "b"), ("kaworu", 340.0, "c")]
assert result.execute().fetchall() == expected

@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
@pytest.mark.parametrize("file_size_bytes", [1000, "1k"])
def test_file_size_bytes_basic(self, pd, file_size_bytes):
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
# Create a larger dataset to ensure multiple files are created
df = pd.DataFrame(
{
"name": [f"name_{i}" for i in range(100)],
"value": [i * 100.0 for i in range(100)],
"description": [f"description_{i}_with_more_text" for i in range(100)],
}
)
rel = duckdb.from_df(df)
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes, per_thread_output=True)

# Check that multiple files were created
files = list(pathlib.Path(temp_file_name).iterdir())
assert len(files) > 1, f"Expected multiple files, got {len(files)}"

# Verify data integrity
result = duckdb.read_parquet(f"{temp_file_name}/*.parquet")
assert len(result.execute().fetchall()) == 100

@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
def test_file_size_bytes_with_partition(self, pd):
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
# Create a dataset with enough data to trigger file splitting
df = pd.DataFrame(
{
"name": [f"name_{i}" for i in range(100)],
"value": [i * 100.0 for i in range(100)],
"category": ["a" if i < 50 else "b" for i in range(100)],
"description": [f"description_{i}_with_more_text_to_increase_size" for i in range(100)],
}
)
rel = duckdb.from_df(df)
rel.to_parquet(temp_file_name, partition_by=["category"], file_size_bytes="2k", per_thread_output=True)

# Check that files were created in partition directories
assert pathlib.Path(f"{temp_file_name}/category=a").exists()
assert pathlib.Path(f"{temp_file_name}/category=b").exists()

# Verify data integrity
result = duckdb.sql(f"FROM read_parquet('{temp_file_name}/*/*.parquet', hive_partitioning=TRUE)")
assert len(result.execute().fetchall()) == 100

@pytest.mark.parametrize("pd", [NumpyPandas(), ArrowPandas()])
@pytest.mark.parametrize("file_size_bytes", ["1M", "1G"])
def test_file_size_bytes_human_readable(self, pd, file_size_bytes):
temp_file_name = os.path.join(tempfile.mkdtemp(), next(tempfile._get_candidate_names())) # noqa: PTH118
df = pd.DataFrame(
{
"name": ["rei", "shinji", "asuka", "kaworu"],
"float": [321.0, 123.0, 23.0, 340.0],
"category": ["a", "a", "b", "c"],
}
)
rel = duckdb.from_df(df)
rel.to_parquet(temp_file_name, file_size_bytes=file_size_bytes)

# With large file size limits, should create just one file
parquet_rel = duckdb.read_parquet(temp_file_name)
assert rel.execute().fetchall() == parquet_rel.execute().fetchall()