Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,9 @@ Other enhancements
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
- Switched wheel upload to **PyPI Trusted Publishing** (OIDC) for release-tag pushes in ``wheels.yml``. (:issue:`61718`)
- Added a new :meth:`DataFrame.from_arrow` method to import any Arrow-compatible
tabular data object into a pandas :class:`DataFrame` through the
`Arrow PyCapsule Protocol <https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html>`__ (:issue:`59631`)

.. ---------------------------------------------------------------------------
.. _whatsnew_300.notable_bug_fixes:
Expand Down
40 changes: 40 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,4 +533,44 @@ def closed(self) -> bool:

SliceType: TypeAlias = Hashable | None


# Arrow PyCapsule Interface
# from https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#protocol-typehints


class ArrowArrayExportable(Protocol):
"""
An object with an ``__arrow_c_array__`` method.

This method indicates the object is an Arrow-compatible object implementing
the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_ in
Python), enabling zero-copy Arrow data interchange across libraries.

.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
.. _Arrow C Data Interface: https://arrow.apache.org/docs/format/CDataInterface.html

"""

def __arrow_c_array__(
self, requested_schema: object | None = None
) -> tuple[object, object]: ...


class ArrowStreamExportable(Protocol):
"""
An object with an ``__arrow_c_stream__`` method.

This method indicates the object is an Arrow-compatible object implementing
the `Arrow PyCapsule Protocol`_ (exposing the `Arrow C Data Interface`_
for streams in Python), enabling zero-copy Arrow data interchange across
libraries.

.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
.. _Arrow C Stream Interface: https://arrow.apache.org/docs/format/CStreamInterface.html

"""

def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ...


__all__ = ["type_t"]
52 changes: 52 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@
AnyAll,
AnyArrayLike,
ArrayLike,
ArrowArrayExportable,
ArrowStreamExportable,
Axes,
Axis,
AxisInt,
Expand Down Expand Up @@ -1840,6 +1842,56 @@ def __rmatmul__(self, other) -> DataFrame:
# ----------------------------------------------------------------------
# IO methods (to / from other formats)

@classmethod
def from_arrow(
cls, data: ArrowArrayExportable | ArrowStreamExportable
) -> DataFrame:
"""
Construct a DataFrame from a tabular Arrow object.

This function accepts any Arrow-compatible tabular object implementing
the `Arrow PyCapsule Protocol`_ (i.e. having an ``__arrow_c_array__``
or ``__arrow_c_stream__`` method).

This function currently relies on ``pyarrow`` to convert the tabular
object in Arrow format to pandas.

.. _Arrow PyCapsule Protocol: https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html

.. versionadded:: 3.0

Parameters
----------
data : pyarrow.Table or Arrow-compatible table
Any tabular object implementing the Arrow PyCapsule Protocol
(i.e. has an ``__arrow_c_array__`` or ``__arrow_c_stream__``
method).

Returns
-------
DataFrame

"""
pa = import_optional_dependency("pyarrow", min_version="14.0.0")
if not isinstance(data, pa.Table):
if not (
hasattr(data, "__arrow_c_array__")
or hasattr(data, "__arrow_c_stream__")
):
# explicitly test this, because otherwise we would accept variour other
# input types through the pa.table(..) call
raise TypeError(
"Expected an Arrow-compatible tabular object (i.e. having an "
"'_arrow_c_array__' or '__arrow_c_stream__' method), got "
f"'{type(data).__name__}' instead."
)
pa_table = pa.table(data)
else:
pa_table = data

df = pa_table.to_pandas()
return df

@classmethod
def from_dict(
cls,
Expand Down
44 changes: 44 additions & 0 deletions pandas/tests/frame/test_arrow_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas.util._test_decorators as td

import pandas as pd
import pandas._testing as tm

pa = pytest.importorskip("pyarrow")

Expand Down Expand Up @@ -45,3 +46,46 @@ def test_dataframe_to_arrow(using_infer_string):
table = pa.RecordBatchReader.from_stream(df, schema=schema).read_all()
expected = expected.cast(schema)
assert table.equals(expected)


class ArrowArrayWrapper:
def __init__(self, batch):
self.array = batch

def __arrow_c_array__(self, requested_schema=None):
return self.array.__arrow_c_array__(requested_schema)


class ArrowStreamWrapper:
def __init__(self, table):
self.stream = table

def __arrow_c_stream__(self, requested_schema=None):
return self.stream.__arrow_c_stream__(requested_schema)


@td.skip_if_no("pyarrow", min_version="14.0")
def test_dataframe_from_arrow():
# objects with __arrow_c_stream__
table = pa.table({"a": [1, 2, 3], "b": ["a", "b", "c"]})

result = pd.DataFrame.from_arrow(table)
expected = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
tm.assert_frame_equal(result, expected)

# not only pyarrow object are supported
result = pd.DataFrame.from_arrow(ArrowStreamWrapper(table))
tm.assert_frame_equal(result, expected)

# objects with __arrow_c_array__
batch = pa.record_batch([[1, 2, 3], ["a", "b", "c"]], names=["a", "b"])

result = pd.DataFrame.from_arrow(table)
tm.assert_frame_equal(result, expected)

result = pd.DataFrame.from_arrow(ArrowArrayWrapper(batch))
tm.assert_frame_equal(result, expected)

# only accept actual Arrow objects
with pytest.raises(TypeError, match="Expected an Arrow-compatible tabular object"):
pd.DataFrame.from_arrow({"a": [1, 2, 3], "b": ["a", "b", "c"]})
Loading