diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c96bb7f663368..21fcd256ff749 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1260,6 +1260,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) - Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`) - Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` computing incorrect results due to numerical instability. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`) +- Bug in :meth:`DataFrame.groupby` methods when operating on NumPy-nullable data failing when the NA mask was not C-contiguous (:issue:`61031`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9f8ff86cbcb7e..32b29ac7af252 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -819,7 +819,7 @@ def group_prod( int64_t[::1] counts, ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, - const uint8_t[:, ::1] mask, + const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint skipna=True, @@ -893,7 +893,7 @@ def group_var( const intp_t[::1] labels, Py_ssize_t min_count=-1, int64_t ddof=1, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, str name="var", @@ -998,7 +998,7 @@ def group_skew( int64_t[::1] counts, ndarray[float64_t, ndim=2] values, const intp_t[::1] labels, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -1086,7 +1086,7 @@ def group_kurt( int64_t[::1] counts, ndarray[float64_t, ndim=2] values, const intp_t[::1] labels, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -1180,7 +1180,7 @@ def group_mean( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -1324,7 +1324,7 @@ def group_ohlc( ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, ) -> None: """ @@ -1870,7 +1870,7 @@ cdef group_min_max( Py_ssize_t min_count=-1, bint is_datetimelike=False, bint compute_max=True, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ): @@ -1983,7 +1983,7 @@ def group_idxmin_idxmax( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, str name="idxmin", bint skipna=True, uint8_t[:, ::1] result_mask=None, @@ -2096,7 +2096,7 @@ def group_max( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -2124,7 +2124,7 @@ def group_min( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -2148,7 +2148,7 @@ def group_min( cdef group_cummin_max( numeric_t[:, ::1] out, ndarray[numeric_t, ndim=2] values, - const uint8_t[:, ::1] mask, + const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask, const intp_t[::1] labels, int ngroups, @@ -2264,7 +2264,7 @@ def group_cummin( const intp_t[::1] labels, int ngroups, bint is_datetimelike, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -2290,7 +2290,7 @@ def group_cummax( const intp_t[::1] labels, int ngroups, bint is_datetimelike, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index 2310c3bf59e15..7a012f5da4aa8 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -84,3 +84,22 @@ def test_dup_labels_output_shape(groupby_func, idx): assert result.shape == (1, 2) tm.assert_index_equal(result.columns, idx) + + +def test_not_c_contiguous_mask(groupby_func): + # https://github.com/pandas-dev/pandas/issues/61031 + if groupby_func == "corrwith": + # corrwith is deprecated + return + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}, dtype="Int64") + reversed = DataFrame( + {"a": [2, 1, 1], "b": [5, 4, 3]}, dtype="Int64", index=[2, 1, 0] + )[::-1] + assert not reversed["b"].array._mask.flags["C_CONTIGUOUS"] + args = get_groupby_method_args(groupby_func, df) + + gb_reversed = reversed.groupby("a") + result = getattr(gb_reversed, groupby_func)(*args) + gb = df.groupby("a") + expected = getattr(gb, groupby_func)(*args) + tm.assert_equal(result, expected)