From d090db7fae2bfa720399e8502c148e8526b116ad Mon Sep 17 00:00:00 2001 From: Jixun Sun <160219251+AnonToky@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:28:20 +0800 Subject: [PATCH 1/3] Add tests for groupby dropna=False behavior Add tests to ensure groupby with dropna=False preserves NaN groups in both DataFrame and Series. --- tests/groupby/test_groupby_dropna.py | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 tests/groupby/test_groupby_dropna.py diff --git a/tests/groupby/test_groupby_dropna.py b/tests/groupby/test_groupby_dropna.py new file mode 100644 index 0000000000000..6c3921b06ee8d --- /dev/null +++ b/tests/groupby/test_groupby_dropna.py @@ -0,0 +1,41 @@ +import numpy as np +import pandas as pd +import pandas._testing as tm + +def test_groupby_dataframe_dropna_false_preserves_nan_group(): + # Ensure DataFrame.groupby(..., dropna=False) preserves NA entries as a single group + # Tests-only addition to lock current behavior (GHxxxx) + data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "val": [0, 1, 2, 3, 4]} + df = pd.DataFrame(data) + + gb_keepna = df.groupby("group", dropna=False) + result = gb_keepna.indices + + # expected: g1 -> [0,2], g2 -> [3], NaN -> [1,4] + expected = { + "g1": np.array([0, 2], dtype=np.intp), + "g2": np.array([3], dtype=np.intp), + np.nan: np.array([1, 4], dtype=np.intp), + } + + # Compare group indices allowing for np.nan key + for res_vals, exp_vals in zip(result.values(), expected.values()): + tm.assert_numpy_array_equal(res_vals, exp_vals) + # check there is an NaN key present + assert any(pd.isna(k) for k in result.keys()) + + +def test_groupby_series_dropna_false_preserves_nan_group(): + # Verify Series.groupby(..., dropna=False) also preserves NA groups + s = pd.Series([1, 2, 3, 4], index=["a", np.nan, "a", np.nan], name="s") + gb = s.groupby(level=0, dropna=False) + res = gb.indices + + expected = { + "a": np.array([0, 2], dtype=np.intp), + np.nan: np.array([1, 3], dtype=np.intp), + } + + for res_vals, exp_vals in zip(res.values(), expected.values()): + tm.assert_numpy_array_equal(res_vals, exp_vals) + assert any(pd.isna(k) for k in res.keys()) From 877ecd97271db54bee06eda8d48da5c79ddb0047 Mon Sep 17 00:00:00 2001 From: Jixun Sun <160219251+AnonToky@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:29:38 +0800 Subject: [PATCH 2/3] Delete tests/groupby directory --- tests/groupby/test_groupby_dropna.py | 41 ---------------------------- 1 file changed, 41 deletions(-) delete mode 100644 tests/groupby/test_groupby_dropna.py diff --git a/tests/groupby/test_groupby_dropna.py b/tests/groupby/test_groupby_dropna.py deleted file mode 100644 index 6c3921b06ee8d..0000000000000 --- a/tests/groupby/test_groupby_dropna.py +++ /dev/null @@ -1,41 +0,0 @@ -import numpy as np -import pandas as pd -import pandas._testing as tm - -def test_groupby_dataframe_dropna_false_preserves_nan_group(): - # Ensure DataFrame.groupby(..., dropna=False) preserves NA entries as a single group - # Tests-only addition to lock current behavior (GHxxxx) - data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "val": [0, 1, 2, 3, 4]} - df = pd.DataFrame(data) - - gb_keepna = df.groupby("group", dropna=False) - result = gb_keepna.indices - - # expected: g1 -> [0,2], g2 -> [3], NaN -> [1,4] - expected = { - "g1": np.array([0, 2], dtype=np.intp), - "g2": np.array([3], dtype=np.intp), - np.nan: np.array([1, 4], dtype=np.intp), - } - - # Compare group indices allowing for np.nan key - for res_vals, exp_vals in zip(result.values(), expected.values()): - tm.assert_numpy_array_equal(res_vals, exp_vals) - # check there is an NaN key present - assert any(pd.isna(k) for k in result.keys()) - - -def test_groupby_series_dropna_false_preserves_nan_group(): - # Verify Series.groupby(..., dropna=False) also preserves NA groups - s = pd.Series([1, 2, 3, 4], index=["a", np.nan, "a", np.nan], name="s") - gb = s.groupby(level=0, dropna=False) - res = gb.indices - - expected = { - "a": np.array([0, 2], dtype=np.intp), - np.nan: np.array([1, 3], dtype=np.intp), - } - - for res_vals, exp_vals in zip(res.values(), expected.values()): - tm.assert_numpy_array_equal(res_vals, exp_vals) - assert any(pd.isna(k) for k in res.keys()) From 32b6ee877960791f8935725f88bc1c1e3d22290c Mon Sep 17 00:00:00 2001 From: Jixun Sun <160219251+AnonToky@users.noreply.github.com> Date: Tue, 25 Nov 2025 20:14:33 +0800 Subject: [PATCH 3/3] Refactor tests to use HDFStore with tmp_path Updated tests to use HDFStore with tmp_path instead of ensure_clean_store. --- pandas/tests/io/pytables/test_errors.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/tests/io/pytables/test_errors.py b/pandas/tests/io/pytables/test_errors.py index 37e6eeb05deec..c444090ebfb3c 100644 --- a/pandas/tests/io/pytables/test_errors.py +++ b/pandas/tests/io/pytables/test_errors.py @@ -14,7 +14,6 @@ date_range, read_hdf, ) -from pandas.tests.io.pytables.common import ensure_clean_store from pandas.io.pytables import ( Term, @@ -24,14 +23,14 @@ pytestmark = [pytest.mark.single_cpu] -def test_pass_spec_to_storer(setup_path): +def test_pass_spec_to_storer(tmp_path, setup_path): df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD"), dtype=object), index=Index([f"i-{i}" for i in range(30)], dtype=object), ) - with ensure_clean_store(setup_path) as store: + with HDFStore(tmp_path / setup_path) as store: store.put("df", df) msg = ( "cannot pass a column specification when reading a Fixed format " @@ -47,19 +46,19 @@ def test_pass_spec_to_storer(setup_path): store.select("df", where=["columns=A"]) -def test_table_index_incompatible_dtypes(setup_path): +def test_table_index_incompatible_dtypes(tmp_path, setup_path): df1 = DataFrame({"a": [1, 2, 3]}) df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) - with ensure_clean_store(setup_path) as store: + with HDFStore(tmp_path / setup_path) as store: store.put("frame", df1, format="table") msg = re.escape("incompatible kind in col [integer - datetime64[ns]]") with pytest.raises(TypeError, match=msg): store.put("frame", df2, format="table", append=True) -def test_unimplemented_dtypes_table_columns(setup_path): - with ensure_clean_store(setup_path) as store: +def test_unimplemented_dtypes_table_columns(tmp_path, setup_path): + with HDFStore(tmp_path / setup_path) as store: dtypes = [("date", datetime.date(2001, 1, 2))] # currently not supported dtypes #### @@ -85,7 +84,7 @@ def test_unimplemented_dtypes_table_columns(setup_path): df["datetime1"] = datetime.date(2001, 1, 2) df = df._consolidate() - with ensure_clean_store(setup_path) as store: + with HDFStore(tmp_path / setup_path) as store: # this fails because we have a date in the object block...... msg = "|".join( [ @@ -101,7 +100,7 @@ def test_unimplemented_dtypes_table_columns(setup_path): def test_invalid_terms(tmp_path, setup_path): - with ensure_clean_store(setup_path) as store: + with HDFStore(tmp_path / setup_path) as store: df = DataFrame( np.random.default_rng(2).standard_normal((10, 4)), columns=Index(list("ABCD"), dtype=object), @@ -162,14 +161,14 @@ def test_invalid_terms(tmp_path, setup_path): read_hdf(path, "dfq", where="A>0 or C>0") -def test_append_with_diff_col_name_types_raises_value_error(setup_path): +def test_append_with_diff_col_name_types_raises_value_error(tmp_path, setup_path): df = DataFrame(np.random.default_rng(2).standard_normal((10, 1))) df2 = DataFrame({"a": np.random.default_rng(2).standard_normal(10)}) df3 = DataFrame({(1, 2): np.random.default_rng(2).standard_normal(10)}) df4 = DataFrame({("1", 2): np.random.default_rng(2).standard_normal(10)}) df5 = DataFrame({("1", 2, object): np.random.default_rng(2).standard_normal(10)}) - with ensure_clean_store(setup_path) as store: + with HDFStore(tmp_path / setup_path) as store: name = "df_diff_valerror" store.append(name, df)