Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 36 additions & 5 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -680,14 +680,45 @@ def compress_group_index(
space can be huge, so this function compresses it, by computing offsets
(comp_ids) into the list of unique labels (obs_group_ids).
"""
if len(group_index) and np.all(group_index[1:] >= group_index[:-1]):
import sys

# Use numpy-based approach for Python 3.14+ to avoid hashtable issues
is_sorted = len(group_index) and np.all(
group_index[1:] >= group_index[:-1]
)
if sys.version_info >= (3, 14) or is_sorted:
# GH 53806: fast path for sorted group_index
# GH 63314: also use for Python 3.14+ due to hashtable behavior changes
if len(group_index) == 0:
empty_arr = np.array([], dtype=np.int64)
return ensure_int64(empty_arr), ensure_int64(empty_arr)

# Sort if needed
if not np.all(group_index[1:] >= group_index[:-1]):
sorted_idx = np.argsort(group_index, kind="stable")
sorted_group_index = group_index[sorted_idx]
unsort_idx = np.empty_like(sorted_idx)
unsort_idx[sorted_idx] = np.arange(len(sorted_idx))
else:
sorted_group_index = group_index
unsort_idx = None

unique_mask = np.concatenate(
[group_index[:1] > -1, group_index[1:] != group_index[:-1]]
[
sorted_group_index[:1] > -1,
sorted_group_index[1:] != sorted_group_index[:-1],
]
)
comp_ids = unique_mask.cumsum()
comp_ids -= 1
obs_group_ids = group_index[unique_mask]
comp_ids_sorted = unique_mask.cumsum() - 1
obs_group_ids = sorted_group_index[unique_mask]

if unsort_idx is not None:
comp_ids = comp_ids_sorted[unsort_idx]
else:
comp_ids = comp_ids_sorted

if sort and not np.all(obs_group_ids[1:] >= obs_group_ids[:-1]):
obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids)
else:
size_hint = len(group_index)
table = hashtable.Int64HashTable(size_hint)
Expand Down
39 changes: 39 additions & 0 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2959,3 +2959,42 @@ def test_pivot_empty_dataframe_period_dtype(self, freq):
)

tm.assert_frame_equal(result, expected)

def test_pivot_table_large_dataset_no_duplicates(self):
# GH 63314: pivot_table with large datasets should not produce
# duplicate indices. This test ensures the Python 3.14 fix works.
n_indices = 10000
metrics = ["apple", "banana", "coconut"]

data = [
{"idx": f"id_{i}", "metric": metric, "value": i * 10 + len(metric)}
for i in range(n_indices)
for metric in metrics
]

df = DataFrame(data)

result = df.pivot_table(
index=["idx"],
columns="metric",
values="value",
aggfunc="first",
)

# Verify no duplicate indices in the result
n_unique = len(result.index.unique())
assert len(result.index) == n_unique, (
f"Expected {n_unique} unique indices, got {len(result.index)}"
)

# Verify we have the expected number of rows
assert len(result) == n_indices, (
f"Expected {n_indices} rows, got {len(result)}"
)

# Verify all expected indices are present
expected_indices = {f"id_{i}" for i in range(n_indices)}
actual_indices = set(result.index)
assert expected_indices == actual_indices, (
"Result indices don't match expected indices"
)
Loading