Skip to content

Commit 9e72a7d

Browse files
authored
feat(preprod): skip renames from showing in diff (#104306)
This skips renames (same file hash, different path) from showing up in the comparison results. A recent Xcode26 change made this way more prominent since every compilation changes a generated UUID in app icon image paths.
1 parent 4598b29 commit 9e72a7d

File tree

3 files changed

+653
-49
lines changed

3 files changed

+653
-49
lines changed

src/sentry/preprod/size_analysis/compare.py

Lines changed: 91 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from __future__ import annotations
22

33
import logging
4+
from typing import NamedTuple
45

56
from packaging.version import InvalidVersion
67
from packaging.version import parse as parse_version
@@ -10,6 +11,8 @@
1011
ComparisonResults,
1112
DiffItem,
1213
DiffType,
14+
FileAnalysis,
15+
FileInfo,
1316
SizeAnalysisResults,
1417
SizeMetricDiffItem,
1518
TreemapElement,
@@ -42,6 +45,11 @@ def compare_size_analysis(
4245

4346
diff_items = []
4447

48+
head_renamed_paths, base_renamed_paths = _find_renamed_paths(
49+
head_size_analysis_results.file_analysis,
50+
base_size_analysis_results.file_analysis,
51+
)
52+
4553
if not skip_diff_item_comparison:
4654
for path in sorted(all_paths):
4755
head_elements = head_files.get(path, [])
@@ -77,6 +85,11 @@ def compare_size_analysis(
7785
# Process unmatched head elements (added)
7886
for head_element in unmatched_head:
7987
head_size = head_element.size
88+
89+
# Skip if this is a renamed file (same hash exists in base at different path)
90+
if path in head_renamed_paths:
91+
continue
92+
8093
if head_size == 0:
8194
continue
8295

@@ -94,6 +107,11 @@ def compare_size_analysis(
94107
# Process unmatched base elements (removed)
95108
for base_element in unmatched_base:
96109
base_size = base_element.size
110+
111+
# Skip if this is a renamed file (same hash exists in head at different path)
112+
if path in base_renamed_paths:
113+
continue
114+
97115
if base_size == 0:
98116
continue
99117

@@ -173,13 +191,20 @@ def _should_skip_diff_item_comparison(
173191
return has_mismatched_major or has_mismatched_minor
174192

175193

194+
class MatchedElements(NamedTuple):
195+
"""Result of matching treemap elements between head and base."""
196+
197+
matched_pairs: list[tuple[TreemapElement, TreemapElement]]
198+
unmatched_head: list[TreemapElement]
199+
unmatched_base: list[TreemapElement]
200+
201+
176202
def _match_elements(
177203
head_elements: list[TreemapElement], base_elements: list[TreemapElement]
178-
) -> tuple[list[tuple[TreemapElement, TreemapElement]], list[TreemapElement], list[TreemapElement]]:
204+
) -> MatchedElements:
179205
"""
180206
Intelligently match elements from head and base when there are duplicates.
181207
For example, in iOS processing multiple images can map to the same file name.
182-
Returns: (matched_pairs, unmatched_head, unmatched_base)
183208
184209
Matching strategy:
185210
1. First, match by exact name and size
@@ -224,7 +249,7 @@ def _match_elements(
224249
elem for idx, elem in enumerate(base_elements) if idx not in matched_base_indices
225250
]
226251

227-
return matched_pairs, unmatched_head, unmatched_base
252+
return MatchedElements(matched_pairs, unmatched_head, unmatched_base)
228253

229254

230255
def _flatten_leaf_nodes(
@@ -248,3 +273,66 @@ def _flatten_leaf_nodes(
248273
items[child_path].extend(child_elements)
249274

250275
return items
276+
277+
278+
def _find_renamed_paths(
279+
head_file_analysis: FileAnalysis | None,
280+
base_file_analysis: FileAnalysis | None,
281+
) -> tuple[set[str], set[str]]:
282+
"""Find paths that are likely renames (same hash, different path).
283+
284+
When a file with the same hash exists at different paths in head vs base,
285+
we consider it a rename. However, if there are more paths on one side
286+
(e.g., file was renamed AND duplicated), we only mark min(head, base)
287+
as renames - the rest are true additions/removals.
288+
"""
289+
head_hash_to_paths = _build_hash_to_paths(head_file_analysis)
290+
base_hash_to_paths = _build_hash_to_paths(base_file_analysis)
291+
292+
head_renamed_paths: set[str] = set()
293+
base_renamed_paths: set[str] = set()
294+
295+
for file_hash, head_paths in head_hash_to_paths.items():
296+
base_paths = base_hash_to_paths.get(file_hash, set())
297+
# Paths only in head (not in base) with the same hash as paths only in base
298+
head_only = head_paths - base_paths
299+
base_only = base_paths - head_paths
300+
301+
if head_only and base_only:
302+
# Only mark the minimum count as renames - the rest are real adds/removes
303+
# e.g., 1 base path + 3 head paths = 1 rename + 2 additions
304+
rename_count = min(len(head_only), len(base_only))
305+
head_renamed_paths.update(sorted(head_only)[:rename_count])
306+
base_renamed_paths.update(sorted(base_only)[:rename_count])
307+
308+
return head_renamed_paths, base_renamed_paths
309+
310+
311+
def _build_hash_to_paths(file_analysis: FileAnalysis | None) -> dict[str, set[str]]:
312+
if not file_analysis:
313+
return {}
314+
315+
hash_to_paths: dict[str, set[str]] = {}
316+
for file_info in file_analysis.items:
317+
_collect_file_hashes(file_info, hash_to_paths)
318+
return hash_to_paths
319+
320+
321+
def _collect_file_hashes(
322+
file_info: FileInfo,
323+
hash_to_paths: dict[str, set[str]],
324+
parent_path: str = "",
325+
) -> None:
326+
if parent_path and not file_info.path.startswith(f"{parent_path}/"):
327+
full_path = f"{parent_path}/{file_info.path}"
328+
else:
329+
full_path = file_info.path
330+
331+
if not file_info.children:
332+
if file_info.hash not in hash_to_paths:
333+
hash_to_paths[file_info.hash] = set()
334+
hash_to_paths[file_info.hash].add(full_path)
335+
else:
336+
# Asset catalogs can have children
337+
for child in file_info.children:
338+
_collect_file_hashes(child, hash_to_paths, full_path)

src/sentry/preprod/size_analysis/models.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88

99
###
1010
# Size analysis results (non-comparison)
11+
# Keep in sync with https://github.com/getsentry/launchpad/blob/main/src/launchpad/size/models/common.py#L92
1112
###
1213

1314

1415
class TreemapElement(BaseModel):
16+
1517
model_config = ConfigDict(frozen=True)
1618

1719
name: str
@@ -26,12 +28,36 @@ class TreemapElement(BaseModel):
2628
class TreemapResults(BaseModel):
2729
"""Complete treemap analysis results."""
2830

31+
model_config = ConfigDict(frozen=True)
32+
2933
root: TreemapElement
3034
file_count: int
3135
category_breakdown: dict[str, dict[str, int]]
3236
platform: str
3337

3438

39+
class FileInfo(BaseModel):
40+
"""Slim file info for rename detection.
41+
42+
Only contains fields needed for hash-based rename detection.
43+
Other fields (size, file_type, etc.) are available in the treemap.
44+
"""
45+
46+
model_config = ConfigDict(frozen=True)
47+
48+
path: str
49+
hash: str
50+
children: list[FileInfo] = []
51+
52+
53+
class FileAnalysis(BaseModel):
54+
"""Analysis results for files and directories in the app bundle."""
55+
56+
model_config = ConfigDict(frozen=True)
57+
58+
items: list[FileInfo]
59+
60+
3561
class AppComponent(BaseModel):
3662
"""Information about a modular app component (watch app, app extension, dynamic feature, etc.)."""
3763

@@ -45,14 +71,17 @@ class AppComponent(BaseModel):
4571
install_size: int
4672

4773

48-
# Keep in sync with https://github.com/getsentry/launchpad/blob/main/src/launchpad/size/models/common.py#L92
4974
class SizeAnalysisResults(BaseModel):
75+
76+
model_config = ConfigDict(frozen=True)
77+
5078
analysis_duration: float
5179
download_size: int
5280
install_size: int
53-
treemap: TreemapResults | None
54-
analysis_version: str | None
55-
app_components: list[AppComponent] | None
81+
treemap: TreemapResults | None = None
82+
analysis_version: str | None = None
83+
file_analysis: FileAnalysis | None = None
84+
app_components: list[AppComponent] | None = None
5685

5786

5887
###

0 commit comments

Comments
 (0)