Skip to content

Commit aad9f1e

Browse files
Only 404 lint unique files (#15452)
<!-- Use this checklist to make sure your PR is ready for merge. You may delete any sections you don't need. --> ## DESCRIBE YOUR PR The 404 link checker GitHub Action was processing over 9,000 pages despite the repository containing only ~2,200 source files. The Sentry docs use a "common files" architecture where a single source file generates multiple URLs across platforms and guides and we were checking every generated page independently. Also made moves to what's bundled during the build to avoid Vercel's 250MB limit on serverless function size. **Implemented source-based deduplication using the existing /api/source-map endpoint:** - Fetch source-map that maps each slug to its source file - Track which source files have been checked - Only check one page per unique source file - Always check API-generated pages (no source file) **That gets us...** - ~72% faster - Checks ~2,200 unique pages instead of 9,000 - ~72% fewer HTTP requests - Massive reduction in server load - Same coverage - Still validates all pages, just efficiently ## IS YOUR CHANGE URGENT? Help us prioritize incoming PRs by letting us know when the change needs to go live. - [ ] Urgent deadline (GA date, etc.): <!-- ENTER DATE HERE --> - [ ] Other deadline: <!-- ENTER DATE HERE --> - [ ] None: Not urgent, can wait up to 1 week+ ## SLA - Teamwork makes the dream work, so please add a reviewer to your PRs. - Please give the docs team up to 1 week to review your PR unless you've added an urgent due date to it. Thanks in advance for your help! ## PRE-MERGE CHECKLIST *Make sure you've checked the following before merging your changes:* - [ ] Checked Vercel preview for correctness, including links - [ ] PR was reviewed and approved by any necessary SMEs (subject matter experts) - [ ] PR was reviewed and approved by a member of the [Sentry docs team](https://github.com/orgs/getsentry/teams/docs) ## LEGAL BOILERPLATE <!-- Sentry employees and contractors can delete or ignore this section. --> Look, I get it. The entity doing business as "Sentry" was incorporated in the State of Delaware in 2015 as Functional Software, Inc. and is gonna need some rights from me in order to utilize my contributions in this here PR. So here's the deal: I retain all rights, title and interest in and to my contributions, and by keeping this boilerplate intact I confirm that Sentry can use, modify, copy, and redistribute my contributions, under Sentry's choice of terms. ## EXTRA RESOURCES - [Sentry Docs contributor guide](https://docs.sentry.io/contributing/) --------- Co-authored-by: getsantry[bot] <66042841+getsantry[bot]@users.noreply.github.com>
1 parent 24da6c3 commit aad9f1e

File tree

6 files changed

+565
-13
lines changed

6 files changed

+565
-13
lines changed

app/api/source-map/route.ts

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
import {NextResponse} from 'next/server';
2+
3+
import {getDevDocsFrontMatter, getDocsFrontMatter} from 'sentry-docs/frontmatter';
4+
import {isDeveloperDocs} from 'sentry-docs/isDeveloperDocs';
5+
6+
/**
7+
* API endpoint that returns a mapping of slugs to their source file paths.
8+
* This is used by the 404 link checker to deduplicate pages that share the same source.
9+
*/
10+
export async function GET() {
11+
const docs = await (isDeveloperDocs ? getDevDocsFrontMatter() : getDocsFrontMatter());
12+
13+
const sourceMap: Record<string, string | null> = {};
14+
15+
for (const doc of docs) {
16+
// Normalize slug (remove leading and trailing slashes to match main.ts trimSlashes)
17+
const slug = doc.slug.replace(/(^\/|\/$)/g, '');
18+
// sourcePath will be null for API-generated pages
19+
sourceMap[slug] = doc.sourcePath ?? null;
20+
}
21+
22+
return NextResponse.json(sourceMap);
23+
}

app/sitemap.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import type {MetadataRoute} from 'next';
22

3+
import {getDevDocsFrontMatter, getDocsFrontMatter} from 'sentry-docs/frontmatter';
34
import {isDeveloperDocs} from 'sentry-docs/isDeveloperDocs';
4-
import {getDevDocsFrontMatter, getDocsFrontMatter} from 'sentry-docs/mdx';
55

66
export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
77
if (isDeveloperDocs) {

next.config.ts

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ import {withSentryConfig} from '@sentry/nextjs';
44
import {REMOTE_IMAGE_PATTERNS} from './src/config/images';
55
import {redirects} from './redirects.js';
66

7+
// Exclude build-time-only dependencies from serverless function bundles to stay under
8+
// Vercel's 250MB limit. These packages (esbuild, mdx-bundler, sharp, etc.) are only
9+
// needed during the build process to compile MDX and optimize assets. The compiled
10+
// output is used at runtime, so bundling these ~150-200MB of dependencies would bloat
11+
// functions unnecessarily and cause deployment failures.
712
const outputFileTracingExcludes = process.env.NEXT_PUBLIC_DEVELOPER_DOCS
813
? {
914
'/**/*': [
@@ -13,6 +18,24 @@ const outputFileTracingExcludes = process.env.NEXT_PUBLIC_DEVELOPER_DOCS
1318
'./.next/cache/mdx-bundler/**/*',
1419
'./.next/cache/md-exports/**/*',
1520
'docs/**/*',
21+
// Exclude heavy build dependencies
22+
'node_modules/@esbuild/**/*',
23+
'node_modules/esbuild/**/*',
24+
'node_modules/@aws-sdk/**/*',
25+
'node_modules/@google-cloud/**/*',
26+
'node_modules/prettier/**/*',
27+
'node_modules/@prettier/**/*',
28+
'node_modules/sharp/**/*',
29+
'node_modules/mermaid/**/*',
30+
// Exclude MDX processing dependencies
31+
'node_modules/mdx-bundler/**/*',
32+
'node_modules/rehype-preset-minify/**/*',
33+
'node_modules/rehype-prism-plus/**/*',
34+
'node_modules/rehype-prism-diff/**/*',
35+
'node_modules/remark-gfm/**/*',
36+
'node_modules/remark-mdx-images/**/*',
37+
'node_modules/unified/**/*',
38+
'node_modules/rollup/**/*',
1639
],
1740
}
1841
: {
@@ -23,7 +46,24 @@ const outputFileTracingExcludes = process.env.NEXT_PUBLIC_DEVELOPER_DOCS
2346
'./.next/cache/md-exports/**/*',
2447
'./apps/**/*',
2548
'develop-docs/**/*',
26-
'node_modules/@esbuild/*',
49+
// Exclude heavy build dependencies
50+
'node_modules/@esbuild/**/*',
51+
'node_modules/esbuild/**/*',
52+
'node_modules/@aws-sdk/**/*',
53+
'node_modules/@google-cloud/**/*',
54+
'node_modules/prettier/**/*',
55+
'node_modules/@prettier/**/*',
56+
'node_modules/sharp/**/*',
57+
'node_modules/mermaid/**/*',
58+
// Exclude MDX processing dependencies
59+
'node_modules/mdx-bundler/**/*',
60+
'node_modules/rehype-preset-minify/**/*',
61+
'node_modules/rehype-prism-plus/**/*',
62+
'node_modules/rehype-prism-diff/**/*',
63+
'node_modules/remark-gfm/**/*',
64+
'node_modules/remark-mdx-images/**/*',
65+
'node_modules/unified/**/*',
66+
'node_modules/rollup/**/*',
2767
],
2868
'/platform-redirect': [
2969
'**/*.gif',
@@ -38,7 +78,6 @@ const outputFileTracingExcludes = process.env.NEXT_PUBLIC_DEVELOPER_DOCS
3878
'public/og-images/**/*',
3979
],
4080
'sitemap.xml': [
41-
'docs/**/*',
4281
'public/mdx-images/**/*',
4382
'public/og-images/**/*',
4483
'**/*.gif',
@@ -57,7 +96,22 @@ if (process.env.NODE_ENV !== 'development' && !process.env.NEXT_PUBLIC_SENTRY_DS
5796
const nextConfig = {
5897
pageExtensions: ['js', 'jsx', 'mdx', 'ts', 'tsx', 'mdx'],
5998
trailingSlash: true,
60-
serverExternalPackages: ['rehype-preset-minify'],
99+
serverExternalPackages: [
100+
'rehype-preset-minify',
101+
'esbuild',
102+
'@esbuild/darwin-arm64',
103+
'@esbuild/darwin-x64',
104+
'@esbuild/linux-arm64',
105+
'@esbuild/linux-x64',
106+
'@esbuild/win32-x64',
107+
'mdx-bundler',
108+
'sharp',
109+
'@aws-sdk/client-s3',
110+
'@google-cloud/storage',
111+
'prettier',
112+
'@prettier/plugin-xml',
113+
'mermaid',
114+
],
61115
outputFileTracingExcludes,
62116
images: {
63117
contentDispositionType: 'inline', // "open image in new tab" instead of downloading

scripts/lint-404s/README.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# 404 Link Checker
2+
3+
This script checks all documentation pages for broken internal links (404s).
4+
5+
## Usage
6+
7+
```bash
8+
# Basic usage (with deduplication - recommended)
9+
bun ./scripts/lint-404s/main.ts
10+
11+
# Show progress for each page
12+
bun ./scripts/lint-404s/main.ts --progress
13+
14+
# Skip deduplication and check all pages (for debugging)
15+
bun ./scripts/lint-404s/main.ts --skip-deduplication
16+
17+
# Filter to a specific path
18+
bun ./scripts/lint-404s/main.ts --path platforms/javascript
19+
```
20+
21+
## Deduplication
22+
23+
By default, the checker **deduplicates common files** to improve performance.
24+
25+
### Why?
26+
27+
The Sentry docs use a "common" file system where documentation is shared across multiple platforms. For example:
28+
29+
- `/platforms/apple/common/configuration/index.mdx` is rendered as:
30+
- `/platforms/apple/guides/ios/configuration/`
31+
- `/platforms/apple/guides/macos/configuration/`
32+
- `/platforms/apple/guides/watchos/configuration/`
33+
- ... and many more
34+
35+
Without deduplication, the checker would fetch and test the same content dozens of times, which:
36+
37+
- Takes much longer to run
38+
- Wastes CI resources
39+
- Provides no additional value (the content is identical)
40+
41+
### How it works
42+
43+
1. The checker fetches a source map from `/api/source-map` that maps each slug to its source file
44+
2. It tracks which source files have been checked
45+
3. For common files, it only checks the first instance
46+
4. **API-generated pages** are always checked (they have no source file)
47+
48+
This typically reduces the number of pages checked from **~9,000 to ~2,500**, a **72% reduction**.
49+
50+
### When to use `--skip-deduplication`
51+
52+
Use this flag to skip deduplication and verify that all rendered pages work correctly, even if they share the same source. This is rarely necessary but can help debug issues with:
53+
54+
- Path routing
55+
- Platform-specific rendering bugs
56+
- Edge cases in the build system
57+
58+
## Ignore List
59+
60+
The `ignore-list.txt` file contains paths that should be skipped during checking. Add paths here (one per line) if they're known to be inaccessible or are special cases.
61+
62+
## Exit Codes
63+
64+
- `0` - No 404s found
65+
- `1` - 404s were detected

scripts/lint-404s/main.ts

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ const trimSlashes = (s: string) => s.replace(/(^\/|\/$)/g, '');
1313
const ignoreListFile = path.join(dirname(import.meta.url), './ignore-list.txt');
1414

1515
const showProgress = process.argv.includes('--progress');
16+
const deduplicatePages = !process.argv.includes('--skip-deduplication');
1617

1718
// Get the path filter if specified
1819
const pathFilterIndex = process.argv.indexOf('--path');
@@ -35,22 +36,74 @@ async function fetchWithFollow(url: URL | string): Promise<Response> {
3536
return r;
3637
}
3738

39+
async function deduplicateSlugs(
40+
allSlugs: string[]
41+
): Promise<{skippedCount: number; slugsToCheck: string[]}> {
42+
try {
43+
const sourceMap: Record<string, string | null> = await fetch(
44+
`${baseURL}api/source-map`
45+
).then(r => r.json());
46+
47+
const checkedSources = new Set<string>();
48+
const slugsToCheck: string[] = [];
49+
let skippedCount = 0;
50+
51+
for (const slug of allSlugs) {
52+
// Use same normalization as route.ts (remove leading and trailing slashes)
53+
const normalizedSlug = slug.replace(/(^\/|\/$)/g, '');
54+
const sourcePath = sourceMap[normalizedSlug];
55+
56+
// Always check API-generated pages (no source file)
57+
if (!sourcePath) {
58+
slugsToCheck.push(slug);
59+
continue;
60+
}
61+
62+
// Skip if we've already checked this source file
63+
if (checkedSources.has(sourcePath)) {
64+
skippedCount++;
65+
continue;
66+
}
67+
68+
// First time seeing this source file
69+
checkedSources.add(sourcePath);
70+
slugsToCheck.push(slug);
71+
}
72+
73+
return {skippedCount, slugsToCheck};
74+
} catch (error) {
75+
console.warn('⚠️ Failed to fetch source map:', error.message);
76+
console.warn('Falling back to checking all pages...\n');
77+
return {skippedCount: 0, slugsToCheck: allSlugs};
78+
}
79+
}
80+
3881
async function main() {
3982
const sitemap = await fetch(`${baseURL}sitemap.xml`).then(r => r.text());
4083

41-
const slugs = [...sitemap.matchAll(/<loc>([^<]*)<\/loc>/g)]
84+
const allSlugs = [...sitemap.matchAll(/<loc>([^<]*)<\/loc>/g)]
4285
.map(l => l[1])
4386
.map(url => trimSlashes(new URL(url).pathname))
4487
.filter(Boolean)
4588
.filter(slug => (pathFilter ? slug.startsWith(pathFilter) : true));
46-
const allSlugsSet = new Set(slugs);
47-
48-
if (pathFilter) {
49-
console.log('Checking 404s on %d pages in /%s', slugs.length, pathFilter);
50-
} else {
51-
console.log('Checking 404s on %d pages', slugs.length);
89+
const allSlugsSet = new Set(allSlugs);
90+
91+
// Deduplicate pages with same source file (default behavior)
92+
const {skippedCount, slugsToCheck} = deduplicatePages
93+
? await deduplicateSlugs(allSlugs)
94+
: {skippedCount: 0, slugsToCheck: allSlugs};
95+
96+
if (skippedCount > 0) {
97+
console.log(
98+
'Deduplication: checking %d unique pages (skipped %d duplicates)\n',
99+
slugsToCheck.length,
100+
skippedCount
101+
);
52102
}
53103

104+
const pathInfo = pathFilter ? ` in /${pathFilter}` : '';
105+
console.log('Checking 404s on %d pages%s', slugsToCheck.length, pathInfo);
106+
54107
const all404s: {page404s: Link[]; slug: string}[] = [];
55108

56109
// check if the slug equivalent of the href is in the sitemap
@@ -100,7 +153,7 @@ async function main() {
100153
return false;
101154
}
102155

103-
for (const slug of slugs) {
156+
for (const slug of slugsToCheck) {
104157
const pageUrl = new URL(slug, baseURL);
105158
const now = performance.now();
106159
const html = await fetchWithFollow(pageUrl.href).then(r => r.text());
@@ -134,7 +187,7 @@ async function main() {
134187
}
135188

136189
if (all404s.length === 0) {
137-
console.log('\n\n🎉 No 404s found');
190+
console.log('\n🎉 No 404s found');
138191
return false;
139192
}
140193
const numberOf404s = all404s.map(x => x.page404s.length).reduce((a, b) => a + b, 0);

0 commit comments

Comments
 (0)