diff --git a/.prettierrc b/.prettierrc new file mode 100644 index 0000000..661f253 --- /dev/null +++ b/.prettierrc @@ -0,0 +1,3 @@ +{ + "useTabs": true +} diff --git a/README.md b/README.md index 8e61f65..fa13efe 100644 --- a/README.md +++ b/README.md @@ -276,6 +276,53 @@ The parser supports standard robots.txt pattern syntax: **Priority**: When both Allow and Disallow match, the longer pattern wins. +## Production Usage + +This library is designed for correctness and RFC 9309 compliance. When using it in production environments that fetch robots.txt from untrusted sources, consider these safeguards: + +### File Size Limits + +The library does not enforce file size limits. Both RFC 9309 and Google require parsing at least 500 KiB. Implement size checks before parsing: + +```typescript +const MAX_ROBOTS_SIZE = 500 * 1024; // 500 KiB (per RFC 9309) + +async function fetchAndParse(url: string) { + const response = await fetch(url); + const contentLength = response.headers.get('content-length'); + + if (contentLength && parseInt(contentLength) > MAX_ROBOTS_SIZE) { + throw new Error('robots.txt too large'); + } + + const text = await response.text(); + if (text.length > MAX_ROBOTS_SIZE) { + throw new Error('robots.txt too large'); + } + + return ParsedRobots.parse(text); +} +``` + +### Timeouts + +Implement timeouts when fetching robots.txt to prevent hanging requests. + +## Google-Specific Behaviors + +This library is a port of Google's C++ parser and includes several behaviors that are Google-specific extensions beyond RFC 9309: + +| Behavior | Google | RFC 9309 | +|----------|--------|----------| +| **Line length limit** | Truncates at 16,664 bytes | No limit specified | +| **Typo tolerance** | Accepts "disalow", "useragent", etc. | "MAY be lenient" (unspecified) | +| **index.html normalization** | `Allow: /path/index.html` also allows `/path/` | Not specified | +| **User-agent `*` with trailing text** | `* foo` treated as global agent | Not specified | + +The core matching behavior (longest-match-wins, case-insensitive user-agent matching, UTF-8 encoding) follows RFC 9309. + +**Note:** This library only handles parsing and matching. HTTP behaviors like redirect following, caching, and status code handling are your responsibility to implement. + ## Project Structure ``` diff --git a/TESTS.md b/TESTS.md index e9a89aa..fb29e2b 100644 --- a/TESTS.md +++ b/TESTS.md @@ -20,9 +20,9 @@ This document provides comprehensive documentation of all tests in Google's robo | Metric | Count | | ---------------- | ----------------- | -| Total Test Files | 4 | -| Total Test Cases | 196 | -| Total Assertions | 476 | +| Total Test Files | 5 | +| Total Test Cases | 206 | +| Total Assertions | 495 | | Coverage | 100% of C++ tests | ## Test Naming Conventions @@ -46,6 +46,7 @@ This document provides comprehensive documentation of all tests in Google's robo 2. **tests/reporter.test.ts** - Reporting/parsing metadata tests (6 test cases) 3. **tests/url-utils.test.ts** - URL utility function tests (22 test cases) 4. **tests/bulk-check.test.ts** - Bulk URL checking API tests (23 test cases) +5. **tests/stress.test.ts** - Performance and stress tests (10 test cases) --- @@ -706,6 +707,84 @@ Test 2 - ParsedRobots reuse vs repeated parsing: 3. Batch check (single parse) i --- +## Category F: Stress Tests (TypeScript Extension) + +These tests validate the library's performance and stability under extreme conditions. + +### StressTest_LargeFileHandling (stress.test.ts:18-63) + +**Purpose**: Tests parsing of large robots.txt files. + +**Assertions (3 total)**: + +Test 1 - 1MB robots.txt: +1. Parser completes without crashing → expects TRUE +2. Completes within 5 seconds → expects TRUE + +Test 2 - 100K lines: +3. Parser handles 100,000 Disallow rules efficiently → expects TRUE + +Test 3 - Many user-agent groups: +4. Parser handles 1,000 separate user-agent groups → expects TRUE + +**Edge Cases**: Memory efficiency, parsing speed with large inputs + +--- + +### StressTest_PathologicalPatterns (stress.test.ts:65-124) + +**Purpose**: Tests pattern matching with complex wildcard patterns. + +**Assertions (3 total)**: + +Test 1 - Many wildcards: +1. Pattern `/a*b*c*d*e*f*g*h*i*j*` matches efficiently → expects TRUE (< 100ms) + +Test 2 - Deeply nested wildcards: +2. Pattern with 16 wildcard segments matches efficiently → expects TRUE + +Test 3 - Many rules with same prefix: +3. 10,000 rules starting with `/api/v1/users/` checked efficiently → expects TRUE + +**Edge Cases**: Avoids exponential backtracking in pattern matching + +--- + +### StressTest_BulkURLCheckingPerformance (stress.test.ts:126-146) + +**Purpose**: Tests bulk URL checking at scale. + +**Assertions (2 total)**: + +Test 1 - 10K URLs: +1. 10,000 URLs processed → expects 10,000 results +2. Completes under 1 second → expects TRUE + +**Edge Cases**: Linear scaling with URL count + +--- + +### StressTest_EdgeCases (stress.test.ts:148-188) + +**Purpose**: Tests graceful handling of edge cases. + +**Assertions (5 total)**: + +Test 1 - Empty robots.txt: +1. Returns allowed (true) → expects TRUE + +Test 2 - Comments only: +2. Returns allowed (true) → expects TRUE + +Test 3 - Malformed URLs: +3. Empty URL doesn't throw → expects no exception +4. Invalid URL doesn't throw → expects no exception +5. Missing scheme URL doesn't throw → expects no exception + +**Edge Cases**: Graceful degradation with invalid input + +--- + ## Helper Classes ### RobotsStatsReporter (robots_test.cc:765-819) @@ -869,7 +948,7 @@ The TypeScript port has been verified to provide **100% test coverage** of all C bun test # Expected output: -# 173 pass +# 206 pass # 0 fail -# 420 expect() calls +# 495 expect() calls ``` diff --git a/bun.lock b/bun.lock index 593e4d3..8403ed1 100644 --- a/bun.lock +++ b/bun.lock @@ -6,6 +6,7 @@ "name": "robotstxt-ts-port", "devDependencies": { "@types/bun": "latest", + "prettier": "^3.7.4", "typescript": "^5.0.0", }, }, @@ -17,6 +18,8 @@ "bun-types": ["bun-types@1.3.3", "", { "dependencies": { "@types/node": "*" } }, "sha512-z3Xwlg7j2l9JY27x5Qn3Wlyos8YAp0kKRlrePAOjgjMGS5IG6E7Jnlx736vH9UVI4wUICwwhC9anYL++XeOgTQ=="], + "prettier": ["prettier@3.7.4", "", { "bin": { "prettier": "bin/prettier.cjs" } }, "sha512-v6UNi1+3hSlVvv8fSaoUbggEM5VErKmmpGA7Pl3HF8V6uKY7rvClBOJlH6yNwQtfTueNkGVpOv/mtWL9L4bgRA=="], + "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], "undici-types": ["undici-types@7.16.0", "", {}, "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw=="], diff --git a/index.ts b/index.ts deleted file mode 100644 index f67b2c6..0000000 --- a/index.ts +++ /dev/null @@ -1 +0,0 @@ -console.log("Hello via Bun!"); \ No newline at end of file diff --git a/package.json b/package.json index c81b964..c4a0fd7 100644 --- a/package.json +++ b/package.json @@ -1,52 +1,54 @@ { - "name": "@trybyte/robotstxt-parser", - "version": "1.0.0", - "description": "Google's robots.txt parser ported to TypeScript - RFC 9309 compliant", - "keywords": [ - "robots.txt", - "robots", - "parser", - "crawler", - "seo", - "geo", - "google", - "rfc9309" - ], - "homepage": "https://github.com/trybyte-app/robotstxt-ts-port#readme", - "bugs": { - "url": "https://github.com/trybyte-app/robotstxt-ts-port/issues" - }, - "repository": { - "type": "git", - "url": "git+https://github.com/trybyte-app/robotstxt-ts-port.git" - }, - "license": "Apache-2.0", - "author": "Alireza Esmikhani", - "type": "module", - "exports": { - ".": { - "types": "./dist/index.d.ts", - "import": "./dist/index.js" - } - }, - "main": "./dist/index.js", - "types": "./dist/index.d.ts", - "directories": { - "test": "tests" - }, - "files": [ - "dist" - ], - "scripts": { - "build": "tsc", - "test": "bun test", - "prepublishOnly": "bun run build" - }, - "devDependencies": { - "@types/bun": "latest", - "typescript": "^5.0.0" - }, - "engines": { - "node": ">=20.0.0" - } + "name": "@trybyte/robotstxt-parser", + "version": "1.0.0", + "description": "Google's robots.txt parser ported to TypeScript - RFC 9309 compliant", + "keywords": [ + "robots.txt", + "robots", + "parser", + "crawler", + "seo", + "geo", + "google", + "rfc9309" + ], + "homepage": "https://github.com/trybyte-app/robotstxt-ts-port#readme", + "bugs": { + "url": "https://github.com/trybyte-app/robotstxt-ts-port/issues" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/trybyte-app/robotstxt-ts-port.git" + }, + "license": "Apache-2.0", + "author": "Byte Team (trybyte.app)", + "type": "module", + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "main": "./dist/index.js", + "types": "./dist/index.d.ts", + "directories": { + "test": "tests" + }, + "files": [ + "dist" + ], + "scripts": { + "build": "tsc", + "test": "bun test", + "prepublishOnly": "bun run build", + "format": "prettier \"**/*.{js,jsx,mjs,ts,tsx,json,jsonc}\" --write" + }, + "devDependencies": { + "@types/bun": "latest", + "prettier": "^3.7.4", + "typescript": "^5.0.0" + }, + "engines": { + "node": ">=20.0.0" + } } diff --git a/src/matcher.ts b/src/matcher.ts index 80d8a92..e3cb1ba 100644 --- a/src/matcher.ts +++ b/src/matcher.ts @@ -184,6 +184,14 @@ export class RobotsMatcher extends RobotsParseHandler { /** * Returns true iff 'url' is allowed to be fetched by any member of the * "userAgents" array. 'url' must be %-encoded according to RFC3986. + * + * Invalid or malformed URLs are handled gracefully - if the path cannot be + * extracted, it defaults to "/" which typically allows access. + * + * @param robotsBody - The robots.txt content to parse + * @param userAgents - Array of user-agent strings to check + * @param url - The URL to check (should be %-encoded per RFC3986) + * @returns true if access is allowed, false if disallowed */ public allowedByRobots( robotsBody: string, @@ -201,6 +209,14 @@ export class RobotsMatcher extends RobotsParseHandler { /** * Do robots check for 'url' when there is only one user agent. 'url' must * be %-encoded according to RFC3986. + * + * Invalid or malformed URLs are handled gracefully - if the path cannot be + * extracted, it defaults to "/" which typically allows access. + * + * @param robotsTxt - The robots.txt content to parse + * @param userAgent - The user-agent string to check + * @param url - The URL to check (should be %-encoded per RFC3986) + * @returns true if access is allowed, false if disallowed */ public oneAgentAllowedByRobots( robotsTxt: string, diff --git a/src/parsed-robots.ts b/src/parsed-robots.ts index f3162c7..f8f576e 100644 --- a/src/parsed-robots.ts +++ b/src/parsed-robots.ts @@ -253,8 +253,12 @@ export class ParsedRobots { * Check multiple URLs for a single user-agent. * This is the fast operation - O(urls * rules) with no parsing overhead. * + * Invalid or malformed URLs are handled gracefully - if the path cannot be + * extracted, it defaults to "/" which typically allows access. No exceptions + * are thrown for invalid input. + * * @param userAgent - The user-agent to check (e.g., 'Googlebot', 'Googlebot/2.1') - * @param urls - Array of URLs to check (must be %-encoded per RFC3986) + * @param urls - Array of URLs to check (should be %-encoded per RFC3986) * @returns Array of results in the same order as input URLs */ public checkUrls(userAgent: string, urls: string[]): UrlCheckResult[] { @@ -274,8 +278,11 @@ export class ParsedRobots { /** * Check a single URL (convenience method). * + * Invalid or malformed URLs are handled gracefully - if the path cannot be + * extracted, it defaults to "/" which typically allows access. + * * @param userAgent - The user-agent to check - * @param url - The URL to check (must be %-encoded per RFC3986) + * @param url - The URL to check (should be %-encoded per RFC3986) * @returns Result with detailed match information */ public checkUrl(userAgent: string, url: string): UrlCheckResult { diff --git a/tests/stress.test.ts b/tests/stress.test.ts new file mode 100644 index 0000000..9a42f62 --- /dev/null +++ b/tests/stress.test.ts @@ -0,0 +1,192 @@ +// Copyright 2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import { describe, test, expect } from "bun:test"; +import { RobotsMatcher, ParsedRobots } from "../src"; + +describe("Stress Tests", () => { + describe("Large File Handling", () => { + test("handles 1MB robots.txt without crashing", () => { + // Generate ~1MB of valid robots.txt content + let content = "User-agent: *\n"; + const rule = "Disallow: /path/to/some/resource/\n"; + while (content.length < 1_000_000) { + content += rule; + } + + const start = performance.now(); + const parsed = ParsedRobots.parse(content); + const elapsed = performance.now() - start; + + expect(parsed).toBeDefined(); + // Should complete within 5 seconds on any reasonable hardware + expect(elapsed).toBeLessThan(5000); + }); + + test("handles 100K lines efficiently", () => { + const lines: string[] = ["User-agent: *"]; + for (let i = 0; i < 100_000; i++) { + lines.push(`Disallow: /path${i}/`); + } + + const start = performance.now(); + const parsed = ParsedRobots.parse(lines.join("\n")); + const elapsed = performance.now() - start; + + expect(parsed).toBeDefined(); + expect(elapsed).toBeLessThan(5000); + }); + + test("handles many user-agent groups", () => { + const lines: string[] = []; + for (let i = 0; i < 1000; i++) { + lines.push(`User-agent: Bot${i}`); + lines.push(`Disallow: /private${i}/`); + lines.push(""); + } + + const start = performance.now(); + const parsed = ParsedRobots.parse(lines.join("\n")); + const elapsed = performance.now() - start; + + expect(parsed).toBeDefined(); + expect(elapsed).toBeLessThan(1000); + }); + }); + + describe("Pathological Patterns", () => { + test("handles many wildcards in pattern", () => { + const pattern = "/a*b*c*d*e*f*g*h*i*j*"; + const robotsTxt = `User-agent: *\nDisallow: ${pattern}`; + const url = "https://example.com/aXbXcXdXeXfXgXhXiXjX"; + + const start = performance.now(); + const result = new RobotsMatcher().oneAgentAllowedByRobots( + robotsTxt, + "bot", + url, + ); + const elapsed = performance.now() - start; + + // Single match should be fast (< 100ms) + expect(elapsed).toBeLessThan(100); + expect(result).toBe(false); // Should be disallowed + }); + + test("handles deeply nested wildcard patterns", () => { + // Pattern with alternating wildcards and literals + const pattern = "/*a*b*c*d*e*f*g*h*i*j*k*l*m*n*o*p*"; + const robotsTxt = `User-agent: *\nDisallow: ${pattern}`; + const url = "https://example.com/XaXbXcXdXeXfXgXhXiXjXkXlXmXnXoXp"; + + const start = performance.now(); + const result = new RobotsMatcher().oneAgentAllowedByRobots( + robotsTxt, + "bot", + url, + ); + const elapsed = performance.now() - start; + + expect(elapsed).toBeLessThan(100); + expect(result).toBe(false); + }); + + test("handles many rules with same prefix", () => { + const lines: string[] = ["User-agent: *"]; + // Many rules starting with the same prefix + for (let i = 0; i < 10000; i++) { + lines.push(`Disallow: /api/v1/users/${i}`); + } + + const robotsTxt = lines.join("\n"); + const parsed = ParsedRobots.parse(robotsTxt); + + const start = performance.now(); + const result = parsed.checkUrl( + "bot", + "https://example.com/api/v1/users/5000", + ); + const elapsed = performance.now() - start; + + expect(elapsed).toBeLessThan(100); + expect(result.allowed).toBe(false); + }); + }); + + describe("Bulk URL Checking Performance", () => { + test("checks 10K URLs efficiently with ParsedRobots", () => { + const robotsTxt = ` +User-agent: * +Disallow: /private/ +Disallow: /admin/ +Allow: /public/ +`; + const parsed = ParsedRobots.parse(robotsTxt); + + const urls: string[] = []; + for (let i = 0; i < 10_000; i++) { + urls.push(`https://example.com/page${i}`); + } + + const start = performance.now(); + const results = parsed.checkUrls("Googlebot", urls); + const elapsed = performance.now() - start; + + expect(results.length).toBe(10_000); + // Should complete well under 1 second + expect(elapsed).toBeLessThan(1000); + }); + }); + + describe("Edge Cases", () => { + test("handles empty robots.txt", () => { + const result = new RobotsMatcher().oneAgentAllowedByRobots( + "", + "bot", + "https://example.com/page", + ); + expect(result).toBe(true); // Empty = allow all + }); + + test("handles robots.txt with only comments", () => { + const robotsTxt = ` +# This is a comment +# Another comment +# No actual rules +`; + const result = new RobotsMatcher().oneAgentAllowedByRobots( + robotsTxt, + "bot", + "https://example.com/page", + ); + expect(result).toBe(true); // No rules = allow all + }); + + test("handles malformed URLs gracefully", () => { + const robotsTxt = `User-agent: *\nDisallow: /`; + + // These should not throw + const matcher = new RobotsMatcher(); + expect(() => + matcher.oneAgentAllowedByRobots(robotsTxt, "bot", ""), + ).not.toThrow(); + expect(() => + matcher.oneAgentAllowedByRobots(robotsTxt, "bot", "not-a-url"), + ).not.toThrow(); + expect(() => + matcher.oneAgentAllowedByRobots(robotsTxt, "bot", "://missing-scheme"), + ).not.toThrow(); + }); + }); +});