From d3f433cfeca82f91df1a88d77400daa541ee0af5 Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:21:35 -0800 Subject: [PATCH 01/11] Added RSC parsing and reconstruction to rewrite Next.js links correctly --- crates/common/src/integrations/mod.rs | 7 +- crates/common/src/integrations/nextjs.rs | 2590 ++++++++++++++++++-- crates/common/src/integrations/registry.rs | 44 + crates/common/src/lib.rs | 1 + crates/common/src/publisher.rs | 138 +- crates/common/src/rsc_flight.rs | 359 +++ crates/common/src/settings.rs | 4 +- crates/common/src/streaming_processor.rs | 131 + docs/RSC_HYDRATION_FINDINGS.md | 768 ++++++ 9 files changed, 3816 insertions(+), 226 deletions(-) create mode 100644 crates/common/src/rsc_flight.rs create mode 100644 docs/RSC_HYDRATION_FINDINGS.md diff --git a/crates/common/src/integrations/mod.rs b/crates/common/src/integrations/mod.rs index 888fa5a..1afda3d 100644 --- a/crates/common/src/integrations/mod.rs +++ b/crates/common/src/integrations/mod.rs @@ -10,9 +10,10 @@ pub mod testlight; pub use registry::{ AttributeRewriteAction, AttributeRewriteOutcome, IntegrationAttributeContext, - IntegrationAttributeRewriter, IntegrationEndpoint, IntegrationMetadata, IntegrationProxy, - IntegrationRegistration, IntegrationRegistrationBuilder, IntegrationRegistry, - IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, + IntegrationAttributeRewriter, IntegrationEndpoint, IntegrationHtmlContext, + IntegrationHtmlPostProcessor, IntegrationMetadata, IntegrationProxy, IntegrationRegistration, + IntegrationRegistrationBuilder, IntegrationRegistry, IntegrationScriptContext, + IntegrationScriptRewriter, ScriptRewriteAction, }; type IntegrationBuilder = fn(&Settings) -> Option; diff --git a/crates/common/src/integrations/nextjs.rs b/crates/common/src/integrations/nextjs.rs index ad33302..5527022 100644 --- a/crates/common/src/integrations/nextjs.rs +++ b/crates/common/src/integrations/nextjs.rs @@ -5,8 +5,8 @@ use serde::{Deserialize, Serialize}; use validator::Validate; use crate::integrations::{ - IntegrationRegistration, IntegrationScriptContext, IntegrationScriptRewriter, - ScriptRewriteAction, + IntegrationHtmlContext, IntegrationHtmlPostProcessor, IntegrationRegistration, + IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, }; use crate::settings::{IntegrationConfig, Settings}; @@ -39,24 +39,102 @@ fn default_rewrite_attributes() -> Vec { } pub fn register(settings: &Settings) -> Option { - let config = build(settings)?; + let config = match build(settings) { + Some(config) => { + log::info!( + "NextJS integration registered: enabled={}, rewrite_attributes={:?}", + config.enabled, + config.rewrite_attributes + ); + config + } + None => { + log::info!("NextJS integration not registered (disabled or missing config)"); + return None; + } + }; + + // Register both structured (Pages Router __NEXT_DATA__) and streamed (App Router RSC) + // rewriters. RSC payloads require LENGTH-PRESERVING URL replacement to avoid breaking + // React hydration - the RSC format uses byte positions for record boundaries. let structured = Arc::new(NextJsScriptRewriter::new( - Arc::clone(&config), + config.clone(), NextJsRewriteMode::Structured, )); + let streamed = Arc::new(NextJsScriptRewriter::new( - config, + config.clone(), NextJsRewriteMode::Streamed, )); + // Register post-processor for cross-script RSC T-chunks + let post_processor = Arc::new(NextJsHtmlPostProcessor::new(config)); + Some( IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) .with_script_rewriter(structured) .with_script_rewriter(streamed) + .with_html_post_processor(post_processor) .build(), ) } +/// Post-processor for handling cross-script RSC T-chunks. +struct NextJsHtmlPostProcessor { + config: Arc, +} + +impl NextJsHtmlPostProcessor { + fn new(config: Arc) -> Self { + Self { config } + } +} + +impl IntegrationHtmlPostProcessor for NextJsHtmlPostProcessor { + fn integration_id(&self) -> &'static str { + NEXTJS_INTEGRATION_ID + } + + fn post_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> String { + log::info!( + "NextJs post-processor called: enabled={}, rewrite_attributes={:?}, html_len={}, origin={}, proxy={}://{}", + self.config.enabled, + self.config.rewrite_attributes, + html.len(), + ctx.origin_host, + ctx.request_scheme, + ctx.request_host + ); + + if !self.config.enabled || self.config.rewrite_attributes.is_empty() { + log::info!("NextJs post-processor skipped (disabled or no attributes)"); + return html.to_string(); + } + + // Count origin URLs before + let origin_before = html.matches(ctx.origin_host).count(); + log::info!( + "NextJs post-processor: {} origin URLs before rewrite", + origin_before + ); + + let result = + post_process_rsc_html(html, ctx.origin_host, ctx.request_host, ctx.request_scheme); + + // Count after + let origin_after = result.matches(ctx.origin_host).count(); + let proxy_after = result.matches(ctx.request_host).count(); + log::info!( + "NextJs post-processor complete: input_len={}, output_len={}, origin_remaining={}, proxy_urls={}", + html.len(), + result.len(), + origin_after, + proxy_after + ); + result + } +} + fn build(settings: &Settings) -> Option> { let config = settings .integration_config::(NEXTJS_INTEGRATION_ID) @@ -81,23 +159,116 @@ impl NextJsScriptRewriter { Self { config, mode } } - fn rewrite_values( + fn rewrite_structured( &self, content: &str, ctx: &IntegrationScriptContext<'_>, ) -> ScriptRewriteAction { + // For structured mode (__NEXT_DATA__), use simple URL replacement if let Some(rewritten) = rewrite_nextjs_values( content, ctx.origin_host, ctx.request_host, ctx.request_scheme, &self.config.rewrite_attributes, + false, // No length preservation needed for structured data ) { ScriptRewriteAction::replace(rewritten) } else { ScriptRewriteAction::keep() } } + + fn rewrite_streamed( + &self, + content: &str, + ctx: &IntegrationScriptContext<'_>, + ) -> ScriptRewriteAction { + // For streamed RSC payloads, we need T-chunk aware rewriting. + // This handles the case where T-chunk lengths need to be recalculated + // after URL rewriting. + // + // Try to extract the RSC payload from self.__next_f.push([1, '...']) + if let Some((payload, quote, start, end)) = extract_rsc_push_payload(content) { + let rewritten_payload = rewrite_rsc_tchunks( + payload, + ctx.origin_host, + ctx.request_host, + ctx.request_scheme, + ); + + if rewritten_payload != payload { + // Reconstruct the script with rewritten payload + let mut result = String::with_capacity(content.len()); + result.push_str(&content[..start]); + result.push(quote); + result.push_str(&rewritten_payload); + result.push(quote); + result.push_str(&content[end + 1..]); + return ScriptRewriteAction::replace(result); + } + } + + // Fallback: use simple URL rewriting for the entire content + // This handles non-standard RSC formats or other script patterns + let rewritten = rewrite_rsc_url_string( + content, + ctx.origin_host, + ctx.request_host, + ctx.request_scheme, + ); + + if rewritten != content { + return ScriptRewriteAction::replace(rewritten); + } + + ScriptRewriteAction::keep() + } +} + +/// Extract RSC payload from a self.__next_f.push([1, '...']) call +/// Returns (payload_content, quote_char, start_pos, end_pos) +/// Handles various whitespace patterns in the push call. +fn extract_rsc_push_payload(content: &str) -> Option<(&str, char, usize, usize)> { + // Match pattern: self.__next_f.push([ followed by whitespace, then 1, then whitespace, then quote + // Use regex to be more flexible with whitespace + let pattern = Regex::new(r#"self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#).ok()?; + + let cap = pattern.captures(content)?; + let quote_match = cap.get(1)?; + let quote = quote_match.as_str().chars().next()?; + let content_start = quote_match.end(); + + // Find matching closing quote + let search_from = &content[content_start..]; + let mut pos = 0; + let mut escape = false; + + for c in search_from.chars() { + if escape { + escape = false; + pos += c.len_utf8(); + continue; + } + if c == '\\' { + escape = true; + pos += 1; + continue; + } + if c == quote { + // Found closing quote + let content_end = content_start + pos; + return Some(( + &content[content_start..content_end], + quote, + content_start - 1, // Include opening quote position + content_end, // Position of closing quote + )); + } + pos += c.len_utf8(); + } + + None } impl IntegrationScriptRewriter for NextJsScriptRewriter { @@ -118,12 +289,19 @@ impl IntegrationScriptRewriter for NextJsScriptRewriter { } match self.mode { - NextJsRewriteMode::Structured => self.rewrite_values(content, ctx), + NextJsRewriteMode::Structured => self.rewrite_structured(content, ctx), NextJsRewriteMode::Streamed => { - if !content.contains("self.__next_f") { + // RSC push scripts (self.__next_f.push) are handled by the post-processor + // because T-chunks can span multiple scripts and require combined processing. + // Only handle non-RSC scripts here. + if content.contains("self.__next_f.push") { return ScriptRewriteAction::keep(); } - self.rewrite_values(content, ctx) + // For other __next_f scripts (like initialization), use simple URL rewriting + if content.contains("self.__next_f") { + return self.rewrite_streamed(content, ctx); + } + ScriptRewriteAction::keep() } } } @@ -135,254 +313,2226 @@ fn rewrite_nextjs_values( request_host: &str, request_scheme: &str, attributes: &[String], + preserve_length: bool, ) -> Option { if origin_host.is_empty() || request_host.is_empty() || attributes.is_empty() { return None; } - let mut rewritten = content.to_string(); - let mut changed = false; - let escaped_origin = escape(origin_host); - let replacement_scheme = format!("{}://{}", request_scheme, request_host); - - for attribute in attributes { - let escaped_attr = escape(attribute); - let pattern = format!( - r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*")(?Phttps?://|//){origin}"#, - attr = escaped_attr, - origin = escaped_origin, - ); - let regex = Regex::new(&pattern).expect("valid Next.js rewrite regex"); - let next_value = regex.replace_all(&rewritten, |caps: ®ex::Captures<'_>| { - let scheme = &caps["scheme"]; - let replacement = if scheme == "//" { - format!("//{}", request_host) - } else { - replacement_scheme.clone() - }; - format!("{}{}", &caps["prefix"], replacement) - }); - if next_value != rewritten { - changed = true; - rewritten = next_value.into_owned(); + // Build the rewriter context with regex patterns + // For RSC payloads (preserve_length=true), we must maintain exact byte positions + // to avoid breaking React hydration. + let rewriter = UrlRewriter::new( + origin_host, + request_host, + request_scheme, + attributes, + preserve_length, + ); + + // Use pure regex-based rewriting - no AST parsing needed + // The rewrite_embedded method handles all URL patterns with proper whitespace padding + rewriter.rewrite_embedded(content) +} + +/// Helper struct to hold URL rewriting configuration +struct UrlRewriter { + origin_host: String, + request_host: String, + request_scheme: String, + /// Regex patterns for embedded JSON in strings with URL scheme (e.g., \"href\":\"https://origin\") + embedded_patterns: Vec, + /// Regex patterns for bare hostname values (e.g., \"siteProductionDomain\":\"www.example.com\") + bare_host_patterns: Vec, + /// Whether to preserve URL length by padding (for RSC payloads) + preserve_length: bool, +} + +impl UrlRewriter { + fn new( + origin_host: &str, + request_host: &str, + request_scheme: &str, + attributes: &[String], + preserve_length: bool, + ) -> Self { + let escaped_origin = escape(origin_host); + + // Build patterns for embedded JSON strings with various escape levels + // Pattern 1: URLs with scheme (https://origin, http://origin, //origin) + // Also capture optional path and closing quote to add whitespace padding after + let embedded_patterns = attributes + .iter() + .map(|attr| { + let escaped_attr = escape(attr); + // Capture: prefix, scheme, path (optional), closing quote + let pattern = format!( + r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*")(?Phttps?://|//){origin}(?P[^"\\]*)(?P\\*")"#, + attr = escaped_attr, + origin = escaped_origin, + ); + Regex::new(&pattern).expect("valid Next.js rewrite regex") + }) + .collect(); + + // Pattern 2: Bare hostname without scheme (e.g., "siteProductionDomain":"www.example.com") + // This matches attribute:"hostname" where hostname is exactly the origin (no path) + let bare_host_patterns = attributes + .iter() + .map(|attr| { + let escaped_attr = escape(attr); + // Match attr":"origin" where origin is followed by end quote (no path) + let pattern = format!( + r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*"){origin}(?P\\*")"#, + attr = escaped_attr, + origin = escaped_origin, + ); + Regex::new(&pattern).expect("valid Next.js bare host rewrite regex") + }) + .collect(); + + Self { + origin_host: origin_host.to_string(), + request_host: request_host.to_string(), + request_scheme: request_scheme.to_string(), + embedded_patterns, + bare_host_patterns, + preserve_length, } } - changed.then_some(rewritten) -} + /// Rewrite a URL value string, returning (new_url, padding) if modified. + /// The padding is whitespace to add after the closing quote to preserve byte positions. + /// Uses the request scheme (http/https) for the rewritten URL. + #[cfg(test)] + fn rewrite_url_value(&self, url: &str) -> Option<(String, String)> { + let original_len = url.len(); -#[cfg(test)] -mod tests { - use super::*; - use crate::html_processor::{create_html_processor, HtmlProcessorConfig}; - use crate::integrations::{IntegrationRegistry, IntegrationScriptContext, ScriptRewriteAction}; - use crate::streaming_processor::{Compression, PipelineConfig, StreamingPipeline}; - use crate::test_support::tests::create_test_settings; - use serde_json::json; - use std::io::Cursor; + // Check for https:// or http:// URLs + // Use the request scheme for the rewritten URL (e.g., http for localhost) + let new_url = if let Some(rest) = url.strip_prefix("https://") { + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + Some(format!( + "{}://{}{}", + self.request_scheme, self.request_host, path + )) + } else { + None + } + } else if let Some(rest) = url.strip_prefix("http://") { + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + Some(format!( + "{}://{}{}", + self.request_scheme, self.request_host, path + )) + } else { + None + } + } else if let Some(rest) = url.strip_prefix("//") { + // Protocol-relative URL - use request scheme + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + Some(format!( + "{}://{}{}", + self.request_scheme, self.request_host, path + )) + } else { + None + } + } else if url == self.origin_host { + // Bare hostname without scheme (e.g., "siteProductionDomain":"www.example.com") + Some(self.request_host.clone()) + } else if url.starts_with(&self.origin_host) { + // Hostname with path but no scheme (e.g., "www.example.com/path") + let path = &url[self.origin_host.len()..]; + Some(format!("{}{}", self.request_host, path)) + } else { + None + }; - fn test_config() -> Arc { - Arc::new(NextJsIntegrationConfig { - enabled: true, - rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], + // Calculate whitespace padding if length preservation is enabled + new_url.map(|url| { + let padding = if self.preserve_length { + Self::calculate_padding(url.len(), original_len) + } else { + String::new() + }; + (url, padding) }) } - fn ctx(selector: &'static str) -> IntegrationScriptContext<'static> { - IntegrationScriptContext { - selector, - request_host: "ts.example.com", - request_scheme: "https", - origin_host: "origin.example.com", + /// Calculate the whitespace padding needed after a URL replacement. + /// Returns empty string if no padding needed (URL is same length or longer). + /// + /// For RSC hydration, we add spaces AFTER the closing quote to preserve + /// byte positions in the JSON stream. This is preferred over URL path padding + /// because it keeps URLs clean and works for all URL types. + #[cfg(test)] + fn calculate_padding(new_url_len: usize, original_len: usize) -> String { + if new_url_len >= original_len { + String::new() + } else { + " ".repeat(original_len - new_url_len) } } - #[test] - fn structured_rewriter_updates_next_data_payload() { - let payload = r#"{"props":{"pageProps":{"primary":{"href":"https://origin.example.com/reviews"},"secondary":{"href":"http://origin.example.com/sign-in"},"fallbackHref":"http://origin.example.com/legacy","protoRelative":"//origin.example.com/assets/logo.png"}}}"#; - let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Structured); - let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__")); + /// Rewrite embedded JSON patterns in a string (for streamed payloads) + fn rewrite_embedded(&self, input: &str) -> Option { + let mut result = input.to_string(); + let mut changed = false; - match result { - ScriptRewriteAction::Replace(value) => { - assert!(value.contains(r#""href":"https://ts.example.com/reviews""#)); - assert!(value.contains(r#""href":"https://ts.example.com/sign-in""#)); - assert!(value.contains(r#""fallbackHref":"http://origin.example.com/legacy""#)); - assert!(value.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#)); + // First pass: URLs with scheme (https://, http://, //) + for regex in &self.embedded_patterns { + let origin_host = &self.origin_host; + let request_host = &self.request_host; + let request_scheme = &self.request_scheme; + let preserve_length = self.preserve_length; + + let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { + let prefix = &caps["prefix"]; + let scheme = &caps["scheme"]; + let path = &caps["path"]; + let quote = &caps["quote"]; + + // Calculate original URL length (scheme + origin_host + path) + let original_url_len = scheme.len() + origin_host.len() + path.len(); + + // Build replacement URL using the request scheme (e.g., http for localhost) + let new_url = format!("{}://{}{}", request_scheme, request_host, path); + + // Calculate whitespace padding if needed + let padding = if preserve_length && new_url.len() < original_url_len { + " ".repeat(original_url_len - new_url.len()) + } else { + String::new() + }; + + // Return: prefix + new_url + quote + padding (spaces after closing quote) + format!("{}{}{}{}", prefix, new_url, quote, padding) + }); + if next_value != result { + changed = true; + result = next_value.into_owned(); } - _ => panic!("Expected rewrite to update payload"), } - } - #[test] - fn streamed_rewriter_only_runs_for_next_payloads() { - let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Streamed); + // Second pass: Bare hostnames without scheme (e.g., "siteProductionDomain":"www.example.com") + for regex in &self.bare_host_patterns { + let origin_host = &self.origin_host; + let request_host = &self.request_host; + let preserve_length = self.preserve_length; - let noop = rewriter.rewrite("console.log('hello');", &ctx("script")); - assert!(matches!(noop, ScriptRewriteAction::Keep)); + let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { + let prefix = &caps["prefix"]; + let suffix = &caps["suffix"]; - let payload = r#"self.__next_f.push(["chunk", "{\"href\":\"https://origin.example.com/app\"}"]); - "#; - let rewritten = rewriter.rewrite(payload, &ctx("script")); - match rewritten { - ScriptRewriteAction::Replace(value) => { - assert!(value.contains(r#"https://ts.example.com/app"#)); + // Calculate padding for bare hostnames + let padding = if preserve_length && request_host.len() < origin_host.len() { + " ".repeat(origin_host.len() - request_host.len()) + } else { + String::new() + }; + + format!("{}{}{}{}", prefix, request_host, suffix, padding) + }); + if next_value != result { + changed = true; + result = next_value.into_owned(); } - _ => panic!("Expected streamed payload rewrite"), } + + changed.then_some(result) } +} - #[test] - fn rewrite_helper_handles_protocol_relative_urls() { - let content = r#"{"props":{"pageProps":{"link":"//origin.example.com/image.png"}}}"#; - let rewritten = rewrite_nextjs_values( - content, - "origin.example.com", - "ts.example.com", - "https", - &["link".into()], - ) - .expect("should rewrite protocol relative link"); +// ============================================================================= +// RSC (React Server Components) T-Chunk Rewriter +// ============================================================================= +// +// Next.js App Router uses React Server Components (RSC) with a streaming flight +// protocol. RSC data is delivered via inline scripts calling `self.__next_f.push()`. +// +// ## RSC Flight Protocol Format +// +// RSC records are separated by `\n` (literal backslash-n in JS strings). +// Each record has format: `ID:DATA` where ID is a hex string (e.g., "1a", "443"). +// +// Record types include: +// - T-chunks (text): `ID:T,` - The most important for rewriting +// - JSON arrays: `ID:[...]` +// - JSON objects: `ID:{...}` +// - Module imports: `ID:I[...]` +// - Head links: `ID:HL[...]` +// - References: `ID:$ref` +// - Strings: `ID:"..."` +// - Null: `ID:null` +// +// ## T-Chunk Format Details +// +// T-chunks contain text data with an explicit byte length: +// ``` +// 1a:T29,{"url":"https://origin.example.com/path"} +// ``` +// - `1a` = chunk ID (hex) +// - `T` = text chunk marker +// - `29` = content length in hex (0x29 = 41 bytes UNESCAPED) +// - `,` = separator +// - Content follows, exactly 41 unescaped bytes +// +// The hex_length is the UNESCAPED byte count - escape sequences like `\n` count +// as 1 byte, `\uHHHH` counts as the UTF-8 byte length of the character, etc. +// +// ## Why T-Chunk Length Matters +// +// React's RSC parser uses byte offsets to navigate the stream. If we rewrite +// URLs without updating T-chunk lengths, the parser reads wrong byte ranges, +// corrupting the data and breaking hydration. +// +// Example: Changing `origin.example.com` (18 chars) to `proxy.io` (8 chars) +// shrinks content by 10 bytes. The T-chunk header must be updated from +// `T29,` to `T1f,` (41 -> 31 bytes). +// +// ## Cross-Script T-Chunks +// +// T-chunks CAN span multiple push scripts: +// - Script 10: `11:null\n1a:T928,` (header only, declares 928 bytes) +// - Script 11: `...actual content...` (the 928 bytes of content) +// +// Our per-script processing handles most cases correctly. For cross-script +// T-chunks, the header script won't have URLs to rewrite (just the header), +// and the content script will be rewritten with correct byte counting. - assert!(rewritten.contains(r#""link":"//ts.example.com/image.png""#)); - } +/// Calculate the unescaped byte length of a JS string with escape sequences. +/// This accounts for \n, \r, \t, \\, \", \xHH, \uHHHH, and surrogate pairs. +fn calculate_unescaped_byte_length(s: &str) -> usize { + let bytes = s.as_bytes(); + let mut result = 0; + let mut i = 0; - fn config_from_settings( - settings: &Settings, - registry: &IntegrationRegistry, - ) -> HtmlProcessorConfig { - HtmlProcessorConfig::from_settings( - settings, - registry, - "origin.example.com", - "test.example.com", - "https", - ) - } + while i < bytes.len() { + if bytes[i] == b'\\' && i + 1 < bytes.len() { + let esc = bytes[i + 1]; - #[test] - fn html_processor_rewrites_nextjs_script_when_enabled() { - let html = r#" - - "#; + // Simple escape sequences: \n, \r, \t, \b, \f, \v, \", \', \\, \/ + if matches!( + esc, + b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' + ) { + result += 1; + i += 2; + continue; + } - let mut settings = create_test_settings(); - settings - .integrations - .insert_config( - "nextjs", - &json!({ - "enabled": true, - "rewrite_attributes": ["href", "link", "url"], - }), - ) - .expect("should update nextjs config"); - let registry = IntegrationRegistry::new(&settings); - let config = config_from_settings(&settings, ®istry); - let processor = create_html_processor(config); - let pipeline_config = PipelineConfig { - input_compression: Compression::None, - output_compression: Compression::None, - chunk_size: 8192, - }; - let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + // \xHH - hex escape (1 byte) + if esc == b'x' && i + 3 < bytes.len() { + result += 1; + i += 4; + continue; + } - let mut output = Vec::new(); - pipeline - .process(Cursor::new(html.as_bytes()), &mut output) - .unwrap(); - let processed = String::from_utf8_lossy(&output); + // \uHHHH - unicode escape + if esc == b'u' && i + 5 < bytes.len() { + let hex = &s[i + 2..i + 6]; + if hex.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit) = u16::from_str_radix(hex, 16) { + // Check for surrogate pair + if (0xD800..=0xDBFF).contains(&code_unit) + && i + 11 < bytes.len() + && bytes[i + 6] == b'\\' + && bytes[i + 7] == b'u' + { + let hex2 = &s[i + 8..i + 12]; + if hex2.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { + if (0xDC00..=0xDFFF).contains(&code_unit2) { + // Full surrogate pair = 4 UTF-8 bytes + result += 4; + i += 12; + continue; + } + } + } + } - assert!( - processed.contains(r#""href":"https://test.example.com/reviews""#), - "should rewrite https Next.js href values" - ); - assert!( - processed.contains(r#""href":"https://test.example.com/sign-in""#), - "should rewrite http Next.js href values" - ); - assert!( - processed.contains(r#""fallbackHref":"http://origin.example.com/legacy""#), - "should leave other fields untouched" - ); - assert!( - processed.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#), - "should not rewrite non-href keys" - ); - assert!( - !processed.contains("\"href\":\"https://origin.example.com/reviews\""), - "should remove origin https href" - ); - assert!( - !processed.contains("\"href\":\"http://origin.example.com/sign-in\""), - "should remove origin http href" - ); + // Single unicode escape - calculate UTF-8 byte length + let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); + result += c.len_utf8(); + i += 6; + continue; + } + } + } + } + + // Regular character - count its UTF-8 byte length + // For ASCII, this is 1 byte + if bytes[i] < 0x80 { + result += 1; + i += 1; + } else { + // Multi-byte UTF-8 character + let c = s[i..].chars().next().unwrap_or('\u{FFFD}'); + result += c.len_utf8(); + i += c.len_utf8(); + } } - #[test] - fn html_processor_rewrites_nextjs_stream_payload() { - let html = r#" - - "#; + result +} - let mut settings = create_test_settings(); - settings - .integrations - .insert_config( - "nextjs", - &json!({ - "enabled": true, - "rewrite_attributes": ["href", "link", "url"], - }), - ) - .expect("should update nextjs config"); - let registry = IntegrationRegistry::new(&settings); - let config = config_from_settings(&settings, ®istry); - let processor = create_html_processor(config); - let pipeline_config = PipelineConfig { - input_compression: Compression::None, - output_compression: Compression::None, - chunk_size: 8192, - }; - let mut pipeline = StreamingPipeline::new(pipeline_config, processor); +/// Consume a specified number of unescaped bytes from a JS string, returning the end position. +fn consume_unescaped_bytes(s: &str, start_pos: usize, byte_count: usize) -> (usize, usize) { + let bytes = s.as_bytes(); + let mut consumed = 0; + let mut pos = start_pos; - let mut output = Vec::new(); - pipeline - .process(Cursor::new(html.as_bytes()), &mut output) - .unwrap(); - let processed = String::from_utf8_lossy(&output); - let normalized = processed.replace('\\', ""); - assert!( - normalized.contains("\"href\":\"https://test.example.com/dashboard\""), - "should rewrite escaped href sequences inside streamed payloads: {}", - normalized - ); - assert!( - normalized.contains("\"href\":\"https://test.example.com/secondary\""), - "should rewrite plain href attributes inside streamed payloads" - ); - assert!( - normalized.contains("\"link\":\"https://test.example.com/api-test\""), - "should rewrite additional configured attributes like link" - ); - assert!( - processed.contains("\"dataHost\":\"https://origin.example.com/api\""), - "should leave non-href properties untouched" - ); - } + while pos < bytes.len() && consumed < byte_count { + if bytes[pos] == b'\\' && pos + 1 < bytes.len() { + let esc = bytes[pos + 1]; - #[test] - fn register_respects_enabled_flag() { - let settings = create_test_settings(); - let registration = register(&settings); + if matches!( + esc, + b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' + ) { + consumed += 1; + pos += 2; + continue; + } - assert!( - registration.is_none(), - "should skip registration when integration is disabled" - ); + if esc == b'x' && pos + 3 < bytes.len() { + consumed += 1; + pos += 4; + continue; + } + + if esc == b'u' && pos + 5 < bytes.len() { + let hex = &s[pos + 2..pos + 6]; + if hex.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit) = u16::from_str_radix(hex, 16) { + if (0xD800..=0xDBFF).contains(&code_unit) + && pos + 11 < bytes.len() + && bytes[pos + 6] == b'\\' + && bytes[pos + 7] == b'u' + { + let hex2 = &s[pos + 8..pos + 12]; + if hex2.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { + if (0xDC00..=0xDFFF).contains(&code_unit2) { + consumed += 4; + pos += 12; + continue; + } + } + } + } + + let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); + consumed += c.len_utf8(); + pos += 6; + continue; + } + } + } + } + + if bytes[pos] < 0x80 { + consumed += 1; + pos += 1; + } else { + let c = s[pos..].chars().next().unwrap_or('\u{FFFD}'); + consumed += c.len_utf8(); + pos += c.len_utf8(); + } + } + + (pos, consumed) +} + +/// Information about a T-chunk found in the combined RSC content +struct TChunkInfo { + /// The chunk ID (hex string like "1a", "443") + id: String, + /// Position where the T-chunk header starts (e.g., position of "1a:T...") + match_start: usize, + /// Position right after the comma (where content begins) + header_end: usize, + /// Position where the content ends + content_end: usize, +} + +/// Find all T-chunks in the combined RSC content. +/// T-chunks have format: ID:T, +fn find_tchunks(content: &str) -> Vec { + // Match pattern: hex_id:Thex_length, + let pattern = Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").unwrap(); + let mut chunks = Vec::new(); + let mut search_pos = 0; + + while search_pos < content.len() { + if let Some(cap) = pattern.captures(&content[search_pos..]) { + let m = cap.get(0).unwrap(); + let match_start = search_pos + m.start(); + let header_end = search_pos + m.end(); + + let id = cap.get(1).unwrap().as_str().to_string(); + let length_hex = cap.get(2).unwrap().as_str(); + let declared_length = usize::from_str_radix(length_hex, 16).unwrap_or(0); + + // Consume the declared number of unescaped bytes, skipping markers + let (content_end, _) = consume_unescaped_bytes(content, header_end, declared_length); + + chunks.push(TChunkInfo { + id, + match_start, + header_end, + content_end, + }); + + search_pos = content_end; + } else { + break; + } + } + + chunks +} + +/// Rewrite URLs in a string, handling various URL formats in RSC content. +fn rewrite_rsc_url_string( + s: &str, + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> String { + let escaped_origin = escape(origin_host); + + // Match various URL patterns: + // - https://host or http://host + // - //host (protocol-relative) + // - \/\/host (escaped slashes in JSON) + // - \\\/\\\/host (double-escaped) + // - \\\\/\\\\/host (quad-escaped) + let pattern = Regex::new(&format!( + r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, + escaped_origin + )) + .unwrap(); + + pattern + .replace_all(s, |caps: ®ex::Captures<'_>| { + let slashes = caps.get(3).map_or("//", |m| m.as_str()); + format!("{}:{}{}", request_scheme, slashes, request_host) + }) + .into_owned() +} + +/// Rewrite T-chunks in RSC content, updating lengths after URL rewriting. +/// This works for single scripts where T-chunks don't span script boundaries. +fn rewrite_rsc_tchunks( + content: &str, + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> String { + let chunks = find_tchunks(content); + + if chunks.is_empty() { + // No T-chunks, just rewrite URLs in the whole content + return rewrite_rsc_url_string(content, origin_host, request_host, request_scheme); + } + + let mut result = String::with_capacity(content.len()); + let mut last_end = 0; + + for chunk in &chunks { + // Content before this T-chunk (rewrite URLs) + let before = &content[last_end..chunk.match_start]; + result.push_str(&rewrite_rsc_url_string( + before, + origin_host, + request_host, + request_scheme, + )); + + // Extract and rewrite T-chunk content + let chunk_content = &content[chunk.header_end..chunk.content_end]; + let rewritten_content = + rewrite_rsc_url_string(chunk_content, origin_host, request_host, request_scheme); + + // Calculate new byte length + let new_length = calculate_unescaped_byte_length(&rewritten_content); + let new_length_hex = format!("{:x}", new_length); + + // Write new T-chunk header and content + result.push_str(&chunk.id); + result.push_str(":T"); + result.push_str(&new_length_hex); + result.push(','); + result.push_str(&rewritten_content); + + last_end = chunk.content_end; + } + + // Remaining content after last T-chunk + let remaining = &content[last_end..]; + result.push_str(&rewrite_rsc_url_string( + remaining, + origin_host, + request_host, + request_scheme, + )); + + result +} + +// ============================================================================= +// Cross-Script RSC Processing +// ============================================================================= +// +// T-chunks can span multiple push scripts. For example: +// - Script 10: "11:null\n1a:T928," (header declares 928 bytes, but script ends) +// - Script 11: "...actual 928 bytes of content..." +// +// To handle this correctly, we must process all scripts together: +// 1. Combine scripts with markers +// 2. Find T-chunks across the combined content (skip markers when counting bytes) +// 3. Rewrite URLs and recalculate lengths +// 4. Split back on markers +// + +/// Marker used to track script boundaries when combining RSC content +const RSC_MARKER: &str = "\x00SPLIT\x00"; + +/// Consume unescaped bytes, skipping RSC markers. +/// Returns (end_position, bytes_consumed) +fn consume_unescaped_bytes_skip_markers( + s: &str, + start_pos: usize, + byte_count: usize, +) -> (usize, usize) { + let bytes = s.as_bytes(); + let mut consumed = 0; + let mut pos = start_pos; + let marker_bytes = RSC_MARKER.as_bytes(); + + while pos < bytes.len() && consumed < byte_count { + // Check for marker - skip it without counting bytes + if pos + marker_bytes.len() <= bytes.len() + && &bytes[pos..pos + marker_bytes.len()] == marker_bytes + { + pos += marker_bytes.len(); + continue; + } + + if bytes[pos] == b'\\' && pos + 1 < bytes.len() { + let esc = bytes[pos + 1]; + + if matches!( + esc, + b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' + ) { + consumed += 1; + pos += 2; + continue; + } + + if esc == b'x' && pos + 3 < bytes.len() { + consumed += 1; + pos += 4; + continue; + } + + if esc == b'u' && pos + 5 < bytes.len() { + let hex = &s[pos + 2..pos + 6]; + if hex.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit) = u16::from_str_radix(hex, 16) { + if (0xD800..=0xDBFF).contains(&code_unit) + && pos + 11 < bytes.len() + && bytes[pos + 6] == b'\\' + && bytes[pos + 7] == b'u' + { + let hex2 = &s[pos + 8..pos + 12]; + if hex2.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { + if (0xDC00..=0xDFFF).contains(&code_unit2) { + consumed += 4; + pos += 12; + continue; + } + } + } + } + + let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); + consumed += c.len_utf8(); + pos += 6; + continue; + } + } + } + } + + if bytes[pos] < 0x80 { + consumed += 1; + pos += 1; + } else { + let c = s[pos..].chars().next().unwrap_or('\u{FFFD}'); + consumed += c.len_utf8(); + pos += c.len_utf8(); + } + } + + (pos, consumed) +} + +/// Calculate unescaped byte length excluding RSC markers. +fn calculate_unescaped_byte_length_skip_markers(s: &str) -> usize { + let without_markers = s.replace(RSC_MARKER, ""); + calculate_unescaped_byte_length(&without_markers) +} + +/// Information about a T-chunk in marker-combined content +struct MarkedTChunkInfo { + id: String, + match_start: usize, + header_end: usize, + content_end: usize, +} + +/// Find T-chunks in marker-combined RSC content. +fn find_tchunks_with_markers(content: &str) -> Vec { + let pattern = Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").unwrap(); + let mut chunks = Vec::new(); + let mut search_pos = 0; + + while search_pos < content.len() { + if let Some(cap) = pattern.captures(&content[search_pos..]) { + let m = cap.get(0).unwrap(); + let match_start = search_pos + m.start(); + let header_end = search_pos + m.end(); + + let id = cap.get(1).unwrap().as_str().to_string(); + let length_hex = cap.get(2).unwrap().as_str(); + let declared_length = usize::from_str_radix(length_hex, 16).unwrap_or(0); + + // Consume bytes, skipping markers + let (content_end, _) = + consume_unescaped_bytes_skip_markers(content, header_end, declared_length); + + chunks.push(MarkedTChunkInfo { + id, + match_start, + header_end, + content_end, + }); + + search_pos = content_end; + } else { + break; + } + } + + chunks +} + +/// Process multiple RSC script payloads together, handling cross-script T-chunks. +/// +/// This function: +/// 1. Combines all payloads with markers +/// 2. Finds T-chunks across the combined content +/// 3. Rewrites URLs and recalculates T-chunk lengths +/// 4. Splits back on markers to return individual rewritten payloads +/// +/// # Arguments +/// * `payloads` - The string content from each `self.__next_f.push([1, '...'])` call +/// * `origin_host` - The origin host to replace +/// * `request_host` - The request host to use in replacements +/// * `request_scheme` - The scheme (http/https) to use in replacements +/// +/// # Returns +/// A vector of rewritten payloads in the same order as input +pub fn rewrite_rsc_scripts_combined( + payloads: &[&str], + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> Vec { + if payloads.is_empty() { + return Vec::new(); + } + + if payloads.len() == 1 { + // Single script - use simple approach + return vec![rewrite_rsc_tchunks( + payloads[0], + origin_host, + request_host, + request_scheme, + )]; + } + + // Combine payloads with markers + let mut combined = payloads[0].to_string(); + for payload in &payloads[1..] { + combined.push_str(RSC_MARKER); + combined.push_str(payload); + } + + // Find T-chunks in combined content + let chunks = find_tchunks_with_markers(&combined); + + if chunks.is_empty() { + // No T-chunks - just rewrite URLs in each payload + return payloads + .iter() + .map(|p| rewrite_rsc_url_string(p, origin_host, request_host, request_scheme)) + .collect(); + } + + // Build rewritten combined content + let mut result = String::with_capacity(combined.len()); + let mut last_end = 0; + + for chunk in &chunks { + // Content before this T-chunk (rewrite URLs, preserve markers) + let before = &combined[last_end..chunk.match_start]; + result.push_str(&rewrite_rsc_url_string( + before, + origin_host, + request_host, + request_scheme, + )); + + // Extract T-chunk content (may contain markers) + let chunk_content = &combined[chunk.header_end..chunk.content_end]; + + // Rewrite URLs (preserves markers) + let rewritten_content = + rewrite_rsc_url_string(chunk_content, origin_host, request_host, request_scheme); + + // Calculate new byte length (excluding markers) + let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); + let new_length_hex = format!("{:x}", new_length); + + // Write new T-chunk header and content + result.push_str(&chunk.id); + result.push_str(":T"); + result.push_str(&new_length_hex); + result.push(','); + result.push_str(&rewritten_content); + + last_end = chunk.content_end; + } + + // Remaining content after last T-chunk + let remaining = &combined[last_end..]; + result.push_str(&rewrite_rsc_url_string( + remaining, + origin_host, + request_host, + request_scheme, + )); + + // Split back on markers + result.split(RSC_MARKER).map(|s| s.to_string()).collect() +} + +/// Information about an RSC push script in HTML +struct RscPushScript { + /// Start position of the payload content (inside the quotes). + payload_start: usize, + /// End position of the payload content (inside the quotes). + payload_end: usize, + /// The payload content (inside the quotes) + payload: String, +} + +/// Find all RSC push scripts in HTML content. +/// Returns scripts in order of appearance. +/// +/// Handles both minified format: `` +/// and prettified format with whitespace: +/// ```html +/// +/// ``` +fn find_rsc_push_scripts(html: &str) -> Vec { + let mut scripts = Vec::new(); + // Match "#).unwrap(); + + let mut search_pos = 0; + + while search_pos < html.len() { + let Some(cap) = pattern.captures(&html[search_pos..]) else { + break; + }; + + let quote_match = cap.get(1).unwrap(); + let quote = quote_match.as_str().chars().next().unwrap(); + let payload_start = search_pos + quote_match.end(); + + // Find the closing quote (handling escapes) + let mut i = payload_start; + let bytes = html.as_bytes(); + while i < bytes.len() { + if bytes[i] == b'\\' { + i += 2; // Skip escape sequence + } else if bytes[i] == quote as u8 { + break; + } else { + i += 1; + } + } + + if i >= bytes.len() { + search_pos = payload_start; + continue; + } + + // After the closing quote, look for ]) with optional whitespace + let after_quote = &html[i + 1..]; + + let Some(ending_match) = ending_pattern.find(after_quote) else { + search_pos = payload_start; + continue; + }; + + let payload = html[payload_start..i].to_string(); + let payload_end = i; + let script_end = i + 1 + ending_match.end(); + + scripts.push(RscPushScript { + payload_start, + payload_end, + payload, + }); + + search_pos = script_end; + } + + scripts +} + +/// Post-process complete HTML to handle cross-script RSC T-chunks. +/// +/// This function: +/// 1. Finds all RSC push scripts in the HTML +/// 2. Extracts their payloads +/// 3. Processes them together using the combined approach +/// 4. Rebuilds the HTML with rewritten scripts +/// +/// This should be called after streaming HTML processing to fix T-chunk lengths +/// that span multiple scripts. +/// +/// # Arguments +/// * `html` - The complete HTML content (must be valid UTF-8) +/// * `origin_host` - The origin host to replace +/// * `request_host` - The request host to use in replacements +/// * `request_scheme` - The scheme (http/https) to use in replacements +/// +/// # Returns +/// The HTML with RSC scripts rewritten to have correct T-chunk lengths +pub fn post_process_rsc_html( + html: &str, + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> String { + let scripts = find_rsc_push_scripts(html); + + log::info!( + "post_process_rsc_html: found {} RSC push scripts, origin={}, proxy={}://{}", + scripts.len(), + origin_host, + request_scheme, + request_host + ); + + if scripts.is_empty() { + log::info!("post_process_rsc_html: no RSC scripts found, returning unchanged"); + return html.to_string(); + } + + // Extract payloads + let payloads: Vec<&str> = scripts.iter().map(|s| s.payload.as_str()).collect(); + + // Count origin URLs before rewriting + let origin_count_before: usize = payloads + .iter() + .map(|p| p.matches(origin_host).count()) + .sum(); + log::info!( + "post_process_rsc_html: {} occurrences of '{}' in payloads before rewriting", + origin_count_before, + origin_host + ); + + // Process all scripts together + let rewritten_payloads = + rewrite_rsc_scripts_combined(&payloads, origin_host, request_host, request_scheme); + + // Count origin URLs after rewriting + let origin_count_after: usize = rewritten_payloads + .iter() + .map(|p| p.matches(origin_host).count()) + .sum(); + let proxy_count: usize = rewritten_payloads + .iter() + .map(|p| p.matches(request_host).count()) + .sum(); + log::info!( + "post_process_rsc_html: after rewriting - {} origin URLs remaining, {} proxy URLs", + origin_count_after, + proxy_count + ); + + // Replace payload contents in-place (apply replacements in reverse order to keep indices valid). + let mut result = html.to_string(); + for (i, script) in scripts.iter().enumerate().rev() { + result.replace_range( + script.payload_start..script.payload_end, + &rewritten_payloads[i], + ); + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::html_processor::{create_html_processor, HtmlProcessorConfig}; + use crate::integrations::{IntegrationRegistry, IntegrationScriptContext, ScriptRewriteAction}; + use crate::streaming_processor::{Compression, PipelineConfig, StreamingPipeline}; + use crate::test_support::tests::create_test_settings; + use serde_json::json; + use std::io::Cursor; + + fn test_config() -> Arc { + Arc::new(NextJsIntegrationConfig { + enabled: true, + rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], + }) + } + + fn ctx(selector: &'static str) -> IntegrationScriptContext<'static> { + IntegrationScriptContext { + selector, + request_host: "ts.example.com", + request_scheme: "https", + origin_host: "origin.example.com", + } + } + + #[test] + fn structured_rewriter_updates_next_data_payload() { + let payload = r#"{"props":{"pageProps":{"primary":{"href":"https://origin.example.com/reviews"},"secondary":{"href":"http://origin.example.com/sign-in"},"fallbackHref":"http://origin.example.com/legacy","protoRelative":"//origin.example.com/assets/logo.png"}}}"#; + let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Structured); + let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__")); + + match result { + ScriptRewriteAction::Replace(value) => { + // Note: URLs may have padding for length preservation + assert!(value.contains("ts.example.com") && value.contains("/reviews")); + assert!(value.contains("ts.example.com") && value.contains("/sign-in")); + assert!(value.contains(r#""fallbackHref":"http://origin.example.com/legacy""#)); + assert!(value.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#)); + } + _ => panic!("Expected rewrite to update payload"), + } + } + + #[test] + fn streamed_rewriter_skips_non_next_payloads() { + // The streamed rewriter skips RSC push scripts (self.__next_f.push) + // because these are handled by the post-processor for cross-script T-chunks. + let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Streamed); + + // Non-Next.js scripts should be skipped + let noop = rewriter.rewrite("console.log('hello');", &ctx("script")); + assert!(matches!(noop, ScriptRewriteAction::Keep)); + + // RSC push payloads should be skipped (handled by post-processor) + let payload = + r#"self.__next_f.push([1, "{\"href\":\"https://origin.example.com/app\"}"]);"#; + let result = rewriter.rewrite(payload, &ctx("script")); + assert!( + matches!(result, ScriptRewriteAction::Keep), + "Streamed rewriter should skip __next_f.push payloads (handled by post-processor)" + ); + + // Other __next_f scripts (like initialization) should still be processed + let init_script = r#"(self.__next_f = self.__next_f || []).push([0]); var url = "https://origin.example.com/api";"#; + let init_result = rewriter.rewrite(init_script, &ctx("script")); + // This might or might not be rewritten depending on content - just verify it runs + assert!( + matches!( + init_result, + ScriptRewriteAction::Keep | ScriptRewriteAction::Replace(_) + ), + "Streamed rewriter should handle non-push __next_f scripts" + ); + } + + #[test] + fn rewrite_helper_handles_protocol_relative_urls() { + let content = r#"{"props":{"pageProps":{"link":"//origin.example.com/image.png"}}}"#; + let rewritten = rewrite_nextjs_values( + content, + "origin.example.com", + "ts.example.com", + "https", + &["link".into()], + false, // preserve_length=false for non-RSC content + ) + .expect("should rewrite protocol relative link"); + + // Note: URLs may have padding for length preservation + assert!(rewritten.contains("ts.example.com") && rewritten.contains("/image.png")); + } + + fn config_from_settings( + settings: &Settings, + registry: &IntegrationRegistry, + ) -> HtmlProcessorConfig { + HtmlProcessorConfig::from_settings( + settings, + registry, + "origin.example.com", + "test.example.com", + "https", + ) + } + + #[test] + fn html_processor_rewrites_nextjs_script_when_enabled() { + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "link", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let processed = String::from_utf8_lossy(&output); + + // Note: URLs may have padding characters for length preservation + assert!( + processed.contains("test.example.com") && processed.contains("/reviews"), + "should rewrite https Next.js href values to test.example.com" + ); + assert!( + processed.contains("test.example.com") && processed.contains("/sign-in"), + "should rewrite http Next.js href values to test.example.com" + ); + assert!( + processed.contains(r#""fallbackHref":"http://origin.example.com/legacy""#), + "should leave other fields untouched" + ); + assert!( + processed.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#), + "should not rewrite non-href keys" + ); + assert!( + !processed.contains("\"href\":\"https://origin.example.com/reviews\""), + "should remove origin https href" + ); + assert!( + !processed.contains("\"href\":\"http://origin.example.com/sign-in\""), + "should remove origin http href" + ); + } + + #[test] + fn html_processor_rewrites_rsc_stream_payload_with_length_preservation() { + // RSC payloads (self.__next_f.push) are rewritten via post-processing. + // The streaming phase skips RSC push scripts, and the post-processor handles them + // to correctly handle cross-script T-chunks. + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "link", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + // Apply post-processing (this is what handles RSC push scripts) + let processed_str = String::from_utf8_lossy(&output); + let final_html = post_process_rsc_html( + &processed_str, + "origin.example.com", + "test.example.com", + "https", + ); + + // RSC payloads should be rewritten via post-processing + assert!( + final_html.contains("test.example.com"), + "RSC stream payloads should be rewritten to proxy host via post-processing. Output: {}", + final_html + ); + } + + #[test] + fn html_processor_rewrites_rsc_stream_payload_with_chunked_input() { + // RSC payloads are rewritten via post-processing, even with chunked streaming input + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 32, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + // Apply post-processing (this is what handles RSC push scripts) + let processed_str = String::from_utf8_lossy(&output); + let final_html = post_process_rsc_html( + &processed_str, + "origin.example.com", + "test.example.com", + "https", + ); + + // RSC payloads should be rewritten via post-processing + assert!( + final_html.contains("test.example.com"), + "RSC stream payloads should be rewritten to proxy host with chunked input. Output: {}", + final_html + ); + } + + #[test] + fn register_respects_enabled_flag() { + let settings = create_test_settings(); + let registration = register(&settings); + + assert!( + registration.is_none(), + "should skip registration when integration is disabled" + ); + } + + #[test] + fn html_processor_rewrites_rsc_payloads_with_length_preservation() { + // RSC payloads (self.__next_f.push) are rewritten via post-processing. + // This allows navigation to stay on proxy while correctly handling cross-script T-chunks. + + let html = r#" + +"#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["url"], + }), + ) + .expect("should update nextjs config"); + + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + // Apply post-processing (this is what handles RSC push scripts) + let processed_str = String::from_utf8_lossy(&output); + let final_html = post_process_rsc_html( + &processed_str, + "origin.example.com", + "test.example.com", + "https", + ); + + println!("=== Final HTML ==="); + println!("{}", final_html); + + // RSC payloads should be rewritten via post-processing + assert!( + final_html.contains("test.example.com"), + "RSC payload URLs should be rewritten to proxy host. Output: {}", + final_html + ); + + // Verify the RSC payload structure is preserved + assert!( + final_html.contains(r#""ID":879000"#), + "RSC payload ID should be preserved" + ); + assert!( + final_html.contains(r#""title":"Makes""#), + "RSC payload title should be preserved" + ); + assert!( + final_html.contains(r#""children":"$45a""#), + "RSC payload children reference should be preserved" + ); + + // Verify \n separators are preserved (crucial for RSC parsing) + assert!( + final_html.contains(r#"\n442:"#), + "RSC record separator \\n should be preserved. Output: {}", + final_html + ); + } + + #[test] + fn test_tchunk_length_recalculation() { + // Test that T-chunk lengths are correctly recalculated after URL rewriting. + // T-chunk format: ID:T, + // The hex_length is the UNESCAPED byte count of the content. + + // Original content: {"url":"https://origin.example.com/path"} = 41 bytes = 0x29 + // After rewriting: {"url":"https://test.example.com/path"} = 39 bytes = 0x27 + // (origin.example.com is 18 chars, test.example.com is 16 chars - shrinks by 2) + let content = r#"1a:T29,{"url":"https://origin.example.com/path"}"#; + let result = + rewrite_rsc_tchunks(content, "origin.example.com", "test.example.com", "https"); + + assert!( + result.contains("test.example.com"), + "URL should be rewritten" + ); + assert!( + result.starts_with("1a:T27,"), + "T-chunk length should be updated from 29 (41) to 27 (39). Got: {}", + result + ); + } + + #[test] + fn test_tchunk_length_recalculation_with_length_increase() { + // Test that T-chunk lengths are correctly recalculated when URL length increases. + // Original: short.io (8 chars) -> test.example.com (16 chars) - grows by 8 + + // Content: {"url":"https://short.io/x"} = 28 bytes = 0x1c + // After: {"url":"https://test.example.com/x"} = 36 bytes = 0x24 + let content = r#"1a:T1c,{"url":"https://short.io/x"}"#; + let result = rewrite_rsc_tchunks(content, "short.io", "test.example.com", "https"); + + assert!( + result.contains("test.example.com"), + "URL should be rewritten" + ); + assert!( + result.starts_with("1a:T24,"), + "T-chunk length should be updated from 1c (28) to 24 (36). Got: {}", + result + ); + } + + #[test] + fn test_calculate_unescaped_byte_length() { + // Test the unescaped byte length calculation + assert_eq!(calculate_unescaped_byte_length("hello"), 5); + assert_eq!(calculate_unescaped_byte_length(r#"\n"#), 1); // \n = 1 byte + assert_eq!(calculate_unescaped_byte_length(r#"\r\n"#), 2); // \r\n = 2 bytes + assert_eq!(calculate_unescaped_byte_length(r#"\""#), 1); // \" = 1 byte + assert_eq!(calculate_unescaped_byte_length(r#"\\"#), 1); // \\ = 1 byte + assert_eq!(calculate_unescaped_byte_length(r#"\x41"#), 1); // \x41 = 'A' = 1 byte + assert_eq!(calculate_unescaped_byte_length(r#"\u0041"#), 1); // \u0041 = 'A' = 1 byte + assert_eq!(calculate_unescaped_byte_length(r#"\u00e9"#), 2); // \u00e9 = 'é' = 2 UTF-8 bytes + } + + #[test] + fn test_multiple_tchunks() { + // Test content with multiple T-chunks + let content = r#"1a:T1c,{"url":"https://short.io/x"}\n1b:T1c,{"url":"https://short.io/y"}"#; + let result = rewrite_rsc_tchunks(content, "short.io", "test.example.com", "https"); + + // Both T-chunks should have updated lengths + assert!( + result.contains("test.example.com"), + "URLs should be rewritten" + ); + // Both chunks should have new length 0x24 (36 bytes) + let count = result.matches(":T24,").count(); + assert_eq!(count, 2, "Both T-chunks should have updated lengths"); + } + + #[test] + fn test_cross_script_tchunk_rewriting() { + // Test T-chunks that span multiple scripts. + // This is the key scenario that breaks per-script processing. + // + // Script 0: Contains a T-chunk header that declares more content than is in this script + // Script 1: Contains the rest of the T-chunk content, including URLs that need rewriting + + // T-chunk declares 64 bytes (0x40), but script 0 only has partial content + let script0 = r#"other:data\n1a:T40,partial content"#; + // Script 1 has the rest of the T-chunk content with a URL + let script1 = r#" with https://origin.example.com/page goes here"#; + + // Check the actual combined byte lengths + let combined_content = "partial content with https://origin.example.com/page goes here"; + let combined_len = calculate_unescaped_byte_length(combined_content); + println!( + "Combined T-chunk content length: {} bytes = 0x{:x}", + combined_len, combined_len + ); + + // Process using combined approach + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + println!("Results[0]: {}", results[0]); + println!("Results[1]: {}", results[1]); + + assert_eq!(results.len(), 2, "Should return same number of scripts"); + + // The URL should be rewritten in script 1 + assert!( + results[1].contains("test.example.com"), + "URL in script 1 should be rewritten. Got: {}", + results[1] + ); + + // The T-chunk header in script 0 should have updated length + // Let's check what the new length actually is + let rewritten_content = "partial content with https://test.example.com/page goes here"; + let rewritten_len = calculate_unescaped_byte_length(rewritten_content); + println!( + "Rewritten T-chunk content length: {} bytes = 0x{:x}", + rewritten_len, rewritten_len + ); + + let expected_header = format!(":T{:x},", rewritten_len); + assert!( + results[0].contains(&expected_header), + "T-chunk length in script 0 should be updated to {}. Got: {}", + expected_header, + results[0] + ); + } + + #[test] + fn test_cross_script_preserves_non_tchunk_content() { + // Test that content outside T-chunks is still rewritten correctly + let script0 = r#"{"url":"https://origin.example.com/first"}\n1a:T40,partial"#; + let script1 = r#" content with https://origin.example.com/page end"#; + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + // URL outside T-chunk in script 0 should be rewritten + assert!( + results[0].contains("test.example.com/first"), + "URL outside T-chunk should be rewritten. Got: {}", + results[0] + ); + + // URL inside T-chunk (spanning scripts) should be rewritten + assert!( + results[1].contains("test.example.com/page"), + "URL inside cross-script T-chunk should be rewritten. Got: {}", + results[1] + ); + } + + #[test] + fn test_post_process_rsc_html() { + // Test the complete HTML post-processing function + let html = r#" + + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + + // The URL should be rewritten + assert!( + result.contains("test.example.com/page"), + "URL should be rewritten. Got: {}", + result + ); + + // The T-chunk length should be updated + assert!( + result.contains(":T3c,"), + "T-chunk length should be updated. Got: {}", + result + ); + + // HTML structure should be preserved + assert!(result.contains("") && result.contains("")); + assert!(result.contains("self.__next_f.push")); + } + + #[test] + fn test_post_process_rsc_html_with_prettified_format() { + // Test with prettified HTML format (newlines and whitespace between elements) + // This is the format Next.js uses when outputting non-minified HTML + let html = r#" + + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + + // Both URLs should be rewritten + assert!( + result.contains("test.example.com/news"), + "First URL should be rewritten. Got: {}", + result + ); + assert!( + result.contains("test.example.com/reviews"), + "Second URL should be rewritten. Got: {}", + result + ); + + // No origin URLs should remain + assert!( + !result.contains("origin.example.com"), + "No origin URLs should remain. Got: {}", + result + ); + + // HTML structure should be preserved + assert!(result.contains("") && result.contains("")); + assert!(result.contains("self.__next_f.push")); + } + + #[test] + fn test_post_process_html_with_html_href_in_tchunk() { + // Test that HTML href attributes inside T-chunks are rewritten + // This is the format where HTML markup is embedded in RSC T-chunk content + let html = r#" + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + + // The HTML href URL should be rewritten + assert!( + result.contains("test.example.com/about-us"), + "HTML href URL in T-chunk should be rewritten. Got: {}", + result + ); + + // No origin URLs should remain + assert!( + !result.contains("origin.example.com"), + "No origin URLs should remain. Got: {}", + result + ); + + // Verify T-chunk length was recalculated + // Original content: \u003cdiv\u003e\u003ca href="https://origin.example.com/about-us"\u003eAbout\u003c/a\u003e\u003c/div\u003e + // After rewrite, URL is shorter so T-chunk length should be smaller + assert!( + !result.contains(":T4d9,"), + "T-chunk length should have been recalculated (original was 4d9). Got: {}", + result + ); + } +} + +#[cfg(test)] +mod truncated_string_tests { + use super::*; + + #[test] + fn test_truncated_string_parsing() { + // This simulates a Next.js chunk that's been split mid-string + // With pure regex rewriting, truncated strings without closing quotes + // simply won't match, which is the desired behavior + let truncated = r#"self.__next_f.push([ + 1, + '430:I[6061,["749","static/chunks/16bf9003-553c36acd7d8a04b.js","4669","static/chun' +]);"#; + + // The regex pattern requires a closing quote after the URL, + // so truncated content without URLs won't be modified + let result = rewrite_nextjs_values( + truncated, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length=true for RSC payloads + ); + println!("Rewrite result: {:?}", result); + // Should return None since no matching URL patterns exist + assert!( + result.is_none(), + "Truncated content without URLs should not be modified" + ); + } + + #[test] + fn test_complete_string_with_url() { + // A complete Next.js chunk with a URL that should be rewritten + let complete = r#"self.__next_f.push([ + 1, + '{"url":"https://origin.example.com/path/to/resource"}' +]);"#; + + let result = rewrite_nextjs_values( + complete, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length=true for RSC payloads + ); + println!("Complete string rewrite: {:?}", result); + assert!(result.is_some()); + let rewritten = result.unwrap(); + // Note: URL may have padding for length preservation + assert!(rewritten.contains("proxy.example.com") && rewritten.contains("/path/to/resource")); + } + + #[test] + fn test_truncated_url_rewrite() { + // URL that starts in this chunk but continues in the next + // Like: "url":"https://origin.example.com/some/path?param=%20 + // where the closing quote is in the next chunk + let truncated_url = r#"self.__next_f.push([ + 1, + '\"url\":\"https://origin.example.com/rss?title=%20' +]);"#; + + println!("Input with truncated URL:"); + println!("{}", truncated_url); + + let result = rewrite_nextjs_values( + truncated_url, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length=true for RSC payloads + ); + println!("Truncated URL rewrite result: {:?}", result); + + // The regex pattern requires a closing quote after the URL path, + // so URLs without closing quotes won't be matched (preventing corruption) + // This is actually the desired behavior - incomplete URLs are left alone + assert!( + result.is_none(), + "Truncated URL without closing quote should not be modified" + ); + } + + #[test] + fn test_embedded_pattern_incomplete_url() { + // Test the regex directly with an incomplete URL + let rewriter = UrlRewriter::new( + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length for RSC payloads + ); + + // This string has an incomplete URL - it starts but doesn't close properly + // within the string boundaries + let incomplete = r#"\"url\":\"https://origin.example.com/rss?title=%20"#; + println!("Testing embedded pattern on incomplete URL:"); + println!("Input: {}", incomplete); + + let result = rewriter.rewrite_embedded(incomplete); + println!("Result: {:?}", result); + + // Now test with a complete URL + let complete = r#"\"url\":\"https://origin.example.com/complete\""#; + println!("\nTesting embedded pattern on complete URL:"); + println!("Input: {}", complete); + + let result = rewriter.rewrite_embedded(complete); + println!("Result: {:?}", result); + } + + #[test] + fn test_split_chunk_url_corruption() { + // This is the EXACT scenario that breaks React hydration! + // The URL is split across two Next.js chunks. + + // Chunk 1: Contains the start of the URL + // Note: In Next.js RSC, double quotes inside single-quoted strings are NOT escaped + let chunk1 = r#"self.__next_f.push([ + 1, + '336:{"url":"https://origin.example.com/.rss/feed/3d70fbb5-ef5e-44f3-a547-e60939496e82.xml?title=Latest%20Car%20News%3A%20Trucks%2C%20SUVs%2C%20EVs%2C%20Reviews%20%26%20' +]);"#; + + // Chunk 2: Contains the continuation of the URL + let chunk2 = r#"self.__next_f.push([ + 1, + 'Auto%20Trends"}\n337:{"url":"https://origin.example.com/complete"}' +]);"#; + + println!("=== Chunk 1 (truncated URL start) ==="); + println!("{}", chunk1); + + let result1 = rewrite_nextjs_values( + chunk1, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length for RSC payloads + ); + println!("\nRewritten Chunk 1: {:?}", result1); + + // CRITICAL CHECK: The rewritten chunk should have the SAME quote escaping as the original + // If original has unescaped " inside ', the rewritten should too + if let Some(ref r1) = result1 { + println!("\n=== Quote escaping analysis ==="); + println!( + "Original has '336:{{\"url\":' (with backslash-quote): {}", + chunk1.contains(r#"\"url\""#) + ); + println!( + "Original has '336:{{\"url\":' (unescaped quote): {}", + chunk1.contains(r#"{"url":"#) + ); + println!("Rewritten has backslash-quote: {}", r1.contains(r#"\""#)); + println!( + "Rewritten has unescaped quote: {}", + r1.contains(r#"{"url":"#) + ); + + // The bug: original has unescaped ", but rewritten might have escaped \" + // This would change the JavaScript string content! + let original_has_backslash = chunk1.contains(r#"\""#); + let rewritten_has_backslash = r1.contains(r#"\""#); + + if !original_has_backslash && rewritten_has_backslash { + println!("\n!!! BUG DETECTED !!!"); + println!("The rewriter is ADDING backslash escapes that weren't in the original!"); + println!("This corrupts the JavaScript string content!"); + } + } + + println!("\n=== Chunk 2 (URL continuation) ==="); + println!("{}", chunk2); + + let result2 = rewrite_nextjs_values( + chunk2, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length for RSC payloads + ); + println!("\nRewritten Chunk 2: {:?}", result2); + + // Let's verify the complete URL in chunk2 is rewritten + if let Some(ref rewritten2) = result2 { + assert!( + rewritten2.contains("proxy.example.com") && rewritten2.contains("/complete"), + "Complete URL in chunk2 should be rewritten to new host with /complete path" + ); + } + } + + #[test] + fn test_embedded_regex_pattern() { + // Test the regex pattern directly to understand what it matches + let rewriter = UrlRewriter::new( + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length for RSC payloads + ); + + // Test 1: Unescaped double quotes (as in single-quoted JS string) + let unescaped = r#"'336:{"url":"https://origin.example.com/path"}'"#; + println!("Test 1 - Unescaped quotes:"); + println!(" Input: {}", unescaped); + let result = rewriter.rewrite_embedded(unescaped); + println!(" Result: {:?}", result); + + // Test 2: Escaped double quotes (as in double-quoted JS string or JSON) + let escaped = r#"'336:{\"url\":\"https://origin.example.com/path\"}'"#; + println!("\nTest 2 - Escaped quotes:"); + println!(" Input: {}", escaped); + let result = rewriter.rewrite_embedded(escaped); + println!(" Result: {:?}", result); + + // Test 3: Double-escaped quotes (as in JSON string inside JS string) + let double_escaped = r#"'336:{\\"url\\":\\"https://origin.example.com/path\\"}'"#; + println!("\nTest 3 - Double-escaped quotes:"); + println!(" Input: {}", double_escaped); + let result = rewriter.rewrite_embedded(double_escaped); + println!(" Result: {:?}", result); + } + + #[test] + fn test_backslash_n_preservation() { + // Critical test: Check that \n (backslash-n) is preserved byte-for-byte + // This is crucial because RSC payloads use \n as a record separator + + // String with literal backslash-n (two bytes: 0x5C 0x6E) + let input = + r#"self.__next_f.push([1, 'foo\n{"url":"https://origin.example.com/test"}\nbar']);"#; + + // Verify input has literal backslash-n + let backslash_n_pos = input.find(r"\n").unwrap(); + assert_eq!( + &input.as_bytes()[backslash_n_pos..backslash_n_pos + 2], + [0x5C, 0x6E], // backslash, n + "Input should have literal backslash-n" + ); + + let result = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length for RSC payloads + ); + + let rewritten = result.expect("should rewrite URL"); + + // Check the rewritten string still has literal backslash-n + let new_pos = rewritten.find(r"\n").unwrap(); + assert_eq!( + &rewritten.as_bytes()[new_pos..new_pos + 2], + [0x5C, 0x6E], + "Rewritten should preserve literal backslash-n" + ); + + // Count number of \n occurrences + let original_count = input.matches(r"\n").count(); + let rewritten_count = rewritten.matches(r"\n").count(); + assert_eq!( + original_count, rewritten_count, + "Number of \\n occurrences should be preserved" + ); + + println!("Input: {}", input); + println!("Rewritten: {}", rewritten); + println!( + "\\n count: original={}, rewritten={}", + original_count, rewritten_count + ); + } + + #[test] + fn test_url_rewriting_basic() { + // Test that URL rewriting works correctly while preserving the original scheme + let input = r#"self.__next_f.push([1, '{"url":"https://origin.example.com/news"}']);"#; + + let result = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", // request_scheme is now ignored - original scheme is preserved + &["url".into()], + true, // preserve_length for RSC payloads + ); + + let rewritten = result.expect("should rewrite URL"); + + println!("Original: {}", input); + println!("Rewritten: {}", rewritten); + + // Verify the URL was rewritten correctly, preserving the original https scheme + // With length preservation, URLs may have padding like /./././ + assert!( + rewritten.contains("http://proxy.example.com") && rewritten.contains("/news"), + "URL should be rewritten to new host with path, preserving https scheme. Got: {}", + rewritten + ); + assert!( + !rewritten.contains("origin.example.com"), + "URL should not contain original host" + ); + } + + #[test] + fn test_url_rewriting_preserves_rsc_structure() { + // Test that RSC record structure is preserved after rewriting + let input = r#"self.__next_f.push([1, '443:{"url":"https://origin.example.com/path"}\n444:{"other":"data"}']);"#; + + let result = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", // request_scheme is now ignored - original scheme is preserved + &["url".into()], + true, // preserve_length for RSC payloads + ); + + let rewritten = result.expect("should rewrite URL"); + + println!("Original: {}", input); + println!("Rewritten: {}", rewritten); + + // Verify URL was rewritten (preserving https scheme) + // With length preservation, URLs may have padding like /./././ + assert!( + rewritten.contains("http://proxy.example.com") && rewritten.contains("/path"), + "URL should be rewritten with preserved https scheme. Got: {}", + rewritten + ); + + // Verify record structure is intact - both records should still be parseable + assert!( + rewritten.contains(r#"\n444:"#), + "RSC record separator and next record ID must be preserved" + ); + assert!( + rewritten.contains(r#""other":"data""#), + "Subsequent record data must be preserved" + ); + } + + #[test] + fn test_nav_menu_rewrite() { + // Test a typical navigation menu payload + // This is the payload that contains the dropdown menu items + let input = r#"self.__next_f.push([ + 1, + '443:{"ID":878799,"title":"News","slug":"","post_parent":"0","guid":"pt000000000000000700000000000d68cf","menu_item_parent":"0","object_id":"category","url":"https://origin.example.com/news","target":"","attr_title":"","description":"","classes":"$444","menu_order":0,"post_type":"nav_menu_item","post_mime_type":"","object":"category","type":"taxonomy","type_label":"Category","menu_item_type":"taxonomy","hide_on_subnav":false,"children":"$445"}\n444:[""]\n445:[]' +]);"#; + + println!("=== Original Input ==="); + println!("{}", input); + + let result = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length for RSC payloads + ); + + let rewritten = result.expect("should rewrite URL"); + + println!("\n=== Rewritten Output ==="); + println!("{}", rewritten); + + // Verify the URL was rewritten using request scheme (http) + // With length preservation, URL may have padding like /./././ + assert!( + rewritten.contains("http://proxy.example.com") && rewritten.contains("/news"), + "URL should be rewritten to new host with request scheme. Got: {}", + rewritten + ); + assert!( + !rewritten.contains("origin.example.com"), + "Original host should not remain" + ); + + // Verify RSC structure is preserved + assert!( + rewritten.contains(r#""ID":878799"#), + "Record ID should be preserved" + ); + assert!( + rewritten.contains(r#""title":"News""#), + "Title should be preserved" + ); + assert!( + rewritten.contains(r#""classes":"$444""#), + "$444 reference should be preserved" + ); + assert!( + rewritten.contains(r#""children":"$445""#), + "$445 reference should be preserved" + ); + assert!( + rewritten.contains(r#"\n444:[""]"#), + "Record 444 should be preserved" + ); + assert!( + rewritten.contains(r#"\n445:[]"#), + "Record 445 should be preserved" + ); + + // Critical: Verify the JavaScript is still valid + // The string must be properly quoted and escaped + assert!( + rewritten.starts_with("self.__next_f.push(["), + "Should start with valid JS" + ); + assert!(rewritten.ends_with("]);"), "Should end with valid JS"); + + // Check byte length difference + let orig_len = input.len(); + let new_len = rewritten.len(); + println!("\n=== Length Analysis ==="); + println!("Original length: {}", orig_len); + println!("Rewritten length: {}", new_len); + println!("Difference: {} bytes", (orig_len as i64) - (new_len as i64)); + } + + #[test] + fn test_site_base_url_rewrite() { + // Test that siteBaseUrl gets rewritten alongside url attributes + // This is critical for React navigation to work correctly - if siteBaseUrl + // doesn't match the rewritten URLs, React may treat links as external + let input = r#"self.__next_f.push([1, '{"siteBaseUrl":"https://origin.example.com","url":"https://origin.example.com/news"}']);"#; + + let result = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", // request_scheme is now ignored - original scheme is preserved + &["url".into(), "siteBaseUrl".into()], // Include siteBaseUrl + true, // preserve_length for RSC payloads + ); + + let rewritten = result.expect("should rewrite URLs"); + + println!("Original: {}", input); + println!("Rewritten: {}", rewritten); + + // Both url and siteBaseUrl should be rewritten, preserving https scheme + // With length preservation, URLs may have padding + assert!( + rewritten.contains("http://proxy.example.com"), + "siteBaseUrl should be rewritten to match proxy host, preserving https. Got: {}", + rewritten + ); + assert!( + rewritten.contains("/news"), + "url path should be preserved. Got: {}", + rewritten + ); + assert!( + !rewritten.contains("origin.example.com"), + "Original host should not remain" + ); + } + + #[test] + fn test_site_production_domain_rewrite() { + // Test that siteProductionDomain (bare hostname without scheme) gets rewritten + // This is critical because Next.js uses this to determine if URLs are internal + let input = r#"self.__next_f.push([1, '{"siteProductionDomain":"origin.example.com","url":"https://origin.example.com/news"}']);"#; + + let result = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", // request_scheme is now ignored - original scheme is preserved + &["url".into(), "siteProductionDomain".into()], + true, // preserve_length for RSC payloads + ); + + let rewritten = result.expect("should rewrite URLs"); + + println!("Original: {}", input); + println!("Rewritten: {}", rewritten); + + // siteProductionDomain and URL should be rewritten, with possible length padding + assert!( + rewritten.contains("proxy.example.com"), + "siteProductionDomain should be rewritten to proxy host. Got: {}", + rewritten + ); + // URL should contain the path + assert!( + rewritten.contains("/news"), + "url path should be preserved. Got: {}", + rewritten + ); + assert!( + !rewritten.contains("origin.example.com"), + "Original host should not remain" + ); + } + + #[test] + fn test_calculate_padding() { + // Test whitespace padding calculation + // When new URL is shorter, we need spaces to compensate + let padding = UrlRewriter::calculate_padding(21, 24); + assert_eq!(padding.len(), 3, "Should need 3 spaces"); + assert_eq!(padding, " ", "Should be 3 spaces"); + + // No padding when lengths are equal + let padding = UrlRewriter::calculate_padding(24, 24); + assert_eq!(padding.len(), 0); + + // No padding when new URL is longer + let padding = UrlRewriter::calculate_padding(30, 24); + assert_eq!(padding.len(), 0); + } + + #[test] + fn test_whitespace_padding_rewrite() { + // Test that URL rewriting returns proper (url, padding) tuple + // Original: https://origin.example.com/news (31 chars) + // New URL: http://proxy.example.com/news (29 chars) + // Padding needed: 2 spaces + + let rewriter = UrlRewriter::new( + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, // preserve_length + ); + + let original_url = "https://origin.example.com/news"; + let result = rewriter.rewrite_url_value(original_url); + + assert!(result.is_some(), "URL should be rewritten"); + let (new_url, padding) = result.unwrap(); + + // Check the URL is correctly rewritten + assert_eq!(new_url, "http://proxy.example.com/news"); + assert!(new_url.contains("proxy.example.com")); + assert!(new_url.contains("/news")); + + // Check padding compensates for length difference + let original_len = original_url.len(); // 33 + let new_len = new_url.len(); // 26 + assert_eq!( + padding.len(), + original_len - new_len, + "Padding should be {} spaces", + original_len - new_len + ); + assert_eq!(padding, " ", "Should be 2 spaces"); + + // Total length (url + padding) should match original + assert_eq!( + new_url.len() + padding.len(), + original_url.len(), + "URL + padding should equal original length" + ); + } + + #[test] + fn test_no_padding_when_disabled() { + // When preserve_length is false, no padding should be returned + let rewriter = UrlRewriter::new( + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + false, // preserve_length disabled + ); + + let result = rewriter.rewrite_url_value("https://origin.example.com/news"); + assert!(result.is_some()); + let (new_url, padding) = result.unwrap(); + + assert_eq!(new_url, "http://proxy.example.com/news"); + assert_eq!(padding, "", "No padding when preserve_length is false"); } } diff --git a/crates/common/src/integrations/registry.rs b/crates/common/src/integrations/registry.rs index 9db8033..a6cb984 100644 --- a/crates/common/src/integrations/registry.rs +++ b/crates/common/src/integrations/registry.rs @@ -249,12 +249,34 @@ pub trait IntegrationScriptRewriter: Send + Sync { fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction; } +/// Context for HTML post-processors. +#[derive(Debug)] +pub struct IntegrationHtmlContext<'a> { + pub request_host: &'a str, + pub request_scheme: &'a str, + pub origin_host: &'a str, +} + +/// Trait for integration-provided HTML post-processors. +/// These run after streaming HTML processing to handle cases that require +/// access to the complete HTML (e.g., cross-script RSC T-chunks). +pub trait IntegrationHtmlPostProcessor: Send + Sync { + /// Identifier for logging/diagnostics. + fn integration_id(&self) -> &'static str; + + /// Post-process complete HTML content. + /// This is called after streaming HTML processing with the complete HTML. + /// Return the modified HTML or the original if no changes needed. + fn post_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> String; +} + /// Registration payload returned by integration builders. pub struct IntegrationRegistration { pub integration_id: &'static str, pub proxies: Vec>, pub attribute_rewriters: Vec>, pub script_rewriters: Vec>, + pub html_post_processors: Vec>, } impl IntegrationRegistration { @@ -276,6 +298,7 @@ impl IntegrationRegistrationBuilder { proxies: Vec::new(), attribute_rewriters: Vec::new(), script_rewriters: Vec::new(), + html_post_processors: Vec::new(), }, } } @@ -301,6 +324,15 @@ impl IntegrationRegistrationBuilder { self } + #[must_use] + pub fn with_html_post_processor( + mut self, + processor: Arc, + ) -> Self { + self.registration.html_post_processors.push(processor); + self + } + #[must_use] pub fn build(self) -> IntegrationRegistration { self.registration @@ -321,6 +353,7 @@ struct IntegrationRegistryInner { routes: Vec<(IntegrationEndpoint, &'static str)>, html_rewriters: Vec>, script_rewriters: Vec>, + html_post_processors: Vec>, } impl Default for IntegrationRegistryInner { @@ -334,6 +367,7 @@ impl Default for IntegrationRegistryInner { routes: Vec::new(), html_rewriters: Vec::new(), script_rewriters: Vec::new(), + html_post_processors: Vec::new(), } } } @@ -415,6 +449,9 @@ impl IntegrationRegistry { inner .script_rewriters .extend(registration.script_rewriters.into_iter()); + inner + .html_post_processors + .extend(registration.html_post_processors.into_iter()); } } @@ -493,6 +530,11 @@ impl IntegrationRegistry { self.inner.script_rewriters.clone() } + /// Expose registered HTML post-processors. + pub fn html_post_processors(&self) -> Vec> { + self.inner.html_post_processors.clone() + } + /// Provide a snapshot of registered integrations and their hooks. pub fn registered_integrations(&self) -> Vec { let mut map: BTreeMap<&'static str, IntegrationMetadata> = BTreeMap::new(); @@ -538,6 +580,7 @@ impl IntegrationRegistry { routes: Vec::new(), html_rewriters: attribute_rewriters, script_rewriters, + html_post_processors: Vec::new(), }), } } @@ -580,6 +623,7 @@ impl IntegrationRegistry { routes: Vec::new(), html_rewriters: Vec::new(), script_rewriters: Vec::new(), + html_post_processors: Vec::new(), }), } } diff --git a/crates/common/src/lib.rs b/crates/common/src/lib.rs index fc0b888..c8df91d 100644 --- a/crates/common/src/lib.rs +++ b/crates/common/src/lib.rs @@ -37,6 +37,7 @@ pub mod openrtb; pub mod proxy; pub mod publisher; pub mod request_signing; +pub mod rsc_flight; pub mod settings; pub mod settings_data; pub mod streaming_processor; diff --git a/crates/common/src/publisher.rs b/crates/common/src/publisher.rs index a8c13a2..d0ce6e8 100644 --- a/crates/common/src/publisher.rs +++ b/crates/common/src/publisher.rs @@ -1,6 +1,7 @@ use error_stack::{Report, ResultExt}; use fastly::http::{header, StatusCode}; use fastly::{Body, Request, Response}; +use std::io::Write; use crate::backend::ensure_backend_from_url; use crate::http_util::serve_static_with_etag; @@ -8,12 +9,67 @@ use crate::http_util::serve_static_with_etag; use crate::constants::{HEADER_SYNTHETIC_TRUSTED_SERVER, HEADER_X_COMPRESS_HINT}; use crate::cookies::create_synthetic_cookie; use crate::error::TrustedServerError; -use crate::integrations::IntegrationRegistry; +use crate::integrations::{IntegrationHtmlContext, IntegrationRegistry}; +use crate::rsc_flight::RscFlightUrlRewriter; use crate::settings::Settings; use crate::streaming_processor::{Compression, PipelineConfig, StreamProcessor, StreamingPipeline}; use crate::streaming_replacer::create_url_replacer; use crate::synthetic::get_or_generate_synthetic_id; +/// Compress data using the specified compression algorithm +fn compress_data( + data: &[u8], + compression: Compression, +) -> Result, Report> { + match compression { + Compression::None => Ok(data.to_vec()), + Compression::Gzip => { + use flate2::write::GzEncoder; + use flate2::Compression as GzCompression; + let mut encoder = GzEncoder::new(Vec::new(), GzCompression::default()); + encoder + .write_all(data) + .change_context(TrustedServerError::Proxy { + message: "Failed to gzip compress data".to_string(), + })?; + encoder.finish().change_context(TrustedServerError::Proxy { + message: "Failed to finish gzip compression".to_string(), + }) + } + Compression::Deflate => { + use flate2::write::ZlibEncoder; + use flate2::Compression as ZlibCompression; + let mut encoder = ZlibEncoder::new(Vec::new(), ZlibCompression::default()); + encoder + .write_all(data) + .change_context(TrustedServerError::Proxy { + message: "Failed to deflate compress data".to_string(), + })?; + encoder.finish().change_context(TrustedServerError::Proxy { + message: "Failed to finish deflate compression".to_string(), + }) + } + Compression::Brotli => { + use brotli::enc::writer::CompressorWriter; + use brotli::enc::BrotliEncoderParams; + let params = BrotliEncoderParams { + quality: 4, // Balance speed and compression + ..Default::default() + }; + let mut output = Vec::new(); + { + let mut writer = CompressorWriter::with_params(&mut output, 4096, ¶ms); + writer + .write_all(data) + .change_context(TrustedServerError::Proxy { + message: "Failed to brotli compress data".to_string(), + })?; + } + Ok(output) + } + } +} + /// Detects the request scheme (HTTP or HTTPS) using Fastly SDK methods and headers. /// /// Tries multiple methods in order of reliability: @@ -116,6 +172,15 @@ fn process_response_streaming( ) -> Result> { // Check if this is HTML content let is_html = params.content_type.contains("text/html"); + let is_rsc_flight = params.content_type.contains("text/x-component"); + log::info!( + "process_response_streaming: content_type={}, content_encoding={}, is_html={}, is_rsc_flight={}, origin_host={}", + params.content_type, + params.content_encoding, + is_html, + is_rsc_flight, + params.origin_host + ); // Determine compression type let compression = Compression::from_content_encoding(params.content_encoding); @@ -134,6 +199,77 @@ fn process_response_streaming( params.integration_registry, )?; + // Check if we have post-processors that need uncompressed HTML + let post_processors = params.integration_registry.html_post_processors(); + let needs_post_processing = !post_processors.is_empty(); + + // If we have post-processors, output uncompressed HTML so they can work with it, + // then compress only once at the end. This avoids double decompression/compression. + let output_compression = if needs_post_processing { + Compression::None + } else { + compression + }; + + let config = PipelineConfig { + input_compression: compression, + output_compression, + chunk_size: 8192, + }; + + let mut pipeline = StreamingPipeline::new(config, processor); + pipeline.process(body, &mut output)?; + + // Post-process HTML through registered integration post-processors. + // This handles cross-script T-chunks for RSC and other integration-specific + // processing that requires the complete HTML document. + log::info!( + "HTML post-processors: count={}, output_len={}, needs_post_processing={}", + post_processors.len(), + output.len(), + needs_post_processing + ); + if needs_post_processing { + // Output is already uncompressed, convert to string for post-processing + if let Ok(html) = std::str::from_utf8(&output) { + log::info!( + "NextJs post-processor called with {} bytes of HTML", + html.len() + ); + let ctx = IntegrationHtmlContext { + request_host: params.request_host, + request_scheme: params.request_scheme, + origin_host: params.origin_host, + }; + let mut processed = html.to_string(); + for processor in post_processors { + processed = processor.post_process(&processed, &ctx); + } + + // Now compress if original content was compressed + if compression != Compression::None { + output = compress_data(processed.as_bytes(), compression)?; + } else { + output = processed.into_bytes(); + } + } else { + log::warn!("HTML post-processing skipped: content is not valid UTF-8"); + // If not valid UTF-8, recompress the output as-is + if compression != Compression::None { + output = compress_data(&output, compression)?; + } + } + } + } else if is_rsc_flight { + // RSC Flight responses are length-prefixed (T rows). A naive string replacement will + // corrupt the stream by changing byte lengths without updating the prefixes. + let processor = RscFlightUrlRewriter::new( + params.origin_host, + params.origin_url, + params.request_host, + params.request_scheme, + ); + let config = PipelineConfig { input_compression: compression, output_compression: compression, diff --git a/crates/common/src/rsc_flight.rs b/crates/common/src/rsc_flight.rs new file mode 100644 index 0000000..82590e3 --- /dev/null +++ b/crates/common/src/rsc_flight.rs @@ -0,0 +1,359 @@ +use std::io; + +use crate::streaming_processor::StreamProcessor; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum RowState { + Id, + Tag, + Length, + ChunkByNewline, + ChunkByLength, +} + +/// Rewrites URLs inside a React Server Components (RSC) Flight stream. +/// +/// Next.js (App Router) uses `react-server-dom-webpack` ("Flight") for navigation responses +/// and for inlined `__next_f` data. The wire format is a sequence of rows: +/// - `:\n` (JSON terminated by `\n`) +/// - `:\n` (tagged, terminated by `\n`) +/// - `:T,` (tagged by `T`, length-delimited, **no trailing newline**) +/// +/// For `T` rows, the length prefix is the UTF-8 byte length of the content bytes. If we rewrite +/// URLs inside the content, we must recompute the length and rewrite the header. +pub struct RscFlightUrlRewriter { + origin_url: String, + origin_http_url: Option, + origin_host: String, + origin_protocol_relative: String, + request_url: String, + request_host: String, + request_protocol_relative: String, + + state: RowState, + row_id: Vec, + row_tag: Option, + declared_length: usize, + remaining_length: usize, + row_content: Vec, + raw_header: Vec, +} + +impl RscFlightUrlRewriter { + #[must_use] + pub fn new( + origin_host: &str, + origin_url: &str, + request_host: &str, + request_scheme: &str, + ) -> Self { + let request_url = format!("{request_scheme}://{request_host}"); + let origin_protocol_relative = format!("//{origin_host}"); + let request_protocol_relative = format!("//{request_host}"); + + let origin_http_url = origin_url + .strip_prefix("https://") + .map(|rest| format!("http://{rest}")); + + Self { + origin_url: origin_url.to_string(), + origin_http_url, + origin_host: origin_host.to_string(), + origin_protocol_relative, + request_url, + request_host: request_host.to_string(), + request_protocol_relative, + state: RowState::Id, + row_id: Vec::new(), + row_tag: None, + declared_length: 0, + remaining_length: 0, + row_content: Vec::new(), + raw_header: Vec::new(), + } + } + + fn reset_row(&mut self) { + self.state = RowState::Id; + self.row_id.clear(); + self.row_tag = None; + self.declared_length = 0; + self.remaining_length = 0; + self.row_content.clear(); + self.raw_header.clear(); + } + + fn rewrite_utf8_bytes(&self, bytes: &[u8]) -> Vec { + let Ok(text) = std::str::from_utf8(bytes) else { + return bytes.to_vec(); + }; + + if !text.contains(&self.origin_host) && !text.contains(&self.origin_url) { + if let Some(http_url) = &self.origin_http_url { + if !text.contains(http_url) { + return bytes.to_vec(); + } + } else { + return bytes.to_vec(); + } + } + + // Keep replacement semantics consistent with `create_url_replacer`. + let mut rewritten = text.replace(&self.origin_url, &self.request_url); + if let Some(http_url) = &self.origin_http_url { + rewritten = rewritten.replace(http_url, &self.request_url); + } + rewritten = rewritten.replace( + &self.origin_protocol_relative, + &self.request_protocol_relative, + ); + rewritten = rewritten.replace(&self.origin_host, &self.request_host); + + rewritten.into_bytes() + } + + fn finalize_newline_row(&mut self, out: &mut Vec) { + out.extend_from_slice(&self.row_id); + out.push(b':'); + if let Some(tag) = self.row_tag { + out.push(tag); + } + let rewritten = self.rewrite_utf8_bytes(&self.row_content); + out.extend_from_slice(&rewritten); + out.push(b'\n'); + self.reset_row(); + } + + fn finalize_length_row(&mut self, out: &mut Vec) { + let Some(tag) = self.row_tag else { + // Should never happen for length-delimited rows; fall back to passthrough. + out.extend_from_slice(&self.raw_header); + out.extend_from_slice(&self.row_content); + self.reset_row(); + return; + }; + + out.extend_from_slice(&self.row_id); + out.push(b':'); + out.push(tag); + + if tag == b'T' { + let rewritten = self.rewrite_utf8_bytes(&self.row_content); + let new_len = rewritten.len(); + out.extend_from_slice(format!("{new_len:x}").as_bytes()); + out.push(b','); + out.extend_from_slice(&rewritten); + } else { + // Length-delimited row type we don't transform (e.g., future/binary Flight types). + out.extend_from_slice(format!("{:x}", self.declared_length).as_bytes()); + out.push(b','); + out.extend_from_slice(&self.row_content); + } + + self.reset_row(); + } + + fn flush_partial_row(&mut self, out: &mut Vec) { + if self.raw_header.is_empty() && self.row_content.is_empty() { + return; + } + out.extend_from_slice(&self.raw_header); + out.extend_from_slice(&self.row_content); + self.reset_row(); + } +} + +impl StreamProcessor for RscFlightUrlRewriter { + fn process_chunk(&mut self, chunk: &[u8], is_last: bool) -> Result, io::Error> { + let mut out = Vec::with_capacity(chunk.len()); + let mut i = 0; + + while i < chunk.len() { + match self.state { + RowState::Id => { + let b = chunk[i]; + i += 1; + if b == b':' { + self.raw_header.push(b':'); + self.state = RowState::Tag; + } else { + self.row_id.push(b); + self.raw_header.push(b); + } + } + RowState::Tag => { + let b = chunk[i]; + i += 1; + + if b == b'T' || b == b'V' { + self.row_tag = Some(b); + self.raw_header.push(b); + self.state = RowState::Length; + self.declared_length = 0; + } else if b.is_ascii_uppercase() { + self.row_tag = Some(b); + self.raw_header.push(b); + self.state = RowState::ChunkByNewline; + } else { + // Not a recognized tag; treat as first byte of a JSON row. + self.row_tag = None; + self.row_content.push(b); + self.state = RowState::ChunkByNewline; + } + } + RowState::Length => { + let b = chunk[i]; + i += 1; + if b == b',' { + self.raw_header.push(b','); + self.remaining_length = self.declared_length; + self.state = RowState::ChunkByLength; + } else { + self.raw_header.push(b); + let digit = match b { + b'0'..=b'9' => (b - b'0') as usize, + b'a'..=b'f' => (b - b'a' + 10) as usize, + b'A'..=b'F' => (b - b'A' + 10) as usize, + _ => 0, + }; + self.declared_length = (self.declared_length << 4) | digit; + } + } + RowState::ChunkByNewline => { + let Some(pos) = chunk[i..].iter().position(|&b| b == b'\n') else { + self.row_content.extend_from_slice(&chunk[i..]); + break; + }; + let end = i + pos; + self.row_content.extend_from_slice(&chunk[i..end]); + i = end + 1; // Skip '\n' + self.finalize_newline_row(&mut out); + } + RowState::ChunkByLength => { + let available = chunk.len() - i; + let take = available.min(self.remaining_length); + self.row_content.extend_from_slice(&chunk[i..i + take]); + i += take; + self.remaining_length -= take; + + if self.remaining_length == 0 { + self.finalize_length_row(&mut out); + } + } + } + } + + if is_last { + self.flush_partial_row(&mut out); + } + + Ok(out) + } + + fn reset(&mut self) { + self.reset_row(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn run_rewriter( + rewriter: &mut RscFlightUrlRewriter, + input: &[u8], + chunk_size: usize, + ) -> Vec { + let mut output = Vec::new(); + let mut pos = 0; + while pos < input.len() { + let end = (pos + chunk_size).min(input.len()); + let chunk = &input[pos..end]; + let rewritten = rewriter + .process_chunk(chunk, false) + .expect("should process chunk"); + output.extend_from_slice(&rewritten); + pos = end; + } + + let tail = rewriter + .process_chunk(&[], true) + .expect("should process final chunk"); + output.extend_from_slice(&tail); + output + } + + #[test] + fn rewrites_newline_rows() { + let input = b"0:[\"https://origin.example.com/page\"]\n"; + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com", + "proxy.example.com", + "https", + ); + + let output = run_rewriter(&mut rewriter, input, 8); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + assert_eq!( + output_str, "0:[\"https://proxy.example.com/page\"]\n", + "Output should rewrite URLs in newline rows" + ); + } + + #[test] + fn rewrites_t_rows_and_updates_length() { + let t_content = r#"{"url":"https://origin.example.com/page"}"#; + let json_row = "2:[\"ok\"]\n"; + let input = format!("1:T{:x},{}{}", t_content.len(), t_content, json_row); + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com", + "proxy.example.com", + "https", + ); + + let output = run_rewriter(&mut rewriter, input.as_bytes(), 7); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + + let rewritten_t_content = r#"{"url":"https://proxy.example.com/page"}"#; + let expected = format!( + "1:T{:x},{}{}", + rewritten_t_content.len(), + rewritten_t_content, + json_row + ); + + assert_eq!( + output_str, expected, + "Output should update T row lengths after rewriting" + ); + } + + #[test] + fn handles_t_row_header_and_body_split_across_chunks() { + let t_content = r#"{"url":"https://origin.example.com/page"}"#; + let input = format!("1:T{:x},{}", t_content.len(), t_content); + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com", + "proxy.example.com", + "https", + ); + + // Split such that the header ends before the comma and content begins in a later chunk. + let output = run_rewriter(&mut rewriter, input.as_bytes(), 3); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + + let rewritten_t_content = r#"{"url":"https://proxy.example.com/page"}"#; + let expected = format!("1:T{:x},{}", rewritten_t_content.len(), rewritten_t_content,); + + assert_eq!( + output_str, expected, + "Rewriter should handle T rows split across chunks" + ); + } +} diff --git a/crates/common/src/settings.rs b/crates/common/src/settings.rs index 25149d9..9c689ff 100644 --- a/crates/common/src/settings.rs +++ b/crates/common/src/settings.rs @@ -450,8 +450,8 @@ mod tests { assert_eq!(raw_nextjs["enabled"], json!(false)); assert_eq!( raw_nextjs["rewrite_attributes"], - json!(["href", "link", "url"]), - "Next.js rewrite attributes should default to href/link/url" + json!(["href", "link", "siteBaseUrl", "siteProductionDomain", "url"]), + "Next.js rewrite attributes should include href/link/siteBaseUrl/siteProductionDomain/url for RSC navigation" ); assert!(!settings.synthetic.counter_store.is_empty()); diff --git a/crates/common/src/streaming_processor.rs b/crates/common/src/streaming_processor.rs index 88d3a64..eb5d608 100644 --- a/crates/common/src/streaming_processor.rs +++ b/crates/common/src/streaming_processor.rs @@ -92,12 +92,17 @@ impl StreamingPipeline

{ ) { (Compression::None, Compression::None) => self.process_uncompressed(input, output), (Compression::Gzip, Compression::Gzip) => self.process_gzip_to_gzip(input, output), + (Compression::Gzip, Compression::None) => self.process_gzip_to_none(input, output), (Compression::Deflate, Compression::Deflate) => { self.process_deflate_to_deflate(input, output) } + (Compression::Deflate, Compression::None) => { + self.process_deflate_to_none(input, output) + } (Compression::Brotli, Compression::Brotli) => { self.process_brotli_to_brotli(input, output) } + (Compression::Brotli, Compression::None) => self.process_brotli_to_none(input, output), _ => Err(Report::new(TrustedServerError::Proxy { message: "Unsupported compression transformation".to_string(), })), @@ -206,6 +211,48 @@ impl StreamingPipeline

{ Ok(()) } + /// Process gzip compressed input to uncompressed output (decompression only) + fn process_gzip_to_none( + &mut self, + input: R, + mut output: W, + ) -> Result<(), Report> { + use flate2::read::GzDecoder; + + // Decompress input + let mut decoder = GzDecoder::new(input); + let mut decompressed = Vec::new(); + decoder + .read_to_end(&mut decompressed) + .change_context(TrustedServerError::Proxy { + message: "Failed to decompress gzip".to_string(), + })?; + + log::info!( + "[Gzip->None] Decompressed size: {} bytes", + decompressed.len() + ); + + // Process the decompressed content + let processed = self + .processor + .process_chunk(&decompressed, true) + .change_context(TrustedServerError::Proxy { + message: "Failed to process content".to_string(), + })?; + + log::info!("[Gzip->None] Processed size: {} bytes", processed.len()); + + // Write uncompressed output + output + .write_all(&processed) + .change_context(TrustedServerError::Proxy { + message: "Failed to write output".to_string(), + })?; + + Ok(()) + } + /// Process deflate compressed stream fn process_deflate_to_deflate( &mut self, @@ -222,6 +269,48 @@ impl StreamingPipeline

{ self.process_through_compression(decoder, encoder) } + /// Process deflate compressed input to uncompressed output (decompression only) + fn process_deflate_to_none( + &mut self, + input: R, + mut output: W, + ) -> Result<(), Report> { + use flate2::read::ZlibDecoder; + + // Decompress input + let mut decoder = ZlibDecoder::new(input); + let mut decompressed = Vec::new(); + decoder + .read_to_end(&mut decompressed) + .change_context(TrustedServerError::Proxy { + message: "Failed to decompress deflate".to_string(), + })?; + + log::info!( + "[Deflate->None] Decompressed size: {} bytes", + decompressed.len() + ); + + // Process the decompressed content + let processed = self + .processor + .process_chunk(&decompressed, true) + .change_context(TrustedServerError::Proxy { + message: "Failed to process content".to_string(), + })?; + + log::info!("[Deflate->None] Processed size: {} bytes", processed.len()); + + // Write uncompressed output + output + .write_all(&processed) + .change_context(TrustedServerError::Proxy { + message: "Failed to write output".to_string(), + })?; + + Ok(()) + } + /// Process brotli compressed stream fn process_brotli_to_brotli( &mut self, @@ -243,6 +332,48 @@ impl StreamingPipeline

{ self.process_through_compression(decoder, encoder) } + /// Process brotli compressed input to uncompressed output (decompression only) + fn process_brotli_to_none( + &mut self, + input: R, + mut output: W, + ) -> Result<(), Report> { + use brotli::Decompressor; + + // Decompress input + let mut decoder = Decompressor::new(input, 4096); + let mut decompressed = Vec::new(); + decoder + .read_to_end(&mut decompressed) + .change_context(TrustedServerError::Proxy { + message: "Failed to decompress brotli".to_string(), + })?; + + log::info!( + "[Brotli->None] Decompressed size: {} bytes", + decompressed.len() + ); + + // Process the decompressed content + let processed = self + .processor + .process_chunk(&decompressed, true) + .change_context(TrustedServerError::Proxy { + message: "Failed to process content".to_string(), + })?; + + log::info!("[Brotli->None] Processed size: {} bytes", processed.len()); + + // Write uncompressed output + output + .write_all(&processed) + .change_context(TrustedServerError::Proxy { + message: "Failed to write output".to_string(), + })?; + + Ok(()) + } + /// Generic processing through compression layers fn process_through_compression( &mut self, diff --git a/docs/RSC_HYDRATION_FINDINGS.md b/docs/RSC_HYDRATION_FINDINGS.md new file mode 100644 index 0000000..9dfb8aa --- /dev/null +++ b/docs/RSC_HYDRATION_FINDINGS.md @@ -0,0 +1,768 @@ +# RSC Hydration URL Rewriting: Technical Findings + +## Problem Statement + +When proxying Next.js App Router sites, URL rewriting in RSC (React Server Components) payloads caused React hydration to fail. The symptom was 0 React fiber nodes after page load, indicating complete hydration failure. + +## Background: How Next.js Delivers RSC Data + +Next.js App Router uses React Server Components with a streaming "flight" protocol. RSC data is delivered to the browser via inline ` + + +``` + +The `[1, "..."]` calls contain the actual RSC payload as a JavaScript string. + +For client-side navigations, Next.js fetches Flight directly (no ` + + + +``` + +This happens because Next.js streams RSC data as it becomes available. The T-chunk header in script 10 declares 928 bytes (0x928 = 2344 decimal), but those bytes are delivered in script 11. + +### Real-World Example + +Analysis of a Next.js App Router site revealed the following cross-script pattern: + +``` +Script 59 (index 58): +- T-chunk header at position 1370: "436:T68f," +- Declares 0x68f = 1679 bytes of content +- Content starts but script ends before all bytes are delivered + +Script 60 (index 59): +- Contains continuation of T-chunk content +- Includes 5 URLs pointing to the origin host that need rewriting +- URLs at byte positions within the T-chunk span +``` + +When the Rust implementation processed each script independently: +- Script 59: T-chunk header found, but `content_end = header_end` (0 bytes in THIS script) +- Script 60: Content processed, but no T-chunk header to update + +Result: T-chunk length remained at 0x68f while actual content changed size after URL rewriting. + +## Discovery 2: Combining Push Calls Breaks Hydration + +```javascript +// Original: 221 push calls -> 683 fibers (works) +// Combined into 1 push call: 0 fibers (broken!) +``` + +Even with identical content, consolidating all RSC into a single push call broke hydration. Next.js processes each push call incrementally, and the structure matters. + +## Discovery 3: Per-Script Streaming Processing Cannot Fix Cross-Script T-Chunks + +The streaming HTML processor (`lol_html`) processes scripts one at a time: + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ HTML Stream │ +│ │ +│ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ Process A Process B Process C │ +│ (isolated) (isolated) (isolated) │ +│ │ +│ Cannot share state between script processing! │ +└─────────────────────────────────────────────────────────────────┘ +``` + +This is a fundamental limitation: when script A declares a T-chunk that continues in script B, the streaming processor cannot: +1. Track that script A's T-chunk is incomplete +2. Update script A's header after processing script B's URLs + +--- + +## The Solution: Two-Phase Processing + +### Phase 1: Streaming HTML Processing (per-script) + +The streaming processor handles scripts that are self-contained: +- Extracts RSC payload from `self.__next_f.push([1, '...'])` +- Finds T-chunks within the single script +- Rewrites URLs and recalculates lengths +- Works correctly for ~95% of scripts + +### Phase 2: Post-Processing (cross-script) + +After streaming completes, a post-processor handles cross-script T-chunks: +1. **Finds all RSC push scripts** in the complete HTML +2. **Combines their payloads** with markers +3. **Processes T-chunks across the combined content**, skipping markers when counting bytes +4. **Rewrites URLs and recalculates lengths** for the combined content +5. **Splits back on markers** to get individual rewritten payloads +6. **Rebuilds the HTML** with rewritten scripts + +### Marker-Based Cross-Script Processing + +#### Step 1: Combine Scripts with Markers + +```rust +const RSC_MARKER: &str = "\x00SPLIT\x00"; + +// Combine all payloads +let mut combined = payloads[0].to_string(); +for payload in &payloads[1..] { + combined.push_str(RSC_MARKER); + combined.push_str(payload); +} +// Result: "11:null\n1a:T928,\x00SPLIT\x00...2344 bytes..." +``` + +The marker `\x00SPLIT\x00` is chosen because: +- Contains null byte (`\x00`) which cannot appear in valid JSON/RSC content +- Easily identifiable for splitting +- Won't be confused with any escape sequence + +#### Step 2: Find T-Chunks Across Combined Content + +```rust +fn find_tchunks_with_markers(content: &str) -> Vec { + let pattern = Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").unwrap(); + + for each match: + // Parse header: id, hex_length + // Consume declared bytes, SKIPPING markers + let (content_end, _) = consume_unescaped_bytes_skip_markers( + content, header_end, declared_length + ); +} +``` + +The key insight: markers don't count toward byte consumption. When a T-chunk declares 1679 bytes, we consume 1679 bytes of actual content, skipping over any markers we encounter. + +#### Step 3: Rewrite URLs and Recalculate Lengths + +```rust +for chunk in &chunks { + // Extract T-chunk content (may contain markers) + let chunk_content = &combined[chunk.header_end..chunk.content_end]; + + // Rewrite URLs (preserves markers) + let rewritten_content = rewrite_rsc_url_string(chunk_content, ...); + + // Calculate new byte length (excluding markers) + let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); + let new_length_hex = format!("{:x}", new_length); + + // Write new T-chunk header and content + result.push_str(&chunk.id); + result.push_str(":T"); + result.push_str(&new_length_hex); + result.push(','); + result.push_str(&rewritten_content); +} +``` + +#### Step 4: Split Back on Markers + +```rust +// Split on markers to get individual payloads +result.split(RSC_MARKER).map(|s| s.to_string()).collect() +``` + +Each resulting payload corresponds to one original script, but with: +- URLs rewritten +- T-chunk lengths correctly recalculated across script boundaries + +--- + +## Integration Hook Architecture + +The post-processing is implemented as an integration hook, allowing other integrations to also perform HTML post-processing. + +### Trait Definition + +```rust +/// Context for HTML post-processors. +pub struct IntegrationHtmlContext<'a> { + pub request_host: &'a str, + pub request_scheme: &'a str, + pub origin_host: &'a str, +} + +/// Trait for integration-provided HTML post-processors. +/// These run after streaming HTML processing to handle cases that require +/// access to the complete HTML (e.g., cross-script RSC T-chunks). +pub trait IntegrationHtmlPostProcessor: Send + Sync { + /// Identifier for logging/diagnostics. + fn integration_id(&self) -> &'static str; + + /// Post-process complete HTML content. + fn post_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> String; +} +``` + +### Registration + +```rust +// In nextjs.rs +pub fn register(settings: &Settings) -> Option { + let config = build(settings)?; + + let structured = Arc::new(NextJsScriptRewriter::new(config.clone(), NextJsRewriteMode::Structured)); + let streamed = Arc::new(NextJsScriptRewriter::new(config.clone(), NextJsRewriteMode::Streamed)); + let post_processor = Arc::new(NextJsHtmlPostProcessor::new(config)); + + Some( + IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) + .with_script_rewriter(structured) + .with_script_rewriter(streamed) + .with_html_post_processor(post_processor) // <-- Post-processor hook + .build(), + ) +} +``` + +### Execution in Publisher + +```rust +// In publisher.rs - process_response_streaming() + +// Phase 1: Streaming HTML processing +let mut pipeline = StreamingPipeline::new(config, processor); +pipeline.process(body, &mut output)?; + +// Phase 2: Post-processing via integration hooks +let post_processors = params.integration_registry.html_post_processors(); +if !post_processors.is_empty() { + if let Ok(html) = std::str::from_utf8(&output) { + let ctx = IntegrationHtmlContext { + request_host: params.request_host, + request_scheme: params.request_scheme, + origin_host: params.origin_host, + }; + let mut processed = html.to_string(); + for processor in post_processors { + processed = processor.post_process(&processed, &ctx); + } + output = processed.into_bytes(); + } +} +``` + +--- + +## Byte Length Calculation Algorithm + +To correctly calculate unescaped byte length: + +```rust +fn calculate_unescaped_byte_length(s: &str) -> usize { + let bytes = s.as_bytes(); + let mut result = 0; + let mut i = 0; + + while i < bytes.len() { + if bytes[i] == b'\\' && i + 1 < bytes.len() { + let esc = bytes[i + 1]; + + // Simple escape sequences: \n, \r, \t, \b, \f, \v, \", \', \\, \/ + if matches!(esc, b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/') { + result += 1; + i += 2; + continue; + } + + // \xHH - hex escape (1 byte) + if esc == b'x' && i + 3 < bytes.len() { + result += 1; + i += 4; + continue; + } + + // \uHHHH - unicode escape + if esc == b'u' && i + 5 < bytes.len() { + let hex = &s[i + 2..i + 6]; + if let Ok(code_unit) = u16::from_str_radix(hex, 16) { + // Check for surrogate pair + if is_high_surrogate(code_unit) && has_low_surrogate_at(s, i + 6) { + result += 4; // Surrogate pair = 4 UTF-8 bytes + i += 12; + continue; + } + // Single unicode escape - calculate UTF-8 byte length + let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); + result += c.len_utf8(); + i += 6; + continue; + } + } + } + + // Regular character - count its UTF-8 byte length + if bytes[i] < 0x80 { + result += 1; + i += 1; + } else { + let c = s[i..].chars().next().unwrap_or('\u{FFFD}'); + result += c.len_utf8(); + i += c.len_utf8(); + } + } + + result +} +``` + +### Marker-Aware Variant + +```rust +fn calculate_unescaped_byte_length_skip_markers(s: &str) -> usize { + let without_markers = s.replace(RSC_MARKER, ""); + calculate_unescaped_byte_length(&without_markers) +} +``` + +--- + +## URL Rewriting Patterns + +The solution handles multiple URL formats in RSC content: + +| Pattern | Example | In RSC String | +|---------|---------|---------------| +| Full HTTPS | `https://host/path` | `https://host/path` | +| Full HTTP | `http://host/path` | `http://host/path` | +| Protocol-relative | `//host/path` | `//host/path` | +| JSON-escaped slashes | `//host/path` | `\\/\\/host/path` | +| Double-escaped | `\\/\\/host` | `\\\\/\\\\/host` | +| Quad-escaped | `\\\\/\\\\/host` | `\\\\\\\\//host` | + +### Regex Pattern + +```rust +let pattern = Regex::new(&format!( + r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, + escaped_origin +)).unwrap(); +``` + +This pattern handles: +- Optional scheme (`https?`)? +- Optional colon (`:`)? +- Multiple escape levels for slashes +- The escaped origin hostname + +--- + +## Complete Processing Flow + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ HTML Response from Origin │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 1: Streaming HTML Processing │ +│ │ +│ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ +│ │ Process Script 1 │ │ Process Script 2 │ │ Process Script N │ │ +│ │ │ │ │ │ │ │ +│ │ - Extract payload│ │ - Extract payload│ │ - Extract payload│ │ +│ │ - Find T-chunks │ │ - Find T-chunks │ │ - Find T-chunks │ │ +│ │ - Rewrite URLs │ │ - Rewrite URLs │ │ - Rewrite URLs │ │ +│ │ - Update lengths │ │ - Update lengths │ │ - Update lengths │ │ +│ └──────────────────┘ └──────────────────┘ └──────────────────┘ │ +│ │ +│ Works for self-contained T-chunks, but cross-script T-chunks may have │ +│ incorrect lengths at this point. │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ PHASE 2: HTML Post-Processing │ +│ (Integration Hook: NextJsHtmlPostProcessor) │ +│ │ +│ 1. Find all RSC push scripts in complete HTML │ +│ │ +│ 2. Extract payloads and combine with markers: │ +│ "payload1\x00SPLIT\x00payload2\x00SPLIT\x00payload3..." │ +│ │ +│ 3. Find T-chunks across combined content (markers don't count as bytes) │ +│ │ +│ 4. For each T-chunk: │ +│ - Extract content (may span markers) │ +│ - Rewrite URLs │ +│ - Calculate new byte length (excluding markers) │ +│ - Write new header: ID:T, │ +│ │ +│ 5. Split on markers to get individual payloads │ +│ │ +│ 6. Rebuild HTML with corrected scripts │ +└─────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ Final HTML Response to Client │ +│ │ +│ - All URLs rewritten to proxy host │ +│ - All T-chunk lengths correctly reflect content after URL rewriting │ +│ - Script structure preserved (same number of push calls) │ +│ - React hydration succeeds │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Test Results + +| Test Case | Result | +|-----------|--------| +| T-chunk length shrinks (longer origin → shorter proxy) | Pass | +| T-chunk length grows (shorter origin → longer proxy) | Pass | +| Multiple T-chunks in same content | Pass | +| Escape sequences: `\n`, `\r`, `\t`, `\\`, `\"` | Pass | +| Unicode escapes: `\uHHHH` | Pass | +| Surrogate pairs: `\uD800\uDC00` | Pass | +| Hex escapes: `\xHH` | Pass | +| Various URL patterns (escaped slashes, etc.) | Pass | +| Cross-script T-chunk (header in script N, content in N+1) | Pass | +| Cross-script with multiple URLs in continuation | Pass | +| Non-T-chunk content preserved | Pass | +| HTML structure preserved after post-processing | Pass | + +### Comparison: JS v7 vs JS v8 vs Rust + +| Implementation | Approach | Fiber Count | Result | +|----------------|----------|-------------|--------| +| JS v7 | Per-script T-chunk rewriting | 0 | FAIL | +| JS v8 | Marker-based cross-script | 683 | PASS | +| Rust (final) | Two-phase with post-processor | 683 | PASS | + +### Playwright Browser Testing (December 2024) + +Automated testing with Playwright across Chrome and Firefox verified the implementation: + +**Test Setup:** +- Fetched live HTML from a Next.js App Router site +- Applied RSC URL rewriting via the Rust post-processor +- Served rewritten HTML locally to isolate from bot detection + +**Results (both Chrome and Firefox):** + +| Metric | Value | +|--------|-------| +| Hydration errors detected | 0 | +| Console errors (hydration-related) | 0 | +| Total links in page | 120 | +| Links rewritten to proxy | 120 | +| Links still pointing to origin | 0 | +| RSC push scripts present | Yes | +| `self.__next_f` entries | 223 | +| `__next` root element | Present | + +**Key Observations:** +1. **No hydration mismatch**: React successfully hydrated without any "Text content does not match" or "Hydration failed" errors +2. **Complete URL rewriting**: All 120 navigation links correctly point to the proxy host +3. **RSC data preserved**: All 223 RSC Flight entries present in `self.__next_f` array +4. **Cross-browser compatibility**: Identical behavior in Chrome (Chromium) and Firefox + +--- + +## Decompression Pipeline for Post-Processing + +The post-processor requires access to uncompressed HTML. Since origin responses are typically gzip or brotli compressed, the streaming pipeline was extended to support decompression-only mode. + +### The Problem + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Original Flow (without post-processing): │ +│ │ +│ Gzip In → Decompress → Process HTML → Recompress → Gzip Out │ +│ │ +│ With post-processing, we need uncompressed output: │ +│ │ +│ Gzip In → Decompress → Process HTML → ??? → Post-Process │ +│ │ +│ If we recompress, post-processor gets garbage (compressed bytes)│ +└─────────────────────────────────────────────────────────────────┘ +``` + +### Solution: Decompression-Only Pipeline Modes + +Added new pipeline transformation modes that decompress without recompressing: +- `process_gzip_to_none()` at [streaming_processor.rs:215](crates/common/src/streaming_processor.rs#L215) +- `process_deflate_to_none()` at [streaming_processor.rs:273](crates/common/src/streaming_processor.rs#L273) +- `process_brotli_to_none()` at [streaming_processor.rs:336](crates/common/src/streaming_processor.rs#L336) + +### Publisher Flow with Post-Processing + +The post-processing flow in publisher.rs: +1. Get post-processors from the integration registry +2. If post-processors exist, output uncompressed HTML (decompression-only mode) +3. Run streaming HTML processing +4. Apply each post-processor to the uncompressed HTML +5. Recompress once at the end to match original Content-Encoding + +**Implementation:** Post-processing logic at [publisher.rs:203](crates/common/src/publisher.rs#L203) + +### Benefits + +1. **Single compression pass**: Avoids decompress → recompress → decompress → recompress cycle +2. **Valid UTF-8 for post-processor**: Post-processor receives actual HTML, not compressed bytes +3. **Preserves original compression**: Final output matches original Content-Encoding + +--- + +## Implementation Files + +| File | Purpose | +|------|---------| +| `crates/common/src/integrations/nextjs.rs` | RSC rewriting logic, post-processor | +| `crates/common/src/integrations/registry.rs` | `IntegrationHtmlPostProcessor` trait | +| `crates/common/src/integrations/mod.rs` | Module exports | +| `crates/common/src/publisher.rs` | Post-processor invocation, decompression flow | +| `crates/common/src/streaming_processor.rs` | Decompression-only pipeline modes | + +### Key Functions in nextjs.rs + +| Function | Line | Purpose | +|----------|------|---------| +| `extract_rsc_push_payload` | [232](crates/common/src/integrations/nextjs.rs#L232) | Extract string from `self.__next_f.push([1, '...'])` | +| `calculate_unescaped_byte_length` | [609](crates/common/src/integrations/nextjs.rs#L609) | Count unescaped bytes with escape handling | +| `consume_unescaped_bytes` | [686](crates/common/src/integrations/nextjs.rs#L686) | Advance through string consuming N bytes | +| `find_tchunks` | [767](crates/common/src/integrations/nextjs.rs#L767) | Find T-chunks in single script | +| `rewrite_rsc_url_string` | [803](crates/common/src/integrations/nextjs.rs#L803) | URL rewriting with escape handling | +| `rewrite_rsc_tchunks` | [833](crates/common/src/integrations/nextjs.rs#L833) | Single-script T-chunk processing | +| `consume_unescaped_bytes_skip_markers` | [910](crates/common/src/integrations/nextjs.rs#L910) | Advance through string, skipping markers | +| `calculate_unescaped_byte_length_skip_markers` | [991](crates/common/src/integrations/nextjs.rs#L991) | Count unescaped bytes, excluding markers | +| `find_tchunks_with_markers` | [1005](crates/common/src/integrations/nextjs.rs#L1005) | Find T-chunks in marker-combined content | +| `rewrite_rsc_scripts_combined` | [1056](crates/common/src/integrations/nextjs.rs#L1056) | Cross-script T-chunk processing | +| `find_rsc_push_scripts` | [1165](crates/common/src/integrations/nextjs.rs#L1165) | Find all RSC scripts in HTML | +| `post_process_rsc_html` | [1245](crates/common/src/integrations/nextjs.rs#L1245) | Complete HTML post-processing | + +--- + +## Limitations + +### Very Long Proxy URLs + +If the proxy URL is significantly longer than the original, T-chunk content grows substantially. This is handled correctly (the hex length is recalculated), but it may affect: +- Response size +- Streaming behavior if scripts become much larger + +### Performance Considerations + +The post-processing phase requires: +1. Parsing complete HTML to find scripts (O(n) string scan) +2. Combining payloads (memory allocation) +3. Regex matching for T-chunks +4. String rebuilding + +For typical pages with 100-300 RSC scripts, this adds ~1-5ms to processing time. + +### Edge Cases Not Handled + +- Malformed RSC content (missing closing quotes, invalid hex) +- Nested script tags (shouldn't occur in valid HTML) +- Non-UTF8 encoded pages (requires UTF-8) + +--- + +## Deconstruction and Reconstruction Logic + +The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main entry point is `post_process_rsc_html()` at [nextjs.rs:1245](crates/common/src/integrations/nextjs.rs#L1245). + +### Step 1: Find RSC Push Scripts + +Find all `self.__next_f.push([1, "..."])` scripts in the HTML and extract their payloads. + +**Implementation:** `find_rsc_push_scripts()` at [nextjs.rs:1165](crates/common/src/integrations/nextjs.rs#L1165) + +### Step 2: Combine Payloads with Markers + +Join all payloads with a marker string (`\x00SPLIT\x00`) that cannot appear in valid JSON/RSC content. This allows T-chunks to be processed across script boundaries while preserving the ability to split back later. + +**Implementation:** Marker constant at [nextjs.rs:906](crates/common/src/integrations/nextjs.rs#L906), combining logic in `rewrite_rsc_scripts_combined()` at [nextjs.rs:1056](crates/common/src/integrations/nextjs.rs#L1056) + +### Step 3: Find T-Chunks Across Combined Content + +Parse T-chunk headers (`ID:T,`) and consume exactly the declared number of unescaped bytes, skipping over markers. + +**Implementation:** `find_tchunks_with_markers()` at [nextjs.rs:1005](crates/common/src/integrations/nextjs.rs#L1005), using `consume_unescaped_bytes_skip_markers()` at [nextjs.rs:910](crates/common/src/integrations/nextjs.rs#L910) + +### Step 4: Rewrite URLs in T-Chunk Content + +Rewrite all URL patterns in the T-chunk content: +- `https://origin.example.com/path` → `http://proxy.example.com/path` +- `//origin.example.com/path` → `//proxy.example.com/path` +- `\\/\\/origin.example.com` → `\\/\\/proxy.example.com` (JSON-escaped) +- `\\\\//origin.example.com` → `\\\\//proxy.example.com` (double-escaped) + +**Implementation:** `rewrite_rsc_url_string()` at [nextjs.rs:803](crates/common/src/integrations/nextjs.rs#L803) + +### Step 5: Recalculate T-Chunk Length + +Calculate the new unescaped byte length (excluding markers) and update the T-chunk header with the new hex length. + +**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [nextjs.rs:991](crates/common/src/integrations/nextjs.rs#L991) + +### Step 6: Split Back on Markers + +Split the combined rewritten content back into individual payloads on the marker boundaries. Each payload corresponds to one original script, with T-chunk lengths now correct across script boundaries. + +**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [nextjs.rs:1056](crates/common/src/integrations/nextjs.rs#L1056) + +### Step 7: Reconstruct HTML + +Replace each original script with its rewritten version in the HTML. + +**Implementation:** Part of `post_process_rsc_html()` at [nextjs.rs:1245](crates/common/src/integrations/nextjs.rs#L1245) + +### Visual Example + +``` +BEFORE (2 scripts, T-chunk spans both): +┌──────────────────────────────────────────────────────────────────┐ +│ Script 1: self.__next_f.push([1,"11:null\n1a:T68f,"]) │ +│ └─ T-chunk header: 1a:T68f (1679 bytes declared) │ +├──────────────────────────────────────────────────────────────────┤ +│ Script 2: self.__next_f.push([1,"{\"url\":\"https://origin...."])│ +│ └─ T-chunk content continues here (1679 bytes total) │ +└──────────────────────────────────────────────────────────────────┘ + +COMBINED (with marker): +"11:null\n1a:T68f,\x00SPLIT\x00{\"url\":\"https://origin.example.com/...\"}" + ^^^^^^^^^^ marker (not counted in byte length) + +AFTER URL REWRITE: +"11:null\n1a:T652,\x00SPLIT\x00{\"url\":\"http://proxy.example.com/...\"}" + ^^^ new hex length (shorter URL = smaller length) + +SPLIT BACK: +┌──────────────────────────────────────────────────────────────────┐ +│ Script 1: self.__next_f.push([1,"11:null\n1a:T652,"]) │ +│ └─ Updated T-chunk header with correct length │ +├──────────────────────────────────────────────────────────────────┤ +│ Script 2: self.__next_f.push([1,"{\"url\":\"http://proxy.exa..."])│ +│ └─ Rewritten URLs in content │ +└──────────────────────────────────────────────────────────────────┘ +``` + +--- + +## Comparison: Old vs New Approach + +| Aspect | Old (Whitespace Padding) | New (T-Chunk Length Recalculation) | +|--------|--------------------------|-------------------------------------| +| T-chunk handling | Broken - lengths not updated | Correct - lengths recalculated | +| URL length change | Limited to shorter URLs | Any length change supported | +| Escape sequences | Not properly counted | Fully supported | +| Cross-script T-chunks | Not handled | Handled via post-processing | +| Implementation | Simple regex replace | Full T-chunk parsing + post-processing | +| Architecture | Hardcoded in processor | Integration hook pattern | +| Extensibility | None | Other integrations can add post-processors | + +--- + +## Conclusion + +RSC hydration requires **correct T-chunk byte lengths**. The solution involves two phases: + +### Phase 1: Streaming (per-script) +- Process each script as it arrives +- Handle self-contained T-chunks +- ~95% of T-chunks are handled here + +### Phase 2: Post-Processing (cross-script) +- After streaming completes +- Combine scripts with markers +- Recalculate T-chunk lengths across boundaries +- Handles the remaining ~5% edge cases + +The key insights are: +1. **T-chunk lengths must match content**: The RSC parser uses declared lengths to navigate +2. **T-chunks can span scripts**: Next.js streaming splits content arbitrarily +3. **Markers enable cross-script processing**: Combine, process, split back +4. **Integration hooks enable extensibility**: Other integrations can add post-processors + +--- + +## References + +- React Flight Protocol: Internal React implementation for RSC streaming +- Next.js App Router: https://nextjs.org/docs/app +- lol_html: https://github.com/nicksrandall/lol-html (streaming HTML rewriter) +- Implementation: `crates/common/src/integrations/nextjs.rs` From 807f8451a0848f3ed54a6dc642cd458e135e885e Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:26:36 -0800 Subject: [PATCH 02/11] Rewrite additional NextJS attributes --- trusted-server.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/trusted-server.toml b/trusted-server.toml index 6da5e63..caf7e65 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -44,7 +44,7 @@ debug = false [integrations.nextjs] enabled = false -rewrite_attributes = ["href", "link", "url"] +rewrite_attributes = ["href", "link", "siteBaseUrl", "siteProductionDomain", "url"] [integrations.testlight] endpoint = "https://testlight.example/openrtb2/auction" From 1fde992b96216d4b09ff8f76dd2184a4b104a4f3 Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:56:51 -0800 Subject: [PATCH 03/11] Refactor integration for post processing --- crates/common/src/html_processor.rs | 80 +++- crates/common/src/integrations/nextjs.rs | 57 ++- crates/common/src/integrations/registry.rs | 13 +- crates/common/src/publisher.rs | 112 +---- docs/RSC_HYDRATION_FINDINGS.md | 511 +++++++-------------- docs/integration_guide.md | 22 +- 6 files changed, 301 insertions(+), 494 deletions(-) diff --git a/crates/common/src/html_processor.rs b/crates/common/src/html_processor.rs index a0ac143..cff0ccd 100644 --- a/crates/common/src/html_processor.rs +++ b/crates/common/src/html_processor.rs @@ -2,18 +2,84 @@ //! //! This module provides a StreamProcessor implementation for HTML content. use std::cell::Cell; +use std::io; use std::rc::Rc; +use std::sync::Arc; use lol_html::{element, html_content::ContentType, text, Settings as RewriterSettings}; use crate::integrations::{ - AttributeRewriteOutcome, IntegrationAttributeContext, IntegrationRegistry, - IntegrationScriptContext, ScriptRewriteAction, + AttributeRewriteOutcome, IntegrationAttributeContext, IntegrationHtmlContext, + IntegrationHtmlPostProcessor, IntegrationRegistry, IntegrationScriptContext, + ScriptRewriteAction, }; use crate::settings::Settings; use crate::streaming_processor::{HtmlRewriterAdapter, StreamProcessor}; use crate::tsjs; +struct HtmlWithPostProcessing { + inner: HtmlRewriterAdapter, + post_processors: Vec>, + origin_host: String, + request_host: String, + request_scheme: String, +} + +impl StreamProcessor for HtmlWithPostProcessing { + fn process_chunk(&mut self, chunk: &[u8], is_last: bool) -> Result, io::Error> { + let output = self.inner.process_chunk(chunk, is_last)?; + if !is_last || output.is_empty() || self.post_processors.is_empty() { + return Ok(output); + } + + let Ok(output_str) = std::str::from_utf8(&output) else { + return Ok(output); + }; + + let ctx = IntegrationHtmlContext { + request_host: &self.request_host, + request_scheme: &self.request_scheme, + origin_host: &self.origin_host, + }; + + // Preflight to avoid allocating a `String` unless at least one post-processor wants to run. + if !self + .post_processors + .iter() + .any(|p| p.should_process(output_str, &ctx)) + { + return Ok(output); + } + + let mut html = String::from_utf8(output).map_err(|e| { + io::Error::other(format!( + "HTML post-processing expected valid UTF-8 output: {e}" + )) + })?; + + let mut changed = false; + for processor in &self.post_processors { + if processor.should_process(&html, &ctx) { + changed |= processor.post_process(&mut html, &ctx); + } + } + + if changed { + log::info!( + "HTML post-processing complete: origin_host={}, output_len={}", + self.origin_host, + html.len() + ); + } + + Ok(html.into_bytes()) + } + + fn reset(&mut self) { + self.inner.reset(); + } +} + /// Configuration for HTML processing #[derive(Clone)] pub struct HtmlProcessorConfig { @@ -43,6 +109,8 @@ impl HtmlProcessorConfig { /// Create an HTML processor with URL replacement and optional Prebid injection pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcessor { + let post_processors = config.integrations.html_post_processors(); + // Simplified URL patterns structure - stores only core data and generates variants on-demand struct UrlPatterns { origin_host: String, @@ -343,7 +411,13 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso ..RewriterSettings::default() }; - HtmlRewriterAdapter::new(rewriter_settings) + HtmlWithPostProcessing { + inner: HtmlRewriterAdapter::new(rewriter_settings), + post_processors, + origin_host: config.origin_host, + request_host: config.request_host, + request_scheme: config.request_scheme, + } } #[cfg(test)] diff --git a/crates/common/src/integrations/nextjs.rs b/crates/common/src/integrations/nextjs.rs index 5527022..29522cc 100644 --- a/crates/common/src/integrations/nextjs.rs +++ b/crates/common/src/integrations/nextjs.rs @@ -95,43 +95,40 @@ impl IntegrationHtmlPostProcessor for NextJsHtmlPostProcessor { NEXTJS_INTEGRATION_ID } - fn post_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> String { - log::info!( - "NextJs post-processor called: enabled={}, rewrite_attributes={:?}, html_len={}, origin={}, proxy={}://{}", - self.config.enabled, - self.config.rewrite_attributes, - html.len(), - ctx.origin_host, - ctx.request_scheme, - ctx.request_host - ); - + fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { if !self.config.enabled || self.config.rewrite_attributes.is_empty() { - log::info!("NextJs post-processor skipped (disabled or no attributes)"); - return html.to_string(); + return false; + } + + // Only Next.js App Router pages will contain `__next_f` pushes. + // Also require an origin host hit to avoid running on already-rewritten pages. + html.contains("__next_f.push") && html.contains(ctx.origin_host) + } + + fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool { + if !self.should_process(html, ctx) { + return false; } - // Count origin URLs before let origin_before = html.matches(ctx.origin_host).count(); log::info!( - "NextJs post-processor: {} origin URLs before rewrite", - origin_before + "NextJs post-processor running: html_len={}, origin_matches={}, origin={}, proxy={}://{}", + html.len(), + origin_before, + ctx.origin_host, + ctx.request_scheme, + ctx.request_host ); let result = post_process_rsc_html(html, ctx.origin_host, ctx.request_host, ctx.request_scheme); - // Count after - let origin_after = result.matches(ctx.origin_host).count(); - let proxy_after = result.matches(ctx.request_host).count(); - log::info!( - "NextJs post-processor complete: input_len={}, output_len={}, origin_remaining={}, proxy_urls={}", - html.len(), - result.len(), - origin_after, - proxy_after - ); - result + if result == *html { + return false; + } + + *html = result; + true } } @@ -291,14 +288,14 @@ impl IntegrationScriptRewriter for NextJsScriptRewriter { match self.mode { NextJsRewriteMode::Structured => self.rewrite_structured(content, ctx), NextJsRewriteMode::Streamed => { - // RSC push scripts (self.__next_f.push) are handled by the post-processor + // RSC push scripts (__next_f.push) are handled by the post-processor // because T-chunks can span multiple scripts and require combined processing. // Only handle non-RSC scripts here. - if content.contains("self.__next_f.push") { + if content.contains("__next_f.push") { return ScriptRewriteAction::keep(); } // For other __next_f scripts (like initialization), use simple URL rewriting - if content.contains("self.__next_f") { + if content.contains("__next_f") { return self.rewrite_streamed(content, ctx); } ScriptRewriteAction::keep() diff --git a/crates/common/src/integrations/registry.rs b/crates/common/src/integrations/registry.rs index a6cb984..b029d6b 100644 --- a/crates/common/src/integrations/registry.rs +++ b/crates/common/src/integrations/registry.rs @@ -264,10 +264,19 @@ pub trait IntegrationHtmlPostProcessor: Send + Sync { /// Identifier for logging/diagnostics. fn integration_id(&self) -> &'static str; + /// Fast preflight check to decide whether post-processing should run for this document. + /// + /// Implementations should keep this cheap (e.g., a substring check) because it may run on + /// every HTML response when the integration is enabled. + fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { + let _ = (html, ctx); + true + } + /// Post-process complete HTML content. /// This is called after streaming HTML processing with the complete HTML. - /// Return the modified HTML or the original if no changes needed. - fn post_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> String; + /// Implementations should mutate `html` in-place and return `true` when changes were made. + fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool; } /// Registration payload returned by integration builders. diff --git a/crates/common/src/publisher.rs b/crates/common/src/publisher.rs index d0ce6e8..cd14514 100644 --- a/crates/common/src/publisher.rs +++ b/crates/common/src/publisher.rs @@ -1,7 +1,6 @@ use error_stack::{Report, ResultExt}; use fastly::http::{header, StatusCode}; use fastly::{Body, Request, Response}; -use std::io::Write; use crate::backend::ensure_backend_from_url; use crate::http_util::serve_static_with_etag; @@ -9,67 +8,13 @@ use crate::http_util::serve_static_with_etag; use crate::constants::{HEADER_SYNTHETIC_TRUSTED_SERVER, HEADER_X_COMPRESS_HINT}; use crate::cookies::create_synthetic_cookie; use crate::error::TrustedServerError; -use crate::integrations::{IntegrationHtmlContext, IntegrationRegistry}; +use crate::integrations::IntegrationRegistry; use crate::rsc_flight::RscFlightUrlRewriter; use crate::settings::Settings; use crate::streaming_processor::{Compression, PipelineConfig, StreamProcessor, StreamingPipeline}; use crate::streaming_replacer::create_url_replacer; use crate::synthetic::get_or_generate_synthetic_id; -/// Compress data using the specified compression algorithm -fn compress_data( - data: &[u8], - compression: Compression, -) -> Result, Report> { - match compression { - Compression::None => Ok(data.to_vec()), - Compression::Gzip => { - use flate2::write::GzEncoder; - use flate2::Compression as GzCompression; - let mut encoder = GzEncoder::new(Vec::new(), GzCompression::default()); - encoder - .write_all(data) - .change_context(TrustedServerError::Proxy { - message: "Failed to gzip compress data".to_string(), - })?; - encoder.finish().change_context(TrustedServerError::Proxy { - message: "Failed to finish gzip compression".to_string(), - }) - } - Compression::Deflate => { - use flate2::write::ZlibEncoder; - use flate2::Compression as ZlibCompression; - let mut encoder = ZlibEncoder::new(Vec::new(), ZlibCompression::default()); - encoder - .write_all(data) - .change_context(TrustedServerError::Proxy { - message: "Failed to deflate compress data".to_string(), - })?; - encoder.finish().change_context(TrustedServerError::Proxy { - message: "Failed to finish deflate compression".to_string(), - }) - } - Compression::Brotli => { - use brotli::enc::writer::CompressorWriter; - use brotli::enc::BrotliEncoderParams; - let params = BrotliEncoderParams { - quality: 4, // Balance speed and compression - ..Default::default() - }; - let mut output = Vec::new(); - { - let mut writer = CompressorWriter::with_params(&mut output, 4096, ¶ms); - writer - .write_all(data) - .change_context(TrustedServerError::Proxy { - message: "Failed to brotli compress data".to_string(), - })?; - } - Ok(output) - } - } -} - /// Detects the request scheme (HTTP or HTTPS) using Fastly SDK methods and headers. /// /// Tries multiple methods in order of reliability: @@ -199,67 +144,14 @@ fn process_response_streaming( params.integration_registry, )?; - // Check if we have post-processors that need uncompressed HTML - let post_processors = params.integration_registry.html_post_processors(); - let needs_post_processing = !post_processors.is_empty(); - - // If we have post-processors, output uncompressed HTML so they can work with it, - // then compress only once at the end. This avoids double decompression/compression. - let output_compression = if needs_post_processing { - Compression::None - } else { - compression - }; - let config = PipelineConfig { input_compression: compression, - output_compression, + output_compression: compression, chunk_size: 8192, }; let mut pipeline = StreamingPipeline::new(config, processor); pipeline.process(body, &mut output)?; - - // Post-process HTML through registered integration post-processors. - // This handles cross-script T-chunks for RSC and other integration-specific - // processing that requires the complete HTML document. - log::info!( - "HTML post-processors: count={}, output_len={}, needs_post_processing={}", - post_processors.len(), - output.len(), - needs_post_processing - ); - if needs_post_processing { - // Output is already uncompressed, convert to string for post-processing - if let Ok(html) = std::str::from_utf8(&output) { - log::info!( - "NextJs post-processor called with {} bytes of HTML", - html.len() - ); - let ctx = IntegrationHtmlContext { - request_host: params.request_host, - request_scheme: params.request_scheme, - origin_host: params.origin_host, - }; - let mut processed = html.to_string(); - for processor in post_processors { - processed = processor.post_process(&processed, &ctx); - } - - // Now compress if original content was compressed - if compression != Compression::None { - output = compress_data(processed.as_bytes(), compression)?; - } else { - output = processed.into_bytes(); - } - } else { - log::warn!("HTML post-processing skipped: content is not valid UTF-8"); - // If not valid UTF-8, recompress the output as-is - if compression != Compression::None { - output = compress_data(&output, compression)?; - } - } - } } else if is_rsc_flight { // RSC Flight responses are length-prefixed (T rows). A naive string replacement will // corrupt the stream by changing byte lengths without updating the prefixes. diff --git a/docs/RSC_HYDRATION_FINDINGS.md b/docs/RSC_HYDRATION_FINDINGS.md index 9dfb8aa..94796da 100644 --- a/docs/RSC_HYDRATION_FINDINGS.md +++ b/docs/RSC_HYDRATION_FINDINGS.md @@ -9,9 +9,15 @@ When proxying Next.js App Router sites, URL rewriting in RSC (React Server Compo Next.js App Router uses React Server Components with a streaming "flight" protocol. RSC data is delivered to the browser via inline ` - - + + + ``` The `[1, "..."]` calls contain the actual RSC payload as a JavaScript string. @@ -22,16 +28,16 @@ For client-side navigations, Next.js fetches Flight directly (no ` + - + ``` This happens because Next.js streams RSC data as it becomes available. The T-chunk header in script 10 declares 928 bytes (0x928 = 2344 decimal), but those bytes are delivered in script 11. @@ -114,6 +124,7 @@ Script 60 (index 59): ``` When the Rust implementation processed each script independently: + - Script 59: T-chunk header found, but `content_end = header_end` (0 bytes in THIS script) - Script 60: Content processed, but no T-chunk header to update @@ -147,6 +158,7 @@ The streaming HTML processor (`lol_html`) processes scripts one at a time: ``` This is a fundamental limitation: when script A declares a T-chunk that continues in script B, the streaming processor cannot: + 1. Track that script A's T-chunk is incomplete 2. Update script A's header after processing script B's URLs @@ -154,17 +166,18 @@ This is a fundamental limitation: when script A declares a T-chunk that continue ## The Solution: Two-Phase Processing -### Phase 1: Streaming HTML Processing (per-script) +### Phase 1: Streaming HTML Processing + +The HTML rewriter runs in a streaming pipeline (decompress → rewrite → recompress). During this phase we: -The streaming processor handles scripts that are self-contained: -- Extracts RSC payload from `self.__next_f.push([1, '...'])` -- Finds T-chunks within the single script -- Rewrites URLs and recalculates lengths -- Works correctly for ~95% of scripts +- Rewrite standard HTML attributes (`href`, `src`, `srcset`, etc.) +- Run integration script rewriters for self-contained payloads (e.g., Pages Router `__NEXT_DATA__`) +- Leave `self.__next_f.push([1,"..."])` scripts untouched because T-chunks can span script boundaries -### Phase 2: Post-Processing (cross-script) +### Phase 2: HTML Post-Processing (cross-script RSC) + +At end-of-document, a post-processor handles cross-script T-chunks: -After streaming completes, a post-processor handles cross-script T-chunks: 1. **Finds all RSC push scripts** in the complete HTML 2. **Combines their payloads** with markers 3. **Processes T-chunks across the combined content**, skipping markers when counting bytes @@ -172,75 +185,44 @@ After streaming completes, a post-processor handles cross-script T-chunks: 5. **Splits back on markers** to get individual rewritten payloads 6. **Rebuilds the HTML** with rewritten scripts +This phase is gated by a cheap `should_process` preflight so non‑Next.js pages do not pay the extra pass. + ### Marker-Based Cross-Script Processing #### Step 1: Combine Scripts with Markers -```rust -const RSC_MARKER: &str = "\x00SPLIT\x00"; - -// Combine all payloads -let mut combined = payloads[0].to_string(); -for payload in &payloads[1..] { - combined.push_str(RSC_MARKER); - combined.push_str(payload); -} -// Result: "11:null\n1a:T928,\x00SPLIT\x00...2344 bytes..." -``` +Concatenate all RSC push payload strings using a marker delimiter that cannot appear in valid JSON/RSC content. The marker `\x00SPLIT\x00` is chosen because: + - Contains null byte (`\x00`) which cannot appear in valid JSON/RSC content - Easily identifiable for splitting - Won't be confused with any escape sequence +**Implementation:** Marker constant at [nextjs.rs:903](crates/common/src/integrations/nextjs.rs#L903) and combine/split logic in [nextjs.rs:1053](crates/common/src/integrations/nextjs.rs#L1053) + #### Step 2: Find T-Chunks Across Combined Content -```rust -fn find_tchunks_with_markers(content: &str) -> Vec { - let pattern = Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").unwrap(); - - for each match: - // Parse header: id, hex_length - // Consume declared bytes, SKIPPING markers - let (content_end, _) = consume_unescaped_bytes_skip_markers( - content, header_end, declared_length - ); -} -``` +Scan the combined stream for `ID:T,` headers, then consume exactly `hex_length` unescaped bytes to find the T-chunk boundary. The key insight: markers don't count toward byte consumption. When a T-chunk declares 1679 bytes, we consume 1679 bytes of actual content, skipping over any markers we encounter. +**Implementation:** T-chunk discovery at [nextjs.rs:1002](crates/common/src/integrations/nextjs.rs#L1002) with marker-aware consumption in [nextjs.rs:907](crates/common/src/integrations/nextjs.rs#L907) + #### Step 3: Rewrite URLs and Recalculate Lengths -```rust -for chunk in &chunks { - // Extract T-chunk content (may contain markers) - let chunk_content = &combined[chunk.header_end..chunk.content_end]; - - // Rewrite URLs (preserves markers) - let rewritten_content = rewrite_rsc_url_string(chunk_content, ...); - - // Calculate new byte length (excluding markers) - let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); - let new_length_hex = format!("{:x}", new_length); - - // Write new T-chunk header and content - result.push_str(&chunk.id); - result.push_str(":T"); - result.push_str(&new_length_hex); - result.push(','); - result.push_str(&rewritten_content); -} -``` +For each `T` chunk: + +1. Rewrite URLs in the chunk content (preserving marker bytes) +2. Recalculate the unescaped byte length (excluding markers) +3. Rewrite the header to `ID:T,` #### Step 4: Split Back on Markers -```rust -// Split on markers to get individual payloads -result.split(RSC_MARKER).map(|s| s.to_string()).collect() -``` +Split the rewritten combined content by the marker to recover per-script payload strings. Each resulting payload corresponds to one original script, but with: + - URLs rewritten - T-chunk lengths correctly recalculated across script boundaries @@ -252,146 +234,25 @@ The post-processing is implemented as an integration hook, allowing other integr ### Trait Definition -```rust -/// Context for HTML post-processors. -pub struct IntegrationHtmlContext<'a> { - pub request_host: &'a str, - pub request_scheme: &'a str, - pub origin_host: &'a str, -} - -/// Trait for integration-provided HTML post-processors. -/// These run after streaming HTML processing to handle cases that require -/// access to the complete HTML (e.g., cross-script RSC T-chunks). -pub trait IntegrationHtmlPostProcessor: Send + Sync { - /// Identifier for logging/diagnostics. - fn integration_id(&self) -> &'static str; - - /// Post-process complete HTML content. - fn post_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> String; -} -``` +**Implementation:** Context at [registry.rs:254](crates/common/src/integrations/registry.rs#L254) and trait at [registry.rs:263](crates/common/src/integrations/registry.rs#L263) ### Registration -```rust -// In nextjs.rs -pub fn register(settings: &Settings) -> Option { - let config = build(settings)?; - - let structured = Arc::new(NextJsScriptRewriter::new(config.clone(), NextJsRewriteMode::Structured)); - let streamed = Arc::new(NextJsScriptRewriter::new(config.clone(), NextJsRewriteMode::Streamed)); - let post_processor = Arc::new(NextJsHtmlPostProcessor::new(config)); - - Some( - IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) - .with_script_rewriter(structured) - .with_script_rewriter(streamed) - .with_html_post_processor(post_processor) // <-- Post-processor hook - .build(), - ) -} -``` +**Implementation:** Next.js registers its HTML post-processor in [nextjs.rs:41](crates/common/src/integrations/nextjs.rs#L41) -### Execution in Publisher - -```rust -// In publisher.rs - process_response_streaming() - -// Phase 1: Streaming HTML processing -let mut pipeline = StreamingPipeline::new(config, processor); -pipeline.process(body, &mut output)?; - -// Phase 2: Post-processing via integration hooks -let post_processors = params.integration_registry.html_post_processors(); -if !post_processors.is_empty() { - if let Ok(html) = std::str::from_utf8(&output) { - let ctx = IntegrationHtmlContext { - request_host: params.request_host, - request_scheme: params.request_scheme, - origin_host: params.origin_host, - }; - let mut processed = html.to_string(); - for processor in post_processors { - processed = processor.post_process(&processed, &ctx); - } - output = processed.into_bytes(); - } -} -``` +### Execution in HTML Processor + +**Implementation:** End-of-document post-processing wrapper at [html_processor.rs:20](crates/common/src/html_processor.rs#L20) --- ## Byte Length Calculation Algorithm -To correctly calculate unescaped byte length: - -```rust -fn calculate_unescaped_byte_length(s: &str) -> usize { - let bytes = s.as_bytes(); - let mut result = 0; - let mut i = 0; - - while i < bytes.len() { - if bytes[i] == b'\\' && i + 1 < bytes.len() { - let esc = bytes[i + 1]; - - // Simple escape sequences: \n, \r, \t, \b, \f, \v, \", \', \\, \/ - if matches!(esc, b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/') { - result += 1; - i += 2; - continue; - } - - // \xHH - hex escape (1 byte) - if esc == b'x' && i + 3 < bytes.len() { - result += 1; - i += 4; - continue; - } - - // \uHHHH - unicode escape - if esc == b'u' && i + 5 < bytes.len() { - let hex = &s[i + 2..i + 6]; - if let Ok(code_unit) = u16::from_str_radix(hex, 16) { - // Check for surrogate pair - if is_high_surrogate(code_unit) && has_low_surrogate_at(s, i + 6) { - result += 4; // Surrogate pair = 4 UTF-8 bytes - i += 12; - continue; - } - // Single unicode escape - calculate UTF-8 byte length - let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); - result += c.len_utf8(); - i += 6; - continue; - } - } - } - - // Regular character - count its UTF-8 byte length - if bytes[i] < 0x80 { - result += 1; - i += 1; - } else { - let c = s[i..].chars().next().unwrap_or('\u{FFFD}'); - result += c.len_utf8(); - i += c.len_utf8(); - } - } - - result -} -``` +`T`-chunk lengths use the **unescaped** byte count of the payload (after decoding JavaScript string escapes). Correct handling requires: -### Marker-Aware Variant - -```rust -fn calculate_unescaped_byte_length_skip_markers(s: &str) -> usize { - let without_markers = s.replace(RSC_MARKER, ""); - calculate_unescaped_byte_length(&without_markers) -} -``` +- Counting unescaped bytes while accounting for `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [nextjs.rs:606](crates/common/src/integrations/nextjs.rs#L606) +- Consuming exactly *N unescaped bytes* to locate the end of a declared `T` chunk: [nextjs.rs:683](crates/common/src/integrations/nextjs.rs#L683) +- Marker-aware variants for cross-script processing (skip `RSC_MARKER` during counting/consumption): [nextjs.rs:988](crates/common/src/integrations/nextjs.rs#L988) and [nextjs.rs:907](crates/common/src/integrations/nextjs.rs#L907) --- @@ -399,25 +260,21 @@ fn calculate_unescaped_byte_length_skip_markers(s: &str) -> usize { The solution handles multiple URL formats in RSC content: -| Pattern | Example | In RSC String | -|---------|---------|---------------| -| Full HTTPS | `https://host/path` | `https://host/path` | -| Full HTTP | `http://host/path` | `http://host/path` | -| Protocol-relative | `//host/path` | `//host/path` | -| JSON-escaped slashes | `//host/path` | `\\/\\/host/path` | -| Double-escaped | `\\/\\/host` | `\\\\/\\\\/host` | -| Quad-escaped | `\\\\/\\\\/host` | `\\\\\\\\//host` | +| Pattern | Example | In RSC String | +| -------------------- | ------------------- | ------------------- | +| Full HTTPS | `https://host/path` | `https://host/path` | +| Full HTTP | `http://host/path` | `http://host/path` | +| Protocol-relative | `//host/path` | `//host/path` | +| JSON-escaped slashes | `//host/path` | `\\/\\/host/path` | +| Double-escaped | `\\/\\/host` | `\\\\/\\\\/host` | +| Quad-escaped | `\\\\/\\\\/host` | `\\\\\\\\//host` | ### Regex Pattern -```rust -let pattern = Regex::new(&format!( - r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, - escaped_origin -)).unwrap(); -``` +**Implementation:** Regex-based rewriting in [nextjs.rs:800](crates/common/src/integrations/nextjs.rs#L800) This pattern handles: + - Optional scheme (`https?`)? - Optional colon (`:`)? - Multiple escape levels for slashes @@ -487,52 +344,54 @@ This pattern handles: ## Test Results -| Test Case | Result | -|-----------|--------| -| T-chunk length shrinks (longer origin → shorter proxy) | Pass | -| T-chunk length grows (shorter origin → longer proxy) | Pass | -| Multiple T-chunks in same content | Pass | -| Escape sequences: `\n`, `\r`, `\t`, `\\`, `\"` | Pass | -| Unicode escapes: `\uHHHH` | Pass | -| Surrogate pairs: `\uD800\uDC00` | Pass | -| Hex escapes: `\xHH` | Pass | -| Various URL patterns (escaped slashes, etc.) | Pass | -| Cross-script T-chunk (header in script N, content in N+1) | Pass | -| Cross-script with multiple URLs in continuation | Pass | -| Non-T-chunk content preserved | Pass | -| HTML structure preserved after post-processing | Pass | +| Test Case | Result | +| --------------------------------------------------------- | ------ | +| T-chunk length shrinks (longer origin → shorter proxy) | Pass | +| T-chunk length grows (shorter origin → longer proxy) | Pass | +| Multiple T-chunks in same content | Pass | +| Escape sequences: `\n`, `\r`, `\t`, `\\`, `\"` | Pass | +| Unicode escapes: `\uHHHH` | Pass | +| Surrogate pairs: `\uD800\uDC00` | Pass | +| Hex escapes: `\xHH` | Pass | +| Various URL patterns (escaped slashes, etc.) | Pass | +| Cross-script T-chunk (header in script N, content in N+1) | Pass | +| Cross-script with multiple URLs in continuation | Pass | +| Non-T-chunk content preserved | Pass | +| HTML structure preserved after post-processing | Pass | ### Comparison: JS v7 vs JS v8 vs Rust -| Implementation | Approach | Fiber Count | Result | -|----------------|----------|-------------|--------| -| JS v7 | Per-script T-chunk rewriting | 0 | FAIL | -| JS v8 | Marker-based cross-script | 683 | PASS | -| Rust (final) | Two-phase with post-processor | 683 | PASS | +| Implementation | Approach | Fiber Count | Result | +| -------------- | ----------------------------- | ----------- | ------ | +| JS v7 | Per-script T-chunk rewriting | 0 | FAIL | +| JS v8 | Marker-based cross-script | 683 | PASS | +| Rust (final) | Two-phase with post-processor | 683 | PASS | ### Playwright Browser Testing (December 2024) Automated testing with Playwright across Chrome and Firefox verified the implementation: **Test Setup:** + - Fetched live HTML from a Next.js App Router site - Applied RSC URL rewriting via the Rust post-processor - Served rewritten HTML locally to isolate from bot detection **Results (both Chrome and Firefox):** -| Metric | Value | -|--------|-------| -| Hydration errors detected | 0 | -| Console errors (hydration-related) | 0 | -| Total links in page | 120 | -| Links rewritten to proxy | 120 | -| Links still pointing to origin | 0 | -| RSC push scripts present | Yes | -| `self.__next_f` entries | 223 | -| `__next` root element | Present | +| Metric | Value | +| ---------------------------------- | ------- | +| Hydration errors detected | 0 | +| Console errors (hydration-related) | 0 | +| Total links in page | 120 | +| Links rewritten to proxy | 120 | +| Links still pointing to origin | 0 | +| RSC push scripts present | Yes | +| `self.__next_f` entries | 223 | +| `__next` root element | Present | **Key Observations:** + 1. **No hydration mismatch**: React successfully hydrated without any "Text content does not match" or "Hydration failed" errors 2. **Complete URL rewriting**: All 120 navigation links correctly point to the proxy host 3. **RSC data preserved**: All 223 RSC Flight entries present in `self.__next_f` array @@ -540,78 +399,49 @@ Automated testing with Playwright across Chrome and Firefox verified the impleme --- -## Decompression Pipeline for Post-Processing - -The post-processor requires access to uncompressed HTML. Since origin responses are typically gzip or brotli compressed, the streaming pipeline was extended to support decompression-only mode. - -### The Problem - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Original Flow (without post-processing): │ -│ │ -│ Gzip In → Decompress → Process HTML → Recompress → Gzip Out │ -│ │ -│ With post-processing, we need uncompressed output: │ -│ │ -│ Gzip In → Decompress → Process HTML → ??? → Post-Process │ -│ │ -│ If we recompress, post-processor gets garbage (compressed bytes)│ -└─────────────────────────────────────────────────────────────────┘ -``` - -### Solution: Decompression-Only Pipeline Modes - -Added new pipeline transformation modes that decompress without recompressing: -- `process_gzip_to_none()` at [streaming_processor.rs:215](crates/common/src/streaming_processor.rs#L215) -- `process_deflate_to_none()` at [streaming_processor.rs:273](crates/common/src/streaming_processor.rs#L273) -- `process_brotli_to_none()` at [streaming_processor.rs:336](crates/common/src/streaming_processor.rs#L336) +## Compression Pipeline with Post-Processing -### Publisher Flow with Post-Processing +Post-processing requires access to uncompressed UTF‑8 HTML, but the trusted server still preserves the origin `Content-Encoding` on the wire. -The post-processing flow in publisher.rs: -1. Get post-processors from the integration registry -2. If post-processors exist, output uncompressed HTML (decompression-only mode) -3. Run streaming HTML processing -4. Apply each post-processor to the uncompressed HTML -5. Recompress once at the end to match original Content-Encoding +End-to-end flow: -**Implementation:** Post-processing logic at [publisher.rs:203](crates/common/src/publisher.rs#L203) +1. `StreamingPipeline` decompresses the origin body based on `Content-Encoding` +2. The HTML processor runs `lol_html` rewriting and (optionally) integration post-processors on the complete HTML +3. `StreamingPipeline` recompresses to the original encoding -### Benefits +Because post-processing runs inside the HTML processor (before recompression), `publisher.rs` does not need to special-case compression for integrations. -1. **Single compression pass**: Avoids decompress → recompress → decompress → recompress cycle -2. **Valid UTF-8 for post-processor**: Post-processor receives actual HTML, not compressed bytes -3. **Preserves original compression**: Final output matches original Content-Encoding +**Implementation:** Post-processing entry point at [html_processor.rs:20](crates/common/src/html_processor.rs#L20) --- ## Implementation Files -| File | Purpose | -|------|---------| -| `crates/common/src/integrations/nextjs.rs` | RSC rewriting logic, post-processor | -| `crates/common/src/integrations/registry.rs` | `IntegrationHtmlPostProcessor` trait | -| `crates/common/src/integrations/mod.rs` | Module exports | -| `crates/common/src/publisher.rs` | Post-processor invocation, decompression flow | -| `crates/common/src/streaming_processor.rs` | Decompression-only pipeline modes | +| File | Purpose | +| -------------------------------------------- | --------------------------------------------- | +| `crates/common/src/integrations/nextjs.rs` | RSC rewriting logic, post-processor | +| `crates/common/src/integrations/registry.rs` | `IntegrationHtmlPostProcessor` trait | +| `crates/common/src/integrations/mod.rs` | Module exports | +| `crates/common/src/html_processor.rs` | HTML rewriting + post-processor invocation | +| `crates/common/src/publisher.rs` | Response routing + streaming pipeline setup | +| `crates/common/src/streaming_processor.rs` | Compression transforms + `StreamProcessor` | ### Key Functions in nextjs.rs -| Function | Line | Purpose | -|----------|------|---------| -| `extract_rsc_push_payload` | [232](crates/common/src/integrations/nextjs.rs#L232) | Extract string from `self.__next_f.push([1, '...'])` | -| `calculate_unescaped_byte_length` | [609](crates/common/src/integrations/nextjs.rs#L609) | Count unescaped bytes with escape handling | -| `consume_unescaped_bytes` | [686](crates/common/src/integrations/nextjs.rs#L686) | Advance through string consuming N bytes | -| `find_tchunks` | [767](crates/common/src/integrations/nextjs.rs#L767) | Find T-chunks in single script | -| `rewrite_rsc_url_string` | [803](crates/common/src/integrations/nextjs.rs#L803) | URL rewriting with escape handling | -| `rewrite_rsc_tchunks` | [833](crates/common/src/integrations/nextjs.rs#L833) | Single-script T-chunk processing | -| `consume_unescaped_bytes_skip_markers` | [910](crates/common/src/integrations/nextjs.rs#L910) | Advance through string, skipping markers | -| `calculate_unescaped_byte_length_skip_markers` | [991](crates/common/src/integrations/nextjs.rs#L991) | Count unescaped bytes, excluding markers | -| `find_tchunks_with_markers` | [1005](crates/common/src/integrations/nextjs.rs#L1005) | Find T-chunks in marker-combined content | -| `rewrite_rsc_scripts_combined` | [1056](crates/common/src/integrations/nextjs.rs#L1056) | Cross-script T-chunk processing | -| `find_rsc_push_scripts` | [1165](crates/common/src/integrations/nextjs.rs#L1165) | Find all RSC scripts in HTML | -| `post_process_rsc_html` | [1245](crates/common/src/integrations/nextjs.rs#L1245) | Complete HTML post-processing | +| Function | Line | Purpose | +| ---------------------------------------------- | ------------------------------------------------------ | ---------------------------------------------------- | +| `extract_rsc_push_payload` | [229](crates/common/src/integrations/nextjs.rs#L229) | Extract string from `self.__next_f.push([1, '...'])` | +| `calculate_unescaped_byte_length` | [606](crates/common/src/integrations/nextjs.rs#L606) | Count unescaped bytes with escape handling | +| `consume_unescaped_bytes` | [683](crates/common/src/integrations/nextjs.rs#L683) | Advance through string consuming N bytes | +| `find_tchunks` | [764](crates/common/src/integrations/nextjs.rs#L764) | Find T-chunks in single script | +| `rewrite_rsc_url_string` | [800](crates/common/src/integrations/nextjs.rs#L800) | URL rewriting with escape handling | +| `rewrite_rsc_tchunks` | [830](crates/common/src/integrations/nextjs.rs#L830) | Single-script T-chunk processing | +| `consume_unescaped_bytes_skip_markers` | [907](crates/common/src/integrations/nextjs.rs#L907) | Advance through string, skipping markers | +| `calculate_unescaped_byte_length_skip_markers` | [988](crates/common/src/integrations/nextjs.rs#L988) | Count unescaped bytes, excluding markers | +| `find_tchunks_with_markers` | [1002](crates/common/src/integrations/nextjs.rs#L1002) | Find T-chunks in marker-combined content | +| `rewrite_rsc_scripts_combined` | [1053](crates/common/src/integrations/nextjs.rs#L1053) | Cross-script T-chunk processing | +| `find_rsc_push_scripts` | [1162](crates/common/src/integrations/nextjs.rs#L1162) | Find all RSC scripts in HTML | +| `post_process_rsc_html` | [1242](crates/common/src/integrations/nextjs.rs#L1242) | Complete HTML post-processing | --- @@ -620,12 +450,14 @@ The post-processing flow in publisher.rs: ### Very Long Proxy URLs If the proxy URL is significantly longer than the original, T-chunk content grows substantially. This is handled correctly (the hex length is recalculated), but it may affect: + - Response size - Streaming behavior if scripts become much larger ### Performance Considerations The post-processing phase requires: + 1. Parsing complete HTML to find scripts (O(n) string scan) 2. Combining payloads (memory allocation) 3. Regex matching for T-chunks @@ -643,53 +475,54 @@ For typical pages with 100-300 RSC scripts, this adds ~1-5ms to processing time. ## Deconstruction and Reconstruction Logic -The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main entry point is `post_process_rsc_html()` at [nextjs.rs:1245](crates/common/src/integrations/nextjs.rs#L1245). +The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main entry point is `post_process_rsc_html()` at [nextjs.rs:1242](crates/common/src/integrations/nextjs.rs#L1242). ### Step 1: Find RSC Push Scripts Find all `self.__next_f.push([1, "..."])` scripts in the HTML and extract their payloads. -**Implementation:** `find_rsc_push_scripts()` at [nextjs.rs:1165](crates/common/src/integrations/nextjs.rs#L1165) +**Implementation:** `find_rsc_push_scripts()` at [nextjs.rs:1162](crates/common/src/integrations/nextjs.rs#L1162) ### Step 2: Combine Payloads with Markers Join all payloads with a marker string (`\x00SPLIT\x00`) that cannot appear in valid JSON/RSC content. This allows T-chunks to be processed across script boundaries while preserving the ability to split back later. -**Implementation:** Marker constant at [nextjs.rs:906](crates/common/src/integrations/nextjs.rs#L906), combining logic in `rewrite_rsc_scripts_combined()` at [nextjs.rs:1056](crates/common/src/integrations/nextjs.rs#L1056) +**Implementation:** Marker constant at [nextjs.rs:903](crates/common/src/integrations/nextjs.rs#L903), combining logic in `rewrite_rsc_scripts_combined()` at [nextjs.rs:1053](crates/common/src/integrations/nextjs.rs#L1053) ### Step 3: Find T-Chunks Across Combined Content Parse T-chunk headers (`ID:T,`) and consume exactly the declared number of unescaped bytes, skipping over markers. -**Implementation:** `find_tchunks_with_markers()` at [nextjs.rs:1005](crates/common/src/integrations/nextjs.rs#L1005), using `consume_unescaped_bytes_skip_markers()` at [nextjs.rs:910](crates/common/src/integrations/nextjs.rs#L910) +**Implementation:** `find_tchunks_with_markers()` at [nextjs.rs:1002](crates/common/src/integrations/nextjs.rs#L1002), using `consume_unescaped_bytes_skip_markers()` at [nextjs.rs:907](crates/common/src/integrations/nextjs.rs#L907) ### Step 4: Rewrite URLs in T-Chunk Content Rewrite all URL patterns in the T-chunk content: + - `https://origin.example.com/path` → `http://proxy.example.com/path` - `//origin.example.com/path` → `//proxy.example.com/path` - `\\/\\/origin.example.com` → `\\/\\/proxy.example.com` (JSON-escaped) - `\\\\//origin.example.com` → `\\\\//proxy.example.com` (double-escaped) -**Implementation:** `rewrite_rsc_url_string()` at [nextjs.rs:803](crates/common/src/integrations/nextjs.rs#L803) +**Implementation:** `rewrite_rsc_url_string()` at [nextjs.rs:800](crates/common/src/integrations/nextjs.rs#L800) ### Step 5: Recalculate T-Chunk Length Calculate the new unescaped byte length (excluding markers) and update the T-chunk header with the new hex length. -**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [nextjs.rs:991](crates/common/src/integrations/nextjs.rs#L991) +**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [nextjs.rs:988](crates/common/src/integrations/nextjs.rs#L988) ### Step 6: Split Back on Markers Split the combined rewritten content back into individual payloads on the marker boundaries. Each payload corresponds to one original script, with T-chunk lengths now correct across script boundaries. -**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [nextjs.rs:1056](crates/common/src/integrations/nextjs.rs#L1056) +**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [nextjs.rs:1053](crates/common/src/integrations/nextjs.rs#L1053) ### Step 7: Reconstruct HTML Replace each original script with its rewritten version in the HTML. -**Implementation:** Part of `post_process_rsc_html()` at [nextjs.rs:1245](crates/common/src/integrations/nextjs.rs#L1245) +**Implementation:** Part of `post_process_rsc_html()` at [nextjs.rs:1242](crates/common/src/integrations/nextjs.rs#L1242) ### Visual Example @@ -725,34 +558,36 @@ SPLIT BACK: ## Comparison: Old vs New Approach -| Aspect | Old (Whitespace Padding) | New (T-Chunk Length Recalculation) | -|--------|--------------------------|-------------------------------------| -| T-chunk handling | Broken - lengths not updated | Correct - lengths recalculated | -| URL length change | Limited to shorter URLs | Any length change supported | -| Escape sequences | Not properly counted | Fully supported | -| Cross-script T-chunks | Not handled | Handled via post-processing | -| Implementation | Simple regex replace | Full T-chunk parsing + post-processing | -| Architecture | Hardcoded in processor | Integration hook pattern | -| Extensibility | None | Other integrations can add post-processors | +| Aspect | Old (Whitespace Padding) | New (T-Chunk Length Recalculation) | +| --------------------- | ---------------------------- | ------------------------------------------ | +| T-chunk handling | Broken - lengths not updated | Correct - lengths recalculated | +| URL length change | Limited to shorter URLs | Any length change supported | +| Escape sequences | Not properly counted | Fully supported | +| Cross-script T-chunks | Not handled | Handled via post-processing | +| Implementation | Simple regex replace | Full T-chunk parsing + post-processing | +| Architecture | Hardcoded in processor | Integration hook pattern | +| Extensibility | None | Other integrations can add post-processors | --- ## Conclusion -RSC hydration requires **correct T-chunk byte lengths**. The solution involves two phases: +RSC hydration requires **correct T-chunk byte lengths**. The trusted server solves this with two stages: + +### Stage 1: Streaming HTML rewrite -### Phase 1: Streaming (per-script) -- Process each script as it arrives -- Handle self-contained T-chunks -- ~95% of T-chunks are handled here +- Run `lol_html` rewriting (attributes + integration script rewriters) +- Skip `__next_f.push` payload scripts (handled in stage 2) -### Phase 2: Post-Processing (cross-script) -- After streaming completes +### Stage 2: End-of-document post-processing (cross-script) + +- After streaming completes for the full HTML document - Combine scripts with markers - Recalculate T-chunk lengths across boundaries -- Handles the remaining ~5% edge cases +- Rewrite URLs in RSC payloads safely across script boundaries The key insights are: + 1. **T-chunk lengths must match content**: The RSC parser uses declared lengths to navigate 2. **T-chunks can span scripts**: Next.js streaming splits content arbitrarily 3. **Markers enable cross-script processing**: Combine, process, split back @@ -762,7 +597,7 @@ The key insights are: ## References -- React Flight Protocol: Internal React implementation for RSC streaming +- React Flight Protocol: Internal React implementation for RSC streaming: https://github.com/vercel/next.js/tree/v14.2.35 - Next.js App Router: https://nextjs.org/docs/app - lol_html: https://github.com/nicksrandall/lol-html (streaming HTML rewriter) - Implementation: `crates/common/src/integrations/nextjs.rs` diff --git a/docs/integration_guide.md b/docs/integration_guide.md index f90b7e8..f615827 100644 --- a/docs/integration_guide.md +++ b/docs/integration_guide.md @@ -133,19 +133,19 @@ impl IntegrationProxy for MyIntegration { } ``` -**Recommended:** Use the provided helper methods to automatically namespace your routes under -`/integrations/{integration_name()}/`. Available helpers: `get()`, `post()`, `put()`, `delete()`, -and `patch()`. This lets you define routes with just their relative paths (e.g., `self.post("/auction")` +**Recommended:** Use the provided helper methods to automatically namespace your routes under +`/integrations/{integration_name()}/`. Available helpers: `get()`, `post()`, `put()`, `delete()`, +and `patch()`. This lets you define routes with just their relative paths (e.g., `self.post("/auction")` becomes `"/integrations/my_integration/auction"`). You can also define routes manually using -`IntegrationEndpoint::get()` / `IntegrationEndpoint::post()` / etc. for backwards compatibility or +`IntegrationEndpoint::get()` / `IntegrationEndpoint::post()` / etc. for backwards compatibility or special cases. -Routes are matched verbatim in `crates/fastly/src/main.rs`, so stick to stable paths and +Routes are matched verbatim in `crates/fastly/src/main.rs`, so stick to stable paths and register whichever HTTP methods you need. **New integrations should namespace their routes under -`/integrations/{INTEGRATION_NAME}/`** using the helper methods (`self.get()`, `self.post()`, -`self.put()`, `self.delete()`, `self.patch()`) for consistency, but you can define routes manually +`/integrations/{INTEGRATION_NAME}/`** using the helper methods (`self.get()`, `self.post()`, +`self.put()`, `self.delete()`, `self.patch()`) for consistency, but you can define routes manually if needed (e.g., for backwards compatibility). -The shared context already injects Trusted Server logging, headers, +The shared context already injects Trusted Server logging, headers, and error handling; the handler only needs to deserialize the request, call the upstream endpoint, and stamp integration-specific headers. @@ -295,9 +295,9 @@ time. Two built-in integrations demonstrate how the framework pieces fit together: -| Integration | Purpose | Key files | -| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | -| `testlight` | Sample partner stub showing request proxying, attribute rewrites, and asset injection. | `crates/common/src/integrations/testlight.rs`, `crates/js/lib/src/integrations/testlight.ts` | +| Integration | Purpose | Key files | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | +| `testlight` | Sample partner stub showing request proxying, attribute rewrites, and asset injection. | `crates/common/src/integrations/testlight.rs`, `crates/js/lib/src/integrations/testlight.ts` | | `prebid` | Production Prebid Server bridge that owns `/first-party/ad` & `/third-party/ad`, injects synthetic IDs, rewrites creatives/notification URLs, and removes publisher-supplied Prebid scripts because the shim already ships in the unified TSJS build. | `crates/common/src/integrations/prebid.rs`, `crates/js/lib/src/ext/prebidjs.ts` | ### Example: Prebid integration From 66867273f864ba9260b334d83c6b1cf4c2c7f44e Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:18:30 -0800 Subject: [PATCH 04/11] Fixed secondary link parsing after first link was clicked --- crates/common/src/integrations/nextjs.rs | 39 +++-------------- crates/common/src/rsc_flight.rs | 54 ++++++++++++++++++++++++ crates/common/src/streaming_replacer.rs | 27 ++++++++++++ 3 files changed, 88 insertions(+), 32 deletions(-) diff --git a/crates/common/src/integrations/nextjs.rs b/crates/common/src/integrations/nextjs.rs index 29522cc..3daff34 100644 --- a/crates/common/src/integrations/nextjs.rs +++ b/crates/common/src/integrations/nextjs.rs @@ -1476,8 +1476,8 @@ mod tests { #[test] fn html_processor_rewrites_rsc_stream_payload_with_length_preservation() { // RSC payloads (self.__next_f.push) are rewritten via post-processing. - // The streaming phase skips RSC push scripts, and the post-processor handles them - // to correctly handle cross-script T-chunks. + // The streaming phase skips RSC push scripts, and the HTML post-processor handles them + // at end-of-document to correctly handle cross-script T-chunks. let html = r#" "#; @@ -1508,16 +1508,9 @@ mod tests { .process(Cursor::new(html.as_bytes()), &mut output) .unwrap(); - // Apply post-processing (this is what handles RSC push scripts) - let processed_str = String::from_utf8_lossy(&output); - let final_html = post_process_rsc_html( - &processed_str, - "origin.example.com", - "test.example.com", - "https", - ); + let final_html = String::from_utf8_lossy(&output); - // RSC payloads should be rewritten via post-processing + // RSC payloads should be rewritten via end-of-document post-processing assert!( final_html.contains("test.example.com"), "RSC stream payloads should be rewritten to proxy host via post-processing. Output: {}", @@ -1558,16 +1551,9 @@ mod tests { .process(Cursor::new(html.as_bytes()), &mut output) .unwrap(); - // Apply post-processing (this is what handles RSC push scripts) - let processed_str = String::from_utf8_lossy(&output); - let final_html = post_process_rsc_html( - &processed_str, - "origin.example.com", - "test.example.com", - "https", - ); + let final_html = String::from_utf8_lossy(&output); - // RSC payloads should be rewritten via post-processing + // RSC payloads should be rewritten via end-of-document post-processing assert!( final_html.contains("test.example.com"), "RSC stream payloads should be rewritten to proxy host with chunked input. Output: {}", @@ -1621,18 +1607,7 @@ mod tests { pipeline .process(Cursor::new(html.as_bytes()), &mut output) .unwrap(); - - // Apply post-processing (this is what handles RSC push scripts) - let processed_str = String::from_utf8_lossy(&output); - let final_html = post_process_rsc_html( - &processed_str, - "origin.example.com", - "test.example.com", - "https", - ); - - println!("=== Final HTML ==="); - println!("{}", final_html); + let final_html = String::from_utf8_lossy(&output); // RSC payloads should be rewritten via post-processing assert!( diff --git a/crates/common/src/rsc_flight.rs b/crates/common/src/rsc_flight.rs index 82590e3..a320a7b 100644 --- a/crates/common/src/rsc_flight.rs +++ b/crates/common/src/rsc_flight.rs @@ -47,6 +47,11 @@ impl RscFlightUrlRewriter { request_host: &str, request_scheme: &str, ) -> Self { + // Normalize because some configs include a trailing slash (e.g. `https://origin/`). + // If we keep the trailing slash, replacing `origin_url` inside `origin_url + "/path"` + // would drop the delimiter and yield `https://proxyhostpath`. + let origin_url = origin_url.trim_end_matches('/'); + let request_url = format!("{request_scheme}://{request_host}"); let origin_protocol_relative = format!("//{origin_host}"); let request_protocol_relative = format!("//{request_host}"); @@ -302,6 +307,25 @@ mod tests { ); } + #[test] + fn rewrites_newline_rows_with_trailing_slash_origin_url() { + let input = b"0:[\"https://origin.example.com/page\"]\n"; + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com/", + "proxy.example.com", + "https", + ); + + let output = run_rewriter(&mut rewriter, input, 8); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + assert_eq!( + output_str, "0:[\"https://proxy.example.com/page\"]\n", + "Output should rewrite URLs without dropping the path slash" + ); + } + #[test] fn rewrites_t_rows_and_updates_length() { let t_content = r#"{"url":"https://origin.example.com/page"}"#; @@ -332,6 +356,36 @@ mod tests { ); } + #[test] + fn rewrites_t_rows_with_trailing_slash_origin_url() { + let t_content = r#"{"url":"https://origin.example.com/page"}"#; + let json_row = "2:[\"ok\"]\n"; + let input = format!("1:T{:x},{}{}", t_content.len(), t_content, json_row); + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com/", + "proxy.example.com", + "https", + ); + + let output = run_rewriter(&mut rewriter, input.as_bytes(), 7); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + + let rewritten_t_content = r#"{"url":"https://proxy.example.com/page"}"#; + let expected = format!( + "1:T{:x},{}{}", + rewritten_t_content.len(), + rewritten_t_content, + json_row + ); + + assert_eq!( + output_str, expected, + "Output should update T row lengths after rewriting without dropping the path slash" + ); + } + #[test] fn handles_t_row_header_and_body_split_across_chunks() { let t_content = r#"{"url":"https://origin.example.com/page"}"#; diff --git a/crates/common/src/streaming_replacer.rs b/crates/common/src/streaming_replacer.rs index 9b975c5..ea5b9c5 100644 --- a/crates/common/src/streaming_replacer.rs +++ b/crates/common/src/streaming_replacer.rs @@ -156,6 +156,10 @@ pub fn create_url_replacer( request_host: &str, request_scheme: &str, ) -> StreamingReplacer { + // Normalize because some configs include a trailing slash (e.g. `https://origin/`). + // If we keep the trailing slash, replacing `origin_url` inside `origin_url + "/path"` + // would drop the delimiter and yield `https://proxyhostpath`. + let origin_url = origin_url.trim_end_matches('/'); let request_url = format!("{}://{}", request_scheme, request_host); let mut replacements = vec![ @@ -364,6 +368,29 @@ mod tests { assert!(result.contains("//test.example.com/script.js")); } + #[test] + fn test_url_replacer_handles_trailing_slash_origin_url() { + let mut replacer = create_url_replacer( + "origin.example.com", + "https://origin.example.com/", + "test.example.com", + "https", + ); + + let content = r#"Visit https://origin.example.com/news for more info"#; + let processed = replacer.process_chunk(content.as_bytes(), true); + let result = String::from_utf8(processed).expect("should be valid UTF-8"); + + assert!( + result.contains("https://test.example.com/news"), + "URL should keep the slash between host and path. Got: {result}" + ); + assert!( + !result.contains("https://test.example.comnews"), + "URL should not lose the slash between host and path. Got: {result}" + ); + } + #[test] fn test_process_chunk_utf8_boundary() { let mut replacer = From cfce366c59e75a96882e2449e2dbd8f93396e9a1 Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 18:12:48 -0800 Subject: [PATCH 05/11] Refactored --- crates/common/src/integrations/nextjs.rs | 515 ++++++++++------------- docs/RSC_HYDRATION_FINDINGS.md | 53 +-- 2 files changed, 253 insertions(+), 315 deletions(-) diff --git a/crates/common/src/integrations/nextjs.rs b/crates/common/src/integrations/nextjs.rs index 3daff34..6c82c58 100644 --- a/crates/common/src/integrations/nextjs.rs +++ b/crates/common/src/integrations/nextjs.rs @@ -1,5 +1,6 @@ use std::sync::Arc; +use once_cell::sync::Lazy; use regex::{escape, Regex}; use serde::{Deserialize, Serialize}; use validator::Validate; @@ -12,6 +13,32 @@ use crate::settings::{IntegrationConfig, Settings}; const NEXTJS_INTEGRATION_ID: &str = "nextjs"; +// ============================================================================= +// Cached Regex Patterns +// ============================================================================= + +/// T-chunk header pattern: hex_id:Thex_length, +static TCHUNK_PATTERN: Lazy = + Lazy::new(|| Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").expect("valid T-chunk regex")); + +/// RSC push payload pattern for extraction +static RSC_PUSH_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r#"self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#).expect("valid RSC push regex") +}); + +/// RSC push script pattern for HTML post-processing +static RSC_SCRIPT_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r#"]*>\s*self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#) + .expect("valid RSC script regex") +}); + +/// RSC script ending pattern +static RSC_SCRIPT_ENDING: Lazy = + Lazy::new(|| Regex::new(r#"^\s*\]\s*\)\s*;?\s*"#).expect("valid RSC ending regex")); + +/// Marker used to track script boundaries when combining RSC content +const RSC_MARKER: &str = "\x00SPLIT\x00"; + #[derive(Debug, Clone, Deserialize, Serialize, Validate)] pub struct NextJsIntegrationConfig { #[serde(default = "default_enabled")] @@ -106,20 +133,21 @@ impl IntegrationHtmlPostProcessor for NextJsHtmlPostProcessor { } fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool { - if !self.should_process(html, ctx) { - return false; + // Note: should_process is already called by the HtmlWithPostProcessing wrapper, + // so we skip the redundant check here. + + if log::log_enabled!(log::Level::Debug) { + let origin_before = html.matches(ctx.origin_host).count(); + log::debug!( + "NextJs post-processor running: html_len={}, origin_matches={}, origin={}, proxy={}://{}", + html.len(), + origin_before, + ctx.origin_host, + ctx.request_scheme, + ctx.request_host + ); } - let origin_before = html.matches(ctx.origin_host).count(); - log::info!( - "NextJs post-processor running: html_len={}, origin_matches={}, origin={}, proxy={}://{}", - html.len(), - origin_before, - ctx.origin_host, - ctx.request_scheme, - ctx.request_host - ); - let result = post_process_rsc_html(html, ctx.origin_host, ctx.request_host, ctx.request_scheme); @@ -227,11 +255,7 @@ impl NextJsScriptRewriter { /// Returns (payload_content, quote_char, start_pos, end_pos) /// Handles various whitespace patterns in the push call. fn extract_rsc_push_payload(content: &str) -> Option<(&str, char, usize, usize)> { - // Match pattern: self.__next_f.push([ followed by whitespace, then 1, then whitespace, then quote - // Use regex to be more flexible with whitespace - let pattern = Regex::new(r#"self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#).ok()?; - - let cap = pattern.captures(content)?; + let cap = RSC_PUSH_PATTERN.captures(content)?; let quote_match = cap.get(1)?; let quote = quote_match.as_str().chars().next()?; let content_start = quote_match.end(); @@ -601,53 +625,126 @@ impl UrlRewriter { // T-chunks, the header script won't have URLs to rewrite (just the header), // and the content script will be rewritten with correct byte counting. -/// Calculate the unescaped byte length of a JS string with escape sequences. -/// This accounts for \n, \r, \t, \\, \", \xHH, \uHHHH, and surrogate pairs. -fn calculate_unescaped_byte_length(s: &str) -> usize { - let bytes = s.as_bytes(); - let mut result = 0; - let mut i = 0; +// ============================================================================= +// Escape Sequence Parsing +// ============================================================================= +// +// JS escape sequences are parsed by a shared iterator to avoid code duplication. +// The iterator yields (source_len, unescaped_byte_count) for each logical unit. + +/// A single parsed element from a JS string +struct EscapeElement { + /// Number of unescaped bytes this represents + byte_count: usize, +} + +/// Iterator over escape sequences in a JS string. +/// Yields (source_len, unescaped_byte_count) for each element. +struct EscapeSequenceIter<'a> { + bytes: &'a [u8], + str_ref: &'a str, + pos: usize, + skip_marker: Option<&'a [u8]>, +} + +impl<'a> EscapeSequenceIter<'a> { + fn new(s: &'a str) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: 0, + skip_marker: None, + } + } + + fn with_marker(s: &'a str, marker: &'a [u8]) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: 0, + skip_marker: Some(marker), + } + } + + fn from_position(s: &'a str, start: usize) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: start, + skip_marker: None, + } + } + + fn from_position_with_marker(s: &'a str, start: usize, marker: &'a [u8]) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: start, + skip_marker: Some(marker), + } + } + + /// Current position in the source string + fn position(&self) -> usize { + self.pos + } +} + +impl Iterator for EscapeSequenceIter<'_> { + type Item = EscapeElement; + + fn next(&mut self) -> Option { + if self.pos >= self.bytes.len() { + return None; + } + + // Check for marker to skip + if let Some(marker) = self.skip_marker { + if self.pos + marker.len() <= self.bytes.len() + && &self.bytes[self.pos..self.pos + marker.len()] == marker + { + self.pos += marker.len(); + return Some(EscapeElement { byte_count: 0 }); // Markers don't count + } + } - while i < bytes.len() { - if bytes[i] == b'\\' && i + 1 < bytes.len() { - let esc = bytes[i + 1]; + // Check for escape sequence + if self.bytes[self.pos] == b'\\' && self.pos + 1 < self.bytes.len() { + let esc = self.bytes[self.pos + 1]; // Simple escape sequences: \n, \r, \t, \b, \f, \v, \", \', \\, \/ if matches!( esc, b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' ) { - result += 1; - i += 2; - continue; + self.pos += 2; + return Some(EscapeElement { byte_count: 1 }); } // \xHH - hex escape (1 byte) - if esc == b'x' && i + 3 < bytes.len() { - result += 1; - i += 4; - continue; + if esc == b'x' && self.pos + 3 < self.bytes.len() { + self.pos += 4; + return Some(EscapeElement { byte_count: 1 }); } // \uHHHH - unicode escape - if esc == b'u' && i + 5 < bytes.len() { - let hex = &s[i + 2..i + 6]; + if esc == b'u' && self.pos + 5 < self.bytes.len() { + let hex = &self.str_ref[self.pos + 2..self.pos + 6]; if hex.chars().all(|c| c.is_ascii_hexdigit()) { if let Ok(code_unit) = u16::from_str_radix(hex, 16) { // Check for surrogate pair if (0xD800..=0xDBFF).contains(&code_unit) - && i + 11 < bytes.len() - && bytes[i + 6] == b'\\' - && bytes[i + 7] == b'u' + && self.pos + 11 < self.bytes.len() + && self.bytes[self.pos + 6] == b'\\' + && self.bytes[self.pos + 7] == b'u' { - let hex2 = &s[i + 8..i + 12]; + let hex2 = &self.str_ref[self.pos + 8..self.pos + 12]; if hex2.chars().all(|c| c.is_ascii_hexdigit()) { if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { if (0xDC00..=0xDFFF).contains(&code_unit2) { // Full surrogate pair = 4 UTF-8 bytes - result += 4; - i += 12; - continue; + self.pos += 12; + return Some(EscapeElement { byte_count: 4 }); } } } @@ -655,96 +752,48 @@ fn calculate_unescaped_byte_length(s: &str) -> usize { // Single unicode escape - calculate UTF-8 byte length let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); - result += c.len_utf8(); - i += 6; - continue; + self.pos += 6; + return Some(EscapeElement { + byte_count: c.len_utf8(), + }); } } } } // Regular character - count its UTF-8 byte length - // For ASCII, this is 1 byte - if bytes[i] < 0x80 { - result += 1; - i += 1; + if self.bytes[self.pos] < 0x80 { + self.pos += 1; + Some(EscapeElement { byte_count: 1 }) } else { // Multi-byte UTF-8 character - let c = s[i..].chars().next().unwrap_or('\u{FFFD}'); - result += c.len_utf8(); - i += c.len_utf8(); + let c = self.str_ref[self.pos..].chars().next().unwrap_or('\u{FFFD}'); + let len = c.len_utf8(); + self.pos += len; + Some(EscapeElement { byte_count: len }) } } +} - result +/// Calculate the unescaped byte length of a JS string with escape sequences. +/// This accounts for \n, \r, \t, \\, \", \xHH, \uHHHH, and surrogate pairs. +fn calculate_unescaped_byte_length(s: &str) -> usize { + EscapeSequenceIter::new(s).map(|e| e.byte_count).sum() } /// Consume a specified number of unescaped bytes from a JS string, returning the end position. fn consume_unescaped_bytes(s: &str, start_pos: usize, byte_count: usize) -> (usize, usize) { - let bytes = s.as_bytes(); + let mut iter = EscapeSequenceIter::from_position(s, start_pos); let mut consumed = 0; - let mut pos = start_pos; - - while pos < bytes.len() && consumed < byte_count { - if bytes[pos] == b'\\' && pos + 1 < bytes.len() { - let esc = bytes[pos + 1]; - - if matches!( - esc, - b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' - ) { - consumed += 1; - pos += 2; - continue; - } - - if esc == b'x' && pos + 3 < bytes.len() { - consumed += 1; - pos += 4; - continue; - } - - if esc == b'u' && pos + 5 < bytes.len() { - let hex = &s[pos + 2..pos + 6]; - if hex.chars().all(|c| c.is_ascii_hexdigit()) { - if let Ok(code_unit) = u16::from_str_radix(hex, 16) { - if (0xD800..=0xDBFF).contains(&code_unit) - && pos + 11 < bytes.len() - && bytes[pos + 6] == b'\\' - && bytes[pos + 7] == b'u' - { - let hex2 = &s[pos + 8..pos + 12]; - if hex2.chars().all(|c| c.is_ascii_hexdigit()) { - if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { - if (0xDC00..=0xDFFF).contains(&code_unit2) { - consumed += 4; - pos += 12; - continue; - } - } - } - } - - let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); - consumed += c.len_utf8(); - pos += 6; - continue; - } - } - } - } - if bytes[pos] < 0x80 { - consumed += 1; - pos += 1; - } else { - let c = s[pos..].chars().next().unwrap_or('\u{FFFD}'); - consumed += c.len_utf8(); - pos += c.len_utf8(); + while consumed < byte_count { + match iter.next() { + Some(elem) => consumed += elem.byte_count, + None => break, } } - (pos, consumed) + (iter.position(), consumed) } /// Information about a T-chunk found in the combined RSC content @@ -759,16 +808,18 @@ struct TChunkInfo { content_end: usize, } -/// Find all T-chunks in the combined RSC content. -/// T-chunks have format: ID:T, -fn find_tchunks(content: &str) -> Vec { - // Match pattern: hex_id:Thex_length, - let pattern = Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").unwrap(); +/// Find all T-chunks in content, optionally skipping markers. +fn find_tchunks_impl(content: &str, skip_markers: bool) -> Vec { let mut chunks = Vec::new(); let mut search_pos = 0; + let marker = if skip_markers { + Some(RSC_MARKER.as_bytes()) + } else { + None + }; while search_pos < content.len() { - if let Some(cap) = pattern.captures(&content[search_pos..]) { + if let Some(cap) = TCHUNK_PATTERN.captures(&content[search_pos..]) { let m = cap.get(0).unwrap(); let match_start = search_pos + m.start(); let header_end = search_pos + m.end(); @@ -777,8 +828,22 @@ fn find_tchunks(content: &str) -> Vec { let length_hex = cap.get(2).unwrap().as_str(); let declared_length = usize::from_str_radix(length_hex, 16).unwrap_or(0); - // Consume the declared number of unescaped bytes, skipping markers - let (content_end, _) = consume_unescaped_bytes(content, header_end, declared_length); + // Consume bytes using the appropriate iterator + let content_end = if let Some(marker_bytes) = marker { + let mut iter = + EscapeSequenceIter::from_position_with_marker(content, header_end, marker_bytes); + let mut consumed = 0; + while consumed < declared_length { + match iter.next() { + Some(elem) => consumed += elem.byte_count, + None => break, + } + } + iter.position() + } else { + let (pos, _) = consume_unescaped_bytes(content, header_end, declared_length); + pos + }; chunks.push(TChunkInfo { id, @@ -796,6 +861,11 @@ fn find_tchunks(content: &str) -> Vec { chunks } +/// Find all T-chunks in RSC content (no markers). +fn find_tchunks(content: &str) -> Vec { + find_tchunks_impl(content, false) +} + /// Rewrite URLs in a string, handling various URL formats in RSC content. fn rewrite_rsc_url_string( s: &str, @@ -899,139 +969,16 @@ fn rewrite_rsc_tchunks( // 4. Split back on markers // -/// Marker used to track script boundaries when combining RSC content -const RSC_MARKER: &str = "\x00SPLIT\x00"; - -/// Consume unescaped bytes, skipping RSC markers. -/// Returns (end_position, bytes_consumed) -fn consume_unescaped_bytes_skip_markers( - s: &str, - start_pos: usize, - byte_count: usize, -) -> (usize, usize) { - let bytes = s.as_bytes(); - let mut consumed = 0; - let mut pos = start_pos; - let marker_bytes = RSC_MARKER.as_bytes(); - - while pos < bytes.len() && consumed < byte_count { - // Check for marker - skip it without counting bytes - if pos + marker_bytes.len() <= bytes.len() - && &bytes[pos..pos + marker_bytes.len()] == marker_bytes - { - pos += marker_bytes.len(); - continue; - } - - if bytes[pos] == b'\\' && pos + 1 < bytes.len() { - let esc = bytes[pos + 1]; - - if matches!( - esc, - b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' - ) { - consumed += 1; - pos += 2; - continue; - } - - if esc == b'x' && pos + 3 < bytes.len() { - consumed += 1; - pos += 4; - continue; - } - - if esc == b'u' && pos + 5 < bytes.len() { - let hex = &s[pos + 2..pos + 6]; - if hex.chars().all(|c| c.is_ascii_hexdigit()) { - if let Ok(code_unit) = u16::from_str_radix(hex, 16) { - if (0xD800..=0xDBFF).contains(&code_unit) - && pos + 11 < bytes.len() - && bytes[pos + 6] == b'\\' - && bytes[pos + 7] == b'u' - { - let hex2 = &s[pos + 8..pos + 12]; - if hex2.chars().all(|c| c.is_ascii_hexdigit()) { - if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { - if (0xDC00..=0xDFFF).contains(&code_unit2) { - consumed += 4; - pos += 12; - continue; - } - } - } - } - - let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); - consumed += c.len_utf8(); - pos += 6; - continue; - } - } - } - } - - if bytes[pos] < 0x80 { - consumed += 1; - pos += 1; - } else { - let c = s[pos..].chars().next().unwrap_or('\u{FFFD}'); - consumed += c.len_utf8(); - pos += c.len_utf8(); - } - } - - (pos, consumed) -} - /// Calculate unescaped byte length excluding RSC markers. fn calculate_unescaped_byte_length_skip_markers(s: &str) -> usize { - let without_markers = s.replace(RSC_MARKER, ""); - calculate_unescaped_byte_length(&without_markers) -} - -/// Information about a T-chunk in marker-combined content -struct MarkedTChunkInfo { - id: String, - match_start: usize, - header_end: usize, - content_end: usize, + EscapeSequenceIter::with_marker(s, RSC_MARKER.as_bytes()) + .map(|e| e.byte_count) + .sum() } /// Find T-chunks in marker-combined RSC content. -fn find_tchunks_with_markers(content: &str) -> Vec { - let pattern = Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").unwrap(); - let mut chunks = Vec::new(); - let mut search_pos = 0; - - while search_pos < content.len() { - if let Some(cap) = pattern.captures(&content[search_pos..]) { - let m = cap.get(0).unwrap(); - let match_start = search_pos + m.start(); - let header_end = search_pos + m.end(); - - let id = cap.get(1).unwrap().as_str().to_string(); - let length_hex = cap.get(2).unwrap().as_str(); - let declared_length = usize::from_str_radix(length_hex, 16).unwrap_or(0); - - // Consume bytes, skipping markers - let (content_end, _) = - consume_unescaped_bytes_skip_markers(content, header_end, declared_length); - - chunks.push(MarkedTChunkInfo { - id, - match_start, - header_end, - content_end, - }); - - search_pos = content_end; - } else { - break; - } - } - - chunks +fn find_tchunks_with_markers(content: &str) -> Vec { + find_tchunks_impl(content, true) } /// Process multiple RSC script payloads together, handling cross-script T-chunks. @@ -1161,16 +1108,10 @@ struct RscPushScript { /// ``` fn find_rsc_push_scripts(html: &str) -> Vec { let mut scripts = Vec::new(); - // Match "#).unwrap(); - let mut search_pos = 0; while search_pos < html.len() { - let Some(cap) = pattern.captures(&html[search_pos..]) else { + let Some(cap) = RSC_SCRIPT_PATTERN.captures(&html[search_pos..]) else { break; }; @@ -1199,7 +1140,7 @@ fn find_rsc_push_scripts(html: &str) -> Vec { // After the closing quote, look for ]) with optional whitespace let after_quote = &html[i + 1..]; - let Some(ending_match) = ending_pattern.find(after_quote) else { + let Some(ending_match) = RSC_SCRIPT_ENDING.find(after_quote) else { search_pos = payload_start; continue; }; @@ -1247,51 +1188,47 @@ pub fn post_process_rsc_html( ) -> String { let scripts = find_rsc_push_scripts(html); - log::info!( - "post_process_rsc_html: found {} RSC push scripts, origin={}, proxy={}://{}", - scripts.len(), - origin_host, - request_scheme, - request_host - ); - if scripts.is_empty() { - log::info!("post_process_rsc_html: no RSC scripts found, returning unchanged"); return html.to_string(); } // Extract payloads let payloads: Vec<&str> = scripts.iter().map(|s| s.payload.as_str()).collect(); - // Count origin URLs before rewriting - let origin_count_before: usize = payloads - .iter() - .map(|p| p.matches(origin_host).count()) - .sum(); - log::info!( - "post_process_rsc_html: {} occurrences of '{}' in payloads before rewriting", - origin_count_before, - origin_host - ); + if log::log_enabled!(log::Level::Debug) { + let origin_count_before: usize = payloads + .iter() + .map(|p| p.matches(origin_host).count()) + .sum(); + log::debug!( + "post_process_rsc_html: {} scripts, {} origin URLs, origin={}, proxy={}://{}", + scripts.len(), + origin_count_before, + origin_host, + request_scheme, + request_host + ); + } // Process all scripts together let rewritten_payloads = rewrite_rsc_scripts_combined(&payloads, origin_host, request_host, request_scheme); - // Count origin URLs after rewriting - let origin_count_after: usize = rewritten_payloads - .iter() - .map(|p| p.matches(origin_host).count()) - .sum(); - let proxy_count: usize = rewritten_payloads - .iter() - .map(|p| p.matches(request_host).count()) - .sum(); - log::info!( - "post_process_rsc_html: after rewriting - {} origin URLs remaining, {} proxy URLs", - origin_count_after, - proxy_count - ); + if log::log_enabled!(log::Level::Debug) { + let origin_count_after: usize = rewritten_payloads + .iter() + .map(|p| p.matches(origin_host).count()) + .sum(); + let proxy_count: usize = rewritten_payloads + .iter() + .map(|p| p.matches(request_host).count()) + .sum(); + log::debug!( + "post_process_rsc_html: after rewriting - {} origin URLs remaining, {} proxy URLs", + origin_count_after, + proxy_count + ); + } // Replace payload contents in-place (apply replacements in reverse order to keep indices valid). let mut result = html.to_string(); diff --git a/docs/RSC_HYDRATION_FINDINGS.md b/docs/RSC_HYDRATION_FINDINGS.md index 94796da..8b54a86 100644 --- a/docs/RSC_HYDRATION_FINDINGS.md +++ b/docs/RSC_HYDRATION_FINDINGS.md @@ -199,7 +199,7 @@ The marker `\x00SPLIT\x00` is chosen because: - Easily identifiable for splitting - Won't be confused with any escape sequence -**Implementation:** Marker constant at [nextjs.rs:903](crates/common/src/integrations/nextjs.rs#L903) and combine/split logic in [nextjs.rs:1053](crates/common/src/integrations/nextjs.rs#L1053) +**Implementation:** Marker constant at [nextjs.rs:40](crates/common/src/integrations/nextjs.rs#L40) and combine/split logic in [nextjs.rs:1000](crates/common/src/integrations/nextjs.rs#L1000) #### Step 2: Find T-Chunks Across Combined Content @@ -207,7 +207,7 @@ Scan the combined stream for `ID:T,` headers, then consume exactly ` The key insight: markers don't count toward byte consumption. When a T-chunk declares 1679 bytes, we consume 1679 bytes of actual content, skipping over any markers we encounter. -**Implementation:** T-chunk discovery at [nextjs.rs:1002](crates/common/src/integrations/nextjs.rs#L1002) with marker-aware consumption in [nextjs.rs:907](crates/common/src/integrations/nextjs.rs#L907) +**Implementation:** T-chunk discovery at [nextjs.rs:980](crates/common/src/integrations/nextjs.rs#L980) with marker-aware escape sequence iterator at [nextjs.rs:643](crates/common/src/integrations/nextjs.rs#L643) #### Step 3: Rewrite URLs and Recalculate Lengths @@ -250,9 +250,10 @@ The post-processing is implemented as an integration hook, allowing other integr `T`-chunk lengths use the **unescaped** byte count of the payload (after decoding JavaScript string escapes). Correct handling requires: -- Counting unescaped bytes while accounting for `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [nextjs.rs:606](crates/common/src/integrations/nextjs.rs#L606) -- Consuming exactly *N unescaped bytes* to locate the end of a declared `T` chunk: [nextjs.rs:683](crates/common/src/integrations/nextjs.rs#L683) -- Marker-aware variants for cross-script processing (skip `RSC_MARKER` during counting/consumption): [nextjs.rs:988](crates/common/src/integrations/nextjs.rs#L988) and [nextjs.rs:907](crates/common/src/integrations/nextjs.rs#L907) +- Shared escape sequence iterator handles `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [nextjs.rs:643](crates/common/src/integrations/nextjs.rs#L643) +- Counting unescaped bytes: [nextjs.rs:780](crates/common/src/integrations/nextjs.rs#L780) +- Consuming exactly _N unescaped bytes_ to locate the end of a declared `T` chunk: [nextjs.rs:785](crates/common/src/integrations/nextjs.rs#L785) +- Marker-aware byte length calculation for cross-script processing: [nextjs.rs:973](crates/common/src/integrations/nextjs.rs#L973) --- @@ -271,7 +272,7 @@ The solution handles multiple URL formats in RSC content: ### Regex Pattern -**Implementation:** Regex-based rewriting in [nextjs.rs:800](crates/common/src/integrations/nextjs.rs#L800) +**Implementation:** Regex-based rewriting in [nextjs.rs:870](crates/common/src/integrations/nextjs.rs#L870) This pattern handles: @@ -417,31 +418,31 @@ Because post-processing runs inside the HTML processor (before recompression), ` ## Implementation Files -| File | Purpose | -| -------------------------------------------- | --------------------------------------------- | -| `crates/common/src/integrations/nextjs.rs` | RSC rewriting logic, post-processor | -| `crates/common/src/integrations/registry.rs` | `IntegrationHtmlPostProcessor` trait | -| `crates/common/src/integrations/mod.rs` | Module exports | -| `crates/common/src/html_processor.rs` | HTML rewriting + post-processor invocation | -| `crates/common/src/publisher.rs` | Response routing + streaming pipeline setup | -| `crates/common/src/streaming_processor.rs` | Compression transforms + `StreamProcessor` | +| File | Purpose | +| -------------------------------------------- | ------------------------------------------- | +| `crates/common/src/integrations/nextjs.rs` | RSC rewriting logic, post-processor | +| `crates/common/src/integrations/registry.rs` | `IntegrationHtmlPostProcessor` trait | +| `crates/common/src/integrations/mod.rs` | Module exports | +| `crates/common/src/html_processor.rs` | HTML rewriting + post-processor invocation | +| `crates/common/src/publisher.rs` | Response routing + streaming pipeline setup | +| `crates/common/src/streaming_processor.rs` | Compression transforms + `StreamProcessor` | ### Key Functions in nextjs.rs | Function | Line | Purpose | | ---------------------------------------------- | ------------------------------------------------------ | ---------------------------------------------------- | -| `extract_rsc_push_payload` | [229](crates/common/src/integrations/nextjs.rs#L229) | Extract string from `self.__next_f.push([1, '...'])` | -| `calculate_unescaped_byte_length` | [606](crates/common/src/integrations/nextjs.rs#L606) | Count unescaped bytes with escape handling | -| `consume_unescaped_bytes` | [683](crates/common/src/integrations/nextjs.rs#L683) | Advance through string consuming N bytes | -| `find_tchunks` | [764](crates/common/src/integrations/nextjs.rs#L764) | Find T-chunks in single script | -| `rewrite_rsc_url_string` | [800](crates/common/src/integrations/nextjs.rs#L800) | URL rewriting with escape handling | -| `rewrite_rsc_tchunks` | [830](crates/common/src/integrations/nextjs.rs#L830) | Single-script T-chunk processing | -| `consume_unescaped_bytes_skip_markers` | [907](crates/common/src/integrations/nextjs.rs#L907) | Advance through string, skipping markers | -| `calculate_unescaped_byte_length_skip_markers` | [988](crates/common/src/integrations/nextjs.rs#L988) | Count unescaped bytes, excluding markers | -| `find_tchunks_with_markers` | [1002](crates/common/src/integrations/nextjs.rs#L1002) | Find T-chunks in marker-combined content | -| `rewrite_rsc_scripts_combined` | [1053](crates/common/src/integrations/nextjs.rs#L1053) | Cross-script T-chunk processing | -| `find_rsc_push_scripts` | [1162](crates/common/src/integrations/nextjs.rs#L1162) | Find all RSC scripts in HTML | -| `post_process_rsc_html` | [1242](crates/common/src/integrations/nextjs.rs#L1242) | Complete HTML post-processing | +| `extract_rsc_push_payload` | [257](crates/common/src/integrations/nextjs.rs#L257) | Extract string from `self.__next_f.push([1, '...'])` | +| `EscapeSequenceIter` | [643](crates/common/src/integrations/nextjs.rs#L643) | Shared iterator for escape sequence parsing | +| `calculate_unescaped_byte_length` | [780](crates/common/src/integrations/nextjs.rs#L780) | Count unescaped bytes with escape handling | +| `consume_unescaped_bytes` | [785](crates/common/src/integrations/nextjs.rs#L785) | Advance through string consuming N bytes | +| `find_tchunks` | [865](crates/common/src/integrations/nextjs.rs#L865) | Find T-chunks in single script | +| `rewrite_rsc_url_string` | [870](crates/common/src/integrations/nextjs.rs#L870) | URL rewriting with escape handling | +| `rewrite_rsc_tchunks` | [900](crates/common/src/integrations/nextjs.rs#L900) | Single-script T-chunk processing | +| `calculate_unescaped_byte_length_skip_markers` | [973](crates/common/src/integrations/nextjs.rs#L973) | Count unescaped bytes, excluding markers | +| `find_tchunks_with_markers` | [980](crates/common/src/integrations/nextjs.rs#L980) | Find T-chunks in marker-combined content | +| `rewrite_rsc_scripts_combined` | [1000](crates/common/src/integrations/nextjs.rs#L1000) | Cross-script T-chunk processing | +| `find_rsc_push_scripts` | [1109](crates/common/src/integrations/nextjs.rs#L1109) | Find all RSC scripts in HTML | +| `post_process_rsc_html` | [1183](crates/common/src/integrations/nextjs.rs#L1183) | Complete HTML post-processing | --- From 373c15475526b06b4fd7df29407a56ee249bfb3d Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 18:20:59 -0800 Subject: [PATCH 06/11] Fixed formatting --- crates/common/src/integrations/nextjs.rs | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/crates/common/src/integrations/nextjs.rs b/crates/common/src/integrations/nextjs.rs index 6c82c58..c5c839c 100644 --- a/crates/common/src/integrations/nextjs.rs +++ b/crates/common/src/integrations/nextjs.rs @@ -767,7 +767,10 @@ impl Iterator for EscapeSequenceIter<'_> { Some(EscapeElement { byte_count: 1 }) } else { // Multi-byte UTF-8 character - let c = self.str_ref[self.pos..].chars().next().unwrap_or('\u{FFFD}'); + let c = self.str_ref[self.pos..] + .chars() + .next() + .unwrap_or('\u{FFFD}'); let len = c.len_utf8(); self.pos += len; Some(EscapeElement { byte_count: len }) @@ -830,8 +833,11 @@ fn find_tchunks_impl(content: &str, skip_markers: bool) -> Vec { // Consume bytes using the appropriate iterator let content_end = if let Some(marker_bytes) = marker { - let mut iter = - EscapeSequenceIter::from_position_with_marker(content, header_end, marker_bytes); + let mut iter = EscapeSequenceIter::from_position_with_marker( + content, + header_end, + marker_bytes, + ); let mut consumed = 0; while consumed < declared_length { match iter.next() { From aeea7dc2e49033464591e1f36849d5eaf16a2eac Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 20:49:57 -0800 Subject: [PATCH 07/11] Another refactor --- crates/common/Cargo.toml | 1 + crates/common/src/html_processor.rs | 62 +- crates/common/src/integrations/nextjs.rs | 2453 ----------------- .../nextjs/fixtures/inlined-data-escaped.html | 7 + .../nextjs/fixtures/inlined-data-nonce.html | 8 + .../integrations/nextjs/html_post_process.rs | 333 +++ crates/common/src/integrations/nextjs/mod.rs | 345 +++ crates/common/src/integrations/nextjs/rsc.rs | 594 ++++ .../integrations/nextjs/script_rewriter.rs | 613 ++++ crates/common/src/publisher.rs | 26 +- crates/common/src/rsc_flight.rs | 54 - crates/common/src/settings.rs | 56 +- crates/common/src/streaming_replacer.rs | 27 - docs/RSC_HYDRATION_FINDINGS.md | 90 +- 14 files changed, 2057 insertions(+), 2612 deletions(-) delete mode 100644 crates/common/src/integrations/nextjs.rs create mode 100644 crates/common/src/integrations/nextjs/fixtures/inlined-data-escaped.html create mode 100644 crates/common/src/integrations/nextjs/fixtures/inlined-data-nonce.html create mode 100644 crates/common/src/integrations/nextjs/html_post_process.rs create mode 100644 crates/common/src/integrations/nextjs/mod.rs create mode 100644 crates/common/src/integrations/nextjs/rsc.rs create mode 100644 crates/common/src/integrations/nextjs/script_rewriter.rs diff --git a/crates/common/Cargo.toml b/crates/common/Cargo.toml index bbb8a2a..ab3058c 100644 --- a/crates/common/Cargo.toml +++ b/crates/common/Cargo.toml @@ -51,6 +51,7 @@ config = { workspace = true } derive_more = { workspace = true } error-stack = { workspace = true } http = { workspace = true } +log = { workspace = true } regex = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/crates/common/src/html_processor.rs b/crates/common/src/html_processor.rs index cff0ccd..392fb0d 100644 --- a/crates/common/src/html_processor.rs +++ b/crates/common/src/html_processor.rs @@ -65,7 +65,7 @@ impl StreamProcessor for HtmlWithPostProcessing { } if changed { - log::info!( + log::debug!( "HTML post-processing complete: origin_host={}, output_len={}", self.origin_host, html.len() @@ -138,6 +138,37 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso fn protocol_relative_replacement(&self) -> String { format!("//{}", self.request_host) } + + fn rewrite_url_value(&self, value: &str) -> Option { + if !value.contains(&self.origin_host) { + return None; + } + + let https_origin = self.https_origin(); + let http_origin = self.http_origin(); + let protocol_relative_origin = self.protocol_relative_origin(); + let replacement_url = self.replacement_url(); + let protocol_relative_replacement = self.protocol_relative_replacement(); + + let mut rewritten = value + .replace(&https_origin, &replacement_url) + .replace(&http_origin, &replacement_url) + .replace(&protocol_relative_origin, &protocol_relative_replacement); + + if rewritten.starts_with(&self.origin_host) { + let suffix = &rewritten[self.origin_host.len()..]; + let boundary_ok = suffix.is_empty() + || matches!( + suffix.as_bytes().first(), + Some(b'/') | Some(b'?') | Some(b'#') + ); + if boundary_ok { + rewritten = format!("{}{}", self.request_host, suffix); + } + } + + (rewritten != value).then_some(rewritten) + } } let patterns = Rc::new(UrlPatterns { @@ -170,11 +201,8 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso move |el| { if let Some(mut href) = el.get_attribute("href") { let original_href = href.clone(); - let new_href = href - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_href != href { - href = new_href; + if let Some(rewritten) = patterns.rewrite_url_value(&href) { + href = rewritten; } match integrations.rewrite_attribute( @@ -211,11 +239,8 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso move |el| { if let Some(mut src) = el.get_attribute("src") { let original_src = src.clone(); - let new_src = src - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_src != src { - src = new_src; + if let Some(rewritten) = patterns.rewrite_url_value(&src) { + src = rewritten; } match integrations.rewrite_attribute( "src", @@ -251,11 +276,8 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso move |el| { if let Some(mut action) = el.get_attribute("action") { let original_action = action.clone(); - let new_action = action - .replace(&patterns.https_origin(), &patterns.replacement_url()) - .replace(&patterns.http_origin(), &patterns.replacement_url()); - if new_action != action { - action = new_action; + if let Some(rewritten) = patterns.rewrite_url_value(&action) { + action = rewritten; } match integrations.rewrite_attribute( @@ -510,8 +532,12 @@ mod tests { let html = r#" Link + Proto + Bare +

+ "#; let mut output = Vec::new(); @@ -521,8 +547,12 @@ mod tests { let result = String::from_utf8(output).unwrap(); assert!(result.contains(r#"href="https://test.example.com/page""#)); + assert!(result.contains(r#"href="//test.example.com/proto""#)); + assert!(result.contains(r#"href="test.example.com/bare""#)); assert!(result.contains(r#"src="https://test.example.com/image.jpg""#)); + assert!(result.contains(r#"src="//test.example.com/image2.jpg""#)); assert!(result.contains(r#"action="https://test.example.com/submit""#)); + assert!(result.contains(r#"action="//test.example.com/submit2""#)); assert!(!result.contains("origin.example.com")); } diff --git a/crates/common/src/integrations/nextjs.rs b/crates/common/src/integrations/nextjs.rs deleted file mode 100644 index c5c839c..0000000 --- a/crates/common/src/integrations/nextjs.rs +++ /dev/null @@ -1,2453 +0,0 @@ -use std::sync::Arc; - -use once_cell::sync::Lazy; -use regex::{escape, Regex}; -use serde::{Deserialize, Serialize}; -use validator::Validate; - -use crate::integrations::{ - IntegrationHtmlContext, IntegrationHtmlPostProcessor, IntegrationRegistration, - IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, -}; -use crate::settings::{IntegrationConfig, Settings}; - -const NEXTJS_INTEGRATION_ID: &str = "nextjs"; - -// ============================================================================= -// Cached Regex Patterns -// ============================================================================= - -/// T-chunk header pattern: hex_id:Thex_length, -static TCHUNK_PATTERN: Lazy = - Lazy::new(|| Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").expect("valid T-chunk regex")); - -/// RSC push payload pattern for extraction -static RSC_PUSH_PATTERN: Lazy = Lazy::new(|| { - Regex::new(r#"self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#).expect("valid RSC push regex") -}); - -/// RSC push script pattern for HTML post-processing -static RSC_SCRIPT_PATTERN: Lazy = Lazy::new(|| { - Regex::new(r#"]*>\s*self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#) - .expect("valid RSC script regex") -}); - -/// RSC script ending pattern -static RSC_SCRIPT_ENDING: Lazy = - Lazy::new(|| Regex::new(r#"^\s*\]\s*\)\s*;?\s*"#).expect("valid RSC ending regex")); - -/// Marker used to track script boundaries when combining RSC content -const RSC_MARKER: &str = "\x00SPLIT\x00"; - -#[derive(Debug, Clone, Deserialize, Serialize, Validate)] -pub struct NextJsIntegrationConfig { - #[serde(default = "default_enabled")] - pub enabled: bool, - #[serde( - default = "default_rewrite_attributes", - deserialize_with = "crate::settings::vec_from_seq_or_map" - )] - #[validate(length(min = 1))] - pub rewrite_attributes: Vec, -} - -impl IntegrationConfig for NextJsIntegrationConfig { - fn is_enabled(&self) -> bool { - self.enabled - } -} - -fn default_enabled() -> bool { - false -} - -fn default_rewrite_attributes() -> Vec { - vec!["href".to_string(), "link".to_string(), "url".to_string()] -} - -pub fn register(settings: &Settings) -> Option { - let config = match build(settings) { - Some(config) => { - log::info!( - "NextJS integration registered: enabled={}, rewrite_attributes={:?}", - config.enabled, - config.rewrite_attributes - ); - config - } - None => { - log::info!("NextJS integration not registered (disabled or missing config)"); - return None; - } - }; - - // Register both structured (Pages Router __NEXT_DATA__) and streamed (App Router RSC) - // rewriters. RSC payloads require LENGTH-PRESERVING URL replacement to avoid breaking - // React hydration - the RSC format uses byte positions for record boundaries. - let structured = Arc::new(NextJsScriptRewriter::new( - config.clone(), - NextJsRewriteMode::Structured, - )); - - let streamed = Arc::new(NextJsScriptRewriter::new( - config.clone(), - NextJsRewriteMode::Streamed, - )); - - // Register post-processor for cross-script RSC T-chunks - let post_processor = Arc::new(NextJsHtmlPostProcessor::new(config)); - - Some( - IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) - .with_script_rewriter(structured) - .with_script_rewriter(streamed) - .with_html_post_processor(post_processor) - .build(), - ) -} - -/// Post-processor for handling cross-script RSC T-chunks. -struct NextJsHtmlPostProcessor { - config: Arc, -} - -impl NextJsHtmlPostProcessor { - fn new(config: Arc) -> Self { - Self { config } - } -} - -impl IntegrationHtmlPostProcessor for NextJsHtmlPostProcessor { - fn integration_id(&self) -> &'static str { - NEXTJS_INTEGRATION_ID - } - - fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { - if !self.config.enabled || self.config.rewrite_attributes.is_empty() { - return false; - } - - // Only Next.js App Router pages will contain `__next_f` pushes. - // Also require an origin host hit to avoid running on already-rewritten pages. - html.contains("__next_f.push") && html.contains(ctx.origin_host) - } - - fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool { - // Note: should_process is already called by the HtmlWithPostProcessing wrapper, - // so we skip the redundant check here. - - if log::log_enabled!(log::Level::Debug) { - let origin_before = html.matches(ctx.origin_host).count(); - log::debug!( - "NextJs post-processor running: html_len={}, origin_matches={}, origin={}, proxy={}://{}", - html.len(), - origin_before, - ctx.origin_host, - ctx.request_scheme, - ctx.request_host - ); - } - - let result = - post_process_rsc_html(html, ctx.origin_host, ctx.request_host, ctx.request_scheme); - - if result == *html { - return false; - } - - *html = result; - true - } -} - -fn build(settings: &Settings) -> Option> { - let config = settings - .integration_config::(NEXTJS_INTEGRATION_ID) - .ok() - .flatten()?; - Some(Arc::new(config)) -} - -#[derive(Clone, Copy)] -enum NextJsRewriteMode { - Structured, - Streamed, -} - -struct NextJsScriptRewriter { - config: Arc, - mode: NextJsRewriteMode, -} - -impl NextJsScriptRewriter { - fn new(config: Arc, mode: NextJsRewriteMode) -> Self { - Self { config, mode } - } - - fn rewrite_structured( - &self, - content: &str, - ctx: &IntegrationScriptContext<'_>, - ) -> ScriptRewriteAction { - // For structured mode (__NEXT_DATA__), use simple URL replacement - if let Some(rewritten) = rewrite_nextjs_values( - content, - ctx.origin_host, - ctx.request_host, - ctx.request_scheme, - &self.config.rewrite_attributes, - false, // No length preservation needed for structured data - ) { - ScriptRewriteAction::replace(rewritten) - } else { - ScriptRewriteAction::keep() - } - } - - fn rewrite_streamed( - &self, - content: &str, - ctx: &IntegrationScriptContext<'_>, - ) -> ScriptRewriteAction { - // For streamed RSC payloads, we need T-chunk aware rewriting. - // This handles the case where T-chunk lengths need to be recalculated - // after URL rewriting. - // - // Try to extract the RSC payload from self.__next_f.push([1, '...']) - if let Some((payload, quote, start, end)) = extract_rsc_push_payload(content) { - let rewritten_payload = rewrite_rsc_tchunks( - payload, - ctx.origin_host, - ctx.request_host, - ctx.request_scheme, - ); - - if rewritten_payload != payload { - // Reconstruct the script with rewritten payload - let mut result = String::with_capacity(content.len()); - result.push_str(&content[..start]); - result.push(quote); - result.push_str(&rewritten_payload); - result.push(quote); - result.push_str(&content[end + 1..]); - return ScriptRewriteAction::replace(result); - } - } - - // Fallback: use simple URL rewriting for the entire content - // This handles non-standard RSC formats or other script patterns - let rewritten = rewrite_rsc_url_string( - content, - ctx.origin_host, - ctx.request_host, - ctx.request_scheme, - ); - - if rewritten != content { - return ScriptRewriteAction::replace(rewritten); - } - - ScriptRewriteAction::keep() - } -} - -/// Extract RSC payload from a self.__next_f.push([1, '...']) call -/// Returns (payload_content, quote_char, start_pos, end_pos) -/// Handles various whitespace patterns in the push call. -fn extract_rsc_push_payload(content: &str) -> Option<(&str, char, usize, usize)> { - let cap = RSC_PUSH_PATTERN.captures(content)?; - let quote_match = cap.get(1)?; - let quote = quote_match.as_str().chars().next()?; - let content_start = quote_match.end(); - - // Find matching closing quote - let search_from = &content[content_start..]; - let mut pos = 0; - let mut escape = false; - - for c in search_from.chars() { - if escape { - escape = false; - pos += c.len_utf8(); - continue; - } - if c == '\\' { - escape = true; - pos += 1; - continue; - } - if c == quote { - // Found closing quote - let content_end = content_start + pos; - return Some(( - &content[content_start..content_end], - quote, - content_start - 1, // Include opening quote position - content_end, // Position of closing quote - )); - } - pos += c.len_utf8(); - } - - None -} - -impl IntegrationScriptRewriter for NextJsScriptRewriter { - fn integration_id(&self) -> &'static str { - NEXTJS_INTEGRATION_ID - } - - fn selector(&self) -> &'static str { - match self.mode { - NextJsRewriteMode::Structured => "script#__NEXT_DATA__", - NextJsRewriteMode::Streamed => "script", - } - } - - fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { - if self.config.rewrite_attributes.is_empty() { - return ScriptRewriteAction::keep(); - } - - match self.mode { - NextJsRewriteMode::Structured => self.rewrite_structured(content, ctx), - NextJsRewriteMode::Streamed => { - // RSC push scripts (__next_f.push) are handled by the post-processor - // because T-chunks can span multiple scripts and require combined processing. - // Only handle non-RSC scripts here. - if content.contains("__next_f.push") { - return ScriptRewriteAction::keep(); - } - // For other __next_f scripts (like initialization), use simple URL rewriting - if content.contains("__next_f") { - return self.rewrite_streamed(content, ctx); - } - ScriptRewriteAction::keep() - } - } - } -} - -fn rewrite_nextjs_values( - content: &str, - origin_host: &str, - request_host: &str, - request_scheme: &str, - attributes: &[String], - preserve_length: bool, -) -> Option { - if origin_host.is_empty() || request_host.is_empty() || attributes.is_empty() { - return None; - } - - // Build the rewriter context with regex patterns - // For RSC payloads (preserve_length=true), we must maintain exact byte positions - // to avoid breaking React hydration. - let rewriter = UrlRewriter::new( - origin_host, - request_host, - request_scheme, - attributes, - preserve_length, - ); - - // Use pure regex-based rewriting - no AST parsing needed - // The rewrite_embedded method handles all URL patterns with proper whitespace padding - rewriter.rewrite_embedded(content) -} - -/// Helper struct to hold URL rewriting configuration -struct UrlRewriter { - origin_host: String, - request_host: String, - request_scheme: String, - /// Regex patterns for embedded JSON in strings with URL scheme (e.g., \"href\":\"https://origin\") - embedded_patterns: Vec, - /// Regex patterns for bare hostname values (e.g., \"siteProductionDomain\":\"www.example.com\") - bare_host_patterns: Vec, - /// Whether to preserve URL length by padding (for RSC payloads) - preserve_length: bool, -} - -impl UrlRewriter { - fn new( - origin_host: &str, - request_host: &str, - request_scheme: &str, - attributes: &[String], - preserve_length: bool, - ) -> Self { - let escaped_origin = escape(origin_host); - - // Build patterns for embedded JSON strings with various escape levels - // Pattern 1: URLs with scheme (https://origin, http://origin, //origin) - // Also capture optional path and closing quote to add whitespace padding after - let embedded_patterns = attributes - .iter() - .map(|attr| { - let escaped_attr = escape(attr); - // Capture: prefix, scheme, path (optional), closing quote - let pattern = format!( - r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*")(?Phttps?://|//){origin}(?P[^"\\]*)(?P\\*")"#, - attr = escaped_attr, - origin = escaped_origin, - ); - Regex::new(&pattern).expect("valid Next.js rewrite regex") - }) - .collect(); - - // Pattern 2: Bare hostname without scheme (e.g., "siteProductionDomain":"www.example.com") - // This matches attribute:"hostname" where hostname is exactly the origin (no path) - let bare_host_patterns = attributes - .iter() - .map(|attr| { - let escaped_attr = escape(attr); - // Match attr":"origin" where origin is followed by end quote (no path) - let pattern = format!( - r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*"){origin}(?P\\*")"#, - attr = escaped_attr, - origin = escaped_origin, - ); - Regex::new(&pattern).expect("valid Next.js bare host rewrite regex") - }) - .collect(); - - Self { - origin_host: origin_host.to_string(), - request_host: request_host.to_string(), - request_scheme: request_scheme.to_string(), - embedded_patterns, - bare_host_patterns, - preserve_length, - } - } - - /// Rewrite a URL value string, returning (new_url, padding) if modified. - /// The padding is whitespace to add after the closing quote to preserve byte positions. - /// Uses the request scheme (http/https) for the rewritten URL. - #[cfg(test)] - fn rewrite_url_value(&self, url: &str) -> Option<(String, String)> { - let original_len = url.len(); - - // Check for https:// or http:// URLs - // Use the request scheme for the rewritten URL (e.g., http for localhost) - let new_url = if let Some(rest) = url.strip_prefix("https://") { - if rest.starts_with(&self.origin_host) { - let path = &rest[self.origin_host.len()..]; - Some(format!( - "{}://{}{}", - self.request_scheme, self.request_host, path - )) - } else { - None - } - } else if let Some(rest) = url.strip_prefix("http://") { - if rest.starts_with(&self.origin_host) { - let path = &rest[self.origin_host.len()..]; - Some(format!( - "{}://{}{}", - self.request_scheme, self.request_host, path - )) - } else { - None - } - } else if let Some(rest) = url.strip_prefix("//") { - // Protocol-relative URL - use request scheme - if rest.starts_with(&self.origin_host) { - let path = &rest[self.origin_host.len()..]; - Some(format!( - "{}://{}{}", - self.request_scheme, self.request_host, path - )) - } else { - None - } - } else if url == self.origin_host { - // Bare hostname without scheme (e.g., "siteProductionDomain":"www.example.com") - Some(self.request_host.clone()) - } else if url.starts_with(&self.origin_host) { - // Hostname with path but no scheme (e.g., "www.example.com/path") - let path = &url[self.origin_host.len()..]; - Some(format!("{}{}", self.request_host, path)) - } else { - None - }; - - // Calculate whitespace padding if length preservation is enabled - new_url.map(|url| { - let padding = if self.preserve_length { - Self::calculate_padding(url.len(), original_len) - } else { - String::new() - }; - (url, padding) - }) - } - - /// Calculate the whitespace padding needed after a URL replacement. - /// Returns empty string if no padding needed (URL is same length or longer). - /// - /// For RSC hydration, we add spaces AFTER the closing quote to preserve - /// byte positions in the JSON stream. This is preferred over URL path padding - /// because it keeps URLs clean and works for all URL types. - #[cfg(test)] - fn calculate_padding(new_url_len: usize, original_len: usize) -> String { - if new_url_len >= original_len { - String::new() - } else { - " ".repeat(original_len - new_url_len) - } - } - - /// Rewrite embedded JSON patterns in a string (for streamed payloads) - fn rewrite_embedded(&self, input: &str) -> Option { - let mut result = input.to_string(); - let mut changed = false; - - // First pass: URLs with scheme (https://, http://, //) - for regex in &self.embedded_patterns { - let origin_host = &self.origin_host; - let request_host = &self.request_host; - let request_scheme = &self.request_scheme; - let preserve_length = self.preserve_length; - - let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { - let prefix = &caps["prefix"]; - let scheme = &caps["scheme"]; - let path = &caps["path"]; - let quote = &caps["quote"]; - - // Calculate original URL length (scheme + origin_host + path) - let original_url_len = scheme.len() + origin_host.len() + path.len(); - - // Build replacement URL using the request scheme (e.g., http for localhost) - let new_url = format!("{}://{}{}", request_scheme, request_host, path); - - // Calculate whitespace padding if needed - let padding = if preserve_length && new_url.len() < original_url_len { - " ".repeat(original_url_len - new_url.len()) - } else { - String::new() - }; - - // Return: prefix + new_url + quote + padding (spaces after closing quote) - format!("{}{}{}{}", prefix, new_url, quote, padding) - }); - if next_value != result { - changed = true; - result = next_value.into_owned(); - } - } - - // Second pass: Bare hostnames without scheme (e.g., "siteProductionDomain":"www.example.com") - for regex in &self.bare_host_patterns { - let origin_host = &self.origin_host; - let request_host = &self.request_host; - let preserve_length = self.preserve_length; - - let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { - let prefix = &caps["prefix"]; - let suffix = &caps["suffix"]; - - // Calculate padding for bare hostnames - let padding = if preserve_length && request_host.len() < origin_host.len() { - " ".repeat(origin_host.len() - request_host.len()) - } else { - String::new() - }; - - format!("{}{}{}{}", prefix, request_host, suffix, padding) - }); - if next_value != result { - changed = true; - result = next_value.into_owned(); - } - } - - changed.then_some(result) - } -} - -// ============================================================================= -// RSC (React Server Components) T-Chunk Rewriter -// ============================================================================= -// -// Next.js App Router uses React Server Components (RSC) with a streaming flight -// protocol. RSC data is delivered via inline scripts calling `self.__next_f.push()`. -// -// ## RSC Flight Protocol Format -// -// RSC records are separated by `\n` (literal backslash-n in JS strings). -// Each record has format: `ID:DATA` where ID is a hex string (e.g., "1a", "443"). -// -// Record types include: -// - T-chunks (text): `ID:T,` - The most important for rewriting -// - JSON arrays: `ID:[...]` -// - JSON objects: `ID:{...}` -// - Module imports: `ID:I[...]` -// - Head links: `ID:HL[...]` -// - References: `ID:$ref` -// - Strings: `ID:"..."` -// - Null: `ID:null` -// -// ## T-Chunk Format Details -// -// T-chunks contain text data with an explicit byte length: -// ``` -// 1a:T29,{"url":"https://origin.example.com/path"} -// ``` -// - `1a` = chunk ID (hex) -// - `T` = text chunk marker -// - `29` = content length in hex (0x29 = 41 bytes UNESCAPED) -// - `,` = separator -// - Content follows, exactly 41 unescaped bytes -// -// The hex_length is the UNESCAPED byte count - escape sequences like `\n` count -// as 1 byte, `\uHHHH` counts as the UTF-8 byte length of the character, etc. -// -// ## Why T-Chunk Length Matters -// -// React's RSC parser uses byte offsets to navigate the stream. If we rewrite -// URLs without updating T-chunk lengths, the parser reads wrong byte ranges, -// corrupting the data and breaking hydration. -// -// Example: Changing `origin.example.com` (18 chars) to `proxy.io` (8 chars) -// shrinks content by 10 bytes. The T-chunk header must be updated from -// `T29,` to `T1f,` (41 -> 31 bytes). -// -// ## Cross-Script T-Chunks -// -// T-chunks CAN span multiple push scripts: -// - Script 10: `11:null\n1a:T928,` (header only, declares 928 bytes) -// - Script 11: `...actual content...` (the 928 bytes of content) -// -// Our per-script processing handles most cases correctly. For cross-script -// T-chunks, the header script won't have URLs to rewrite (just the header), -// and the content script will be rewritten with correct byte counting. - -// ============================================================================= -// Escape Sequence Parsing -// ============================================================================= -// -// JS escape sequences are parsed by a shared iterator to avoid code duplication. -// The iterator yields (source_len, unescaped_byte_count) for each logical unit. - -/// A single parsed element from a JS string -struct EscapeElement { - /// Number of unescaped bytes this represents - byte_count: usize, -} - -/// Iterator over escape sequences in a JS string. -/// Yields (source_len, unescaped_byte_count) for each element. -struct EscapeSequenceIter<'a> { - bytes: &'a [u8], - str_ref: &'a str, - pos: usize, - skip_marker: Option<&'a [u8]>, -} - -impl<'a> EscapeSequenceIter<'a> { - fn new(s: &'a str) -> Self { - Self { - bytes: s.as_bytes(), - str_ref: s, - pos: 0, - skip_marker: None, - } - } - - fn with_marker(s: &'a str, marker: &'a [u8]) -> Self { - Self { - bytes: s.as_bytes(), - str_ref: s, - pos: 0, - skip_marker: Some(marker), - } - } - - fn from_position(s: &'a str, start: usize) -> Self { - Self { - bytes: s.as_bytes(), - str_ref: s, - pos: start, - skip_marker: None, - } - } - - fn from_position_with_marker(s: &'a str, start: usize, marker: &'a [u8]) -> Self { - Self { - bytes: s.as_bytes(), - str_ref: s, - pos: start, - skip_marker: Some(marker), - } - } - - /// Current position in the source string - fn position(&self) -> usize { - self.pos - } -} - -impl Iterator for EscapeSequenceIter<'_> { - type Item = EscapeElement; - - fn next(&mut self) -> Option { - if self.pos >= self.bytes.len() { - return None; - } - - // Check for marker to skip - if let Some(marker) = self.skip_marker { - if self.pos + marker.len() <= self.bytes.len() - && &self.bytes[self.pos..self.pos + marker.len()] == marker - { - self.pos += marker.len(); - return Some(EscapeElement { byte_count: 0 }); // Markers don't count - } - } - - // Check for escape sequence - if self.bytes[self.pos] == b'\\' && self.pos + 1 < self.bytes.len() { - let esc = self.bytes[self.pos + 1]; - - // Simple escape sequences: \n, \r, \t, \b, \f, \v, \", \', \\, \/ - if matches!( - esc, - b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' - ) { - self.pos += 2; - return Some(EscapeElement { byte_count: 1 }); - } - - // \xHH - hex escape (1 byte) - if esc == b'x' && self.pos + 3 < self.bytes.len() { - self.pos += 4; - return Some(EscapeElement { byte_count: 1 }); - } - - // \uHHHH - unicode escape - if esc == b'u' && self.pos + 5 < self.bytes.len() { - let hex = &self.str_ref[self.pos + 2..self.pos + 6]; - if hex.chars().all(|c| c.is_ascii_hexdigit()) { - if let Ok(code_unit) = u16::from_str_radix(hex, 16) { - // Check for surrogate pair - if (0xD800..=0xDBFF).contains(&code_unit) - && self.pos + 11 < self.bytes.len() - && self.bytes[self.pos + 6] == b'\\' - && self.bytes[self.pos + 7] == b'u' - { - let hex2 = &self.str_ref[self.pos + 8..self.pos + 12]; - if hex2.chars().all(|c| c.is_ascii_hexdigit()) { - if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { - if (0xDC00..=0xDFFF).contains(&code_unit2) { - // Full surrogate pair = 4 UTF-8 bytes - self.pos += 12; - return Some(EscapeElement { byte_count: 4 }); - } - } - } - } - - // Single unicode escape - calculate UTF-8 byte length - let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); - self.pos += 6; - return Some(EscapeElement { - byte_count: c.len_utf8(), - }); - } - } - } - } - - // Regular character - count its UTF-8 byte length - if self.bytes[self.pos] < 0x80 { - self.pos += 1; - Some(EscapeElement { byte_count: 1 }) - } else { - // Multi-byte UTF-8 character - let c = self.str_ref[self.pos..] - .chars() - .next() - .unwrap_or('\u{FFFD}'); - let len = c.len_utf8(); - self.pos += len; - Some(EscapeElement { byte_count: len }) - } - } -} - -/// Calculate the unescaped byte length of a JS string with escape sequences. -/// This accounts for \n, \r, \t, \\, \", \xHH, \uHHHH, and surrogate pairs. -fn calculate_unescaped_byte_length(s: &str) -> usize { - EscapeSequenceIter::new(s).map(|e| e.byte_count).sum() -} - -/// Consume a specified number of unescaped bytes from a JS string, returning the end position. -fn consume_unescaped_bytes(s: &str, start_pos: usize, byte_count: usize) -> (usize, usize) { - let mut iter = EscapeSequenceIter::from_position(s, start_pos); - let mut consumed = 0; - - while consumed < byte_count { - match iter.next() { - Some(elem) => consumed += elem.byte_count, - None => break, - } - } - - (iter.position(), consumed) -} - -/// Information about a T-chunk found in the combined RSC content -struct TChunkInfo { - /// The chunk ID (hex string like "1a", "443") - id: String, - /// Position where the T-chunk header starts (e.g., position of "1a:T...") - match_start: usize, - /// Position right after the comma (where content begins) - header_end: usize, - /// Position where the content ends - content_end: usize, -} - -/// Find all T-chunks in content, optionally skipping markers. -fn find_tchunks_impl(content: &str, skip_markers: bool) -> Vec { - let mut chunks = Vec::new(); - let mut search_pos = 0; - let marker = if skip_markers { - Some(RSC_MARKER.as_bytes()) - } else { - None - }; - - while search_pos < content.len() { - if let Some(cap) = TCHUNK_PATTERN.captures(&content[search_pos..]) { - let m = cap.get(0).unwrap(); - let match_start = search_pos + m.start(); - let header_end = search_pos + m.end(); - - let id = cap.get(1).unwrap().as_str().to_string(); - let length_hex = cap.get(2).unwrap().as_str(); - let declared_length = usize::from_str_radix(length_hex, 16).unwrap_or(0); - - // Consume bytes using the appropriate iterator - let content_end = if let Some(marker_bytes) = marker { - let mut iter = EscapeSequenceIter::from_position_with_marker( - content, - header_end, - marker_bytes, - ); - let mut consumed = 0; - while consumed < declared_length { - match iter.next() { - Some(elem) => consumed += elem.byte_count, - None => break, - } - } - iter.position() - } else { - let (pos, _) = consume_unescaped_bytes(content, header_end, declared_length); - pos - }; - - chunks.push(TChunkInfo { - id, - match_start, - header_end, - content_end, - }); - - search_pos = content_end; - } else { - break; - } - } - - chunks -} - -/// Find all T-chunks in RSC content (no markers). -fn find_tchunks(content: &str) -> Vec { - find_tchunks_impl(content, false) -} - -/// Rewrite URLs in a string, handling various URL formats in RSC content. -fn rewrite_rsc_url_string( - s: &str, - origin_host: &str, - request_host: &str, - request_scheme: &str, -) -> String { - let escaped_origin = escape(origin_host); - - // Match various URL patterns: - // - https://host or http://host - // - //host (protocol-relative) - // - \/\/host (escaped slashes in JSON) - // - \\\/\\\/host (double-escaped) - // - \\\\/\\\\/host (quad-escaped) - let pattern = Regex::new(&format!( - r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, - escaped_origin - )) - .unwrap(); - - pattern - .replace_all(s, |caps: ®ex::Captures<'_>| { - let slashes = caps.get(3).map_or("//", |m| m.as_str()); - format!("{}:{}{}", request_scheme, slashes, request_host) - }) - .into_owned() -} - -/// Rewrite T-chunks in RSC content, updating lengths after URL rewriting. -/// This works for single scripts where T-chunks don't span script boundaries. -fn rewrite_rsc_tchunks( - content: &str, - origin_host: &str, - request_host: &str, - request_scheme: &str, -) -> String { - let chunks = find_tchunks(content); - - if chunks.is_empty() { - // No T-chunks, just rewrite URLs in the whole content - return rewrite_rsc_url_string(content, origin_host, request_host, request_scheme); - } - - let mut result = String::with_capacity(content.len()); - let mut last_end = 0; - - for chunk in &chunks { - // Content before this T-chunk (rewrite URLs) - let before = &content[last_end..chunk.match_start]; - result.push_str(&rewrite_rsc_url_string( - before, - origin_host, - request_host, - request_scheme, - )); - - // Extract and rewrite T-chunk content - let chunk_content = &content[chunk.header_end..chunk.content_end]; - let rewritten_content = - rewrite_rsc_url_string(chunk_content, origin_host, request_host, request_scheme); - - // Calculate new byte length - let new_length = calculate_unescaped_byte_length(&rewritten_content); - let new_length_hex = format!("{:x}", new_length); - - // Write new T-chunk header and content - result.push_str(&chunk.id); - result.push_str(":T"); - result.push_str(&new_length_hex); - result.push(','); - result.push_str(&rewritten_content); - - last_end = chunk.content_end; - } - - // Remaining content after last T-chunk - let remaining = &content[last_end..]; - result.push_str(&rewrite_rsc_url_string( - remaining, - origin_host, - request_host, - request_scheme, - )); - - result -} - -// ============================================================================= -// Cross-Script RSC Processing -// ============================================================================= -// -// T-chunks can span multiple push scripts. For example: -// - Script 10: "11:null\n1a:T928," (header declares 928 bytes, but script ends) -// - Script 11: "...actual 928 bytes of content..." -// -// To handle this correctly, we must process all scripts together: -// 1. Combine scripts with markers -// 2. Find T-chunks across the combined content (skip markers when counting bytes) -// 3. Rewrite URLs and recalculate lengths -// 4. Split back on markers -// - -/// Calculate unescaped byte length excluding RSC markers. -fn calculate_unescaped_byte_length_skip_markers(s: &str) -> usize { - EscapeSequenceIter::with_marker(s, RSC_MARKER.as_bytes()) - .map(|e| e.byte_count) - .sum() -} - -/// Find T-chunks in marker-combined RSC content. -fn find_tchunks_with_markers(content: &str) -> Vec { - find_tchunks_impl(content, true) -} - -/// Process multiple RSC script payloads together, handling cross-script T-chunks. -/// -/// This function: -/// 1. Combines all payloads with markers -/// 2. Finds T-chunks across the combined content -/// 3. Rewrites URLs and recalculates T-chunk lengths -/// 4. Splits back on markers to return individual rewritten payloads -/// -/// # Arguments -/// * `payloads` - The string content from each `self.__next_f.push([1, '...'])` call -/// * `origin_host` - The origin host to replace -/// * `request_host` - The request host to use in replacements -/// * `request_scheme` - The scheme (http/https) to use in replacements -/// -/// # Returns -/// A vector of rewritten payloads in the same order as input -pub fn rewrite_rsc_scripts_combined( - payloads: &[&str], - origin_host: &str, - request_host: &str, - request_scheme: &str, -) -> Vec { - if payloads.is_empty() { - return Vec::new(); - } - - if payloads.len() == 1 { - // Single script - use simple approach - return vec![rewrite_rsc_tchunks( - payloads[0], - origin_host, - request_host, - request_scheme, - )]; - } - - // Combine payloads with markers - let mut combined = payloads[0].to_string(); - for payload in &payloads[1..] { - combined.push_str(RSC_MARKER); - combined.push_str(payload); - } - - // Find T-chunks in combined content - let chunks = find_tchunks_with_markers(&combined); - - if chunks.is_empty() { - // No T-chunks - just rewrite URLs in each payload - return payloads - .iter() - .map(|p| rewrite_rsc_url_string(p, origin_host, request_host, request_scheme)) - .collect(); - } - - // Build rewritten combined content - let mut result = String::with_capacity(combined.len()); - let mut last_end = 0; - - for chunk in &chunks { - // Content before this T-chunk (rewrite URLs, preserve markers) - let before = &combined[last_end..chunk.match_start]; - result.push_str(&rewrite_rsc_url_string( - before, - origin_host, - request_host, - request_scheme, - )); - - // Extract T-chunk content (may contain markers) - let chunk_content = &combined[chunk.header_end..chunk.content_end]; - - // Rewrite URLs (preserves markers) - let rewritten_content = - rewrite_rsc_url_string(chunk_content, origin_host, request_host, request_scheme); - - // Calculate new byte length (excluding markers) - let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); - let new_length_hex = format!("{:x}", new_length); - - // Write new T-chunk header and content - result.push_str(&chunk.id); - result.push_str(":T"); - result.push_str(&new_length_hex); - result.push(','); - result.push_str(&rewritten_content); - - last_end = chunk.content_end; - } - - // Remaining content after last T-chunk - let remaining = &combined[last_end..]; - result.push_str(&rewrite_rsc_url_string( - remaining, - origin_host, - request_host, - request_scheme, - )); - - // Split back on markers - result.split(RSC_MARKER).map(|s| s.to_string()).collect() -} - -/// Information about an RSC push script in HTML -struct RscPushScript { - /// Start position of the payload content (inside the quotes). - payload_start: usize, - /// End position of the payload content (inside the quotes). - payload_end: usize, - /// The payload content (inside the quotes) - payload: String, -} - -/// Find all RSC push scripts in HTML content. -/// Returns scripts in order of appearance. -/// -/// Handles both minified format: `` -/// and prettified format with whitespace: -/// ```html -/// -/// ``` -fn find_rsc_push_scripts(html: &str) -> Vec { - let mut scripts = Vec::new(); - let mut search_pos = 0; - - while search_pos < html.len() { - let Some(cap) = RSC_SCRIPT_PATTERN.captures(&html[search_pos..]) else { - break; - }; - - let quote_match = cap.get(1).unwrap(); - let quote = quote_match.as_str().chars().next().unwrap(); - let payload_start = search_pos + quote_match.end(); - - // Find the closing quote (handling escapes) - let mut i = payload_start; - let bytes = html.as_bytes(); - while i < bytes.len() { - if bytes[i] == b'\\' { - i += 2; // Skip escape sequence - } else if bytes[i] == quote as u8 { - break; - } else { - i += 1; - } - } - - if i >= bytes.len() { - search_pos = payload_start; - continue; - } - - // After the closing quote, look for ]) with optional whitespace - let after_quote = &html[i + 1..]; - - let Some(ending_match) = RSC_SCRIPT_ENDING.find(after_quote) else { - search_pos = payload_start; - continue; - }; - - let payload = html[payload_start..i].to_string(); - let payload_end = i; - let script_end = i + 1 + ending_match.end(); - - scripts.push(RscPushScript { - payload_start, - payload_end, - payload, - }); - - search_pos = script_end; - } - - scripts -} - -/// Post-process complete HTML to handle cross-script RSC T-chunks. -/// -/// This function: -/// 1. Finds all RSC push scripts in the HTML -/// 2. Extracts their payloads -/// 3. Processes them together using the combined approach -/// 4. Rebuilds the HTML with rewritten scripts -/// -/// This should be called after streaming HTML processing to fix T-chunk lengths -/// that span multiple scripts. -/// -/// # Arguments -/// * `html` - The complete HTML content (must be valid UTF-8) -/// * `origin_host` - The origin host to replace -/// * `request_host` - The request host to use in replacements -/// * `request_scheme` - The scheme (http/https) to use in replacements -/// -/// # Returns -/// The HTML with RSC scripts rewritten to have correct T-chunk lengths -pub fn post_process_rsc_html( - html: &str, - origin_host: &str, - request_host: &str, - request_scheme: &str, -) -> String { - let scripts = find_rsc_push_scripts(html); - - if scripts.is_empty() { - return html.to_string(); - } - - // Extract payloads - let payloads: Vec<&str> = scripts.iter().map(|s| s.payload.as_str()).collect(); - - if log::log_enabled!(log::Level::Debug) { - let origin_count_before: usize = payloads - .iter() - .map(|p| p.matches(origin_host).count()) - .sum(); - log::debug!( - "post_process_rsc_html: {} scripts, {} origin URLs, origin={}, proxy={}://{}", - scripts.len(), - origin_count_before, - origin_host, - request_scheme, - request_host - ); - } - - // Process all scripts together - let rewritten_payloads = - rewrite_rsc_scripts_combined(&payloads, origin_host, request_host, request_scheme); - - if log::log_enabled!(log::Level::Debug) { - let origin_count_after: usize = rewritten_payloads - .iter() - .map(|p| p.matches(origin_host).count()) - .sum(); - let proxy_count: usize = rewritten_payloads - .iter() - .map(|p| p.matches(request_host).count()) - .sum(); - log::debug!( - "post_process_rsc_html: after rewriting - {} origin URLs remaining, {} proxy URLs", - origin_count_after, - proxy_count - ); - } - - // Replace payload contents in-place (apply replacements in reverse order to keep indices valid). - let mut result = html.to_string(); - for (i, script) in scripts.iter().enumerate().rev() { - result.replace_range( - script.payload_start..script.payload_end, - &rewritten_payloads[i], - ); - } - - result -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::html_processor::{create_html_processor, HtmlProcessorConfig}; - use crate::integrations::{IntegrationRegistry, IntegrationScriptContext, ScriptRewriteAction}; - use crate::streaming_processor::{Compression, PipelineConfig, StreamingPipeline}; - use crate::test_support::tests::create_test_settings; - use serde_json::json; - use std::io::Cursor; - - fn test_config() -> Arc { - Arc::new(NextJsIntegrationConfig { - enabled: true, - rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], - }) - } - - fn ctx(selector: &'static str) -> IntegrationScriptContext<'static> { - IntegrationScriptContext { - selector, - request_host: "ts.example.com", - request_scheme: "https", - origin_host: "origin.example.com", - } - } - - #[test] - fn structured_rewriter_updates_next_data_payload() { - let payload = r#"{"props":{"pageProps":{"primary":{"href":"https://origin.example.com/reviews"},"secondary":{"href":"http://origin.example.com/sign-in"},"fallbackHref":"http://origin.example.com/legacy","protoRelative":"//origin.example.com/assets/logo.png"}}}"#; - let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Structured); - let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__")); - - match result { - ScriptRewriteAction::Replace(value) => { - // Note: URLs may have padding for length preservation - assert!(value.contains("ts.example.com") && value.contains("/reviews")); - assert!(value.contains("ts.example.com") && value.contains("/sign-in")); - assert!(value.contains(r#""fallbackHref":"http://origin.example.com/legacy""#)); - assert!(value.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#)); - } - _ => panic!("Expected rewrite to update payload"), - } - } - - #[test] - fn streamed_rewriter_skips_non_next_payloads() { - // The streamed rewriter skips RSC push scripts (self.__next_f.push) - // because these are handled by the post-processor for cross-script T-chunks. - let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Streamed); - - // Non-Next.js scripts should be skipped - let noop = rewriter.rewrite("console.log('hello');", &ctx("script")); - assert!(matches!(noop, ScriptRewriteAction::Keep)); - - // RSC push payloads should be skipped (handled by post-processor) - let payload = - r#"self.__next_f.push([1, "{\"href\":\"https://origin.example.com/app\"}"]);"#; - let result = rewriter.rewrite(payload, &ctx("script")); - assert!( - matches!(result, ScriptRewriteAction::Keep), - "Streamed rewriter should skip __next_f.push payloads (handled by post-processor)" - ); - - // Other __next_f scripts (like initialization) should still be processed - let init_script = r#"(self.__next_f = self.__next_f || []).push([0]); var url = "https://origin.example.com/api";"#; - let init_result = rewriter.rewrite(init_script, &ctx("script")); - // This might or might not be rewritten depending on content - just verify it runs - assert!( - matches!( - init_result, - ScriptRewriteAction::Keep | ScriptRewriteAction::Replace(_) - ), - "Streamed rewriter should handle non-push __next_f scripts" - ); - } - - #[test] - fn rewrite_helper_handles_protocol_relative_urls() { - let content = r#"{"props":{"pageProps":{"link":"//origin.example.com/image.png"}}}"#; - let rewritten = rewrite_nextjs_values( - content, - "origin.example.com", - "ts.example.com", - "https", - &["link".into()], - false, // preserve_length=false for non-RSC content - ) - .expect("should rewrite protocol relative link"); - - // Note: URLs may have padding for length preservation - assert!(rewritten.contains("ts.example.com") && rewritten.contains("/image.png")); - } - - fn config_from_settings( - settings: &Settings, - registry: &IntegrationRegistry, - ) -> HtmlProcessorConfig { - HtmlProcessorConfig::from_settings( - settings, - registry, - "origin.example.com", - "test.example.com", - "https", - ) - } - - #[test] - fn html_processor_rewrites_nextjs_script_when_enabled() { - let html = r#" - - "#; - - let mut settings = create_test_settings(); - settings - .integrations - .insert_config( - "nextjs", - &json!({ - "enabled": true, - "rewrite_attributes": ["href", "link", "url"], - }), - ) - .expect("should update nextjs config"); - let registry = IntegrationRegistry::new(&settings); - let config = config_from_settings(&settings, ®istry); - let processor = create_html_processor(config); - let pipeline_config = PipelineConfig { - input_compression: Compression::None, - output_compression: Compression::None, - chunk_size: 8192, - }; - let mut pipeline = StreamingPipeline::new(pipeline_config, processor); - - let mut output = Vec::new(); - pipeline - .process(Cursor::new(html.as_bytes()), &mut output) - .unwrap(); - let processed = String::from_utf8_lossy(&output); - - // Note: URLs may have padding characters for length preservation - assert!( - processed.contains("test.example.com") && processed.contains("/reviews"), - "should rewrite https Next.js href values to test.example.com" - ); - assert!( - processed.contains("test.example.com") && processed.contains("/sign-in"), - "should rewrite http Next.js href values to test.example.com" - ); - assert!( - processed.contains(r#""fallbackHref":"http://origin.example.com/legacy""#), - "should leave other fields untouched" - ); - assert!( - processed.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#), - "should not rewrite non-href keys" - ); - assert!( - !processed.contains("\"href\":\"https://origin.example.com/reviews\""), - "should remove origin https href" - ); - assert!( - !processed.contains("\"href\":\"http://origin.example.com/sign-in\""), - "should remove origin http href" - ); - } - - #[test] - fn html_processor_rewrites_rsc_stream_payload_with_length_preservation() { - // RSC payloads (self.__next_f.push) are rewritten via post-processing. - // The streaming phase skips RSC push scripts, and the HTML post-processor handles them - // at end-of-document to correctly handle cross-script T-chunks. - let html = r#" - - "#; - - let mut settings = create_test_settings(); - settings - .integrations - .insert_config( - "nextjs", - &json!({ - "enabled": true, - "rewrite_attributes": ["href", "link", "url"], - }), - ) - .expect("should update nextjs config"); - let registry = IntegrationRegistry::new(&settings); - let config = config_from_settings(&settings, ®istry); - let processor = create_html_processor(config); - let pipeline_config = PipelineConfig { - input_compression: Compression::None, - output_compression: Compression::None, - chunk_size: 8192, - }; - let mut pipeline = StreamingPipeline::new(pipeline_config, processor); - - let mut output = Vec::new(); - pipeline - .process(Cursor::new(html.as_bytes()), &mut output) - .unwrap(); - - let final_html = String::from_utf8_lossy(&output); - - // RSC payloads should be rewritten via end-of-document post-processing - assert!( - final_html.contains("test.example.com"), - "RSC stream payloads should be rewritten to proxy host via post-processing. Output: {}", - final_html - ); - } - - #[test] - fn html_processor_rewrites_rsc_stream_payload_with_chunked_input() { - // RSC payloads are rewritten via post-processing, even with chunked streaming input - let html = r#" - - "#; - - let mut settings = create_test_settings(); - settings - .integrations - .insert_config( - "nextjs", - &json!({ - "enabled": true, - "rewrite_attributes": ["href", "url"], - }), - ) - .expect("should update nextjs config"); - let registry = IntegrationRegistry::new(&settings); - let config = config_from_settings(&settings, ®istry); - let processor = create_html_processor(config); - let pipeline_config = PipelineConfig { - input_compression: Compression::None, - output_compression: Compression::None, - chunk_size: 32, - }; - let mut pipeline = StreamingPipeline::new(pipeline_config, processor); - - let mut output = Vec::new(); - pipeline - .process(Cursor::new(html.as_bytes()), &mut output) - .unwrap(); - - let final_html = String::from_utf8_lossy(&output); - - // RSC payloads should be rewritten via end-of-document post-processing - assert!( - final_html.contains("test.example.com"), - "RSC stream payloads should be rewritten to proxy host with chunked input. Output: {}", - final_html - ); - } - - #[test] - fn register_respects_enabled_flag() { - let settings = create_test_settings(); - let registration = register(&settings); - - assert!( - registration.is_none(), - "should skip registration when integration is disabled" - ); - } - - #[test] - fn html_processor_rewrites_rsc_payloads_with_length_preservation() { - // RSC payloads (self.__next_f.push) are rewritten via post-processing. - // This allows navigation to stay on proxy while correctly handling cross-script T-chunks. - - let html = r#" - -"#; - - let mut settings = create_test_settings(); - settings - .integrations - .insert_config( - "nextjs", - &json!({ - "enabled": true, - "rewrite_attributes": ["url"], - }), - ) - .expect("should update nextjs config"); - - let registry = IntegrationRegistry::new(&settings); - let config = config_from_settings(&settings, ®istry); - let processor = create_html_processor(config); - let pipeline_config = PipelineConfig { - input_compression: Compression::None, - output_compression: Compression::None, - chunk_size: 8192, - }; - let mut pipeline = StreamingPipeline::new(pipeline_config, processor); - - let mut output = Vec::new(); - pipeline - .process(Cursor::new(html.as_bytes()), &mut output) - .unwrap(); - let final_html = String::from_utf8_lossy(&output); - - // RSC payloads should be rewritten via post-processing - assert!( - final_html.contains("test.example.com"), - "RSC payload URLs should be rewritten to proxy host. Output: {}", - final_html - ); - - // Verify the RSC payload structure is preserved - assert!( - final_html.contains(r#""ID":879000"#), - "RSC payload ID should be preserved" - ); - assert!( - final_html.contains(r#""title":"Makes""#), - "RSC payload title should be preserved" - ); - assert!( - final_html.contains(r#""children":"$45a""#), - "RSC payload children reference should be preserved" - ); - - // Verify \n separators are preserved (crucial for RSC parsing) - assert!( - final_html.contains(r#"\n442:"#), - "RSC record separator \\n should be preserved. Output: {}", - final_html - ); - } - - #[test] - fn test_tchunk_length_recalculation() { - // Test that T-chunk lengths are correctly recalculated after URL rewriting. - // T-chunk format: ID:T, - // The hex_length is the UNESCAPED byte count of the content. - - // Original content: {"url":"https://origin.example.com/path"} = 41 bytes = 0x29 - // After rewriting: {"url":"https://test.example.com/path"} = 39 bytes = 0x27 - // (origin.example.com is 18 chars, test.example.com is 16 chars - shrinks by 2) - let content = r#"1a:T29,{"url":"https://origin.example.com/path"}"#; - let result = - rewrite_rsc_tchunks(content, "origin.example.com", "test.example.com", "https"); - - assert!( - result.contains("test.example.com"), - "URL should be rewritten" - ); - assert!( - result.starts_with("1a:T27,"), - "T-chunk length should be updated from 29 (41) to 27 (39). Got: {}", - result - ); - } - - #[test] - fn test_tchunk_length_recalculation_with_length_increase() { - // Test that T-chunk lengths are correctly recalculated when URL length increases. - // Original: short.io (8 chars) -> test.example.com (16 chars) - grows by 8 - - // Content: {"url":"https://short.io/x"} = 28 bytes = 0x1c - // After: {"url":"https://test.example.com/x"} = 36 bytes = 0x24 - let content = r#"1a:T1c,{"url":"https://short.io/x"}"#; - let result = rewrite_rsc_tchunks(content, "short.io", "test.example.com", "https"); - - assert!( - result.contains("test.example.com"), - "URL should be rewritten" - ); - assert!( - result.starts_with("1a:T24,"), - "T-chunk length should be updated from 1c (28) to 24 (36). Got: {}", - result - ); - } - - #[test] - fn test_calculate_unescaped_byte_length() { - // Test the unescaped byte length calculation - assert_eq!(calculate_unescaped_byte_length("hello"), 5); - assert_eq!(calculate_unescaped_byte_length(r#"\n"#), 1); // \n = 1 byte - assert_eq!(calculate_unescaped_byte_length(r#"\r\n"#), 2); // \r\n = 2 bytes - assert_eq!(calculate_unescaped_byte_length(r#"\""#), 1); // \" = 1 byte - assert_eq!(calculate_unescaped_byte_length(r#"\\"#), 1); // \\ = 1 byte - assert_eq!(calculate_unescaped_byte_length(r#"\x41"#), 1); // \x41 = 'A' = 1 byte - assert_eq!(calculate_unescaped_byte_length(r#"\u0041"#), 1); // \u0041 = 'A' = 1 byte - assert_eq!(calculate_unescaped_byte_length(r#"\u00e9"#), 2); // \u00e9 = 'é' = 2 UTF-8 bytes - } - - #[test] - fn test_multiple_tchunks() { - // Test content with multiple T-chunks - let content = r#"1a:T1c,{"url":"https://short.io/x"}\n1b:T1c,{"url":"https://short.io/y"}"#; - let result = rewrite_rsc_tchunks(content, "short.io", "test.example.com", "https"); - - // Both T-chunks should have updated lengths - assert!( - result.contains("test.example.com"), - "URLs should be rewritten" - ); - // Both chunks should have new length 0x24 (36 bytes) - let count = result.matches(":T24,").count(); - assert_eq!(count, 2, "Both T-chunks should have updated lengths"); - } - - #[test] - fn test_cross_script_tchunk_rewriting() { - // Test T-chunks that span multiple scripts. - // This is the key scenario that breaks per-script processing. - // - // Script 0: Contains a T-chunk header that declares more content than is in this script - // Script 1: Contains the rest of the T-chunk content, including URLs that need rewriting - - // T-chunk declares 64 bytes (0x40), but script 0 only has partial content - let script0 = r#"other:data\n1a:T40,partial content"#; - // Script 1 has the rest of the T-chunk content with a URL - let script1 = r#" with https://origin.example.com/page goes here"#; - - // Check the actual combined byte lengths - let combined_content = "partial content with https://origin.example.com/page goes here"; - let combined_len = calculate_unescaped_byte_length(combined_content); - println!( - "Combined T-chunk content length: {} bytes = 0x{:x}", - combined_len, combined_len - ); - - // Process using combined approach - let payloads: Vec<&str> = vec![script0, script1]; - let results = rewrite_rsc_scripts_combined( - &payloads, - "origin.example.com", - "test.example.com", - "https", - ); - - println!("Results[0]: {}", results[0]); - println!("Results[1]: {}", results[1]); - - assert_eq!(results.len(), 2, "Should return same number of scripts"); - - // The URL should be rewritten in script 1 - assert!( - results[1].contains("test.example.com"), - "URL in script 1 should be rewritten. Got: {}", - results[1] - ); - - // The T-chunk header in script 0 should have updated length - // Let's check what the new length actually is - let rewritten_content = "partial content with https://test.example.com/page goes here"; - let rewritten_len = calculate_unescaped_byte_length(rewritten_content); - println!( - "Rewritten T-chunk content length: {} bytes = 0x{:x}", - rewritten_len, rewritten_len - ); - - let expected_header = format!(":T{:x},", rewritten_len); - assert!( - results[0].contains(&expected_header), - "T-chunk length in script 0 should be updated to {}. Got: {}", - expected_header, - results[0] - ); - } - - #[test] - fn test_cross_script_preserves_non_tchunk_content() { - // Test that content outside T-chunks is still rewritten correctly - let script0 = r#"{"url":"https://origin.example.com/first"}\n1a:T40,partial"#; - let script1 = r#" content with https://origin.example.com/page end"#; - - let payloads: Vec<&str> = vec![script0, script1]; - let results = rewrite_rsc_scripts_combined( - &payloads, - "origin.example.com", - "test.example.com", - "https", - ); - - // URL outside T-chunk in script 0 should be rewritten - assert!( - results[0].contains("test.example.com/first"), - "URL outside T-chunk should be rewritten. Got: {}", - results[0] - ); - - // URL inside T-chunk (spanning scripts) should be rewritten - assert!( - results[1].contains("test.example.com/page"), - "URL inside cross-script T-chunk should be rewritten. Got: {}", - results[1] - ); - } - - #[test] - fn test_post_process_rsc_html() { - // Test the complete HTML post-processing function - let html = r#" - - -"#; - - let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); - - // The URL should be rewritten - assert!( - result.contains("test.example.com/page"), - "URL should be rewritten. Got: {}", - result - ); - - // The T-chunk length should be updated - assert!( - result.contains(":T3c,"), - "T-chunk length should be updated. Got: {}", - result - ); - - // HTML structure should be preserved - assert!(result.contains("") && result.contains("")); - assert!(result.contains("self.__next_f.push")); - } - - #[test] - fn test_post_process_rsc_html_with_prettified_format() { - // Test with prettified HTML format (newlines and whitespace between elements) - // This is the format Next.js uses when outputting non-minified HTML - let html = r#" - - -"#; - - let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); - - // Both URLs should be rewritten - assert!( - result.contains("test.example.com/news"), - "First URL should be rewritten. Got: {}", - result - ); - assert!( - result.contains("test.example.com/reviews"), - "Second URL should be rewritten. Got: {}", - result - ); - - // No origin URLs should remain - assert!( - !result.contains("origin.example.com"), - "No origin URLs should remain. Got: {}", - result - ); - - // HTML structure should be preserved - assert!(result.contains("") && result.contains("")); - assert!(result.contains("self.__next_f.push")); - } - - #[test] - fn test_post_process_html_with_html_href_in_tchunk() { - // Test that HTML href attributes inside T-chunks are rewritten - // This is the format where HTML markup is embedded in RSC T-chunk content - let html = r#" - -"#; - - let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); - - // The HTML href URL should be rewritten - assert!( - result.contains("test.example.com/about-us"), - "HTML href URL in T-chunk should be rewritten. Got: {}", - result - ); - - // No origin URLs should remain - assert!( - !result.contains("origin.example.com"), - "No origin URLs should remain. Got: {}", - result - ); - - // Verify T-chunk length was recalculated - // Original content: \u003cdiv\u003e\u003ca href="https://origin.example.com/about-us"\u003eAbout\u003c/a\u003e\u003c/div\u003e - // After rewrite, URL is shorter so T-chunk length should be smaller - assert!( - !result.contains(":T4d9,"), - "T-chunk length should have been recalculated (original was 4d9). Got: {}", - result - ); - } -} - -#[cfg(test)] -mod truncated_string_tests { - use super::*; - - #[test] - fn test_truncated_string_parsing() { - // This simulates a Next.js chunk that's been split mid-string - // With pure regex rewriting, truncated strings without closing quotes - // simply won't match, which is the desired behavior - let truncated = r#"self.__next_f.push([ - 1, - '430:I[6061,["749","static/chunks/16bf9003-553c36acd7d8a04b.js","4669","static/chun' -]);"#; - - // The regex pattern requires a closing quote after the URL, - // so truncated content without URLs won't be modified - let result = rewrite_nextjs_values( - truncated, - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length=true for RSC payloads - ); - println!("Rewrite result: {:?}", result); - // Should return None since no matching URL patterns exist - assert!( - result.is_none(), - "Truncated content without URLs should not be modified" - ); - } - - #[test] - fn test_complete_string_with_url() { - // A complete Next.js chunk with a URL that should be rewritten - let complete = r#"self.__next_f.push([ - 1, - '{"url":"https://origin.example.com/path/to/resource"}' -]);"#; - - let result = rewrite_nextjs_values( - complete, - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length=true for RSC payloads - ); - println!("Complete string rewrite: {:?}", result); - assert!(result.is_some()); - let rewritten = result.unwrap(); - // Note: URL may have padding for length preservation - assert!(rewritten.contains("proxy.example.com") && rewritten.contains("/path/to/resource")); - } - - #[test] - fn test_truncated_url_rewrite() { - // URL that starts in this chunk but continues in the next - // Like: "url":"https://origin.example.com/some/path?param=%20 - // where the closing quote is in the next chunk - let truncated_url = r#"self.__next_f.push([ - 1, - '\"url\":\"https://origin.example.com/rss?title=%20' -]);"#; - - println!("Input with truncated URL:"); - println!("{}", truncated_url); - - let result = rewrite_nextjs_values( - truncated_url, - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length=true for RSC payloads - ); - println!("Truncated URL rewrite result: {:?}", result); - - // The regex pattern requires a closing quote after the URL path, - // so URLs without closing quotes won't be matched (preventing corruption) - // This is actually the desired behavior - incomplete URLs are left alone - assert!( - result.is_none(), - "Truncated URL without closing quote should not be modified" - ); - } - - #[test] - fn test_embedded_pattern_incomplete_url() { - // Test the regex directly with an incomplete URL - let rewriter = UrlRewriter::new( - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length for RSC payloads - ); - - // This string has an incomplete URL - it starts but doesn't close properly - // within the string boundaries - let incomplete = r#"\"url\":\"https://origin.example.com/rss?title=%20"#; - println!("Testing embedded pattern on incomplete URL:"); - println!("Input: {}", incomplete); - - let result = rewriter.rewrite_embedded(incomplete); - println!("Result: {:?}", result); - - // Now test with a complete URL - let complete = r#"\"url\":\"https://origin.example.com/complete\""#; - println!("\nTesting embedded pattern on complete URL:"); - println!("Input: {}", complete); - - let result = rewriter.rewrite_embedded(complete); - println!("Result: {:?}", result); - } - - #[test] - fn test_split_chunk_url_corruption() { - // This is the EXACT scenario that breaks React hydration! - // The URL is split across two Next.js chunks. - - // Chunk 1: Contains the start of the URL - // Note: In Next.js RSC, double quotes inside single-quoted strings are NOT escaped - let chunk1 = r#"self.__next_f.push([ - 1, - '336:{"url":"https://origin.example.com/.rss/feed/3d70fbb5-ef5e-44f3-a547-e60939496e82.xml?title=Latest%20Car%20News%3A%20Trucks%2C%20SUVs%2C%20EVs%2C%20Reviews%20%26%20' -]);"#; - - // Chunk 2: Contains the continuation of the URL - let chunk2 = r#"self.__next_f.push([ - 1, - 'Auto%20Trends"}\n337:{"url":"https://origin.example.com/complete"}' -]);"#; - - println!("=== Chunk 1 (truncated URL start) ==="); - println!("{}", chunk1); - - let result1 = rewrite_nextjs_values( - chunk1, - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length for RSC payloads - ); - println!("\nRewritten Chunk 1: {:?}", result1); - - // CRITICAL CHECK: The rewritten chunk should have the SAME quote escaping as the original - // If original has unescaped " inside ', the rewritten should too - if let Some(ref r1) = result1 { - println!("\n=== Quote escaping analysis ==="); - println!( - "Original has '336:{{\"url\":' (with backslash-quote): {}", - chunk1.contains(r#"\"url\""#) - ); - println!( - "Original has '336:{{\"url\":' (unescaped quote): {}", - chunk1.contains(r#"{"url":"#) - ); - println!("Rewritten has backslash-quote: {}", r1.contains(r#"\""#)); - println!( - "Rewritten has unescaped quote: {}", - r1.contains(r#"{"url":"#) - ); - - // The bug: original has unescaped ", but rewritten might have escaped \" - // This would change the JavaScript string content! - let original_has_backslash = chunk1.contains(r#"\""#); - let rewritten_has_backslash = r1.contains(r#"\""#); - - if !original_has_backslash && rewritten_has_backslash { - println!("\n!!! BUG DETECTED !!!"); - println!("The rewriter is ADDING backslash escapes that weren't in the original!"); - println!("This corrupts the JavaScript string content!"); - } - } - - println!("\n=== Chunk 2 (URL continuation) ==="); - println!("{}", chunk2); - - let result2 = rewrite_nextjs_values( - chunk2, - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length for RSC payloads - ); - println!("\nRewritten Chunk 2: {:?}", result2); - - // Let's verify the complete URL in chunk2 is rewritten - if let Some(ref rewritten2) = result2 { - assert!( - rewritten2.contains("proxy.example.com") && rewritten2.contains("/complete"), - "Complete URL in chunk2 should be rewritten to new host with /complete path" - ); - } - } - - #[test] - fn test_embedded_regex_pattern() { - // Test the regex pattern directly to understand what it matches - let rewriter = UrlRewriter::new( - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length for RSC payloads - ); - - // Test 1: Unescaped double quotes (as in single-quoted JS string) - let unescaped = r#"'336:{"url":"https://origin.example.com/path"}'"#; - println!("Test 1 - Unescaped quotes:"); - println!(" Input: {}", unescaped); - let result = rewriter.rewrite_embedded(unescaped); - println!(" Result: {:?}", result); - - // Test 2: Escaped double quotes (as in double-quoted JS string or JSON) - let escaped = r#"'336:{\"url\":\"https://origin.example.com/path\"}'"#; - println!("\nTest 2 - Escaped quotes:"); - println!(" Input: {}", escaped); - let result = rewriter.rewrite_embedded(escaped); - println!(" Result: {:?}", result); - - // Test 3: Double-escaped quotes (as in JSON string inside JS string) - let double_escaped = r#"'336:{\\"url\\":\\"https://origin.example.com/path\\"}'"#; - println!("\nTest 3 - Double-escaped quotes:"); - println!(" Input: {}", double_escaped); - let result = rewriter.rewrite_embedded(double_escaped); - println!(" Result: {:?}", result); - } - - #[test] - fn test_backslash_n_preservation() { - // Critical test: Check that \n (backslash-n) is preserved byte-for-byte - // This is crucial because RSC payloads use \n as a record separator - - // String with literal backslash-n (two bytes: 0x5C 0x6E) - let input = - r#"self.__next_f.push([1, 'foo\n{"url":"https://origin.example.com/test"}\nbar']);"#; - - // Verify input has literal backslash-n - let backslash_n_pos = input.find(r"\n").unwrap(); - assert_eq!( - &input.as_bytes()[backslash_n_pos..backslash_n_pos + 2], - [0x5C, 0x6E], // backslash, n - "Input should have literal backslash-n" - ); - - let result = rewrite_nextjs_values( - input, - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length for RSC payloads - ); - - let rewritten = result.expect("should rewrite URL"); - - // Check the rewritten string still has literal backslash-n - let new_pos = rewritten.find(r"\n").unwrap(); - assert_eq!( - &rewritten.as_bytes()[new_pos..new_pos + 2], - [0x5C, 0x6E], - "Rewritten should preserve literal backslash-n" - ); - - // Count number of \n occurrences - let original_count = input.matches(r"\n").count(); - let rewritten_count = rewritten.matches(r"\n").count(); - assert_eq!( - original_count, rewritten_count, - "Number of \\n occurrences should be preserved" - ); - - println!("Input: {}", input); - println!("Rewritten: {}", rewritten); - println!( - "\\n count: original={}, rewritten={}", - original_count, rewritten_count - ); - } - - #[test] - fn test_url_rewriting_basic() { - // Test that URL rewriting works correctly while preserving the original scheme - let input = r#"self.__next_f.push([1, '{"url":"https://origin.example.com/news"}']);"#; - - let result = rewrite_nextjs_values( - input, - "origin.example.com", - "proxy.example.com", - "http", // request_scheme is now ignored - original scheme is preserved - &["url".into()], - true, // preserve_length for RSC payloads - ); - - let rewritten = result.expect("should rewrite URL"); - - println!("Original: {}", input); - println!("Rewritten: {}", rewritten); - - // Verify the URL was rewritten correctly, preserving the original https scheme - // With length preservation, URLs may have padding like /./././ - assert!( - rewritten.contains("http://proxy.example.com") && rewritten.contains("/news"), - "URL should be rewritten to new host with path, preserving https scheme. Got: {}", - rewritten - ); - assert!( - !rewritten.contains("origin.example.com"), - "URL should not contain original host" - ); - } - - #[test] - fn test_url_rewriting_preserves_rsc_structure() { - // Test that RSC record structure is preserved after rewriting - let input = r#"self.__next_f.push([1, '443:{"url":"https://origin.example.com/path"}\n444:{"other":"data"}']);"#; - - let result = rewrite_nextjs_values( - input, - "origin.example.com", - "proxy.example.com", - "http", // request_scheme is now ignored - original scheme is preserved - &["url".into()], - true, // preserve_length for RSC payloads - ); - - let rewritten = result.expect("should rewrite URL"); - - println!("Original: {}", input); - println!("Rewritten: {}", rewritten); - - // Verify URL was rewritten (preserving https scheme) - // With length preservation, URLs may have padding like /./././ - assert!( - rewritten.contains("http://proxy.example.com") && rewritten.contains("/path"), - "URL should be rewritten with preserved https scheme. Got: {}", - rewritten - ); - - // Verify record structure is intact - both records should still be parseable - assert!( - rewritten.contains(r#"\n444:"#), - "RSC record separator and next record ID must be preserved" - ); - assert!( - rewritten.contains(r#""other":"data""#), - "Subsequent record data must be preserved" - ); - } - - #[test] - fn test_nav_menu_rewrite() { - // Test a typical navigation menu payload - // This is the payload that contains the dropdown menu items - let input = r#"self.__next_f.push([ - 1, - '443:{"ID":878799,"title":"News","slug":"","post_parent":"0","guid":"pt000000000000000700000000000d68cf","menu_item_parent":"0","object_id":"category","url":"https://origin.example.com/news","target":"","attr_title":"","description":"","classes":"$444","menu_order":0,"post_type":"nav_menu_item","post_mime_type":"","object":"category","type":"taxonomy","type_label":"Category","menu_item_type":"taxonomy","hide_on_subnav":false,"children":"$445"}\n444:[""]\n445:[]' -]);"#; - - println!("=== Original Input ==="); - println!("{}", input); - - let result = rewrite_nextjs_values( - input, - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length for RSC payloads - ); - - let rewritten = result.expect("should rewrite URL"); - - println!("\n=== Rewritten Output ==="); - println!("{}", rewritten); - - // Verify the URL was rewritten using request scheme (http) - // With length preservation, URL may have padding like /./././ - assert!( - rewritten.contains("http://proxy.example.com") && rewritten.contains("/news"), - "URL should be rewritten to new host with request scheme. Got: {}", - rewritten - ); - assert!( - !rewritten.contains("origin.example.com"), - "Original host should not remain" - ); - - // Verify RSC structure is preserved - assert!( - rewritten.contains(r#""ID":878799"#), - "Record ID should be preserved" - ); - assert!( - rewritten.contains(r#""title":"News""#), - "Title should be preserved" - ); - assert!( - rewritten.contains(r#""classes":"$444""#), - "$444 reference should be preserved" - ); - assert!( - rewritten.contains(r#""children":"$445""#), - "$445 reference should be preserved" - ); - assert!( - rewritten.contains(r#"\n444:[""]"#), - "Record 444 should be preserved" - ); - assert!( - rewritten.contains(r#"\n445:[]"#), - "Record 445 should be preserved" - ); - - // Critical: Verify the JavaScript is still valid - // The string must be properly quoted and escaped - assert!( - rewritten.starts_with("self.__next_f.push(["), - "Should start with valid JS" - ); - assert!(rewritten.ends_with("]);"), "Should end with valid JS"); - - // Check byte length difference - let orig_len = input.len(); - let new_len = rewritten.len(); - println!("\n=== Length Analysis ==="); - println!("Original length: {}", orig_len); - println!("Rewritten length: {}", new_len); - println!("Difference: {} bytes", (orig_len as i64) - (new_len as i64)); - } - - #[test] - fn test_site_base_url_rewrite() { - // Test that siteBaseUrl gets rewritten alongside url attributes - // This is critical for React navigation to work correctly - if siteBaseUrl - // doesn't match the rewritten URLs, React may treat links as external - let input = r#"self.__next_f.push([1, '{"siteBaseUrl":"https://origin.example.com","url":"https://origin.example.com/news"}']);"#; - - let result = rewrite_nextjs_values( - input, - "origin.example.com", - "proxy.example.com", - "http", // request_scheme is now ignored - original scheme is preserved - &["url".into(), "siteBaseUrl".into()], // Include siteBaseUrl - true, // preserve_length for RSC payloads - ); - - let rewritten = result.expect("should rewrite URLs"); - - println!("Original: {}", input); - println!("Rewritten: {}", rewritten); - - // Both url and siteBaseUrl should be rewritten, preserving https scheme - // With length preservation, URLs may have padding - assert!( - rewritten.contains("http://proxy.example.com"), - "siteBaseUrl should be rewritten to match proxy host, preserving https. Got: {}", - rewritten - ); - assert!( - rewritten.contains("/news"), - "url path should be preserved. Got: {}", - rewritten - ); - assert!( - !rewritten.contains("origin.example.com"), - "Original host should not remain" - ); - } - - #[test] - fn test_site_production_domain_rewrite() { - // Test that siteProductionDomain (bare hostname without scheme) gets rewritten - // This is critical because Next.js uses this to determine if URLs are internal - let input = r#"self.__next_f.push([1, '{"siteProductionDomain":"origin.example.com","url":"https://origin.example.com/news"}']);"#; - - let result = rewrite_nextjs_values( - input, - "origin.example.com", - "proxy.example.com", - "http", // request_scheme is now ignored - original scheme is preserved - &["url".into(), "siteProductionDomain".into()], - true, // preserve_length for RSC payloads - ); - - let rewritten = result.expect("should rewrite URLs"); - - println!("Original: {}", input); - println!("Rewritten: {}", rewritten); - - // siteProductionDomain and URL should be rewritten, with possible length padding - assert!( - rewritten.contains("proxy.example.com"), - "siteProductionDomain should be rewritten to proxy host. Got: {}", - rewritten - ); - // URL should contain the path - assert!( - rewritten.contains("/news"), - "url path should be preserved. Got: {}", - rewritten - ); - assert!( - !rewritten.contains("origin.example.com"), - "Original host should not remain" - ); - } - - #[test] - fn test_calculate_padding() { - // Test whitespace padding calculation - // When new URL is shorter, we need spaces to compensate - let padding = UrlRewriter::calculate_padding(21, 24); - assert_eq!(padding.len(), 3, "Should need 3 spaces"); - assert_eq!(padding, " ", "Should be 3 spaces"); - - // No padding when lengths are equal - let padding = UrlRewriter::calculate_padding(24, 24); - assert_eq!(padding.len(), 0); - - // No padding when new URL is longer - let padding = UrlRewriter::calculate_padding(30, 24); - assert_eq!(padding.len(), 0); - } - - #[test] - fn test_whitespace_padding_rewrite() { - // Test that URL rewriting returns proper (url, padding) tuple - // Original: https://origin.example.com/news (31 chars) - // New URL: http://proxy.example.com/news (29 chars) - // Padding needed: 2 spaces - - let rewriter = UrlRewriter::new( - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, // preserve_length - ); - - let original_url = "https://origin.example.com/news"; - let result = rewriter.rewrite_url_value(original_url); - - assert!(result.is_some(), "URL should be rewritten"); - let (new_url, padding) = result.unwrap(); - - // Check the URL is correctly rewritten - assert_eq!(new_url, "http://proxy.example.com/news"); - assert!(new_url.contains("proxy.example.com")); - assert!(new_url.contains("/news")); - - // Check padding compensates for length difference - let original_len = original_url.len(); // 33 - let new_len = new_url.len(); // 26 - assert_eq!( - padding.len(), - original_len - new_len, - "Padding should be {} spaces", - original_len - new_len - ); - assert_eq!(padding, " ", "Should be 2 spaces"); - - // Total length (url + padding) should match original - assert_eq!( - new_url.len() + padding.len(), - original_url.len(), - "URL + padding should equal original length" - ); - } - - #[test] - fn test_no_padding_when_disabled() { - // When preserve_length is false, no padding should be returned - let rewriter = UrlRewriter::new( - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - false, // preserve_length disabled - ); - - let result = rewriter.rewrite_url_value("https://origin.example.com/news"); - assert!(result.is_some()); - let (new_url, padding) = result.unwrap(); - - assert_eq!(new_url, "http://proxy.example.com/news"); - assert_eq!(padding, "", "No padding when preserve_length is false"); - } -} diff --git a/crates/common/src/integrations/nextjs/fixtures/inlined-data-escaped.html b/crates/common/src/integrations/nextjs/fixtures/inlined-data-escaped.html new file mode 100644 index 0000000..81c213c --- /dev/null +++ b/crates/common/src/integrations/nextjs/fixtures/inlined-data-escaped.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/crates/common/src/integrations/nextjs/fixtures/inlined-data-nonce.html b/crates/common/src/integrations/nextjs/fixtures/inlined-data-nonce.html new file mode 100644 index 0000000..5ae7afa --- /dev/null +++ b/crates/common/src/integrations/nextjs/fixtures/inlined-data-nonce.html @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/crates/common/src/integrations/nextjs/html_post_process.rs b/crates/common/src/integrations/nextjs/html_post_process.rs new file mode 100644 index 0000000..f29f13f --- /dev/null +++ b/crates/common/src/integrations/nextjs/html_post_process.rs @@ -0,0 +1,333 @@ +use std::sync::Arc; + +use once_cell::sync::Lazy; +use regex::Regex; + +use crate::integrations::{IntegrationHtmlContext, IntegrationHtmlPostProcessor}; + +use super::rsc::rewrite_rsc_scripts_combined; +use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; + +/// RSC push script pattern for HTML post-processing. +static RSC_SCRIPT_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r#"]*>\s*self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#) + .expect("valid RSC script regex") +}); + +/// RSC script ending pattern. +static RSC_SCRIPT_ENDING: Lazy = + Lazy::new(|| Regex::new(r#"^\s*\]\s*\)\s*;?\s*"#).expect("valid RSC ending regex")); + +pub(crate) struct NextJsHtmlPostProcessor { + config: Arc, +} + +impl NextJsHtmlPostProcessor { + pub(crate) fn new(config: Arc) -> Self { + Self { config } + } +} + +impl IntegrationHtmlPostProcessor for NextJsHtmlPostProcessor { + fn integration_id(&self) -> &'static str { + NEXTJS_INTEGRATION_ID + } + + fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { + if !self.config.enabled || self.config.rewrite_attributes.is_empty() { + return false; + } + + html.contains("__next_f.push") && html.contains(ctx.origin_host) + } + + fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool { + if log::log_enabled!(log::Level::Debug) { + let origin_before = html.matches(ctx.origin_host).count(); + log::debug!( + "NextJs post-processor running: html_len={}, origin_matches={}, origin={}, proxy={}://{}", + html.len(), + origin_before, + ctx.origin_host, + ctx.request_scheme, + ctx.request_host + ); + } + + post_process_rsc_html_in_place(html, ctx.origin_host, ctx.request_host, ctx.request_scheme) + } +} + +#[derive(Debug, Clone, Copy)] +struct RscPushScriptRange { + payload_start: usize, + payload_end: usize, +} + +fn find_rsc_push_scripts(html: &str) -> Vec { + let mut scripts = Vec::new(); + let mut search_pos = 0; + + while search_pos < html.len() { + let Some(cap) = RSC_SCRIPT_PATTERN.captures(&html[search_pos..]) else { + break; + }; + + let quote_match = cap.get(1).expect("script regex should capture quote"); + let quote = quote_match + .as_str() + .chars() + .next() + .expect("quote should exist"); + let payload_start = search_pos + quote_match.end(); + + let mut i = payload_start; + let bytes = html.as_bytes(); + while i < bytes.len() { + if bytes[i] == b'\\' { + i += 2; + } else if bytes[i] == quote as u8 { + break; + } else { + i += 1; + } + } + + if i >= bytes.len() { + search_pos = payload_start; + continue; + } + + let after_quote = &html[i + 1..]; + let Some(ending_match) = RSC_SCRIPT_ENDING.find(after_quote) else { + search_pos = payload_start; + continue; + }; + + let payload_end = i; + let script_end = i + 1 + ending_match.end(); + + scripts.push(RscPushScriptRange { + payload_start, + payload_end, + }); + + search_pos = script_end; + } + + scripts +} + +pub fn post_process_rsc_html( + html: &str, + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> String { + let mut result = html.to_string(); + post_process_rsc_html_in_place(&mut result, origin_host, request_host, request_scheme); + result +} + +pub fn post_process_rsc_html_in_place( + html: &mut String, + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> bool { + let scripts = find_rsc_push_scripts(html.as_str()); + if scripts.is_empty() { + return false; + } + + let payloads: Vec<&str> = scripts + .iter() + .map(|s| &html[s.payload_start..s.payload_end]) + .collect(); + + if !payloads.iter().any(|p| p.contains(origin_host)) { + return false; + } + + if log::log_enabled!(log::Level::Debug) { + let origin_count_before: usize = payloads + .iter() + .map(|p| p.matches(origin_host).count()) + .sum(); + log::debug!( + "post_process_rsc_html: {} scripts, {} origin URLs, origin={}, proxy={}://{}", + payloads.len(), + origin_count_before, + origin_host, + request_scheme, + request_host + ); + } + + let rewritten_payloads = rewrite_rsc_scripts_combined( + payloads.as_slice(), + origin_host, + request_host, + request_scheme, + ); + + let mut changed = false; + for (i, original) in payloads.iter().enumerate() { + if rewritten_payloads[i] != *original { + changed = true; + break; + } + } + + if !changed { + return false; + } + + for (i, script) in scripts.iter().enumerate().rev() { + html.replace_range( + script.payload_start..script.payload_end, + &rewritten_payloads[i], + ); + } + + true +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn post_process_rsc_html_rewrites_cross_script_tchunks() { + let html = r#" + + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + + assert!( + result.contains("test.example.com/page"), + "URL should be rewritten. Got: {}", + result + ); + assert!( + result.contains(":T3c,"), + "T-chunk length should be updated. Got: {}", + result + ); + assert!(result.contains("") && result.contains("")); + assert!(result.contains("self.__next_f.push")); + } + + #[test] + fn post_process_rsc_html_handles_prettified_format() { + let html = r#" + + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + + assert!( + result.contains("test.example.com/news"), + "First URL should be rewritten. Got: {}", + result + ); + assert!( + result.contains("test.example.com/reviews"), + "Second URL should be rewritten. Got: {}", + result + ); + assert!( + !result.contains("origin.example.com"), + "No origin URLs should remain. Got: {}", + result + ); + assert!(result.contains("") && result.contains("")); + assert!(result.contains("self.__next_f.push")); + } + + #[test] + fn post_process_rewrites_html_href_inside_tchunk() { + let html = r#" + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + + assert!( + result.contains("test.example.com/about-us"), + "HTML href URL in T-chunk should be rewritten. Got: {}", + result + ); + assert!( + !result.contains("origin.example.com"), + "No origin URLs should remain. Got: {}", + result + ); + assert!( + !result.contains(":T4d9,"), + "T-chunk length should have been recalculated (original was 4d9). Got: {}", + result + ); + } + + #[test] + fn handles_nextjs_inlined_data_nonce_fixture() { + // Fixture mirrors Next.js `createInlinedDataReadableStream` output: + // `` + let html = include_str!("fixtures/inlined-data-nonce.html"); + let scripts = find_rsc_push_scripts(html); + assert_eq!(scripts.len(), 1, "Should find exactly one RSC data script"); + + let rewritten = + post_process_rsc_html(html, "origin.example.com", "proxy.example.com", "https"); + assert!( + rewritten.contains("https://proxy.example.com/news"), + "Fixture URL should be rewritten. Got: {rewritten}" + ); + assert!( + !rewritten.contains("https://origin.example.com/news"), + "Origin URL should be removed. Got: {rewritten}" + ); + } + + #[test] + fn handles_nextjs_inlined_data_html_escaping_fixture() { + // Fixture includes `\\u003c` escapes, matching Next.js `htmlEscapeJsonString` behavior. + let html = include_str!("fixtures/inlined-data-escaped.html"); + let scripts = find_rsc_push_scripts(html); + assert_eq!(scripts.len(), 1, "Should find exactly one RSC data script"); + + let rewritten = + post_process_rsc_html(html, "origin.example.com", "proxy.example.com", "https"); + assert!( + rewritten.contains("https://proxy.example.com/about"), + "Escaped fixture URL should be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#"\\u003ca href=\\\"https://proxy.example.com/about\\\""#), + "Escaped HTML should remain escaped and rewritten. Got: {rewritten}" + ); + assert!( + !rewritten.contains("https://origin.example.com/about"), + "Origin URL should be removed. Got: {rewritten}" + ); + } +} diff --git a/crates/common/src/integrations/nextjs/mod.rs b/crates/common/src/integrations/nextjs/mod.rs new file mode 100644 index 0000000..b3fc41f --- /dev/null +++ b/crates/common/src/integrations/nextjs/mod.rs @@ -0,0 +1,345 @@ +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; +use validator::Validate; + +use crate::integrations::IntegrationRegistration; +use crate::settings::{IntegrationConfig, Settings}; + +const NEXTJS_INTEGRATION_ID: &str = "nextjs"; + +mod html_post_process; +mod rsc; +mod script_rewriter; + +pub use html_post_process::{post_process_rsc_html, post_process_rsc_html_in_place}; +pub use rsc::rewrite_rsc_scripts_combined; + +use html_post_process::NextJsHtmlPostProcessor; +use script_rewriter::{NextJsRewriteMode, NextJsScriptRewriter}; + +#[derive(Debug, Clone, Deserialize, Serialize, Validate)] +pub struct NextJsIntegrationConfig { + #[serde(default = "default_enabled")] + pub enabled: bool, + #[serde( + default = "default_rewrite_attributes", + deserialize_with = "crate::settings::vec_from_seq_or_map" + )] + #[validate(length(min = 1))] + pub rewrite_attributes: Vec, +} + +impl IntegrationConfig for NextJsIntegrationConfig { + fn is_enabled(&self) -> bool { + self.enabled + } +} + +fn default_enabled() -> bool { + false +} + +fn default_rewrite_attributes() -> Vec { + vec!["href".to_string(), "link".to_string(), "url".to_string()] +} + +pub fn register(settings: &Settings) -> Option { + let config = match build(settings) { + Some(config) => { + log::info!( + "NextJS integration registered: enabled={}, rewrite_attributes={:?}", + config.enabled, + config.rewrite_attributes + ); + config + } + None => { + log::info!("NextJS integration not registered (disabled or missing config)"); + return None; + } + }; + + // Register both structured (Pages Router __NEXT_DATA__) and streamed (App Router RSC) + // rewriters. RSC payloads require LENGTH-PRESERVING URL replacement to avoid breaking + // React hydration - the RSC format uses byte positions for record boundaries. + let structured = Arc::new(NextJsScriptRewriter::new( + config.clone(), + NextJsRewriteMode::Structured, + )); + + let streamed = Arc::new(NextJsScriptRewriter::new( + config.clone(), + NextJsRewriteMode::Streamed, + )); + + // Register post-processor for cross-script RSC T-chunks + let post_processor = Arc::new(NextJsHtmlPostProcessor::new(config)); + + Some( + IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) + .with_script_rewriter(structured) + .with_script_rewriter(streamed) + .with_html_post_processor(post_processor) + .build(), + ) +} + +fn build(settings: &Settings) -> Option> { + let config = settings + .integration_config::(NEXTJS_INTEGRATION_ID) + .ok() + .flatten()?; + Some(Arc::new(config)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::html_processor::{create_html_processor, HtmlProcessorConfig}; + use crate::integrations::IntegrationRegistry; + use crate::streaming_processor::{Compression, PipelineConfig, StreamingPipeline}; + use crate::test_support::tests::create_test_settings; + use serde_json::json; + use std::io::Cursor; + + fn config_from_settings( + settings: &Settings, + registry: &IntegrationRegistry, + ) -> HtmlProcessorConfig { + HtmlProcessorConfig::from_settings( + settings, + registry, + "origin.example.com", + "test.example.com", + "https", + ) + } + + #[test] + fn html_processor_rewrites_nextjs_script_when_enabled() { + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "link", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let processed = String::from_utf8_lossy(&output); + + // Note: URLs may have padding characters for length preservation + assert!( + processed.contains("test.example.com") && processed.contains("/reviews"), + "should rewrite https Next.js href values to test.example.com" + ); + assert!( + processed.contains("test.example.com") && processed.contains("/sign-in"), + "should rewrite http Next.js href values to test.example.com" + ); + assert!( + processed.contains(r#""fallbackHref":"http://origin.example.com/legacy""#), + "should leave other fields untouched" + ); + assert!( + processed.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#), + "should not rewrite non-href keys" + ); + assert!( + !processed.contains("\"href\":\"https://origin.example.com/reviews\""), + "should remove origin https href" + ); + assert!( + !processed.contains("\"href\":\"http://origin.example.com/sign-in\""), + "should remove origin http href" + ); + } + + #[test] + fn html_processor_rewrites_rsc_stream_payload_with_length_preservation() { + // RSC payloads (self.__next_f.push) are rewritten via post-processing. + // The streaming phase skips RSC push scripts, and the HTML post-processor handles them + // at end-of-document to correctly handle cross-script T-chunks. + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "link", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + let final_html = String::from_utf8_lossy(&output); + + // RSC payloads should be rewritten via end-of-document post-processing + assert!( + final_html.contains("test.example.com"), + "RSC stream payloads should be rewritten to proxy host via post-processing. Output: {}", + final_html + ); + } + + #[test] + fn html_processor_rewrites_rsc_stream_payload_with_chunked_input() { + // RSC payloads are rewritten via post-processing, even with chunked streaming input + let html = r#" + + "#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 32, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + let final_html = String::from_utf8_lossy(&output); + + // RSC payloads should be rewritten via end-of-document post-processing + assert!( + final_html.contains("test.example.com"), + "RSC stream payloads should be rewritten to proxy host with chunked input. Output: {}", + final_html + ); + } + + #[test] + fn register_respects_enabled_flag() { + let settings = create_test_settings(); + let registration = register(&settings); + + assert!( + registration.is_none(), + "should skip registration when integration is disabled" + ); + } + + #[test] + fn html_processor_rewrites_rsc_payloads_with_length_preservation() { + // RSC payloads (self.__next_f.push) are rewritten via post-processing. + // This allows navigation to stay on proxy while correctly handling cross-script T-chunks. + + let html = r#" + +"#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["url"], + }), + ) + .expect("should update nextjs config"); + + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let final_html = String::from_utf8_lossy(&output); + + // RSC payloads should be rewritten via post-processing + assert!( + final_html.contains("test.example.com"), + "RSC payload URLs should be rewritten to proxy host. Output: {}", + final_html + ); + + // Verify the RSC payload structure is preserved + assert!( + final_html.contains(r#""ID":879000"#), + "RSC payload ID should be preserved" + ); + assert!( + final_html.contains(r#""title":"Makes""#), + "RSC payload title should be preserved" + ); + assert!( + final_html.contains(r#""children":"$45a""#), + "RSC payload children reference should be preserved" + ); + + // Verify \n separators are preserved (crucial for RSC parsing) + assert!( + final_html.contains(r#"\n442:"#), + "RSC record separator \\n should be preserved. Output: {}", + final_html + ); + } +} diff --git a/crates/common/src/integrations/nextjs/rsc.rs b/crates/common/src/integrations/nextjs/rsc.rs new file mode 100644 index 0000000..3dea8b1 --- /dev/null +++ b/crates/common/src/integrations/nextjs/rsc.rs @@ -0,0 +1,594 @@ +use std::borrow::Cow; + +use once_cell::sync::Lazy; +use regex::{escape, Regex}; + +/// T-chunk header pattern: hex_id:Thex_length, +static TCHUNK_PATTERN: Lazy = + Lazy::new(|| Regex::new(r"([0-9a-fA-F]+):T([0-9a-fA-F]+),").expect("valid T-chunk regex")); + +/// Marker used to track script boundaries when combining RSC content. +pub(crate) const RSC_MARKER: &str = "\x00SPLIT\x00"; + +// ============================================================================= +// Escape Sequence Parsing +// ============================================================================= +// +// JS escape sequences are parsed by a shared iterator to avoid code duplication. +// The iterator yields (source_len, unescaped_byte_count) for each logical unit. + +/// A single parsed element from a JS string. +#[derive(Clone, Copy)] +struct EscapeElement { + /// Number of unescaped bytes this represents. + byte_count: usize, +} + +/// Iterator over escape sequences in a JS string. +/// Yields the unescaped byte count for each element. +struct EscapeSequenceIter<'a> { + bytes: &'a [u8], + str_ref: &'a str, + pos: usize, + skip_marker: Option<&'a [u8]>, +} + +impl<'a> EscapeSequenceIter<'a> { + fn new(s: &'a str) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: 0, + skip_marker: None, + } + } + + fn with_marker(s: &'a str, marker: &'a [u8]) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: 0, + skip_marker: Some(marker), + } + } + + fn from_position(s: &'a str, start: usize) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: start, + skip_marker: None, + } + } + + fn from_position_with_marker(s: &'a str, start: usize, marker: &'a [u8]) -> Self { + Self { + bytes: s.as_bytes(), + str_ref: s, + pos: start, + skip_marker: Some(marker), + } + } + + /// Current position in the source string. + fn position(&self) -> usize { + self.pos + } +} + +impl Iterator for EscapeSequenceIter<'_> { + type Item = EscapeElement; + + fn next(&mut self) -> Option { + if self.pos >= self.bytes.len() { + return None; + } + + if let Some(marker) = self.skip_marker { + if self.pos + marker.len() <= self.bytes.len() + && &self.bytes[self.pos..self.pos + marker.len()] == marker + { + self.pos += marker.len(); + return Some(EscapeElement { byte_count: 0 }); + } + } + + if self.bytes[self.pos] == b'\\' && self.pos + 1 < self.bytes.len() { + let esc = self.bytes[self.pos + 1]; + + if matches!( + esc, + b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' + ) { + self.pos += 2; + return Some(EscapeElement { byte_count: 1 }); + } + + if esc == b'x' && self.pos + 3 < self.bytes.len() { + self.pos += 4; + return Some(EscapeElement { byte_count: 1 }); + } + + if esc == b'u' && self.pos + 5 < self.bytes.len() { + let hex = &self.str_ref[self.pos + 2..self.pos + 6]; + if hex.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit) = u16::from_str_radix(hex, 16) { + if (0xD800..=0xDBFF).contains(&code_unit) + && self.pos + 11 < self.bytes.len() + && self.bytes[self.pos + 6] == b'\\' + && self.bytes[self.pos + 7] == b'u' + { + let hex2 = &self.str_ref[self.pos + 8..self.pos + 12]; + if hex2.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { + if (0xDC00..=0xDFFF).contains(&code_unit2) { + self.pos += 12; + return Some(EscapeElement { byte_count: 4 }); + } + } + } + } + + let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); + self.pos += 6; + return Some(EscapeElement { + byte_count: c.len_utf8(), + }); + } + } + } + } + + if self.bytes[self.pos] < 0x80 { + self.pos += 1; + Some(EscapeElement { byte_count: 1 }) + } else { + let c = self.str_ref[self.pos..] + .chars() + .next() + .unwrap_or('\u{FFFD}'); + let len = c.len_utf8(); + self.pos += len; + Some(EscapeElement { byte_count: len }) + } + } +} + +/// Calculate the unescaped byte length of a JS string with escape sequences. +fn calculate_unescaped_byte_length(s: &str) -> usize { + EscapeSequenceIter::new(s).map(|e| e.byte_count).sum() +} + +/// Consume a specified number of unescaped bytes from a JS string, returning the end position. +fn consume_unescaped_bytes(s: &str, start_pos: usize, byte_count: usize) -> (usize, usize) { + let mut iter = EscapeSequenceIter::from_position(s, start_pos); + let mut consumed = 0; + + while consumed < byte_count { + match iter.next() { + Some(elem) => consumed += elem.byte_count, + None => break, + } + } + + (iter.position(), consumed) +} + +// ============================================================================= +// T-chunk discovery +// ============================================================================= + +/// Information about a T-chunk found in the combined RSC content. +struct TChunkInfo { + /// The chunk ID (hex string like "1a", "443"). + id: String, + /// Position where the T-chunk header starts (e.g., position of "1a:T..."). + match_start: usize, + /// Position right after the comma (where content begins). + header_end: usize, + /// Position where the content ends. + content_end: usize, +} + +/// Find all T-chunks in content, optionally skipping markers. +fn find_tchunks_impl(content: &str, skip_markers: bool) -> Vec { + let mut chunks = Vec::new(); + let mut search_pos = 0; + let marker = if skip_markers { + Some(RSC_MARKER.as_bytes()) + } else { + None + }; + + while search_pos < content.len() { + if let Some(cap) = TCHUNK_PATTERN.captures(&content[search_pos..]) { + let m = cap.get(0).expect("T-chunk match should exist"); + let match_start = search_pos + m.start(); + let header_end = search_pos + m.end(); + + let id = cap + .get(1) + .expect("T-chunk id should exist") + .as_str() + .to_string(); + let length_hex = cap.get(2).expect("T-chunk length should exist").as_str(); + let declared_length = usize::from_str_radix(length_hex, 16).unwrap_or(0); + + let content_end = if let Some(marker_bytes) = marker { + let mut iter = EscapeSequenceIter::from_position_with_marker( + content, + header_end, + marker_bytes, + ); + let mut consumed = 0; + while consumed < declared_length { + match iter.next() { + Some(elem) => consumed += elem.byte_count, + None => break, + } + } + iter.position() + } else { + let (pos, _) = consume_unescaped_bytes(content, header_end, declared_length); + pos + }; + + chunks.push(TChunkInfo { + id, + match_start, + header_end, + content_end, + }); + + search_pos = content_end; + } else { + break; + } + } + + chunks +} + +fn find_tchunks(content: &str) -> Vec { + find_tchunks_impl(content, false) +} + +fn find_tchunks_with_markers(content: &str) -> Vec { + find_tchunks_impl(content, true) +} + +// ============================================================================= +// URL rewriting (cached per call) +// ============================================================================= + +/// Rewriter for RSC payload URL patterns. +/// +/// This is constructed per document / payload rewrite so that the origin-host-dependent regex is +/// compiled once, then reused across multiple calls. +pub(crate) struct RscUrlRewriter { + origin_host: String, + request_host: String, + request_scheme: String, + pattern: Regex, +} + +impl RscUrlRewriter { + pub(crate) fn new(origin_host: &str, request_host: &str, request_scheme: &str) -> Self { + let escaped_origin = escape(origin_host); + + // Match: + // - https://origin_host or http://origin_host + // - //origin_host (protocol-relative) + // - escaped variants inside JSON-in-JS strings (e.g., \/\/origin_host) + let pattern = Regex::new(&format!( + r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, + escaped_origin + )) + .expect("valid RSC URL rewrite regex"); + + Self { + origin_host: origin_host.to_string(), + request_host: request_host.to_string(), + request_scheme: request_scheme.to_string(), + pattern, + } + } + + pub(crate) fn rewrite<'a>(&self, input: &'a str) -> Cow<'a, str> { + if !input.contains(&self.origin_host) { + return Cow::Borrowed(input); + } + + let replaced = self + .pattern + .replace_all(input, |caps: ®ex::Captures<'_>| { + let slashes = caps.get(3).map_or("//", |m| m.as_str()); + if caps.get(1).is_some() { + format!("{}:{}{}", self.request_scheme, slashes, self.request_host) + } else { + format!("{}{}", slashes, self.request_host) + } + }); + + let still_contains_origin = match &replaced { + Cow::Borrowed(s) => s.contains(&self.origin_host), + Cow::Owned(s) => s.contains(&self.origin_host), + }; + + if !still_contains_origin { + return replaced; + } + + // Also rewrite bare host occurrences inside RSC payloads (e.g. `siteProductionDomain`). + let owned = replaced.into_owned(); + Cow::Owned(owned.replace(&self.origin_host, &self.request_host)) + } + + pub(crate) fn rewrite_to_string(&self, input: &str) -> String { + self.rewrite(input).into_owned() + } +} + +// ============================================================================= +// Single-script T-chunk processing +// ============================================================================= + +pub(crate) fn rewrite_rsc_tchunks_with_rewriter( + content: &str, + rewriter: &RscUrlRewriter, +) -> String { + let chunks = find_tchunks(content); + + if chunks.is_empty() { + return rewriter.rewrite_to_string(content); + } + + let mut result = String::with_capacity(content.len()); + let mut last_end = 0; + + for chunk in &chunks { + let before = &content[last_end..chunk.match_start]; + result.push_str(rewriter.rewrite(before).as_ref()); + + let chunk_content = &content[chunk.header_end..chunk.content_end]; + let rewritten_content = rewriter.rewrite_to_string(chunk_content); + + let new_length = calculate_unescaped_byte_length(&rewritten_content); + let new_length_hex = format!("{new_length:x}"); + + result.push_str(&chunk.id); + result.push_str(":T"); + result.push_str(&new_length_hex); + result.push(','); + result.push_str(&rewritten_content); + + last_end = chunk.content_end; + } + + let remaining = &content[last_end..]; + result.push_str(rewriter.rewrite(remaining).as_ref()); + + result +} + +// ============================================================================= +// Cross-script RSC processing +// ============================================================================= + +fn calculate_unescaped_byte_length_skip_markers(s: &str) -> usize { + EscapeSequenceIter::with_marker(s, RSC_MARKER.as_bytes()) + .map(|e| e.byte_count) + .sum() +} + +/// Process multiple RSC script payloads together, handling cross-script T-chunks. +pub fn rewrite_rsc_scripts_combined( + payloads: &[&str], + origin_host: &str, + request_host: &str, + request_scheme: &str, +) -> Vec { + if payloads.is_empty() { + return Vec::new(); + } + + let rewriter = RscUrlRewriter::new(origin_host, request_host, request_scheme); + + if payloads.len() == 1 { + return vec![rewrite_rsc_tchunks_with_rewriter(payloads[0], &rewriter)]; + } + + let mut combined = payloads[0].to_string(); + for payload in &payloads[1..] { + combined.push_str(RSC_MARKER); + combined.push_str(payload); + } + + let chunks = find_tchunks_with_markers(&combined); + if chunks.is_empty() { + return payloads + .iter() + .map(|p| rewriter.rewrite_to_string(p)) + .collect(); + } + + let mut result = String::with_capacity(combined.len()); + let mut last_end = 0; + + for chunk in &chunks { + let before = &combined[last_end..chunk.match_start]; + result.push_str(rewriter.rewrite(before).as_ref()); + + let chunk_content = &combined[chunk.header_end..chunk.content_end]; + let rewritten_content = rewriter.rewrite_to_string(chunk_content); + + let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); + let new_length_hex = format!("{new_length:x}"); + + result.push_str(&chunk.id); + result.push_str(":T"); + result.push_str(&new_length_hex); + result.push(','); + result.push_str(&rewritten_content); + + last_end = chunk.content_end; + } + + let remaining = &combined[last_end..]; + result.push_str(rewriter.rewrite(remaining).as_ref()); + + result.split(RSC_MARKER).map(|s| s.to_string()).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tchunk_length_recalculation() { + let content = r#"1a:T29,{"url":"https://origin.example.com/path"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert!( + result.contains("test.example.com"), + "URL should be rewritten" + ); + assert!( + result.starts_with("1a:T27,"), + "T-chunk length should be updated from 29 (41) to 27 (39). Got: {}", + result + ); + } + + #[test] + fn tchunk_length_recalculation_with_length_increase() { + let content = r#"1a:T1c,{"url":"https://short.io/x"}"#; + let rewriter = RscUrlRewriter::new("short.io", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert!( + result.contains("test.example.com"), + "URL should be rewritten" + ); + assert!( + result.starts_with("1a:T24,"), + "T-chunk length should be updated from 1c (28) to 24 (36). Got: {}", + result + ); + } + + #[test] + fn calculate_unescaped_byte_length_handles_common_escapes() { + assert_eq!(calculate_unescaped_byte_length("hello"), 5); + assert_eq!(calculate_unescaped_byte_length(r#"\n"#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\r\n"#), 2); + assert_eq!(calculate_unescaped_byte_length(r#"\""#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\\"#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\x41"#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\u0041"#), 1); + assert_eq!(calculate_unescaped_byte_length(r#"\u00e9"#), 2); + } + + #[test] + fn multiple_tchunks() { + let content = r#"1a:T1c,{"url":"https://short.io/x"}\n1b:T1c,{"url":"https://short.io/y"}"#; + let rewriter = RscUrlRewriter::new("short.io", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert!( + result.contains("test.example.com"), + "URLs should be rewritten" + ); + let count = result.matches(":T24,").count(); + assert_eq!(count, 2, "Both T-chunks should have updated lengths"); + } + + #[test] + fn cross_script_tchunk_rewriting() { + let script0 = r#"other:data\n1a:T40,partial content"#; + let script1 = r#" with https://origin.example.com/page goes here"#; + + let combined_content = "partial content with https://origin.example.com/page goes here"; + let combined_len = calculate_unescaped_byte_length(combined_content); + println!( + "Combined T-chunk content length: {} bytes = 0x{:x}", + combined_len, combined_len + ); + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + assert_eq!(results.len(), 2, "Should return same number of scripts"); + assert!( + results[1].contains("test.example.com"), + "URL in script 1 should be rewritten. Got: {}", + results[1] + ); + + let rewritten_content = "partial content with https://test.example.com/page goes here"; + let rewritten_len = calculate_unescaped_byte_length(rewritten_content); + let expected_header = format!(":T{:x},", rewritten_len); + assert!( + results[0].contains(&expected_header), + "T-chunk length in script 0 should be updated to {}. Got: {}", + expected_header, + results[0] + ); + } + + #[test] + fn cross_script_preserves_non_tchunk_content() { + let script0 = r#"{"url":"https://origin.example.com/first"}\n1a:T40,partial"#; + let script1 = r#" content with https://origin.example.com/page end"#; + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + assert!( + results[0].contains("test.example.com/first"), + "URL outside T-chunk should be rewritten. Got: {}", + results[0] + ); + + assert!( + results[1].contains("test.example.com/page"), + "URL inside cross-script T-chunk should be rewritten. Got: {}", + results[1] + ); + } + + #[test] + fn preserves_protocol_relative_urls() { + let input = r#"{"url":"//origin.example.com/path"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewritten = rewriter.rewrite_to_string(input); + + assert!( + rewritten.contains(r#""url":"//proxy.example.com/path""#), + "Protocol-relative URL should remain protocol-relative. Got: {rewritten}", + ); + } + + #[test] + fn rewrites_bare_host_occurrences() { + let input = r#"{"siteProductionDomain":"origin.example.com"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewritten = rewriter.rewrite_to_string(input); + + assert!( + rewritten.contains(r#""siteProductionDomain":"proxy.example.com""#), + "Bare host should be rewritten inside RSC payload. Got: {rewritten}" + ); + } +} diff --git a/crates/common/src/integrations/nextjs/script_rewriter.rs b/crates/common/src/integrations/nextjs/script_rewriter.rs new file mode 100644 index 0000000..440ea3a --- /dev/null +++ b/crates/common/src/integrations/nextjs/script_rewriter.rs @@ -0,0 +1,613 @@ +use std::sync::Arc; + +use once_cell::sync::Lazy; +use regex::{escape, Regex}; + +use crate::integrations::{ + IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, +}; + +use super::rsc::{rewrite_rsc_tchunks_with_rewriter, RscUrlRewriter}; +use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; + +/// RSC push payload pattern for extraction. +static RSC_PUSH_PATTERN: Lazy = Lazy::new(|| { + Regex::new(r#"self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#).expect("valid RSC push regex") +}); + +#[derive(Clone, Copy)] +pub(super) enum NextJsRewriteMode { + Structured, + Streamed, +} + +pub(super) struct NextJsScriptRewriter { + config: Arc, + mode: NextJsRewriteMode, +} + +impl NextJsScriptRewriter { + pub(super) fn new(config: Arc, mode: NextJsRewriteMode) -> Self { + Self { config, mode } + } + + fn rewrite_structured( + &self, + content: &str, + ctx: &IntegrationScriptContext<'_>, + ) -> ScriptRewriteAction { + if let Some(rewritten) = rewrite_nextjs_values( + content, + ctx.origin_host, + ctx.request_host, + ctx.request_scheme, + &self.config.rewrite_attributes, + false, + ) { + ScriptRewriteAction::replace(rewritten) + } else { + ScriptRewriteAction::keep() + } + } + + fn rewrite_streamed( + &self, + content: &str, + ctx: &IntegrationScriptContext<'_>, + ) -> ScriptRewriteAction { + let rsc_rewriter = + RscUrlRewriter::new(ctx.origin_host, ctx.request_host, ctx.request_scheme); + + if let Some((payload, quote, start, end)) = extract_rsc_push_payload(content) { + let rewritten_payload = rewrite_rsc_tchunks_with_rewriter(payload, &rsc_rewriter); + + if rewritten_payload != payload { + let mut result = String::with_capacity(content.len()); + result.push_str(&content[..start]); + result.push(quote); + result.push_str(&rewritten_payload); + result.push(quote); + result.push_str(&content[end + 1..]); + return ScriptRewriteAction::replace(result); + } + } + + let rewritten = rsc_rewriter.rewrite_to_string(content); + if rewritten != content { + return ScriptRewriteAction::replace(rewritten); + } + + ScriptRewriteAction::keep() + } +} + +/// Extract RSC payload from a self.__next_f.push([1, '...']) call. +/// Returns (payload_content, quote_char, start_pos, end_pos). +fn extract_rsc_push_payload(content: &str) -> Option<(&str, char, usize, usize)> { + let cap = RSC_PUSH_PATTERN.captures(content)?; + let quote_match = cap.get(1)?; + let quote = quote_match.as_str().chars().next()?; + let content_start = quote_match.end(); + + let search_from = &content[content_start..]; + let mut pos = 0; + let mut escape = false; + + for c in search_from.chars() { + if escape { + escape = false; + pos += c.len_utf8(); + continue; + } + if c == '\\' { + escape = true; + pos += 1; + continue; + } + if c == quote { + let content_end = content_start + pos; + return Some(( + &content[content_start..content_end], + quote, + content_start - 1, + content_end, + )); + } + pos += c.len_utf8(); + } + + None +} + +impl IntegrationScriptRewriter for NextJsScriptRewriter { + fn integration_id(&self) -> &'static str { + NEXTJS_INTEGRATION_ID + } + + fn selector(&self) -> &'static str { + match self.mode { + NextJsRewriteMode::Structured => "script#__NEXT_DATA__", + NextJsRewriteMode::Streamed => "script", + } + } + + fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { + if self.config.rewrite_attributes.is_empty() { + return ScriptRewriteAction::keep(); + } + + match self.mode { + NextJsRewriteMode::Structured => self.rewrite_structured(content, ctx), + NextJsRewriteMode::Streamed => { + if content.contains("__next_f.push") { + return ScriptRewriteAction::keep(); + } + if content.contains("__next_f") { + return self.rewrite_streamed(content, ctx); + } + ScriptRewriteAction::keep() + } + } + } +} + +fn rewrite_nextjs_values( + content: &str, + origin_host: &str, + request_host: &str, + request_scheme: &str, + attributes: &[String], + preserve_length: bool, +) -> Option { + if origin_host.is_empty() || request_host.is_empty() || attributes.is_empty() { + return None; + } + + let rewriter = UrlRewriter::new( + origin_host, + request_host, + request_scheme, + attributes, + preserve_length, + ); + + rewriter.rewrite_embedded(content) +} + +struct UrlRewriter { + origin_host: String, + request_host: String, + request_scheme: String, + embedded_patterns: Vec, + bare_host_patterns: Vec, + preserve_length: bool, +} + +impl UrlRewriter { + fn new( + origin_host: &str, + request_host: &str, + request_scheme: &str, + attributes: &[String], + preserve_length: bool, + ) -> Self { + let escaped_origin = escape(origin_host); + + let embedded_patterns = attributes + .iter() + .map(|attr| { + let escaped_attr = escape(attr); + let pattern = format!( + r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*")(?Phttps?://|//){origin}(?P[^"\\]*)(?P\\*")"#, + attr = escaped_attr, + origin = escaped_origin, + ); + Regex::new(&pattern).expect("valid Next.js rewrite regex") + }) + .collect(); + + let bare_host_patterns = attributes + .iter() + .map(|attr| { + let escaped_attr = escape(attr); + let pattern = format!( + r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*"){origin}(?P\\*")"#, + attr = escaped_attr, + origin = escaped_origin, + ); + Regex::new(&pattern).expect("valid Next.js bare host rewrite regex") + }) + .collect(); + + Self { + origin_host: origin_host.to_string(), + request_host: request_host.to_string(), + request_scheme: request_scheme.to_string(), + embedded_patterns, + bare_host_patterns, + preserve_length, + } + } + + #[cfg(test)] + fn rewrite_url_value(&self, url: &str) -> Option<(String, String)> { + let original_len = url.len(); + + let new_url = if let Some(rest) = url.strip_prefix("https://") { + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + Some(format!( + "{}://{}{}", + self.request_scheme, self.request_host, path + )) + } else { + None + } + } else if let Some(rest) = url.strip_prefix("http://") { + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + Some(format!( + "{}://{}{}", + self.request_scheme, self.request_host, path + )) + } else { + None + } + } else if let Some(rest) = url.strip_prefix("//") { + if rest.starts_with(&self.origin_host) { + let path = &rest[self.origin_host.len()..]; + Some(format!("//{}{}", self.request_host, path)) + } else { + None + } + } else if url == self.origin_host { + Some(self.request_host.clone()) + } else if url.starts_with(&self.origin_host) { + let path = &url[self.origin_host.len()..]; + Some(format!("{}{}", self.request_host, path)) + } else { + None + }; + + new_url.map(|url| { + let padding = if self.preserve_length { + Self::calculate_padding(url.len(), original_len) + } else { + String::new() + }; + (url, padding) + }) + } + + #[cfg(test)] + fn calculate_padding(new_url_len: usize, original_len: usize) -> String { + if new_url_len >= original_len { + String::new() + } else { + " ".repeat(original_len - new_url_len) + } + } + + fn rewrite_embedded(&self, input: &str) -> Option { + let mut result = input.to_string(); + let mut changed = false; + + for regex in &self.embedded_patterns { + let origin_host = &self.origin_host; + let request_host = &self.request_host; + let request_scheme = &self.request_scheme; + let preserve_length = self.preserve_length; + + let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { + let prefix = &caps["prefix"]; + let scheme = &caps["scheme"]; + let path = &caps["path"]; + let quote = &caps["quote"]; + + let original_url_len = scheme.len() + origin_host.len() + path.len(); + + let new_url = if scheme == "//" { + format!("//{}{}", request_host, path) + } else { + format!("{}://{}{}", request_scheme, request_host, path) + }; + + let padding = if preserve_length && new_url.len() < original_url_len { + " ".repeat(original_url_len - new_url.len()) + } else { + String::new() + }; + + format!("{prefix}{new_url}{quote}{padding}") + }); + + if next_value != result { + changed = true; + result = next_value.into_owned(); + } + } + + for regex in &self.bare_host_patterns { + let origin_host = &self.origin_host; + let request_host = &self.request_host; + let preserve_length = self.preserve_length; + + let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { + let prefix = &caps["prefix"]; + let suffix = &caps["suffix"]; + + let padding = if preserve_length && request_host.len() < origin_host.len() { + " ".repeat(origin_host.len() - request_host.len()) + } else { + String::new() + }; + + format!("{prefix}{request_host}{suffix}{padding}") + }); + + if next_value != result { + changed = true; + result = next_value.into_owned(); + } + } + + changed.then_some(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::integrations::ScriptRewriteAction; + + fn test_config() -> Arc { + Arc::new(NextJsIntegrationConfig { + enabled: true, + rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], + }) + } + + fn ctx(selector: &'static str) -> IntegrationScriptContext<'static> { + IntegrationScriptContext { + selector, + request_host: "ts.example.com", + request_scheme: "https", + origin_host: "origin.example.com", + } + } + + #[test] + fn structured_rewriter_updates_next_data_payload() { + let payload = r#"{"props":{"pageProps":{"primary":{"href":"https://origin.example.com/reviews"},"secondary":{"href":"http://origin.example.com/sign-in"},"fallbackHref":"http://origin.example.com/legacy","protoRelative":"//origin.example.com/assets/logo.png"}}}"#; + let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Structured); + let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__")); + + match result { + ScriptRewriteAction::Replace(value) => { + assert!(value.contains("ts.example.com") && value.contains("/reviews")); + assert!(value.contains("ts.example.com") && value.contains("/sign-in")); + assert!(value.contains(r#""fallbackHref":"http://origin.example.com/legacy""#)); + assert!(value.contains(r#""protoRelative":"//origin.example.com/assets/logo.png""#)); + } + _ => panic!("Expected rewrite to update payload"), + } + } + + #[test] + fn streamed_rewriter_skips_non_next_payloads() { + let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Streamed); + + let noop = rewriter.rewrite("console.log('hello');", &ctx("script")); + assert!(matches!(noop, ScriptRewriteAction::Keep)); + + let payload = + r#"self.__next_f.push([1, "{\"href\":\"https://origin.example.com/app\"}"]);"#; + let result = rewriter.rewrite(payload, &ctx("script")); + assert!( + matches!(result, ScriptRewriteAction::Keep), + "Streamed rewriter should skip __next_f.push payloads (handled by post-processor)" + ); + + let init_script = r#"(self.__next_f = self.__next_f || []).push([0]); var url = "https://origin.example.com/api";"#; + let init_result = rewriter.rewrite(init_script, &ctx("script")); + assert!( + matches!( + init_result, + ScriptRewriteAction::Keep | ScriptRewriteAction::Replace(_) + ), + "Streamed rewriter should handle non-push __next_f scripts" + ); + } + + #[test] + fn rewrite_helper_handles_protocol_relative_urls() { + let content = r#"{"props":{"pageProps":{"link":"//origin.example.com/image.png"}}}"#; + let rewritten = rewrite_nextjs_values( + content, + "origin.example.com", + "ts.example.com", + "https", + &["link".into()], + false, + ) + .expect("should rewrite protocol relative link"); + + assert!(rewritten.contains("ts.example.com") && rewritten.contains("/image.png")); + } + + #[test] + fn truncated_string_without_urls_is_not_modified() { + let truncated = r#"self.__next_f.push([ + 1, + '430:I[6061,["749","static/chunks/16bf9003-553c36acd7d8a04b.js","4669","static/chun' +]);"#; + + let result = rewrite_nextjs_values( + truncated, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, + ); + + assert!( + result.is_none(), + "Truncated content without URLs should not be modified" + ); + } + + #[test] + fn complete_string_with_url_is_rewritten() { + let complete = r#"self.__next_f.push([ + 1, + '{"url":"https://origin.example.com/path/to/resource"}' +]);"#; + + let result = rewrite_nextjs_values( + complete, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, + ) + .expect("should rewrite URL"); + + assert!( + result.contains("proxy.example.com") && result.contains("/path/to/resource"), + "Complete URL should be rewritten. Got: {result}" + ); + } + + #[test] + fn truncated_url_without_closing_quote_is_not_modified() { + let truncated_url = r#"self.__next_f.push([ + 1, + '\"url\":\"https://origin.example.com/rss?title=%20' +]);"#; + + let result = rewrite_nextjs_values( + truncated_url, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, + ); + + assert!( + result.is_none(), + "Truncated URL without closing quote should not be modified" + ); + } + + #[test] + fn backslash_n_is_preserved() { + let input = + r#"self.__next_f.push([1, 'foo\n{"url":"https://origin.example.com/test"}\nbar']);"#; + + let backslash_n_pos = input.find(r"\n").expect("should contain \\n"); + assert_eq!( + &input.as_bytes()[backslash_n_pos..backslash_n_pos + 2], + [0x5C, 0x6E], + "Input should have literal backslash-n" + ); + + let rewritten = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, + ) + .expect("should rewrite URL"); + + let new_pos = rewritten.find(r"\n").expect("should contain \\n"); + assert_eq!( + &rewritten.as_bytes()[new_pos..new_pos + 2], + [0x5C, 0x6E], + "Rewritten should preserve literal backslash-n" + ); + } + + #[test] + fn site_production_domain_is_rewritten() { + let input = r#"self.__next_f.push([1, '{"siteProductionDomain":"origin.example.com","url":"https://origin.example.com/news"}']);"#; + + let rewritten = rewrite_nextjs_values( + input, + "origin.example.com", + "proxy.example.com", + "http", + &["url".into(), "siteProductionDomain".into()], + true, + ) + .expect("should rewrite URLs"); + + assert!( + rewritten.contains("proxy.example.com") && rewritten.contains("/news"), + "Expected host to be rewritten. Got: {rewritten}" + ); + assert!( + !rewritten.contains("origin.example.com"), + "Original host should not remain" + ); + } + + #[test] + fn whitespace_padding_calculation() { + let padding = UrlRewriter::calculate_padding(21, 24); + assert_eq!(padding.len(), 3, "Should need 3 spaces"); + assert_eq!(padding, " ", "Should be 3 spaces"); + + let padding = UrlRewriter::calculate_padding(24, 24); + assert_eq!(padding.len(), 0); + + let padding = UrlRewriter::calculate_padding(30, 24); + assert_eq!(padding.len(), 0); + } + + #[test] + fn whitespace_padding_rewrite() { + let rewriter = UrlRewriter::new( + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + true, + ); + + let original_url = "https://origin.example.com/news"; + let result = rewriter + .rewrite_url_value(original_url) + .expect("URL should be rewritten"); + let (new_url, padding) = result; + + assert_eq!(new_url, "http://proxy.example.com/news"); + assert_eq!( + new_url.len() + padding.len(), + original_url.len(), + "URL + padding should equal original length" + ); + assert_eq!(padding, " ", "Should be 2 spaces"); + } + + #[test] + fn no_padding_when_disabled() { + let rewriter = UrlRewriter::new( + "origin.example.com", + "proxy.example.com", + "http", + &["url".into()], + false, + ); + + let (new_url, padding) = rewriter + .rewrite_url_value("https://origin.example.com/news") + .expect("URL should be rewritten"); + assert_eq!(new_url, "http://proxy.example.com/news"); + assert_eq!(padding, "", "No padding when preserve_length is false"); + } +} diff --git a/crates/common/src/publisher.rs b/crates/common/src/publisher.rs index cd14514..6041536 100644 --- a/crates/common/src/publisher.rs +++ b/crates/common/src/publisher.rs @@ -118,7 +118,7 @@ fn process_response_streaming( // Check if this is HTML content let is_html = params.content_type.contains("text/html"); let is_rsc_flight = params.content_type.contains("text/x-component"); - log::info!( + log::debug!( "process_response_streaming: content_type={}, content_encoding={}, is_html={}, is_rsc_flight={}, origin_host={}", params.content_type, params.content_encoding, @@ -189,7 +189,7 @@ fn process_response_streaming( pipeline.process(body, &mut output)?; } - log::info!( + log::debug!( "Streaming processing complete - output size: {} bytes", output.len() ); @@ -233,7 +233,7 @@ pub fn handle_publisher_request( integration_registry: &IntegrationRegistry, mut req: Request, ) -> Result> { - log::info!("Proxying request to publisher_origin"); + log::debug!("Proxying request to publisher_origin"); // Prebid.js requests are not intercepted here anymore. The HTML processor rewrites // any Prebid script references to `/static/tsjs-ext.min.js` when auto-configure is enabled. @@ -249,7 +249,7 @@ pub fn handle_publisher_request( let request_scheme = detect_request_scheme(&req); // Log detection details for debugging - log::info!( + log::debug!( "Scheme detection - TLS Protocol: {:?}, TLS Cipher: {:?}, Forwarded: {:?}, X-Forwarded-Proto: {:?}, Fastly-SSL: {:?}, Result: {}", req.get_tls_protocol(), req.get_tls_cipher_openssl_name(), @@ -259,7 +259,7 @@ pub fn handle_publisher_request( request_scheme ); - log::info!("Request host: {}, scheme: {}", request_host, request_scheme); + log::debug!("Request host: {}, scheme: {}", request_host, request_scheme); // Generate synthetic identifiers before the request body is consumed. let synthetic_id = get_or_generate_synthetic_id(settings, &req)?; @@ -273,7 +273,7 @@ pub fn handle_publisher_request( }) .unwrap_or(false); - log::info!( + log::debug!( "Proxy synthetic IDs - trusted: {}, has_cookie: {}", synthetic_id, has_synthetic_cookie @@ -282,7 +282,7 @@ pub fn handle_publisher_request( let backend_name = ensure_backend_from_url(&settings.publisher.origin_url)?; let origin_host = settings.publisher.origin_host(); - log::info!( + log::debug!( "Proxying to dynamic backend: {} (from {})", backend_name, settings.publisher.origin_url @@ -296,9 +296,9 @@ pub fn handle_publisher_request( })?; // Log all response headers for debugging - log::info!("Response headers:"); + log::debug!("Response headers:"); for (name, value) in response.get_headers() { - log::info!(" {}: {:?}", name, value); + log::debug!(" {}: {:?}", name, value); } // Check if the response has a text-based content type that we should process @@ -321,7 +321,7 @@ pub fn handle_publisher_request( .to_lowercase(); // Log response details for debugging - log::info!( + log::debug!( "Processing response - Content-Type: {}, Content-Encoding: {}, Request Host: {}, Origin Host: {}", content_type, content_encoding, request_host, origin_host ); @@ -349,12 +349,12 @@ pub fn handle_publisher_request( response.remove_header(header::CONTENT_LENGTH); // Keep Content-Encoding header since we're returning compressed content - log::info!( + log::debug!( "Preserved Content-Encoding: {} for compressed response", content_encoding ); - log::info!("Completed streaming processing of response body"); + log::debug!("Completed streaming processing of response body"); } Err(e) => { log::error!("Failed to process response body: {:?}", e); @@ -363,7 +363,7 @@ pub fn handle_publisher_request( } } } else { - log::info!( + log::debug!( "Skipping response processing - should_process: {}, request_host: '{}'", should_process, request_host diff --git a/crates/common/src/rsc_flight.rs b/crates/common/src/rsc_flight.rs index a320a7b..82590e3 100644 --- a/crates/common/src/rsc_flight.rs +++ b/crates/common/src/rsc_flight.rs @@ -47,11 +47,6 @@ impl RscFlightUrlRewriter { request_host: &str, request_scheme: &str, ) -> Self { - // Normalize because some configs include a trailing slash (e.g. `https://origin/`). - // If we keep the trailing slash, replacing `origin_url` inside `origin_url + "/path"` - // would drop the delimiter and yield `https://proxyhostpath`. - let origin_url = origin_url.trim_end_matches('/'); - let request_url = format!("{request_scheme}://{request_host}"); let origin_protocol_relative = format!("//{origin_host}"); let request_protocol_relative = format!("//{request_host}"); @@ -307,25 +302,6 @@ mod tests { ); } - #[test] - fn rewrites_newline_rows_with_trailing_slash_origin_url() { - let input = b"0:[\"https://origin.example.com/page\"]\n"; - - let mut rewriter = RscFlightUrlRewriter::new( - "origin.example.com", - "https://origin.example.com/", - "proxy.example.com", - "https", - ); - - let output = run_rewriter(&mut rewriter, input, 8); - let output_str = String::from_utf8(output).expect("should be valid UTF-8"); - assert_eq!( - output_str, "0:[\"https://proxy.example.com/page\"]\n", - "Output should rewrite URLs without dropping the path slash" - ); - } - #[test] fn rewrites_t_rows_and_updates_length() { let t_content = r#"{"url":"https://origin.example.com/page"}"#; @@ -356,36 +332,6 @@ mod tests { ); } - #[test] - fn rewrites_t_rows_with_trailing_slash_origin_url() { - let t_content = r#"{"url":"https://origin.example.com/page"}"#; - let json_row = "2:[\"ok\"]\n"; - let input = format!("1:T{:x},{}{}", t_content.len(), t_content, json_row); - - let mut rewriter = RscFlightUrlRewriter::new( - "origin.example.com", - "https://origin.example.com/", - "proxy.example.com", - "https", - ); - - let output = run_rewriter(&mut rewriter, input.as_bytes(), 7); - let output_str = String::from_utf8(output).expect("should be valid UTF-8"); - - let rewritten_t_content = r#"{"url":"https://proxy.example.com/page"}"#; - let expected = format!( - "1:T{:x},{}{}", - rewritten_t_content.len(), - rewritten_t_content, - json_row - ); - - assert_eq!( - output_str, expected, - "Output should update T row lengths after rewriting without dropping the path slash" - ); - } - #[test] fn handles_t_row_header_and_body_split_across_chunks() { let t_content = r#"{"url":"https://origin.example.com/page"}"#; diff --git a/crates/common/src/settings.rs b/crates/common/src/settings.rs index 9c689ff..4a41b1e 100644 --- a/crates/common/src/settings.rs +++ b/crates/common/src/settings.rs @@ -53,6 +53,17 @@ impl Publisher { }) .unwrap_or_else(|| self.origin_url.clone()) } + + fn normalize(&mut self) { + let trimmed = self.origin_url.trim_end_matches('/'); + if trimmed != self.origin_url { + log::warn!( + "publisher.origin_url ends with '/': normalizing to {}", + trimmed + ); + self.origin_url = trimmed.to_string(); + } + } } #[derive(Debug, Default, Deserialize, Serialize)] @@ -318,12 +329,15 @@ impl Settings { .change_context(TrustedServerError::Configuration { message: "Failed to build configuration".to_string(), })?; - // You can deserialize (and thus freeze) the entire configuration as - config - .try_deserialize() - .change_context(TrustedServerError::Configuration { - message: "Failed to deserialize configuration".to_string(), - }) + let mut settings: Self = + config + .try_deserialize() + .change_context(TrustedServerError::Configuration { + message: "Failed to deserialize configuration".to_string(), + })?; + + settings.publisher.normalize(); + Ok(settings) } #[must_use] @@ -417,6 +431,7 @@ mod tests { use serde_json::json; use crate::integrations::{nextjs::NextJsIntegrationConfig, prebid::PrebidIntegrationConfig}; + use crate::streaming_replacer::create_url_replacer; use crate::test_support::tests::{crate_test_settings_str, create_test_settings}; #[test] @@ -507,6 +522,35 @@ mod tests { settings.validate().expect("Failed to validate settings"); } + #[test] + fn from_toml_normalizes_trailing_slash_in_origin_url() { + let toml_str = crate_test_settings_str().replace( + r#"origin_url = "https://origin.test-publisher.com""#, + r#"origin_url = "https://origin.test-publisher.com/""#, + ); + + let settings = Settings::from_toml(&toml_str).expect("should parse valid TOML"); + assert_eq!( + settings.publisher.origin_url, "https://origin.test-publisher.com", + "origin_url should be normalized by trimming trailing slashes" + ); + + let origin_host = settings.publisher.origin_host(); + let mut replacer = create_url_replacer( + &origin_host, + &settings.publisher.origin_url, + "proxy.example.com", + "https", + ); + + let processed = replacer.process_chunk(b"https://origin.test-publisher.com/news", true); + let rewritten = String::from_utf8(processed).expect("should be valid UTF-8"); + assert_eq!( + rewritten, "https://proxy.example.com/news", + "rewriting should keep the delimiter slash between host and path" + ); + } + #[test] fn test_settings_missing_required_fields() { let re = Regex::new(r"origin_url = .*").unwrap(); diff --git a/crates/common/src/streaming_replacer.rs b/crates/common/src/streaming_replacer.rs index ea5b9c5..9b975c5 100644 --- a/crates/common/src/streaming_replacer.rs +++ b/crates/common/src/streaming_replacer.rs @@ -156,10 +156,6 @@ pub fn create_url_replacer( request_host: &str, request_scheme: &str, ) -> StreamingReplacer { - // Normalize because some configs include a trailing slash (e.g. `https://origin/`). - // If we keep the trailing slash, replacing `origin_url` inside `origin_url + "/path"` - // would drop the delimiter and yield `https://proxyhostpath`. - let origin_url = origin_url.trim_end_matches('/'); let request_url = format!("{}://{}", request_scheme, request_host); let mut replacements = vec![ @@ -368,29 +364,6 @@ mod tests { assert!(result.contains("//test.example.com/script.js")); } - #[test] - fn test_url_replacer_handles_trailing_slash_origin_url() { - let mut replacer = create_url_replacer( - "origin.example.com", - "https://origin.example.com/", - "test.example.com", - "https", - ); - - let content = r#"Visit https://origin.example.com/news for more info"#; - let processed = replacer.process_chunk(content.as_bytes(), true); - let result = String::from_utf8(processed).expect("should be valid UTF-8"); - - assert!( - result.contains("https://test.example.com/news"), - "URL should keep the slash between host and path. Got: {result}" - ); - assert!( - !result.contains("https://test.example.comnews"), - "URL should not lose the slash between host and path. Got: {result}" - ); - } - #[test] fn test_process_chunk_utf8_boundary() { let mut replacer = diff --git a/docs/RSC_HYDRATION_FINDINGS.md b/docs/RSC_HYDRATION_FINDINGS.md index 8b54a86..0f67065 100644 --- a/docs/RSC_HYDRATION_FINDINGS.md +++ b/docs/RSC_HYDRATION_FINDINGS.md @@ -185,7 +185,7 @@ At end-of-document, a post-processor handles cross-script T-chunks: 5. **Splits back on markers** to get individual rewritten payloads 6. **Rebuilds the HTML** with rewritten scripts -This phase is gated by a cheap `should_process` preflight so non‑Next.js pages do not pay the extra pass. +This phase is gated by a cheap `should_process` preflight so non‑Next.js pages do not pay the extra pass ([html_post_process.rs:36](crates/common/src/integrations/nextjs/html_post_process.rs#L36)). ### Marker-Based Cross-Script Processing @@ -199,7 +199,7 @@ The marker `\x00SPLIT\x00` is chosen because: - Easily identifiable for splitting - Won't be confused with any escape sequence -**Implementation:** Marker constant at [nextjs.rs:40](crates/common/src/integrations/nextjs.rs#L40) and combine/split logic in [nextjs.rs:1000](crates/common/src/integrations/nextjs.rs#L1000) +**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11) and combine/split logic in [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) #### Step 2: Find T-Chunks Across Combined Content @@ -207,7 +207,7 @@ Scan the combined stream for `ID:T,` headers, then consume exactly ` The key insight: markers don't count toward byte consumption. When a T-chunk declares 1679 bytes, we consume 1679 bytes of actual content, skipping over any markers we encounter. -**Implementation:** T-chunk discovery at [nextjs.rs:980](crates/common/src/integrations/nextjs.rs#L980) with marker-aware escape sequence iterator at [nextjs.rs:643](crates/common/src/integrations/nextjs.rs#L643) +**Implementation:** T-chunk discovery at [rsc.rs:194](crates/common/src/integrations/nextjs/rsc.rs#L194) with marker-aware escape sequence iterator at [rsc.rs:29](crates/common/src/integrations/nextjs/rsc.rs#L29) #### Step 3: Rewrite URLs and Recalculate Lengths @@ -238,7 +238,7 @@ The post-processing is implemented as an integration hook, allowing other integr ### Registration -**Implementation:** Next.js registers its HTML post-processor in [nextjs.rs:41](crates/common/src/integrations/nextjs.rs#L41) +**Implementation:** Next.js registers its HTML post-processor in [mod.rs:47](crates/common/src/integrations/nextjs/mod.rs#L47) ### Execution in HTML Processor @@ -250,10 +250,10 @@ The post-processing is implemented as an integration hook, allowing other integr `T`-chunk lengths use the **unescaped** byte count of the payload (after decoding JavaScript string escapes). Correct handling requires: -- Shared escape sequence iterator handles `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [nextjs.rs:643](crates/common/src/integrations/nextjs.rs#L643) -- Counting unescaped bytes: [nextjs.rs:780](crates/common/src/integrations/nextjs.rs#L780) -- Consuming exactly _N unescaped bytes_ to locate the end of a declared `T` chunk: [nextjs.rs:785](crates/common/src/integrations/nextjs.rs#L785) -- Marker-aware byte length calculation for cross-script processing: [nextjs.rs:973](crates/common/src/integrations/nextjs.rs#L973) +- Shared escape sequence iterator handles `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [rsc.rs:29](crates/common/src/integrations/nextjs/rsc.rs#L29) +- Counting unescaped bytes: [rsc.rs:158](crates/common/src/integrations/nextjs/rsc.rs#L158) +- Consuming exactly _N unescaped bytes_ to locate the end of a declared `T` chunk: [rsc.rs:163](crates/common/src/integrations/nextjs/rsc.rs#L163) +- Marker-aware byte length calculation for cross-script processing: [rsc.rs:378](crates/common/src/integrations/nextjs/rsc.rs#L378) --- @@ -272,7 +272,7 @@ The solution handles multiple URL formats in RSC content: ### Regex Pattern -**Implementation:** Regex-based rewriting in [nextjs.rs:870](crates/common/src/integrations/nextjs.rs#L870) +**Implementation:** Regex-based rewriting in [rsc.rs:276](crates/common/src/integrations/nextjs/rsc.rs#L276) This pattern handles: @@ -418,31 +418,35 @@ Because post-processing runs inside the HTML processor (before recompression), ` ## Implementation Files -| File | Purpose | -| -------------------------------------------- | ------------------------------------------- | -| `crates/common/src/integrations/nextjs.rs` | RSC rewriting logic, post-processor | -| `crates/common/src/integrations/registry.rs` | `IntegrationHtmlPostProcessor` trait | -| `crates/common/src/integrations/mod.rs` | Module exports | -| `crates/common/src/html_processor.rs` | HTML rewriting + post-processor invocation | -| `crates/common/src/publisher.rs` | Response routing + streaming pipeline setup | -| `crates/common/src/streaming_processor.rs` | Compression transforms + `StreamProcessor` | - -### Key Functions in nextjs.rs - -| Function | Line | Purpose | -| ---------------------------------------------- | ------------------------------------------------------ | ---------------------------------------------------- | -| `extract_rsc_push_payload` | [257](crates/common/src/integrations/nextjs.rs#L257) | Extract string from `self.__next_f.push([1, '...'])` | -| `EscapeSequenceIter` | [643](crates/common/src/integrations/nextjs.rs#L643) | Shared iterator for escape sequence parsing | -| `calculate_unescaped_byte_length` | [780](crates/common/src/integrations/nextjs.rs#L780) | Count unescaped bytes with escape handling | -| `consume_unescaped_bytes` | [785](crates/common/src/integrations/nextjs.rs#L785) | Advance through string consuming N bytes | -| `find_tchunks` | [865](crates/common/src/integrations/nextjs.rs#L865) | Find T-chunks in single script | -| `rewrite_rsc_url_string` | [870](crates/common/src/integrations/nextjs.rs#L870) | URL rewriting with escape handling | -| `rewrite_rsc_tchunks` | [900](crates/common/src/integrations/nextjs.rs#L900) | Single-script T-chunk processing | -| `calculate_unescaped_byte_length_skip_markers` | [973](crates/common/src/integrations/nextjs.rs#L973) | Count unescaped bytes, excluding markers | -| `find_tchunks_with_markers` | [980](crates/common/src/integrations/nextjs.rs#L980) | Find T-chunks in marker-combined content | -| `rewrite_rsc_scripts_combined` | [1000](crates/common/src/integrations/nextjs.rs#L1000) | Cross-script T-chunk processing | -| `find_rsc_push_scripts` | [1109](crates/common/src/integrations/nextjs.rs#L1109) | Find all RSC scripts in HTML | -| `post_process_rsc_html` | [1183](crates/common/src/integrations/nextjs.rs#L1183) | Complete HTML post-processing | +| File | Purpose | +| ------------------------------------------------------------ | --------------------------------------------------------- | +| `crates/common/src/integrations/nextjs/mod.rs` | Next.js integration config + registration | +| `crates/common/src/integrations/nextjs/html_post_process.rs` | HTML post-processing for cross-script RSC | +| `crates/common/src/integrations/nextjs/rsc.rs` | RSC T-chunk parsing + URL rewriting | +| `crates/common/src/integrations/nextjs/script_rewriter.rs` | Script rewrites (`__NEXT_DATA__`, inline `__next_f.push`) | +| `crates/common/src/rsc_flight.rs` | Flight response rewriting (`text/x-component`) | +| `crates/common/src/integrations/registry.rs` | `IntegrationHtmlPostProcessor` trait | +| `crates/common/src/integrations/mod.rs` | Module exports | +| `crates/common/src/html_processor.rs` | HTML rewriting + post-processor invocation | +| `crates/common/src/publisher.rs` | Response routing + streaming pipeline setup | +| `crates/common/src/streaming_processor.rs` | Compression transforms + `StreamProcessor` | + +### Key Functions (Next.js integration) + +| Symbol | Location | Purpose | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------- | +| `extract_rsc_push_payload` | [script_rewriter.rs:86](crates/common/src/integrations/nextjs/script_rewriter.rs#L86) | Extract string from `self.__next_f.push([1, '...'])` | +| `EscapeSequenceIter` | [rsc.rs:29](crates/common/src/integrations/nextjs/rsc.rs#L29) | Shared iterator for escape sequence parsing | +| `calculate_unescaped_byte_length` | [rsc.rs:158](crates/common/src/integrations/nextjs/rsc.rs#L158) | Count unescaped bytes with escape handling | +| `consume_unescaped_bytes` | [rsc.rs:163](crates/common/src/integrations/nextjs/rsc.rs#L163) | Advance through string consuming N bytes | +| `find_tchunks` | [rsc.rs:252](crates/common/src/integrations/nextjs/rsc.rs#L252) | Find T-chunks in a single payload | +| `RscUrlRewriter` | [rsc.rs:266](crates/common/src/integrations/nextjs/rsc.rs#L266) | Regex URL rewriting (compiled once per rewrite call) | +| `rewrite_rsc_tchunks_with_rewriter` | [rsc.rs:336](crates/common/src/integrations/nextjs/rsc.rs#L336) | Single-payload T-chunk processing | +| `calculate_unescaped_byte_length_skip_markers` | [rsc.rs:378](crates/common/src/integrations/nextjs/rsc.rs#L378) | Count unescaped bytes, excluding markers | +| `find_tchunks_with_markers` | [rsc.rs:256](crates/common/src/integrations/nextjs/rsc.rs#L256) | Find T-chunks in marker-combined content | +| `rewrite_rsc_scripts_combined` | [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) | Cross-script T-chunk processing | +| `find_rsc_push_scripts` | [html_post_process.rs:67](crates/common/src/integrations/nextjs/html_post_process.rs#L67) | Find all RSC scripts in HTML | +| `post_process_rsc_html_in_place` | [html_post_process.rs:132](crates/common/src/integrations/nextjs/html_post_process.rs#L132) | Complete HTML post-processing | --- @@ -476,25 +480,25 @@ For typical pages with 100-300 RSC scripts, this adds ~1-5ms to processing time. ## Deconstruction and Reconstruction Logic -The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main entry point is `post_process_rsc_html()` at [nextjs.rs:1242](crates/common/src/integrations/nextjs.rs#L1242). +The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main entry point is `post_process_rsc_html_in_place()` at [html_post_process.rs:132](crates/common/src/integrations/nextjs/html_post_process.rs#L132). ### Step 1: Find RSC Push Scripts Find all `self.__next_f.push([1, "..."])` scripts in the HTML and extract their payloads. -**Implementation:** `find_rsc_push_scripts()` at [nextjs.rs:1162](crates/common/src/integrations/nextjs.rs#L1162) +**Implementation:** `find_rsc_push_scripts()` at [html_post_process.rs:67](crates/common/src/integrations/nextjs/html_post_process.rs#L67) ### Step 2: Combine Payloads with Markers Join all payloads with a marker string (`\x00SPLIT\x00`) that cannot appear in valid JSON/RSC content. This allows T-chunks to be processed across script boundaries while preserving the ability to split back later. -**Implementation:** Marker constant at [nextjs.rs:903](crates/common/src/integrations/nextjs.rs#L903), combining logic in `rewrite_rsc_scripts_combined()` at [nextjs.rs:1053](crates/common/src/integrations/nextjs.rs#L1053) +**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11), combining logic in `rewrite_rsc_scripts_combined()` at [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) ### Step 3: Find T-Chunks Across Combined Content Parse T-chunk headers (`ID:T,`) and consume exactly the declared number of unescaped bytes, skipping over markers. -**Implementation:** `find_tchunks_with_markers()` at [nextjs.rs:1002](crates/common/src/integrations/nextjs.rs#L1002), using `consume_unescaped_bytes_skip_markers()` at [nextjs.rs:907](crates/common/src/integrations/nextjs.rs#L907) +**Implementation:** `find_tchunks_with_markers()` at [rsc.rs:256](crates/common/src/integrations/nextjs/rsc.rs#L256), using `EscapeSequenceIter::from_position_with_marker()` at [rsc.rs:64](crates/common/src/integrations/nextjs/rsc.rs#L64) ### Step 4: Rewrite URLs in T-Chunk Content @@ -505,25 +509,25 @@ Rewrite all URL patterns in the T-chunk content: - `\\/\\/origin.example.com` → `\\/\\/proxy.example.com` (JSON-escaped) - `\\\\//origin.example.com` → `\\\\//proxy.example.com` (double-escaped) -**Implementation:** `rewrite_rsc_url_string()` at [nextjs.rs:800](crates/common/src/integrations/nextjs.rs#L800) +**Implementation:** `RscUrlRewriter::rewrite()` at [rsc.rs:297](crates/common/src/integrations/nextjs/rsc.rs#L297) ### Step 5: Recalculate T-Chunk Length Calculate the new unescaped byte length (excluding markers) and update the T-chunk header with the new hex length. -**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [nextjs.rs:988](crates/common/src/integrations/nextjs.rs#L988) +**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [rsc.rs:378](crates/common/src/integrations/nextjs/rsc.rs#L378) ### Step 6: Split Back on Markers Split the combined rewritten content back into individual payloads on the marker boundaries. Each payload corresponds to one original script, with T-chunk lengths now correct across script boundaries. -**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [nextjs.rs:1053](crates/common/src/integrations/nextjs.rs#L1053) +**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) ### Step 7: Reconstruct HTML Replace each original script with its rewritten version in the HTML. -**Implementation:** Part of `post_process_rsc_html()` at [nextjs.rs:1242](crates/common/src/integrations/nextjs.rs#L1242) +**Implementation:** Part of `post_process_rsc_html_in_place()` at [html_post_process.rs:132](crates/common/src/integrations/nextjs/html_post_process.rs#L132) ### Visual Example @@ -601,4 +605,4 @@ The key insights are: - React Flight Protocol: Internal React implementation for RSC streaming: https://github.com/vercel/next.js/tree/v14.2.35 - Next.js App Router: https://nextjs.org/docs/app - lol_html: https://github.com/nicksrandall/lol-html (streaming HTML rewriter) -- Implementation: `crates/common/src/integrations/nextjs.rs` +- Implementation: `crates/common/src/integrations/nextjs/mod.rs` and `crates/common/src/integrations/nextjs/` From 86a807b1e27d04713f3ae94ad1a4ef5fbe1dd27a Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 21:30:41 -0800 Subject: [PATCH 08/11] Another refactor --- .../integrations/nextjs/html_post_process.rs | 59 ++++++++- crates/common/src/integrations/nextjs/rsc.rs | 118 ++++++++++++++++-- .../integrations/nextjs/script_rewriter.rs | 34 ++++- crates/common/src/rsc_flight.rs | 8 ++ docs/RSC_HYDRATION_FINDINGS.md | 51 ++++---- 5 files changed, 229 insertions(+), 41 deletions(-) diff --git a/crates/common/src/integrations/nextjs/html_post_process.rs b/crates/common/src/integrations/nextjs/html_post_process.rs index f29f13f..804c09f 100644 --- a/crates/common/src/integrations/nextjs/html_post_process.rs +++ b/crates/common/src/integrations/nextjs/html_post_process.rs @@ -84,8 +84,11 @@ fn find_rsc_push_scripts(html: &str) -> Vec { let mut i = payload_start; let bytes = html.as_bytes(); while i < bytes.len() { - if bytes[i] == b'\\' { - i += 2; + if bytes[i] == b'\\' && i + 1 < bytes.len() { + i += 2; // Skip escape sequence (safe: we checked i+1 exists) + } else if bytes[i] == b'\\' { + // Trailing backslash at end of content - malformed + break; } else if bytes[i] == quote as u8 { break; } else { @@ -93,7 +96,7 @@ fn find_rsc_push_scripts(html: &str) -> Vec { } } - if i >= bytes.len() { + if i >= bytes.len() || bytes[i] != quote as u8 { search_pos = payload_start; continue; } @@ -330,4 +333,54 @@ mod tests { "Origin URL should be removed. Got: {rewritten}" ); } + + #[test] + fn handles_trailing_backslash_gracefully() { + // Malformed content with trailing backslash should not panic + let html = r#" + + +"#; + + let scripts = find_rsc_push_scripts(html); + // The first script is malformed (trailing backslash escapes the quote), + // so it won't be detected as valid. The second one should be found. + assert!( + scripts.len() >= 1, + "Should find at least the valid script. Found: {}", + scripts.len() + ); + + // Should not panic during processing + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + assert!( + result.contains("test.example.com") || result.contains("origin.example.com"), + "Processing should complete without panic" + ); + } + + #[test] + fn handles_unterminated_string_gracefully() { + // Content where string never closes - should not hang or panic + let html = r#" + +"#; + + let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + assert_eq!(result, html, "HTML without origin should be unchanged"); + } } diff --git a/crates/common/src/integrations/nextjs/rsc.rs b/crates/common/src/integrations/nextjs/rsc.rs index 3dea8b1..885911e 100644 --- a/crates/common/src/integrations/nextjs/rsc.rs +++ b/crates/common/src/integrations/nextjs/rsc.rs @@ -10,6 +10,10 @@ static TCHUNK_PATTERN: Lazy = /// Marker used to track script boundaries when combining RSC content. pub(crate) const RSC_MARKER: &str = "\x00SPLIT\x00"; +/// Maximum combined payload size for cross-script processing (10 MB). +/// Payloads exceeding this limit are processed individually without cross-script T-chunk handling. +const MAX_COMBINED_PAYLOAD_SIZE: usize = 10 * 1024 * 1024; + // ============================================================================= // Escape Sequence Parsing // ============================================================================= @@ -299,6 +303,7 @@ impl RscUrlRewriter { return Cow::Borrowed(input); } + // Phase 1: Regex-based URL pattern rewriting (handles escaped slashes, schemes, etc.) let replaced = self .pattern .replace_all(input, |caps: ®ex::Captures<'_>| { @@ -310,18 +315,20 @@ impl RscUrlRewriter { } }); - let still_contains_origin = match &replaced { - Cow::Borrowed(s) => s.contains(&self.origin_host), - Cow::Owned(s) => s.contains(&self.origin_host), + // Phase 2: Handle bare host occurrences not matched by the URL regex + // (e.g., `siteProductionDomain`). Only check if regex made no changes, + // because if it did, we already know origin_host was present. + let text = match &replaced { + Cow::Borrowed(s) => *s, + Cow::Owned(s) => s.as_str(), }; - if !still_contains_origin { + if !text.contains(&self.origin_host) { return replaced; } - // Also rewrite bare host occurrences inside RSC payloads (e.g. `siteProductionDomain`). - let owned = replaced.into_owned(); - Cow::Owned(owned.replace(&self.origin_host, &self.request_host)) + // Bare host replacement needed + Cow::Owned(text.replace(&self.origin_host, &self.request_host)) } pub(crate) fn rewrite_to_string(&self, input: &str) -> String { @@ -398,7 +405,26 @@ pub fn rewrite_rsc_scripts_combined( return vec![rewrite_rsc_tchunks_with_rewriter(payloads[0], &rewriter)]; } - let mut combined = payloads[0].to_string(); + // Check total size before allocating combined buffer + let total_size: usize = + payloads.iter().map(|p| p.len()).sum::() + (payloads.len() - 1) * RSC_MARKER.len(); + + if total_size > MAX_COMBINED_PAYLOAD_SIZE { + // Fall back to individual processing if combined size is too large. + // This sacrifices cross-script T-chunk correctness for memory safety. + log::warn!( + "RSC combined payload size {} exceeds limit {}, processing individually", + total_size, + MAX_COMBINED_PAYLOAD_SIZE + ); + return payloads + .iter() + .map(|p| rewrite_rsc_tchunks_with_rewriter(p, &rewriter)) + .collect(); + } + + let mut combined = String::with_capacity(total_size); + combined.push_str(payloads[0]); for payload in &payloads[1..] { combined.push_str(RSC_MARKER); combined.push_str(payload); @@ -591,4 +617,80 @@ mod tests { "Bare host should be rewritten inside RSC payload. Got: {rewritten}" ); } + + #[test] + fn single_payload_bypasses_combining() { + // When there's only one payload, we should process it directly without combining + // Content: {"url":"https://origin.example.com/x"} = 37 bytes = 0x25 hex + let payload = r#"1a:T25,{"url":"https://origin.example.com/x"}"#; + let payloads: Vec<&str> = vec![payload]; + + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + assert_eq!(results.len(), 1); + assert!( + results[0].contains("test.example.com"), + "Single payload should be rewritten. Got: {}", + results[0] + ); + // The length should be updated for the rewritten URL + // {"url":"https://test.example.com/x"} = 35 bytes = 0x23 hex + assert!( + results[0].contains(":T23,"), + "T-chunk length should be updated. Got: {}", + results[0] + ); + } + + #[test] + fn empty_payloads_returns_empty() { + let payloads: Vec<&str> = vec![]; + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + assert!(results.is_empty()); + } + + #[test] + fn no_origin_in_payloads_returns_unchanged() { + let payloads: Vec<&str> = vec![r#"1a:T10,{"key":"value"}"#, r#"1b:T10,{"foo":"bar"}"#]; + + let results = rewrite_rsc_scripts_combined( + &payloads, + "origin.example.com", + "test.example.com", + "https", + ); + + assert_eq!(results.len(), 2); + // Content should be identical - note that T-chunk lengths may be recalculated + // even if content is unchanged (due to how the algorithm works) + assert!( + !results[0].contains("origin.example.com") && !results[0].contains("test.example.com"), + "No host should be present in payload without URLs" + ); + assert!( + !results[1].contains("origin.example.com") && !results[1].contains("test.example.com"), + "No host should be present in payload without URLs" + ); + // The content after T-chunk header should be preserved + assert!( + results[0].contains(r#"{"key":"value"}"#), + "Content should be preserved. Got: {}", + results[0] + ); + assert!( + results[1].contains(r#"{"foo":"bar"}"#), + "Content should be preserved. Got: {}", + results[1] + ); + } } diff --git a/crates/common/src/integrations/nextjs/script_rewriter.rs b/crates/common/src/integrations/nextjs/script_rewriter.rs index 440ea3a..94c21ed 100644 --- a/crates/common/src/integrations/nextjs/script_rewriter.rs +++ b/crates/common/src/integrations/nextjs/script_rewriter.rs @@ -36,14 +36,22 @@ impl NextJsScriptRewriter { content: &str, ctx: &IntegrationScriptContext<'_>, ) -> ScriptRewriteAction { - if let Some(rewritten) = rewrite_nextjs_values( - content, + if ctx.origin_host.is_empty() + || ctx.request_host.is_empty() + || self.config.rewrite_attributes.is_empty() + { + return ScriptRewriteAction::keep(); + } + + let rewriter = UrlRewriter::new( ctx.origin_host, ctx.request_host, ctx.request_scheme, &self.config.rewrite_attributes, - false, - ) { + false, // preserve_length not used for structured payloads + ); + + if let Some(rewritten) = rewrite_nextjs_values_with_rewriter(content, &rewriter) { ScriptRewriteAction::replace(rewritten) } else { ScriptRewriteAction::keep() @@ -151,6 +159,11 @@ impl IntegrationScriptRewriter for NextJsScriptRewriter { } } +fn rewrite_nextjs_values_with_rewriter(content: &str, rewriter: &UrlRewriter) -> Option { + rewriter.rewrite_embedded(content) +} + +#[cfg(test)] fn rewrite_nextjs_values( content: &str, origin_host: &str, @@ -171,15 +184,26 @@ fn rewrite_nextjs_values( preserve_length, ); - rewriter.rewrite_embedded(content) + rewrite_nextjs_values_with_rewriter(content, &rewriter) } +/// Rewrites URLs in structured Next.js JSON payloads (e.g., `__NEXT_DATA__`). +/// +/// This rewriter uses attribute-specific regex patterns to find and replace URLs +/// in JSON content. It handles full URLs, protocol-relative URLs, and bare hostnames. +/// +/// The `preserve_length` option adds whitespace padding to maintain byte length, +/// which was an early attempt at RSC compatibility. This is no longer needed for +/// RSC payloads (T-chunk lengths are recalculated instead), but is kept for +/// potential future use cases where length preservation is required. struct UrlRewriter { origin_host: String, request_host: String, request_scheme: String, embedded_patterns: Vec, bare_host_patterns: Vec, + /// When true, adds whitespace padding to maintain original byte length. + /// Currently unused in production (always false). preserve_length: bool, } diff --git a/crates/common/src/rsc_flight.rs b/crates/common/src/rsc_flight.rs index 82590e3..d850010 100644 --- a/crates/common/src/rsc_flight.rs +++ b/crates/common/src/rsc_flight.rs @@ -21,6 +21,14 @@ enum RowState { /// /// For `T` rows, the length prefix is the UTF-8 byte length of the content bytes. If we rewrite /// URLs inside the content, we must recompute the length and rewrite the header. +/// +/// ## Limitations +/// +/// This rewriter performs simple string replacement and does NOT handle JSON escape sequences. +/// URLs like `\/\/origin.example.com` (JSON-escaped slashes) will not be rewritten. This is +/// acceptable because Flight responses from client-side navigation typically contain plain URLs, +/// not doubly-escaped JSON-in-JS content. For inlined `__next_f` data in HTML (which can have +/// escape sequences), the HTML post-processor in `integrations/nextjs/` handles those cases. pub struct RscFlightUrlRewriter { origin_url: String, origin_http_url: Option, diff --git a/docs/RSC_HYDRATION_FINDINGS.md b/docs/RSC_HYDRATION_FINDINGS.md index 0f67065..66de477 100644 --- a/docs/RSC_HYDRATION_FINDINGS.md +++ b/docs/RSC_HYDRATION_FINDINGS.md @@ -199,7 +199,7 @@ The marker `\x00SPLIT\x00` is chosen because: - Easily identifiable for splitting - Won't be confused with any escape sequence -**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11) and combine/split logic in [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) +**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11) and combine/split logic in [rsc.rs:417](crates/common/src/integrations/nextjs/rsc.rs#L417) #### Step 2: Find T-Chunks Across Combined Content @@ -207,7 +207,7 @@ Scan the combined stream for `ID:T,` headers, then consume exactly ` The key insight: markers don't count toward byte consumption. When a T-chunk declares 1679 bytes, we consume 1679 bytes of actual content, skipping over any markers we encounter. -**Implementation:** T-chunk discovery at [rsc.rs:194](crates/common/src/integrations/nextjs/rsc.rs#L194) with marker-aware escape sequence iterator at [rsc.rs:29](crates/common/src/integrations/nextjs/rsc.rs#L29) +**Implementation:** T-chunk discovery at [rsc.rs:198](crates/common/src/integrations/nextjs/rsc.rs#L198) with marker-aware escape sequence iterator at [rsc.rs:68](crates/common/src/integrations/nextjs/rsc.rs#L68) #### Step 3: Rewrite URLs and Recalculate Lengths @@ -250,10 +250,11 @@ The post-processing is implemented as an integration hook, allowing other integr `T`-chunk lengths use the **unescaped** byte count of the payload (after decoding JavaScript string escapes). Correct handling requires: -- Shared escape sequence iterator handles `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [rsc.rs:29](crates/common/src/integrations/nextjs/rsc.rs#L29) -- Counting unescaped bytes: [rsc.rs:158](crates/common/src/integrations/nextjs/rsc.rs#L158) -- Consuming exactly _N unescaped bytes_ to locate the end of a declared `T` chunk: [rsc.rs:163](crates/common/src/integrations/nextjs/rsc.rs#L163) -- Marker-aware byte length calculation for cross-script processing: [rsc.rs:378](crates/common/src/integrations/nextjs/rsc.rs#L378) +- Shared escape sequence iterator handles `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [rsc.rs:33](crates/common/src/integrations/nextjs/rsc.rs#L33) +- Counting unescaped bytes: [rsc.rs:162](crates/common/src/integrations/nextjs/rsc.rs#L162) +- Consuming exactly _N unescaped bytes_ to locate the end of a declared `T` chunk: [rsc.rs:167](crates/common/src/integrations/nextjs/rsc.rs#L167) +- Marker-aware byte length calculation for cross-script processing: [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) +- Size-limited combined payload allocation (10 MB max): [rsc.rs:395](crates/common/src/integrations/nextjs/rsc.rs#L395) --- @@ -272,7 +273,7 @@ The solution handles multiple URL formats in RSC content: ### Regex Pattern -**Implementation:** Regex-based rewriting in [rsc.rs:276](crates/common/src/integrations/nextjs/rsc.rs#L276) +**Implementation:** Regex-based rewriting in [rsc.rs:272](crates/common/src/integrations/nextjs/rsc.rs#L272) This pattern handles: @@ -435,18 +436,18 @@ Because post-processing runs inside the HTML processor (before recompression), ` | Symbol | Location | Purpose | | ---------------------------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------- | -| `extract_rsc_push_payload` | [script_rewriter.rs:86](crates/common/src/integrations/nextjs/script_rewriter.rs#L86) | Extract string from `self.__next_f.push([1, '...'])` | -| `EscapeSequenceIter` | [rsc.rs:29](crates/common/src/integrations/nextjs/rsc.rs#L29) | Shared iterator for escape sequence parsing | -| `calculate_unescaped_byte_length` | [rsc.rs:158](crates/common/src/integrations/nextjs/rsc.rs#L158) | Count unescaped bytes with escape handling | -| `consume_unescaped_bytes` | [rsc.rs:163](crates/common/src/integrations/nextjs/rsc.rs#L163) | Advance through string consuming N bytes | -| `find_tchunks` | [rsc.rs:252](crates/common/src/integrations/nextjs/rsc.rs#L252) | Find T-chunks in a single payload | -| `RscUrlRewriter` | [rsc.rs:266](crates/common/src/integrations/nextjs/rsc.rs#L266) | Regex URL rewriting (compiled once per rewrite call) | -| `rewrite_rsc_tchunks_with_rewriter` | [rsc.rs:336](crates/common/src/integrations/nextjs/rsc.rs#L336) | Single-payload T-chunk processing | -| `calculate_unescaped_byte_length_skip_markers` | [rsc.rs:378](crates/common/src/integrations/nextjs/rsc.rs#L378) | Count unescaped bytes, excluding markers | -| `find_tchunks_with_markers` | [rsc.rs:256](crates/common/src/integrations/nextjs/rsc.rs#L256) | Find T-chunks in marker-combined content | -| `rewrite_rsc_scripts_combined` | [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) | Cross-script T-chunk processing | +| `extract_rsc_push_payload` | [script_rewriter.rs:94](crates/common/src/integrations/nextjs/script_rewriter.rs#L94) | Extract string from `self.__next_f.push([1, '...'])` | +| `EscapeSequenceIter` | [rsc.rs:33](crates/common/src/integrations/nextjs/rsc.rs#L33) | Shared iterator for escape sequence parsing | +| `calculate_unescaped_byte_length` | [rsc.rs:162](crates/common/src/integrations/nextjs/rsc.rs#L162) | Count unescaped bytes with escape handling | +| `consume_unescaped_bytes` | [rsc.rs:167](crates/common/src/integrations/nextjs/rsc.rs#L167) | Advance through string consuming N bytes | +| `find_tchunks` | [rsc.rs:256](crates/common/src/integrations/nextjs/rsc.rs#L256) | Find T-chunks in a single payload | +| `RscUrlRewriter` | [rsc.rs:272](crates/common/src/integrations/nextjs/rsc.rs#L272) | Regex URL rewriting (compiled once per rewrite call) | +| `rewrite_rsc_tchunks_with_rewriter` | [rsc.rs:343](crates/common/src/integrations/nextjs/rsc.rs#L343) | Single-payload T-chunk processing | +| `calculate_unescaped_byte_length_skip_markers` | [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) | Count unescaped bytes, excluding markers | +| `find_tchunks_with_markers` | [rsc.rs:260](crates/common/src/integrations/nextjs/rsc.rs#L260) | Find T-chunks in marker-combined content | +| `rewrite_rsc_scripts_combined` | [rsc.rs:392](crates/common/src/integrations/nextjs/rsc.rs#L392) | Cross-script T-chunk processing | | `find_rsc_push_scripts` | [html_post_process.rs:67](crates/common/src/integrations/nextjs/html_post_process.rs#L67) | Find all RSC scripts in HTML | -| `post_process_rsc_html_in_place` | [html_post_process.rs:132](crates/common/src/integrations/nextjs/html_post_process.rs#L132) | Complete HTML post-processing | +| `post_process_rsc_html_in_place` | [html_post_process.rs:135](crates/common/src/integrations/nextjs/html_post_process.rs#L135) | Complete HTML post-processing | --- @@ -480,7 +481,7 @@ For typical pages with 100-300 RSC scripts, this adds ~1-5ms to processing time. ## Deconstruction and Reconstruction Logic -The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main entry point is `post_process_rsc_html_in_place()` at [html_post_process.rs:132](crates/common/src/integrations/nextjs/html_post_process.rs#L132). +The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main entry point is `post_process_rsc_html_in_place()` at [html_post_process.rs:136](crates/common/src/integrations/nextjs/html_post_process.rs#L136). ### Step 1: Find RSC Push Scripts @@ -492,13 +493,13 @@ Find all `self.__next_f.push([1, "..."])` scripts in the HTML and extract their Join all payloads with a marker string (`\x00SPLIT\x00`) that cannot appear in valid JSON/RSC content. This allows T-chunks to be processed across script boundaries while preserving the ability to split back later. -**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11), combining logic in `rewrite_rsc_scripts_combined()` at [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) +**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11), combining logic in `rewrite_rsc_scripts_combined()` at [rsc.rs:392](crates/common/src/integrations/nextjs/rsc.rs#L392) ### Step 3: Find T-Chunks Across Combined Content Parse T-chunk headers (`ID:T,`) and consume exactly the declared number of unescaped bytes, skipping over markers. -**Implementation:** `find_tchunks_with_markers()` at [rsc.rs:256](crates/common/src/integrations/nextjs/rsc.rs#L256), using `EscapeSequenceIter::from_position_with_marker()` at [rsc.rs:64](crates/common/src/integrations/nextjs/rsc.rs#L64) +**Implementation:** `find_tchunks_with_markers()` at [rsc.rs:260](crates/common/src/integrations/nextjs/rsc.rs#L260), using `EscapeSequenceIter::from_position_with_marker()` at [rsc.rs:68](crates/common/src/integrations/nextjs/rsc.rs#L68) ### Step 4: Rewrite URLs in T-Chunk Content @@ -509,25 +510,25 @@ Rewrite all URL patterns in the T-chunk content: - `\\/\\/origin.example.com` → `\\/\\/proxy.example.com` (JSON-escaped) - `\\\\//origin.example.com` → `\\\\//proxy.example.com` (double-escaped) -**Implementation:** `RscUrlRewriter::rewrite()` at [rsc.rs:297](crates/common/src/integrations/nextjs/rsc.rs#L297) +**Implementation:** `RscUrlRewriter::rewrite()` at [rsc.rs:301](crates/common/src/integrations/nextjs/rsc.rs#L301) ### Step 5: Recalculate T-Chunk Length Calculate the new unescaped byte length (excluding markers) and update the T-chunk header with the new hex length. -**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [rsc.rs:378](crates/common/src/integrations/nextjs/rsc.rs#L378) +**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) ### Step 6: Split Back on Markers Split the combined rewritten content back into individual payloads on the marker boundaries. Each payload corresponds to one original script, with T-chunk lengths now correct across script boundaries. -**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) +**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [rsc.rs:392](crates/common/src/integrations/nextjs/rsc.rs#L392) ### Step 7: Reconstruct HTML Replace each original script with its rewritten version in the HTML. -**Implementation:** Part of `post_process_rsc_html_in_place()` at [html_post_process.rs:132](crates/common/src/integrations/nextjs/html_post_process.rs#L132) +**Implementation:** Part of `post_process_rsc_html_in_place()` at [html_post_process.rs:135](crates/common/src/integrations/nextjs/html_post_process.rs#L135) ### Visual Example From 7c645f3b8d13a19511096d48192e691b36c245a4 Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Mon, 15 Dec 2025 21:35:19 -0800 Subject: [PATCH 09/11] Fixed formatting --- crates/common/src/integrations/nextjs/html_post_process.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/common/src/integrations/nextjs/html_post_process.rs b/crates/common/src/integrations/nextjs/html_post_process.rs index 804c09f..fea1ad9 100644 --- a/crates/common/src/integrations/nextjs/html_post_process.rs +++ b/crates/common/src/integrations/nextjs/html_post_process.rs @@ -346,7 +346,7 @@ mod tests { // The first script is malformed (trailing backslash escapes the quote), // so it won't be detected as valid. The second one should be found. assert!( - scripts.len() >= 1, + !scripts.is_empty(), "Should find at least the valid script. Found: {}", scripts.len() ); From dd02e05a044dc978b95f49dcc1a8623f5edea9ad Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Tue, 16 Dec 2025 23:22:46 -0800 Subject: [PATCH 10/11] Additional refactoring --- crates/common/src/host_rewrite.rs | 49 ++ crates/common/src/html_processor.rs | 15 +- crates/common/src/integrations/mod.rs | 8 +- .../integrations/nextjs/html_post_process.rs | 706 +++++++++++++++--- crates/common/src/integrations/nextjs/mod.rs | 123 ++- crates/common/src/integrations/nextjs/rsc.rs | 338 ++++++--- .../integrations/nextjs/rsc_placeholders.rs | 242 ++++++ .../integrations/nextjs/script_rewriter.rs | 364 ++------- .../common/src/integrations/nextjs/shared.rs | 229 ++++++ crates/common/src/integrations/registry.rs | 111 ++- crates/common/src/lib.rs | 1 + crates/common/src/rsc_flight.rs | 26 +- docs/RSC_HYDRATION_FINDINGS.md | 158 ++-- trusted-server.toml | 2 + 14 files changed, 1755 insertions(+), 617 deletions(-) create mode 100644 crates/common/src/host_rewrite.rs create mode 100644 crates/common/src/integrations/nextjs/rsc_placeholders.rs create mode 100644 crates/common/src/integrations/nextjs/shared.rs diff --git a/crates/common/src/host_rewrite.rs b/crates/common/src/host_rewrite.rs new file mode 100644 index 0000000..cb15f40 --- /dev/null +++ b/crates/common/src/host_rewrite.rs @@ -0,0 +1,49 @@ +/// Rewrite bare host occurrences (e.g. `origin.example.com/news`) only when the match is a full +/// hostname token, not part of a larger hostname like `cdn.origin.example.com`. +/// +/// This is used by both HTML (`__next_f` payloads) and Flight (`text/x-component`) rewriting to +/// avoid corrupting unrelated hostnames. +pub(crate) fn rewrite_bare_host_at_boundaries( + text: &str, + origin_host: &str, + request_host: &str, +) -> Option { + if origin_host.is_empty() || request_host.is_empty() || !text.contains(origin_host) { + return None; + } + + fn is_host_char(byte: u8) -> bool { + byte.is_ascii_alphanumeric() || matches!(byte, b'.' | b'-' | b':') + } + + let origin_len = origin_host.len(); + let bytes = text.as_bytes(); + let mut out = String::with_capacity(text.len()); + let mut search = 0; + let mut replaced_any = false; + + while let Some(rel) = text[search..].find(origin_host) { + let pos = search + rel; + let end = pos + origin_len; + + let before_ok = pos == 0 || !is_host_char(bytes[pos - 1]); + let after_ok = end == bytes.len() || !is_host_char(bytes[end]); + + if before_ok && after_ok { + out.push_str(&text[search..pos]); + out.push_str(request_host); + replaced_any = true; + search = end; + } else { + out.push_str(&text[search..pos + 1]); + search = pos + 1; + } + } + + if !replaced_any { + return None; + } + + out.push_str(&text[search..]); + Some(out) +} diff --git a/crates/common/src/html_processor.rs b/crates/common/src/html_processor.rs index 392fb0d..1803436 100644 --- a/crates/common/src/html_processor.rs +++ b/crates/common/src/html_processor.rs @@ -9,9 +9,9 @@ use std::sync::Arc; use lol_html::{element, html_content::ContentType, text, Settings as RewriterSettings}; use crate::integrations::{ - AttributeRewriteOutcome, IntegrationAttributeContext, IntegrationHtmlContext, - IntegrationHtmlPostProcessor, IntegrationRegistry, IntegrationScriptContext, - ScriptRewriteAction, + AttributeRewriteOutcome, IntegrationAttributeContext, IntegrationDocumentState, + IntegrationHtmlContext, IntegrationHtmlPostProcessor, IntegrationRegistry, + IntegrationScriptContext, ScriptRewriteAction, }; use crate::settings::Settings; use crate::streaming_processor::{HtmlRewriterAdapter, StreamProcessor}; @@ -23,6 +23,7 @@ struct HtmlWithPostProcessing { origin_host: String, request_host: String, request_scheme: String, + document_state: IntegrationDocumentState, } impl StreamProcessor for HtmlWithPostProcessing { @@ -40,6 +41,7 @@ impl StreamProcessor for HtmlWithPostProcessing { request_host: &self.request_host, request_scheme: &self.request_scheme, origin_host: &self.origin_host, + document_state: &self.document_state, }; // Preflight to avoid allocating a `String` unless at least one post-processor wants to run. @@ -77,6 +79,7 @@ impl StreamProcessor for HtmlWithPostProcessing { fn reset(&mut self) { self.inner.reset(); + self.document_state.clear(); } } @@ -110,6 +113,7 @@ impl HtmlProcessorConfig { /// Create an HTML processor with URL replacement and optional Prebid injection pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcessor { let post_processors = config.integrations.html_post_processors(); + let document_state = IntegrationDocumentState::default(); // Simplified URL patterns structure - stores only core data and generates variants on-demand struct UrlPatterns { @@ -404,15 +408,19 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso let selector = script_rewriter.selector(); let rewriter = script_rewriter.clone(); let patterns = patterns.clone(); + let document_state = document_state.clone(); element_content_handlers.push(text!(selector, { let rewriter = rewriter.clone(); let patterns = patterns.clone(); + let document_state = document_state.clone(); move |text| { let ctx = IntegrationScriptContext { selector, request_host: &patterns.request_host, request_scheme: &patterns.request_scheme, origin_host: &patterns.origin_host, + is_last_in_text_node: text.last_in_text_node(), + document_state: &document_state, }; match rewriter.rewrite(text.as_str(), &ctx) { ScriptRewriteAction::Keep => {} @@ -439,6 +447,7 @@ pub fn create_html_processor(config: HtmlProcessorConfig) -> impl StreamProcesso origin_host: config.origin_host, request_host: config.request_host, request_scheme: config.request_scheme, + document_state, } } diff --git a/crates/common/src/integrations/mod.rs b/crates/common/src/integrations/mod.rs index 1afda3d..5cc8fc2 100644 --- a/crates/common/src/integrations/mod.rs +++ b/crates/common/src/integrations/mod.rs @@ -10,10 +10,10 @@ pub mod testlight; pub use registry::{ AttributeRewriteAction, AttributeRewriteOutcome, IntegrationAttributeContext, - IntegrationAttributeRewriter, IntegrationEndpoint, IntegrationHtmlContext, - IntegrationHtmlPostProcessor, IntegrationMetadata, IntegrationProxy, IntegrationRegistration, - IntegrationRegistrationBuilder, IntegrationRegistry, IntegrationScriptContext, - IntegrationScriptRewriter, ScriptRewriteAction, + IntegrationAttributeRewriter, IntegrationDocumentState, IntegrationEndpoint, + IntegrationHtmlContext, IntegrationHtmlPostProcessor, IntegrationMetadata, IntegrationProxy, + IntegrationRegistration, IntegrationRegistrationBuilder, IntegrationRegistry, + IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, }; type IntegrationBuilder = fn(&Settings) -> Option; diff --git a/crates/common/src/integrations/nextjs/html_post_process.rs b/crates/common/src/integrations/nextjs/html_post_process.rs index fea1ad9..1171331 100644 --- a/crates/common/src/integrations/nextjs/html_post_process.rs +++ b/crates/common/src/integrations/nextjs/html_post_process.rs @@ -1,23 +1,19 @@ +use std::cell::{Cell, RefCell}; +use std::rc::Rc; use std::sync::Arc; +use std::sync::Mutex; -use once_cell::sync::Lazy; -use regex::Regex; +use lol_html::{text, Settings as RewriterSettings}; use crate::integrations::{IntegrationHtmlContext, IntegrationHtmlPostProcessor}; -use super::rsc::rewrite_rsc_scripts_combined; +use super::rsc::rewrite_rsc_scripts_combined_with_limit; +use super::rsc_placeholders::{ + NextJsRscPostProcessState, RSC_PAYLOAD_PLACEHOLDER_PREFIX, RSC_PAYLOAD_PLACEHOLDER_SUFFIX, +}; +use super::shared::find_rsc_push_payload_range; use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; -/// RSC push script pattern for HTML post-processing. -static RSC_SCRIPT_PATTERN: Lazy = Lazy::new(|| { - Regex::new(r#"]*>\s*self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#) - .expect("valid RSC script regex") -}); - -/// RSC script ending pattern. -static RSC_SCRIPT_ENDING: Lazy = - Lazy::new(|| Regex::new(r#"^\s*\]\s*\)\s*;?\s*"#).expect("valid RSC ending regex")); - pub(crate) struct NextJsHtmlPostProcessor { config: Arc, } @@ -34,93 +30,285 @@ impl IntegrationHtmlPostProcessor for NextJsHtmlPostProcessor { } fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { + let _ = html; if !self.config.enabled || self.config.rewrite_attributes.is_empty() { return false; } - html.contains("__next_f.push") && html.contains(ctx.origin_host) + let Some(state) = ctx + .document_state + .get::>(NEXTJS_INTEGRATION_ID) + else { + return false; + }; + + let guard = state.lock().unwrap_or_else(|e| e.into_inner()); + !guard.payloads.is_empty() } fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool { + let Some(state) = ctx + .document_state + .get::>(NEXTJS_INTEGRATION_ID) + else { + return false; + }; + + let payloads = { + let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); + guard.take_payloads() + }; + if payloads.is_empty() { + return false; + } + + let payload_refs: Vec<&str> = payloads.iter().map(String::as_str).collect(); + let mut rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( + payload_refs.as_slice(), + ctx.origin_host, + ctx.request_host, + ctx.request_scheme, + self.config.max_combined_payload_bytes, + ); + + if rewritten_payloads.len() != payloads.len() { + log::warn!( + "NextJs post-process skipping due to rewrite payload count mismatch: original={}, rewritten={}", + payloads.len(), + rewritten_payloads.len() + ); + rewritten_payloads = payloads; + } + if log::log_enabled!(log::Level::Debug) { - let origin_before = html.matches(ctx.origin_host).count(); + let origin_count_before: usize = rewritten_payloads + .iter() + .map(|p| p.matches(ctx.origin_host).count()) + .sum(); log::debug!( - "NextJs post-processor running: html_len={}, origin_matches={}, origin={}, proxy={}://{}", - html.len(), - origin_before, + "NextJs post-processor substituting RSC payloads: scripts={}, origin_urls={}, origin={}, proxy={}://{}, html_len={}", + rewritten_payloads.len(), + origin_count_before, ctx.origin_host, ctx.request_scheme, - ctx.request_host + ctx.request_host, + html.len() + ); + } + + let (updated, replaced) = + substitute_rsc_payload_placeholders(html.as_str(), &rewritten_payloads); + + let expected = rewritten_payloads.len(); + if replaced != expected { + log::warn!( + "NextJs post-process placeholder substitution count mismatch: expected={}, replaced={}", + expected, + replaced + ); + } + + if contains_rsc_payload_placeholders(&updated) { + log::error!( + "NextJs post-process left RSC placeholders in output; attempting fallback substitution (scripts={})", + expected ); + + let fallback = + substitute_rsc_payload_placeholders_exact(html.as_str(), &rewritten_payloads); + + if contains_rsc_payload_placeholders(&fallback) { + log::error!( + "NextJs post-process fallback substitution still left RSC placeholders in output; hydration may break (scripts={})", + expected + ); + } + + *html = fallback; + return true; } - post_process_rsc_html_in_place(html, ctx.origin_host, ctx.request_host, ctx.request_scheme) + *html = updated; + true } } -#[derive(Debug, Clone, Copy)] -struct RscPushScriptRange { - payload_start: usize, - payload_end: usize, +fn contains_rsc_payload_placeholders(html: &str) -> bool { + let mut cursor = 0usize; + while let Some(next) = html[cursor..].find(RSC_PAYLOAD_PLACEHOLDER_PREFIX) { + let start = cursor + next; + let after_prefix = start + RSC_PAYLOAD_PLACEHOLDER_PREFIX.len(); + let mut idx_end = after_prefix; + while idx_end < html.len() && html.as_bytes()[idx_end].is_ascii_digit() { + idx_end += 1; + } + if idx_end > after_prefix && html[idx_end..].starts_with(RSC_PAYLOAD_PLACEHOLDER_SUFFIX) { + return true; + } + cursor = after_prefix; + } + false } -fn find_rsc_push_scripts(html: &str) -> Vec { - let mut scripts = Vec::new(); - let mut search_pos = 0; +fn substitute_rsc_payload_placeholders(html: &str, replacements: &[String]) -> (String, usize) { + let mut output = String::with_capacity(html.len()); + let mut cursor = 0usize; + let mut replaced = 0usize; - while search_pos < html.len() { - let Some(cap) = RSC_SCRIPT_PATTERN.captures(&html[search_pos..]) else { - break; - }; + while let Some(next) = html[cursor..].find(RSC_PAYLOAD_PLACEHOLDER_PREFIX) { + let start = cursor + next; + output.push_str(&html[cursor..start]); - let quote_match = cap.get(1).expect("script regex should capture quote"); - let quote = quote_match - .as_str() - .chars() - .next() - .expect("quote should exist"); - let payload_start = search_pos + quote_match.end(); - - let mut i = payload_start; - let bytes = html.as_bytes(); - while i < bytes.len() { - if bytes[i] == b'\\' && i + 1 < bytes.len() { - i += 2; // Skip escape sequence (safe: we checked i+1 exists) - } else if bytes[i] == b'\\' { - // Trailing backslash at end of content - malformed - break; - } else if bytes[i] == quote as u8 { - break; - } else { - i += 1; - } + let after_prefix = start + RSC_PAYLOAD_PLACEHOLDER_PREFIX.len(); + let mut idx_end = after_prefix; + while idx_end < html.len() && html.as_bytes()[idx_end].is_ascii_digit() { + idx_end += 1; } - if i >= bytes.len() || bytes[i] != quote as u8 { - search_pos = payload_start; + let suffix_ok = + idx_end > after_prefix && html[idx_end..].starts_with(RSC_PAYLOAD_PLACEHOLDER_SUFFIX); + if !suffix_ok { + output.push_str(RSC_PAYLOAD_PLACEHOLDER_PREFIX); + cursor = after_prefix; continue; } - let after_quote = &html[i + 1..]; - let Some(ending_match) = RSC_SCRIPT_ENDING.find(after_quote) else { - search_pos = payload_start; + let idx_str = &html[after_prefix..idx_end]; + let Ok(index) = idx_str.parse::() else { + output.push_str(RSC_PAYLOAD_PLACEHOLDER_PREFIX); + output.push_str(idx_str); + output.push_str(RSC_PAYLOAD_PLACEHOLDER_SUFFIX); + cursor = idx_end + RSC_PAYLOAD_PLACEHOLDER_SUFFIX.len(); continue; }; - let payload_end = i; - let script_end = i + 1 + ending_match.end(); + let Some(replacement) = replacements.get(index) else { + output.push_str(RSC_PAYLOAD_PLACEHOLDER_PREFIX); + output.push_str(idx_str); + output.push_str(RSC_PAYLOAD_PLACEHOLDER_SUFFIX); + cursor = idx_end + RSC_PAYLOAD_PLACEHOLDER_SUFFIX.len(); + continue; + }; + + output.push_str(replacement); + replaced += 1; + cursor = idx_end + RSC_PAYLOAD_PLACEHOLDER_SUFFIX.len(); + } + + output.push_str(&html[cursor..]); + (output, replaced) +} - scripts.push(RscPushScriptRange { - payload_start, - payload_end, - }); +fn substitute_rsc_payload_placeholders_exact(html: &str, replacements: &[String]) -> String { + let mut out = html.to_string(); + for (index, replacement) in replacements.iter().enumerate() { + let placeholder = + format!("{RSC_PAYLOAD_PLACEHOLDER_PREFIX}{index}{RSC_PAYLOAD_PLACEHOLDER_SUFFIX}"); + out = out.replace(&placeholder, replacement); + } + out +} + +#[derive(Debug, Clone, Copy)] +struct RscPushScriptRange { + payload_start: usize, + payload_end: usize, +} + +fn find_rsc_push_scripts(html: &str) -> Vec { + if !html.contains("__next_f") { + return Vec::new(); + } + + let ranges: Rc>> = Rc::new(RefCell::new(Vec::new())); + let buffer: Rc> = Rc::new(RefCell::new(String::new())); + let buffering = Rc::new(Cell::new(false)); + let buffer_start = Rc::new(Cell::new(0usize)); + + let settings = RewriterSettings { + element_content_handlers: vec![text!("script", { + let ranges = Rc::clone(&ranges); + let buffer = Rc::clone(&buffer); + let buffering = Rc::clone(&buffering); + let buffer_start = Rc::clone(&buffer_start); + move |t| { + if !buffering.get() && t.last_in_text_node() { + let script = t.as_str(); + if !script.contains("__next_f") { + return Ok(()); + } + + let Some((payload_start_rel, payload_end_rel)) = + find_rsc_push_payload_range(script) + else { + return Ok(()); + }; + + let loc = t.source_location().bytes(); + ranges.borrow_mut().push(RscPushScriptRange { + payload_start: loc.start + payload_start_rel, + payload_end: loc.start + payload_end_rel, + }); + return Ok(()); + } + + if !buffering.get() { + buffering.set(true); + buffer_start.set(t.source_location().bytes().start); + } + buffer.borrow_mut().push_str(t.as_str()); + + if !t.last_in_text_node() { + return Ok(()); + } + + buffering.set(false); + let script = std::mem::take(&mut *buffer.borrow_mut()); + if !script.contains("__next_f") { + return Ok(()); + } + + let Some((payload_start_rel, payload_end_rel)) = + find_rsc_push_payload_range(&script) + else { + return Ok(()); + }; + + let base = buffer_start.get(); + ranges.borrow_mut().push(RscPushScriptRange { + payload_start: base + payload_start_rel, + payload_end: base + payload_end_rel, + }); + + Ok(()) + } + })], + ..RewriterSettings::default() + }; - search_pos = script_end; + let mut rewriter = lol_html::HtmlRewriter::new(settings, |_chunk: &[u8]| {}); + if rewriter.write(html.as_bytes()).is_err() || rewriter.end().is_err() { + return Vec::new(); } - scripts + let result = std::mem::take(&mut *ranges.borrow_mut()); + result } +/// Rewrite RSC payload URLs in HTML by re-parsing the document. +/// +/// # Deprecation +/// +/// This function is **deprecated** in favor of the placeholder-based approach used in production: +/// - `NextJsRscPlaceholderRewriter` captures payloads during the initial `lol_html` pass +/// - `NextJsHtmlPostProcessor` rewrites and substitutes them at end-of-document +/// +/// This function re-parses HTML with `lol_html`, which is slower than the placeholder approach. +/// It remains available for testing and backward compatibility. +#[deprecated( + since = "0.1.0", + note = "Use NextJsHtmlPostProcessor for production RSC rewriting. This function re-parses HTML." +)] pub fn post_process_rsc_html( html: &str, origin_host: &str, @@ -128,63 +316,141 @@ pub fn post_process_rsc_html( request_scheme: &str, ) -> String { let mut result = html.to_string(); + #[allow(deprecated)] post_process_rsc_html_in_place(&mut result, origin_host, request_host, request_scheme); result } +/// Rewrite RSC payload URLs in HTML in place by re-parsing the document. +/// +/// # Deprecation +/// +/// This function is **deprecated** in favor of the placeholder-based approach used in production. +/// See [`post_process_rsc_html`] for details. +#[deprecated( + since = "0.1.0", + note = "Use NextJsHtmlPostProcessor for production RSC rewriting. This function re-parses HTML." +)] pub fn post_process_rsc_html_in_place( html: &mut String, origin_host: &str, request_host: &str, request_scheme: &str, ) -> bool { - let scripts = find_rsc_push_scripts(html.as_str()); + post_process_rsc_html_in_place_with_limit( + html, + origin_host, + request_host, + request_scheme, + super::rsc::DEFAULT_MAX_COMBINED_PAYLOAD_BYTES, + ) +} + +fn post_process_rsc_html_in_place_with_limit( + html: &mut String, + origin_host: &str, + request_host: &str, + request_scheme: &str, + max_combined_payload_bytes: usize, +) -> bool { + let mut scripts = find_rsc_push_scripts(html.as_str()); if scripts.is_empty() { return false; } - let payloads: Vec<&str> = scripts - .iter() - .map(|s| &html[s.payload_start..s.payload_end]) - .collect(); - - if !payloads.iter().any(|p| p.contains(origin_host)) { - return false; + scripts.sort_by_key(|s| s.payload_start); + let mut previous_end = 0usize; + for script in &scripts { + if script.payload_start > script.payload_end { + log::warn!( + "NextJs post-process skipping due to invalid payload range: start={}, end={}", + script.payload_start, + script.payload_end + ); + return false; + } + if script.payload_end > html.len() + || !html.is_char_boundary(script.payload_start) + || !html.is_char_boundary(script.payload_end) + { + log::warn!( + "NextJs post-process skipping due to non-UTF8 boundary payload range: start={}, end={}, html_len={}", + script.payload_start, + script.payload_end, + html.len() + ); + return false; + } + if script.payload_start < previous_end { + log::warn!( + "NextJs post-process skipping due to overlapping payload ranges: prev_end={}, start={}, end={}", + previous_end, + script.payload_start, + script.payload_end + ); + return false; + } + previous_end = script.payload_end; } - if log::log_enabled!(log::Level::Debug) { - let origin_count_before: usize = payloads + let rewritten_payloads = { + let Some(payloads) = scripts .iter() - .map(|p| p.matches(origin_host).count()) - .sum(); - log::debug!( - "post_process_rsc_html: {} scripts, {} origin URLs, origin={}, proxy={}://{}", - payloads.len(), - origin_count_before, + .map(|s| html.get(s.payload_start..s.payload_end)) + .collect::>>() + else { + log::warn!( + "NextJs post-process skipping due to invalid UTF-8 payload slicing despite boundary checks" + ); + return false; + }; + + if !payloads.iter().any(|p| p.contains(origin_host)) { + return false; + } + + if log::log_enabled!(log::Level::Debug) { + let origin_count_before: usize = payloads + .iter() + .map(|p| p.matches(origin_host).count()) + .sum(); + log::debug!( + "post_process_rsc_html: {} scripts, {} origin URLs, origin={}, proxy={}://{}", + payloads.len(), + origin_count_before, + origin_host, + request_scheme, + request_host + ); + } + + let rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( + payloads.as_slice(), origin_host, + request_host, request_scheme, - request_host + max_combined_payload_bytes, ); - } - let rewritten_payloads = rewrite_rsc_scripts_combined( - payloads.as_slice(), - origin_host, - request_host, - request_scheme, - ); + if rewritten_payloads.len() != payloads.len() { + log::warn!( + "NextJs post-process skipping due to rewrite payload count mismatch: original={}, rewritten={}", + payloads.len(), + rewritten_payloads.len() + ); + return false; + } - let mut changed = false; - for (i, original) in payloads.iter().enumerate() { - if rewritten_payloads[i] != *original { - changed = true; - break; + let changed = payloads + .iter() + .zip(&rewritten_payloads) + .any(|(original, rewritten)| *original != rewritten); + if !changed { + return false; } - } - if !changed { - return false; - } + rewritten_payloads + }; for (i, script) in scripts.iter().enumerate().rev() { html.replace_range( @@ -197,13 +463,109 @@ pub fn post_process_rsc_html_in_place( } #[cfg(test)] +#[allow(deprecated)] // Tests use deprecated post_process_rsc_html for legacy API coverage mod tests { use super::*; + fn find_rsc_push_scripts_chunked( + html: &str, + chunk_size: usize, + ) -> (Vec, bool) { + if !html.contains("__next_f") { + return (Vec::new(), false); + } + + let ranges: Rc>> = Rc::new(RefCell::new(Vec::new())); + let buffer: Rc> = Rc::new(RefCell::new(String::new())); + let buffering = Rc::new(Cell::new(false)); + let buffer_start = Rc::new(Cell::new(0usize)); + let saw_partial = Rc::new(Cell::new(false)); + + let settings = RewriterSettings { + element_content_handlers: vec![text!("script", { + let ranges = Rc::clone(&ranges); + let buffer = Rc::clone(&buffer); + let buffering = Rc::clone(&buffering); + let buffer_start = Rc::clone(&buffer_start); + let saw_partial = Rc::clone(&saw_partial); + move |t| { + if !t.last_in_text_node() { + saw_partial.set(true); + } + + if !buffering.get() && t.last_in_text_node() { + let script = t.as_str(); + if !script.contains("__next_f") { + return Ok(()); + } + + let Some((payload_start_rel, payload_end_rel)) = + find_rsc_push_payload_range(script) + else { + return Ok(()); + }; + + let loc = t.source_location().bytes(); + ranges.borrow_mut().push(RscPushScriptRange { + payload_start: loc.start + payload_start_rel, + payload_end: loc.start + payload_end_rel, + }); + return Ok(()); + } + + if !buffering.get() { + buffering.set(true); + buffer_start.set(t.source_location().bytes().start); + } + buffer.borrow_mut().push_str(t.as_str()); + + if !t.last_in_text_node() { + return Ok(()); + } + + buffering.set(false); + let script = std::mem::take(&mut *buffer.borrow_mut()); + if !script.contains("__next_f") { + return Ok(()); + } + + let Some((payload_start_rel, payload_end_rel)) = + find_rsc_push_payload_range(&script) + else { + return Ok(()); + }; + + let base = buffer_start.get(); + ranges.borrow_mut().push(RscPushScriptRange { + payload_start: base + payload_start_rel, + payload_end: base + payload_end_rel, + }); + + Ok(()) + } + })], + ..RewriterSettings::default() + }; + + let mut rewriter = lol_html::HtmlRewriter::new(settings, |_chunk: &[u8]| {}); + let chunk_size = chunk_size.max(1); + for chunk in html.as_bytes().chunks(chunk_size) { + if rewriter.write(chunk).is_err() { + return (Vec::new(), saw_partial.get()); + } + } + if rewriter.end().is_err() { + return (Vec::new(), saw_partial.get()); + } + + let result = std::mem::take(&mut *ranges.borrow_mut()); + (result, saw_partial.get()) + } + #[test] fn post_process_rsc_html_rewrites_cross_script_tchunks() { let html = r#" - + "#; @@ -223,6 +585,64 @@ mod tests { assert!(result.contains("self.__next_f.push")); } + #[test] + fn finds_rsc_push_scripts_with_fragmented_script_text_chunks() { + let filler = "a".repeat(32 * 1024); + let payload = format!("{filler} https://origin.example.com/page"); + let html = format!( + r#""# + ); + + let (scripts, saw_partial) = find_rsc_push_scripts_chunked(&html, 64); + + assert!( + saw_partial, + "should observe fragmented script text chunks when writing input in small pieces" + ); + assert_eq!( + scripts.len(), + 1, + "Should find exactly one RSC payload script" + ); + + let extracted = &html[scripts[0].payload_start..scripts[0].payload_end]; + assert_eq!( + extracted.len(), + payload.len(), + "Extracted payload length should match the original payload" + ); + assert!( + extracted.ends_with("https://origin.example.com/page"), + "Extracted payload should contain the origin URL" + ); + } + + #[test] + fn finds_assignment_push_form() { + let html = r#""#; + let scripts = find_rsc_push_scripts(html); + assert_eq!( + scripts.len(), + 1, + "Should find exactly one RSC payload script" + ); + let payload = &html[scripts[0].payload_start..scripts[0].payload_end]; + assert_eq!(payload, "payload", "Should capture the payload string"); + } + + #[test] + fn finds_window_next_f_push_with_case_insensitive_script_tags() { + let html = r#""#; + let scripts = find_rsc_push_scripts(html); + assert_eq!( + scripts.len(), + 1, + "Should find exactly one RSC payload script" + ); + let payload = &html[scripts[0].payload_start..scripts[0].payload_end]; + assert_eq!(payload, "payload", "Should capture the payload string"); + } + #[test] fn post_process_rsc_html_handles_prettified_format() { let html = r#" @@ -263,16 +683,92 @@ mod tests { #[test] fn post_process_rewrites_html_href_inside_tchunk() { - let html = r#" + fn calculate_unescaped_byte_length_for_test(s: &str) -> usize { + let bytes = s.as_bytes(); + let mut pos = 0usize; + let mut count = 0usize; + + while pos < bytes.len() { + if bytes[pos] == b'\\' && pos + 1 < bytes.len() { + let esc = bytes[pos + 1]; + + if matches!( + esc, + b'n' | b'r' | b't' | b'b' | b'f' | b'v' | b'"' | b'\'' | b'\\' | b'/' + ) { + pos += 2; + count += 1; + continue; + } + + if esc == b'x' && pos + 3 < bytes.len() { + pos += 4; + count += 1; + continue; + } + + if esc == b'u' && pos + 5 < bytes.len() { + let hex = &s[pos + 2..pos + 6]; + if hex.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit) = u16::from_str_radix(hex, 16) { + // Surrogate pairs use UTF-16 and expand to 4 bytes in UTF-8. + if (0xD800..=0xDBFF).contains(&code_unit) + && pos + 11 < bytes.len() + && bytes[pos + 6] == b'\\' + && bytes[pos + 7] == b'u' + { + let hex2 = &s[pos + 8..pos + 12]; + if hex2.chars().all(|c| c.is_ascii_hexdigit()) { + if let Ok(code_unit2) = u16::from_str_radix(hex2, 16) { + if (0xDC00..=0xDFFF).contains(&code_unit2) { + pos += 12; + count += 4; + continue; + } + } + } + } + + let c = char::from_u32(code_unit as u32).unwrap_or('\u{FFFD}'); + pos += 6; + count += c.len_utf8(); + continue; + } + } + } + } + + if bytes[pos] < 0x80 { + pos += 1; + count += 1; + } else { + let c = s[pos..].chars().next().unwrap_or('\u{FFFD}'); + pos += c.len_utf8(); + count += c.len_utf8(); + } + } + + count + } + + let tchunk_content = r#"\u003cdiv\u003e\u003ca href="https://origin.example.com/about-us"\u003eAbout\u003c/a\u003e\u003c/div\u003e"#; + let declared_len_hex = format!( + "{:x}", + calculate_unescaped_byte_length_for_test(tchunk_content) + ); + let html = format!( + r#" -"#; +"# + ); - let result = post_process_rsc_html(html, "origin.example.com", "test.example.com", "https"); + let result = + post_process_rsc_html(&html, "origin.example.com", "test.example.com", "https"); assert!( result.contains("test.example.com/about-us"), @@ -285,8 +781,8 @@ mod tests { result ); assert!( - !result.contains(":T4d9,"), - "T-chunk length should have been recalculated (original was 4d9). Got: {}", + !result.contains(&format!(":T{declared_len_hex},")), + "T-chunk length should have been recalculated. Got: {}", result ); } diff --git a/crates/common/src/integrations/nextjs/mod.rs b/crates/common/src/integrations/nextjs/mod.rs index b3fc41f..9a3648e 100644 --- a/crates/common/src/integrations/nextjs/mod.rs +++ b/crates/common/src/integrations/nextjs/mod.rs @@ -10,13 +10,19 @@ const NEXTJS_INTEGRATION_ID: &str = "nextjs"; mod html_post_process; mod rsc; +mod rsc_placeholders; mod script_rewriter; +mod shared; +// Re-export deprecated legacy functions for backward compatibility. +// Production code should use the placeholder-based approach via NextJsHtmlPostProcessor. +#[allow(deprecated)] pub use html_post_process::{post_process_rsc_html, post_process_rsc_html_in_place}; pub use rsc::rewrite_rsc_scripts_combined; use html_post_process::NextJsHtmlPostProcessor; -use script_rewriter::{NextJsRewriteMode, NextJsScriptRewriter}; +use rsc_placeholders::NextJsRscPlaceholderRewriter; +use script_rewriter::NextJsNextDataRewriter; #[derive(Debug, Clone, Deserialize, Serialize, Validate)] pub struct NextJsIntegrationConfig { @@ -28,6 +34,8 @@ pub struct NextJsIntegrationConfig { )] #[validate(length(min = 1))] pub rewrite_attributes: Vec, + #[serde(default = "default_max_combined_payload_bytes")] + pub max_combined_payload_bytes: usize, } impl IntegrationConfig for NextJsIntegrationConfig { @@ -44,13 +52,18 @@ fn default_rewrite_attributes() -> Vec { vec!["href".to_string(), "link".to_string(), "url".to_string()] } +fn default_max_combined_payload_bytes() -> usize { + 10 * 1024 * 1024 +} + pub fn register(settings: &Settings) -> Option { let config = match build(settings) { Some(config) => { log::info!( - "NextJS integration registered: enabled={}, rewrite_attributes={:?}", + "NextJS integration registered: enabled={}, rewrite_attributes={:?}, max_combined_payload_bytes={}", config.enabled, - config.rewrite_attributes + config.rewrite_attributes, + config.max_combined_payload_bytes ); config } @@ -60,29 +73,22 @@ pub fn register(settings: &Settings) -> Option { } }; - // Register both structured (Pages Router __NEXT_DATA__) and streamed (App Router RSC) - // rewriters. RSC payloads require LENGTH-PRESERVING URL replacement to avoid breaking - // React hydration - the RSC format uses byte positions for record boundaries. - let structured = Arc::new(NextJsScriptRewriter::new( - config.clone(), - NextJsRewriteMode::Structured, - )); + // Register a structured (Pages Router __NEXT_DATA__) rewriter. + let structured = Arc::new(NextJsNextDataRewriter::new(config.clone())); - let streamed = Arc::new(NextJsScriptRewriter::new( - config.clone(), - NextJsRewriteMode::Streamed, - )); + // Insert placeholders for App Router RSC payload scripts during the initial HTML rewrite pass, + // then substitute them during post-processing without re-parsing HTML. + let placeholders = Arc::new(NextJsRscPlaceholderRewriter::new(config.clone())); // Register post-processor for cross-script RSC T-chunks - let post_processor = Arc::new(NextJsHtmlPostProcessor::new(config)); - - Some( - IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) - .with_script_rewriter(structured) - .with_script_rewriter(streamed) - .with_html_post_processor(post_processor) - .build(), - ) + let post_processor = Arc::new(NextJsHtmlPostProcessor::new(config.clone())); + + let builder = IntegrationRegistration::builder(NEXTJS_INTEGRATION_ID) + .with_script_rewriter(structured) + .with_script_rewriter(placeholders) + .with_html_post_processor(post_processor); + + Some(builder.build()) } fn build(settings: &Settings) -> Option> { @@ -95,6 +101,7 @@ fn build(settings: &Settings) -> Option> { #[cfg(test)] mod tests { + use super::rsc_placeholders::RSC_PAYLOAD_PLACEHOLDER_PREFIX; use super::*; use crate::html_processor::{create_html_processor, HtmlProcessorConfig}; use crate::integrations::IntegrationRegistry; @@ -221,6 +228,11 @@ mod tests { "RSC stream payloads should be rewritten to proxy host via post-processing. Output: {}", final_html ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "RSC placeholder markers should not appear in final HTML. Output: {}", + final_html + ); } #[test] @@ -264,6 +276,66 @@ mod tests { "RSC stream payloads should be rewritten to proxy host with chunked input. Output: {}", final_html ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "RSC placeholder markers should not appear in final HTML. Output: {}", + final_html + ); + } + + #[test] + fn html_processor_respects_max_combined_payload_bytes() { + // When the combined payload size exceeds `max_combined_payload_bytes` and the document + // contains cross-script T-chunks, we skip post-processing to avoid breaking hydration. + let html = r#" + + +"#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["href", "link", "url"], + "max_combined_payload_bytes": 1, + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 8192, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + + let final_html = String::from_utf8_lossy(&output); + + assert!( + final_html.contains("https://origin.example.com/page"), + "Origin URL should remain when rewrite is skipped due to size limit. Output: {}", + final_html + ); + assert!( + !final_html.contains("test.example.com"), + "Proxy host should not be introduced when rewrite is skipped. Output: {}", + final_html + ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "RSC placeholder markers should not appear in final HTML. Output: {}", + final_html + ); } #[test] @@ -341,5 +413,10 @@ mod tests { "RSC record separator \\n should be preserved. Output: {}", final_html ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "RSC placeholder markers should not appear in final HTML. Output: {}", + final_html + ); } } diff --git a/crates/common/src/integrations/nextjs/rsc.rs b/crates/common/src/integrations/nextjs/rsc.rs index 885911e..534c912 100644 --- a/crates/common/src/integrations/nextjs/rsc.rs +++ b/crates/common/src/integrations/nextjs/rsc.rs @@ -1,7 +1,7 @@ -use std::borrow::Cow; - use once_cell::sync::Lazy; -use regex::{escape, Regex}; +use regex::Regex; + +use super::shared::RscUrlRewriter; /// T-chunk header pattern: hex_id:Thex_length, static TCHUNK_PATTERN: Lazy = @@ -10,9 +10,13 @@ static TCHUNK_PATTERN: Lazy = /// Marker used to track script boundaries when combining RSC content. pub(crate) const RSC_MARKER: &str = "\x00SPLIT\x00"; -/// Maximum combined payload size for cross-script processing (10 MB). -/// Payloads exceeding this limit are processed individually without cross-script T-chunk handling. -const MAX_COMBINED_PAYLOAD_SIZE: usize = 10 * 1024 * 1024; +/// Default maximum combined payload size for cross-script processing (10 MB). +pub(crate) const DEFAULT_MAX_COMBINED_PAYLOAD_BYTES: usize = 10 * 1024 * 1024; + +/// Maximum reasonable T-chunk length to prevent DoS from malformed input (100 MB). +/// A T-chunk larger than this is almost certainly malformed and would cause excessive +/// memory allocation or iteration. +const MAX_REASONABLE_TCHUNK_LENGTH: usize = 100 * 1024 * 1024; // ============================================================================= // Escape Sequence Parsing @@ -184,10 +188,10 @@ fn consume_unescaped_bytes(s: &str, start_pos: usize, byte_count: usize) -> (usi /// Information about a T-chunk found in the combined RSC content. struct TChunkInfo { - /// The chunk ID (hex string like "1a", "443"). - id: String, /// Position where the T-chunk header starts (e.g., position of "1a:T..."). match_start: usize, + /// Position right after the chunk ID (position of ":T"). + id_end: usize, /// Position right after the comma (where content begins). header_end: usize, /// Position where the content ends. @@ -195,7 +199,7 @@ struct TChunkInfo { } /// Find all T-chunks in content, optionally skipping markers. -fn find_tchunks_impl(content: &str, skip_markers: bool) -> Vec { +fn find_tchunks_impl(content: &str, skip_markers: bool) -> Option> { let mut chunks = Vec::new(); let mut search_pos = 0; let marker = if skip_markers { @@ -210,13 +214,12 @@ fn find_tchunks_impl(content: &str, skip_markers: bool) -> Vec { let match_start = search_pos + m.start(); let header_end = search_pos + m.end(); - let id = cap - .get(1) - .expect("T-chunk id should exist") - .as_str() - .to_string(); + let id_match = cap.get(1).expect("T-chunk id should exist"); + let id_end = search_pos + id_match.end(); let length_hex = cap.get(2).expect("T-chunk length should exist").as_str(); - let declared_length = usize::from_str_radix(length_hex, 16).unwrap_or(0); + let declared_length = usize::from_str_radix(length_hex, 16) + .ok() + .filter(|&len| len <= MAX_REASONABLE_TCHUNK_LENGTH)?; let content_end = if let Some(marker_bytes) = marker { let mut iter = EscapeSequenceIter::from_position_with_marker( @@ -231,15 +234,21 @@ fn find_tchunks_impl(content: &str, skip_markers: bool) -> Vec { None => break, } } + if consumed < declared_length { + return None; + } iter.position() } else { - let (pos, _) = consume_unescaped_bytes(content, header_end, declared_length); + let (pos, consumed) = consume_unescaped_bytes(content, header_end, declared_length); + if consumed < declared_length { + return None; + } pos }; chunks.push(TChunkInfo { - id, match_start, + id_end, header_end, content_end, }); @@ -250,92 +259,17 @@ fn find_tchunks_impl(content: &str, skip_markers: bool) -> Vec { } } - chunks + Some(chunks) } -fn find_tchunks(content: &str) -> Vec { +fn find_tchunks(content: &str) -> Option> { find_tchunks_impl(content, false) } -fn find_tchunks_with_markers(content: &str) -> Vec { +fn find_tchunks_with_markers(content: &str) -> Option> { find_tchunks_impl(content, true) } -// ============================================================================= -// URL rewriting (cached per call) -// ============================================================================= - -/// Rewriter for RSC payload URL patterns. -/// -/// This is constructed per document / payload rewrite so that the origin-host-dependent regex is -/// compiled once, then reused across multiple calls. -pub(crate) struct RscUrlRewriter { - origin_host: String, - request_host: String, - request_scheme: String, - pattern: Regex, -} - -impl RscUrlRewriter { - pub(crate) fn new(origin_host: &str, request_host: &str, request_scheme: &str) -> Self { - let escaped_origin = escape(origin_host); - - // Match: - // - https://origin_host or http://origin_host - // - //origin_host (protocol-relative) - // - escaped variants inside JSON-in-JS strings (e.g., \/\/origin_host) - let pattern = Regex::new(&format!( - r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, - escaped_origin - )) - .expect("valid RSC URL rewrite regex"); - - Self { - origin_host: origin_host.to_string(), - request_host: request_host.to_string(), - request_scheme: request_scheme.to_string(), - pattern, - } - } - - pub(crate) fn rewrite<'a>(&self, input: &'a str) -> Cow<'a, str> { - if !input.contains(&self.origin_host) { - return Cow::Borrowed(input); - } - - // Phase 1: Regex-based URL pattern rewriting (handles escaped slashes, schemes, etc.) - let replaced = self - .pattern - .replace_all(input, |caps: ®ex::Captures<'_>| { - let slashes = caps.get(3).map_or("//", |m| m.as_str()); - if caps.get(1).is_some() { - format!("{}:{}{}", self.request_scheme, slashes, self.request_host) - } else { - format!("{}{}", slashes, self.request_host) - } - }); - - // Phase 2: Handle bare host occurrences not matched by the URL regex - // (e.g., `siteProductionDomain`). Only check if regex made no changes, - // because if it did, we already know origin_host was present. - let text = match &replaced { - Cow::Borrowed(s) => *s, - Cow::Owned(s) => s.as_str(), - }; - - if !text.contains(&self.origin_host) { - return replaced; - } - - // Bare host replacement needed - Cow::Owned(text.replace(&self.origin_host, &self.request_host)) - } - - pub(crate) fn rewrite_to_string(&self, input: &str) -> String { - self.rewrite(input).into_owned() - } -} - // ============================================================================= // Single-script T-chunk processing // ============================================================================= @@ -344,7 +278,12 @@ pub(crate) fn rewrite_rsc_tchunks_with_rewriter( content: &str, rewriter: &RscUrlRewriter, ) -> String { - let chunks = find_tchunks(content); + let Some(chunks) = find_tchunks(content) else { + log::warn!( + "RSC payload contains invalid or incomplete T-chunks; skipping rewriting to avoid breaking hydration" + ); + return content.to_string(); + }; if chunks.is_empty() { return rewriter.rewrite_to_string(content); @@ -363,7 +302,7 @@ pub(crate) fn rewrite_rsc_tchunks_with_rewriter( let new_length = calculate_unescaped_byte_length(&rewritten_content); let new_length_hex = format!("{new_length:x}"); - result.push_str(&chunk.id); + result.push_str(&content[chunk.match_start..chunk.id_end]); result.push_str(":T"); result.push_str(&new_length_hex); result.push(','); @@ -394,29 +333,97 @@ pub fn rewrite_rsc_scripts_combined( origin_host: &str, request_host: &str, request_scheme: &str, +) -> Vec { + rewrite_rsc_scripts_combined_with_limit( + payloads, + origin_host, + request_host, + request_scheme, + DEFAULT_MAX_COMBINED_PAYLOAD_BYTES, + ) +} + +fn payload_contains_incomplete_tchunk(payload: &str) -> bool { + let mut search_pos = 0; + while search_pos < payload.len() { + let Some(cap) = TCHUNK_PATTERN.captures(&payload[search_pos..]) else { + break; + }; + + let m = cap.get(0).expect("T-chunk match should exist"); + let header_end = search_pos + m.end(); + + let length_hex = cap.get(2).expect("T-chunk length should exist").as_str(); + let Some(declared_length) = usize::from_str_radix(length_hex, 16) + .ok() + .filter(|&len| len <= MAX_REASONABLE_TCHUNK_LENGTH) + else { + return true; + }; + + let (pos, consumed) = consume_unescaped_bytes(payload, header_end, declared_length); + if consumed < declared_length { + return true; + } + + search_pos = pos; + } + + false +} + +pub(crate) fn rewrite_rsc_scripts_combined_with_limit( + payloads: &[&str], + origin_host: &str, + request_host: &str, + request_scheme: &str, + max_combined_payload_bytes: usize, ) -> Vec { if payloads.is_empty() { return Vec::new(); } + // Early exit if no payload contains the origin host - avoids regex compilation + if !payloads.iter().any(|p| p.contains(origin_host)) { + return payloads.iter().map(|p| (*p).to_string()).collect(); + } + let rewriter = RscUrlRewriter::new(origin_host, request_host, request_scheme); if payloads.len() == 1 { return vec![rewrite_rsc_tchunks_with_rewriter(payloads[0], &rewriter)]; } + let max_combined_payload_bytes = if max_combined_payload_bytes == 0 { + DEFAULT_MAX_COMBINED_PAYLOAD_BYTES + } else { + max_combined_payload_bytes + }; + // Check total size before allocating combined buffer let total_size: usize = payloads.iter().map(|p| p.len()).sum::() + (payloads.len() - 1) * RSC_MARKER.len(); - if total_size > MAX_COMBINED_PAYLOAD_SIZE { - // Fall back to individual processing if combined size is too large. - // This sacrifices cross-script T-chunk correctness for memory safety. + if total_size > max_combined_payload_bytes { + // Avoid allocating a large combined buffer. If the payloads contain cross-script T-chunks, + // per-script rewriting is unsafe because it may rewrite T-chunk content without updating + // the original header, breaking React hydration. log::warn!( - "RSC combined payload size {} exceeds limit {}, processing individually", + "RSC combined payload size {} exceeds limit {}, skipping cross-script combining", total_size, - MAX_COMBINED_PAYLOAD_SIZE + max_combined_payload_bytes ); + + if payloads + .iter() + .any(|p| payload_contains_incomplete_tchunk(p)) + { + log::warn!( + "RSC payloads contain cross-script T-chunks; skipping RSC URL rewriting to avoid breaking hydration (consider increasing integrations.nextjs.max_combined_payload_bytes)" + ); + return payloads.iter().map(|p| (*p).to_string()).collect(); + } + return payloads .iter() .map(|p| rewrite_rsc_tchunks_with_rewriter(p, &rewriter)) @@ -430,7 +437,12 @@ pub fn rewrite_rsc_scripts_combined( combined.push_str(payload); } - let chunks = find_tchunks_with_markers(&combined); + let Some(chunks) = find_tchunks_with_markers(&combined) else { + log::warn!( + "RSC combined payload contains invalid or incomplete T-chunks; skipping rewriting to avoid breaking hydration" + ); + return payloads.iter().map(|p| (*p).to_string()).collect(); + }; if chunks.is_empty() { return payloads .iter() @@ -451,7 +463,7 @@ pub fn rewrite_rsc_scripts_combined( let new_length = calculate_unescaped_byte_length_skip_markers(&rewritten_content); let new_length_hex = format!("{new_length:x}"); - result.push_str(&chunk.id); + result.push_str(&combined[chunk.match_start..chunk.id_end]); result.push_str(":T"); result.push_str(&new_length_hex); result.push(','); @@ -532,7 +544,7 @@ mod tests { #[test] fn cross_script_tchunk_rewriting() { - let script0 = r#"other:data\n1a:T40,partial content"#; + let script0 = r#"other:data\n1a:T3e,partial content"#; let script1 = r#" with https://origin.example.com/page goes here"#; let combined_content = "partial content with https://origin.example.com/page goes here"; @@ -570,7 +582,7 @@ mod tests { #[test] fn cross_script_preserves_non_tchunk_content() { - let script0 = r#"{"url":"https://origin.example.com/first"}\n1a:T40,partial"#; + let script0 = r#"{"url":"https://origin.example.com/first"}\n1a:T38,partial"#; let script1 = r#" content with https://origin.example.com/page end"#; let payloads: Vec<&str> = vec![script0, script1]; @@ -618,6 +630,34 @@ mod tests { ); } + #[test] + fn bare_host_rewrite_respects_hostname_boundaries() { + let input = r#"{"sub":"cdn.origin.example.com","prefix":"notorigin.example.com","suffix":"origin.example.com.uk","path":"origin.example.com/news","exact":"origin.example.com"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let rewritten = rewriter.rewrite_to_string(input); + + assert!( + rewritten.contains(r#""sub":"cdn.origin.example.com""#), + "Subdomain should not be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#""prefix":"notorigin.example.com""#), + "Prefix substring should not be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#""suffix":"origin.example.com.uk""#), + "Suffix domain should not be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#""path":"proxy.example.com/news""#), + "Bare host with path should be rewritten. Got: {rewritten}" + ); + assert!( + rewritten.contains(r#""exact":"proxy.example.com""#), + "Exact bare host should be rewritten. Got: {rewritten}" + ); + } + #[test] fn single_payload_bypasses_combining() { // When there's only one payload, we should process it directly without combining @@ -693,4 +733,90 @@ mod tests { results[1] ); } + + #[test] + fn size_limit_skips_rewrite_when_cross_script_tchunk_detected() { + let script0 = r#"other:data\n1a:T40,partial content"#; + let script1 = r#" with https://origin.example.com/page goes here"#; + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined_with_limit( + &payloads, + "origin.example.com", + "test.example.com", + "https", + 1, + ); + + assert_eq!(results.len(), 2, "Should return same number of scripts"); + assert_eq!( + results[0], script0, + "Cross-script payload should remain unchanged when size limit is exceeded" + ); + assert_eq!( + results[1], script1, + "Cross-script payload should remain unchanged when size limit is exceeded" + ); + } + + #[test] + fn size_limit_rewrites_individually_when_tchunks_are_complete() { + let script0 = r#"1a:T25,{"url":"https://origin.example.com/x"}"#; + let script1 = r#"1b:T25,{"url":"https://origin.example.com/y"}"#; + + let payloads: Vec<&str> = vec![script0, script1]; + let results = rewrite_rsc_scripts_combined_with_limit( + &payloads, + "origin.example.com", + "test.example.com", + "https", + 1, + ); + + assert_eq!(results.len(), 2, "Should return same number of scripts"); + assert!( + results[0].contains("test.example.com"), + "First payload should be rewritten. Got: {}", + results[0] + ); + assert!( + results[1].contains("test.example.com"), + "Second payload should be rewritten. Got: {}", + results[1] + ); + assert!( + results[0].contains(":T23,"), + "First payload T-chunk length should be updated. Got: {}", + results[0] + ); + assert!( + results[1].contains(":T23,"), + "Second payload T-chunk length should be updated. Got: {}", + results[1] + ); + } + + #[test] + fn invalid_or_unreasonable_tchunk_length_skips_rewriting() { + let content = r#"1a:T10000000,{"url":"https://origin.example.com/path"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert_eq!( + result, content, + "Should skip rewriting when T-chunk length is unreasonable" + ); + } + + #[test] + fn incomplete_tchunk_skips_rewriting() { + let content = r#"1a:Tff,{"url":"https://origin.example.com/path"}"#; + let rewriter = RscUrlRewriter::new("origin.example.com", "test.example.com", "https"); + let result = rewrite_rsc_tchunks_with_rewriter(content, &rewriter); + + assert_eq!( + result, content, + "Should skip rewriting when T-chunk content is incomplete" + ); + } } diff --git a/crates/common/src/integrations/nextjs/rsc_placeholders.rs b/crates/common/src/integrations/nextjs/rsc_placeholders.rs new file mode 100644 index 0000000..84ac650 --- /dev/null +++ b/crates/common/src/integrations/nextjs/rsc_placeholders.rs @@ -0,0 +1,242 @@ +use std::borrow::Cow; +use std::sync::{Arc, Mutex}; + +use crate::integrations::{ + IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, +}; + +use super::shared::find_rsc_push_payload_range; +use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; + +pub(super) const RSC_PAYLOAD_PLACEHOLDER_PREFIX: &str = "__ts_rsc_payload_"; +pub(super) const RSC_PAYLOAD_PLACEHOLDER_SUFFIX: &str = "__"; + +#[derive(Default)] +pub(super) struct NextJsRscPostProcessState { + pub(super) payloads: Vec, + buffer: String, + buffering: bool, +} + +impl NextJsRscPostProcessState { + fn buffer_chunk(&mut self, chunk: &str) { + if !self.buffering { + self.buffering = true; + self.buffer.clear(); + } + self.buffer.push_str(chunk); + } + + /// Returns the complete script content, either borrowed from input or owned from buffer. + fn take_script_or_borrow<'a>(&mut self, chunk: &'a str) -> Cow<'a, str> { + if self.buffering { + self.buffer.push_str(chunk); + self.buffering = false; + Cow::Owned(std::mem::take(&mut self.buffer)) + } else { + Cow::Borrowed(chunk) + } + } + + pub(super) fn take_payloads(&mut self) -> Vec { + self.buffer.clear(); + self.buffering = false; + std::mem::take(&mut self.payloads) + } +} + +fn rsc_payload_placeholder(index: usize) -> String { + format!("{RSC_PAYLOAD_PLACEHOLDER_PREFIX}{index}{RSC_PAYLOAD_PLACEHOLDER_SUFFIX}") +} + +pub(super) struct NextJsRscPlaceholderRewriter { + config: Arc, +} + +impl NextJsRscPlaceholderRewriter { + pub(super) fn new(config: Arc) -> Self { + Self { config } + } +} + +impl IntegrationScriptRewriter for NextJsRscPlaceholderRewriter { + fn integration_id(&self) -> &'static str { + NEXTJS_INTEGRATION_ID + } + + fn selector(&self) -> &'static str { + "script" + } + + fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { + if !self.config.enabled || self.config.rewrite_attributes.is_empty() { + return ScriptRewriteAction::keep(); + } + + if !ctx.is_last_in_text_node { + if let Some(existing) = ctx + .document_state + .get::>(NEXTJS_INTEGRATION_ID) + { + let mut guard = existing.lock().unwrap_or_else(|e| e.into_inner()); + if guard.buffering { + guard.buffer_chunk(content); + return ScriptRewriteAction::remove_node(); + } + } + + let trimmed = content.trim_start(); + if trimmed.starts_with('{') || trimmed.starts_with('[') { + // Avoid interfering with other inline JSON scripts (e.g. `__NEXT_DATA__`, JSON-LD). + return ScriptRewriteAction::keep(); + } + + let state = ctx + .document_state + .get_or_insert_with(NEXTJS_INTEGRATION_ID, || { + Mutex::new(NextJsRscPostProcessState::default()) + }); + let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); + guard.buffer_chunk(content); + return ScriptRewriteAction::remove_node(); + } + + if !content.contains("__next_f") + && ctx + .document_state + .get::>(NEXTJS_INTEGRATION_ID) + .is_none() + { + return ScriptRewriteAction::keep(); + } + + let state = ctx + .document_state + .get_or_insert_with(NEXTJS_INTEGRATION_ID, || { + Mutex::new(NextJsRscPostProcessState::default()) + }); + let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); + let script = guard.take_script_or_borrow(content); + let was_buffered = matches!(script, Cow::Owned(_)); + + if !script.contains("__next_f") { + if was_buffered { + return ScriptRewriteAction::replace(script.into_owned()); + } + return ScriptRewriteAction::keep(); + } + + let Some((payload_start, payload_end)) = find_rsc_push_payload_range(&script) else { + if was_buffered { + return ScriptRewriteAction::replace(script.into_owned()); + } + return ScriptRewriteAction::keep(); + }; + + if payload_start > payload_end + || payload_end > script.len() + || !script.is_char_boundary(payload_start) + || !script.is_char_boundary(payload_end) + { + if was_buffered { + return ScriptRewriteAction::replace(script.into_owned()); + } + return ScriptRewriteAction::keep(); + } + + let placeholder_index = guard.payloads.len(); + let placeholder = rsc_payload_placeholder(placeholder_index); + guard + .payloads + .push(script[payload_start..payload_end].to_string()); + + let mut rewritten = script.into_owned(); + rewritten.replace_range(payload_start..payload_end, &placeholder); + ScriptRewriteAction::replace(rewritten) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::integrations::IntegrationDocumentState; + + fn ctx<'a>( + is_last_in_text_node: bool, + document_state: &'a IntegrationDocumentState, + ) -> IntegrationScriptContext<'a> { + IntegrationScriptContext { + selector: "script", + request_host: "proxy.example.com", + request_scheme: "https", + origin_host: "origin.example.com", + is_last_in_text_node, + document_state, + } + } + + fn test_config() -> Arc { + Arc::new(NextJsIntegrationConfig { + enabled: true, + rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], + max_combined_payload_bytes: 10 * 1024 * 1024, + }) + } + + #[test] + fn inserts_placeholder_and_records_payload() { + let state = IntegrationDocumentState::default(); + let rewriter = NextJsRscPlaceholderRewriter::new(test_config()); + + let script = r#"self.__next_f.push([1,"https://origin.example.com/page"])"#; + let action = rewriter.rewrite(script, &ctx(true, &state)); + + let ScriptRewriteAction::Replace(rewritten) = action else { + panic!("Expected placeholder insertion to replace script"); + }; + assert!( + rewritten.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "Rewritten script should contain placeholder. Got: {rewritten}" + ); + + let stored = state + .get::>(NEXTJS_INTEGRATION_ID) + .expect("should store RSC state"); + let guard = stored.lock().expect("should lock Next.js RSC state"); + assert_eq!(guard.payloads.len(), 1, "Should store exactly one payload"); + assert_eq!( + guard.payloads[0], "https://origin.example.com/page", + "Stored payload should match original" + ); + } + + #[test] + fn buffers_fragmented_scripts_and_emits_single_replacement() { + let state = IntegrationDocumentState::default(); + let rewriter = NextJsRscPlaceholderRewriter::new(test_config()); + + let first = "self.__next_f.push([1,\"https://origin.example.com"; + let second = "/page\"])"; + + let action_first = rewriter.rewrite(first, &ctx(false, &state)); + assert_eq!( + action_first, + ScriptRewriteAction::RemoveNode, + "Intermediate chunk should be removed" + ); + + let action_second = rewriter.rewrite(second, &ctx(true, &state)); + let ScriptRewriteAction::Replace(rewritten) = action_second else { + panic!("Final chunk should be replaced with combined output"); + }; + + assert!( + rewritten.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "Combined output should include placeholder. Got: {rewritten}" + ); + assert!( + rewritten.contains("self.__next_f.push"), + "Combined output should keep the push call. Got: {rewritten}" + ); + } +} diff --git a/crates/common/src/integrations/nextjs/script_rewriter.rs b/crates/common/src/integrations/nextjs/script_rewriter.rs index 94c21ed..4df3493 100644 --- a/crates/common/src/integrations/nextjs/script_rewriter.rs +++ b/crates/common/src/integrations/nextjs/script_rewriter.rs @@ -1,34 +1,20 @@ use std::sync::Arc; -use once_cell::sync::Lazy; use regex::{escape, Regex}; use crate::integrations::{ IntegrationScriptContext, IntegrationScriptRewriter, ScriptRewriteAction, }; -use super::rsc::{rewrite_rsc_tchunks_with_rewriter, RscUrlRewriter}; use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; -/// RSC push payload pattern for extraction. -static RSC_PUSH_PATTERN: Lazy = Lazy::new(|| { - Regex::new(r#"self\.__next_f\.push\(\[\s*1\s*,\s*(['"])"#).expect("valid RSC push regex") -}); - -#[derive(Clone, Copy)] -pub(super) enum NextJsRewriteMode { - Structured, - Streamed, -} - -pub(super) struct NextJsScriptRewriter { +pub(super) struct NextJsNextDataRewriter { config: Arc, - mode: NextJsRewriteMode, } -impl NextJsScriptRewriter { - pub(super) fn new(config: Arc, mode: NextJsRewriteMode) -> Self { - Self { config, mode } +impl NextJsNextDataRewriter { + pub(super) fn new(config: Arc) -> Self { + Self { config } } fn rewrite_structured( @@ -48,7 +34,6 @@ impl NextJsScriptRewriter { ctx.request_host, ctx.request_scheme, &self.config.rewrite_attributes, - false, // preserve_length not used for structured payloads ); if let Some(rewritten) = rewrite_nextjs_values_with_rewriter(content, &rewriter) { @@ -57,86 +42,15 @@ impl NextJsScriptRewriter { ScriptRewriteAction::keep() } } - - fn rewrite_streamed( - &self, - content: &str, - ctx: &IntegrationScriptContext<'_>, - ) -> ScriptRewriteAction { - let rsc_rewriter = - RscUrlRewriter::new(ctx.origin_host, ctx.request_host, ctx.request_scheme); - - if let Some((payload, quote, start, end)) = extract_rsc_push_payload(content) { - let rewritten_payload = rewrite_rsc_tchunks_with_rewriter(payload, &rsc_rewriter); - - if rewritten_payload != payload { - let mut result = String::with_capacity(content.len()); - result.push_str(&content[..start]); - result.push(quote); - result.push_str(&rewritten_payload); - result.push(quote); - result.push_str(&content[end + 1..]); - return ScriptRewriteAction::replace(result); - } - } - - let rewritten = rsc_rewriter.rewrite_to_string(content); - if rewritten != content { - return ScriptRewriteAction::replace(rewritten); - } - - ScriptRewriteAction::keep() - } -} - -/// Extract RSC payload from a self.__next_f.push([1, '...']) call. -/// Returns (payload_content, quote_char, start_pos, end_pos). -fn extract_rsc_push_payload(content: &str) -> Option<(&str, char, usize, usize)> { - let cap = RSC_PUSH_PATTERN.captures(content)?; - let quote_match = cap.get(1)?; - let quote = quote_match.as_str().chars().next()?; - let content_start = quote_match.end(); - - let search_from = &content[content_start..]; - let mut pos = 0; - let mut escape = false; - - for c in search_from.chars() { - if escape { - escape = false; - pos += c.len_utf8(); - continue; - } - if c == '\\' { - escape = true; - pos += 1; - continue; - } - if c == quote { - let content_end = content_start + pos; - return Some(( - &content[content_start..content_end], - quote, - content_start - 1, - content_end, - )); - } - pos += c.len_utf8(); - } - - None } -impl IntegrationScriptRewriter for NextJsScriptRewriter { +impl IntegrationScriptRewriter for NextJsNextDataRewriter { fn integration_id(&self) -> &'static str { NEXTJS_INTEGRATION_ID } fn selector(&self) -> &'static str { - match self.mode { - NextJsRewriteMode::Structured => "script#__NEXT_DATA__", - NextJsRewriteMode::Streamed => "script", - } + "script#__NEXT_DATA__" } fn rewrite(&self, content: &str, ctx: &IntegrationScriptContext<'_>) -> ScriptRewriteAction { @@ -144,18 +58,7 @@ impl IntegrationScriptRewriter for NextJsScriptRewriter { return ScriptRewriteAction::keep(); } - match self.mode { - NextJsRewriteMode::Structured => self.rewrite_structured(content, ctx), - NextJsRewriteMode::Streamed => { - if content.contains("__next_f.push") { - return ScriptRewriteAction::keep(); - } - if content.contains("__next_f") { - return self.rewrite_streamed(content, ctx); - } - ScriptRewriteAction::keep() - } - } + self.rewrite_structured(content, ctx) } } @@ -170,41 +73,30 @@ fn rewrite_nextjs_values( request_host: &str, request_scheme: &str, attributes: &[String], - preserve_length: bool, ) -> Option { if origin_host.is_empty() || request_host.is_empty() || attributes.is_empty() { return None; } - let rewriter = UrlRewriter::new( - origin_host, - request_host, - request_scheme, - attributes, - preserve_length, - ); + let rewriter = UrlRewriter::new(origin_host, request_host, request_scheme, attributes); rewrite_nextjs_values_with_rewriter(content, &rewriter) } /// Rewrites URLs in structured Next.js JSON payloads (e.g., `__NEXT_DATA__`). /// -/// This rewriter uses attribute-specific regex patterns to find and replace URLs +/// This rewriter uses combined regex patterns to find and replace URLs /// in JSON content. It handles full URLs, protocol-relative URLs, and bare hostnames. -/// -/// The `preserve_length` option adds whitespace padding to maintain byte length, -/// which was an early attempt at RSC compatibility. This is no longer needed for -/// RSC payloads (T-chunk lengths are recalculated instead), but is kept for -/// potential future use cases where length preservation is required. +/// Patterns for all attributes are combined with alternation for efficiency. struct UrlRewriter { + #[cfg_attr(not(test), allow(dead_code))] origin_host: String, request_host: String, request_scheme: String, - embedded_patterns: Vec, - bare_host_patterns: Vec, - /// When true, adds whitespace padding to maintain original byte length. - /// Currently unused in production (always false). - preserve_length: bool, + /// Single regex matching URL patterns for all attributes + embedded_pattern: Option, + /// Single regex matching bare hostname patterns for all attributes + bare_host_pattern: Option, } impl UrlRewriter { @@ -213,114 +105,90 @@ impl UrlRewriter { request_host: &str, request_scheme: &str, attributes: &[String], - preserve_length: bool, ) -> Self { let escaped_origin = escape(origin_host); - let embedded_patterns = attributes - .iter() - .map(|attr| { - let escaped_attr = escape(attr); - let pattern = format!( - r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*")(?Phttps?://|//){origin}(?P[^"\\]*)(?P\\*")"#, - attr = escaped_attr, - origin = escaped_origin, - ); - Regex::new(&pattern).expect("valid Next.js rewrite regex") - }) - .collect(); - - let bare_host_patterns = attributes - .iter() - .map(|attr| { - let escaped_attr = escape(attr); - let pattern = format!( - r#"(?P(?:\\*")?{attr}(?:\\*")?:\\*"){origin}(?P\\*")"#, - attr = escaped_attr, - origin = escaped_origin, - ); - Regex::new(&pattern).expect("valid Next.js bare host rewrite regex") - }) - .collect(); + // Build a single regex with alternation for all attributes + let embedded_pattern = if attributes.is_empty() { + None + } else { + let attr_alternation = attributes + .iter() + .map(|attr| escape(attr)) + .collect::>() + .join("|"); + let pattern = format!( + r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?:\\*")(?Phttps?://|//){origin}(?P[^"\\]*)(?P\\*")"#, + attrs = attr_alternation, + origin = escaped_origin, + ); + Some(Regex::new(&pattern).expect("valid Next.js rewrite regex")) + }; + + let bare_host_pattern = if attributes.is_empty() { + None + } else { + let attr_alternation = attributes + .iter() + .map(|attr| escape(attr)) + .collect::>() + .join("|"); + let pattern = format!( + r#"(?P(?:\\*")?(?:{attrs})(?:\\*")?:\\*"){origin}(?P\\*")"#, + attrs = attr_alternation, + origin = escaped_origin, + ); + Some(Regex::new(&pattern).expect("valid Next.js bare host rewrite regex")) + }; Self { origin_host: origin_host.to_string(), request_host: request_host.to_string(), request_scheme: request_scheme.to_string(), - embedded_patterns, - bare_host_patterns, - preserve_length, + embedded_pattern, + bare_host_pattern, } } #[cfg(test)] - fn rewrite_url_value(&self, url: &str) -> Option<(String, String)> { - let original_len = url.len(); - - let new_url = if let Some(rest) = url.strip_prefix("https://") { + fn rewrite_url_value(&self, url: &str) -> Option { + if let Some(rest) = url.strip_prefix("https://") { if rest.starts_with(&self.origin_host) { let path = &rest[self.origin_host.len()..]; - Some(format!( + return Some(format!( "{}://{}{}", self.request_scheme, self.request_host, path - )) - } else { - None + )); } } else if let Some(rest) = url.strip_prefix("http://") { if rest.starts_with(&self.origin_host) { let path = &rest[self.origin_host.len()..]; - Some(format!( + return Some(format!( "{}://{}{}", self.request_scheme, self.request_host, path - )) - } else { - None + )); } } else if let Some(rest) = url.strip_prefix("//") { if rest.starts_with(&self.origin_host) { let path = &rest[self.origin_host.len()..]; - Some(format!("//{}{}", self.request_host, path)) - } else { - None + return Some(format!("//{}{}", self.request_host, path)); } } else if url == self.origin_host { - Some(self.request_host.clone()) + return Some(self.request_host.clone()); } else if url.starts_with(&self.origin_host) { let path = &url[self.origin_host.len()..]; - Some(format!("{}{}", self.request_host, path)) - } else { - None - }; - - new_url.map(|url| { - let padding = if self.preserve_length { - Self::calculate_padding(url.len(), original_len) - } else { - String::new() - }; - (url, padding) - }) - } - - #[cfg(test)] - fn calculate_padding(new_url_len: usize, original_len: usize) -> String { - if new_url_len >= original_len { - String::new() - } else { - " ".repeat(original_len - new_url_len) + return Some(format!("{}{}", self.request_host, path)); } + None } fn rewrite_embedded(&self, input: &str) -> Option { let mut result = input.to_string(); let mut changed = false; - for regex in &self.embedded_patterns { - let origin_host = &self.origin_host; + if let Some(regex) = &self.embedded_pattern { let request_host = &self.request_host; let request_scheme = &self.request_scheme; - let preserve_length = self.preserve_length; let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { let prefix = &caps["prefix"]; @@ -328,21 +196,13 @@ impl UrlRewriter { let path = &caps["path"]; let quote = &caps["quote"]; - let original_url_len = scheme.len() + origin_host.len() + path.len(); - let new_url = if scheme == "//" { format!("//{}{}", request_host, path) } else { format!("{}://{}{}", request_scheme, request_host, path) }; - let padding = if preserve_length && new_url.len() < original_url_len { - " ".repeat(original_url_len - new_url.len()) - } else { - String::new() - }; - - format!("{prefix}{new_url}{quote}{padding}") + format!("{prefix}{new_url}{quote}") }); if next_value != result { @@ -351,22 +211,14 @@ impl UrlRewriter { } } - for regex in &self.bare_host_patterns { - let origin_host = &self.origin_host; + if let Some(regex) = &self.bare_host_pattern { let request_host = &self.request_host; - let preserve_length = self.preserve_length; let next_value = regex.replace_all(&result, |caps: ®ex::Captures<'_>| { let prefix = &caps["prefix"]; let suffix = &caps["suffix"]; - let padding = if preserve_length && request_host.len() < origin_host.len() { - " ".repeat(origin_host.len() - request_host.len()) - } else { - String::new() - }; - - format!("{prefix}{request_host}{suffix}{padding}") + format!("{prefix}{request_host}{suffix}") }); if next_value != result { @@ -382,29 +234,37 @@ impl UrlRewriter { #[cfg(test)] mod tests { use super::*; + use crate::integrations::IntegrationDocumentState; use crate::integrations::ScriptRewriteAction; fn test_config() -> Arc { Arc::new(NextJsIntegrationConfig { enabled: true, rewrite_attributes: vec!["href".into(), "link".into(), "url".into()], + max_combined_payload_bytes: 10 * 1024 * 1024, }) } - fn ctx(selector: &'static str) -> IntegrationScriptContext<'static> { + fn ctx<'a>( + selector: &'static str, + document_state: &'a IntegrationDocumentState, + ) -> IntegrationScriptContext<'a> { IntegrationScriptContext { selector, request_host: "ts.example.com", request_scheme: "https", origin_host: "origin.example.com", + is_last_in_text_node: true, + document_state, } } #[test] fn structured_rewriter_updates_next_data_payload() { let payload = r#"{"props":{"pageProps":{"primary":{"href":"https://origin.example.com/reviews"},"secondary":{"href":"http://origin.example.com/sign-in"},"fallbackHref":"http://origin.example.com/legacy","protoRelative":"//origin.example.com/assets/logo.png"}}}"#; - let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Structured); - let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__")); + let rewriter = NextJsNextDataRewriter::new(test_config()); + let document_state = IntegrationDocumentState::default(); + let result = rewriter.rewrite(payload, &ctx("script#__NEXT_DATA__", &document_state)); match result { ScriptRewriteAction::Replace(value) => { @@ -417,32 +277,6 @@ mod tests { } } - #[test] - fn streamed_rewriter_skips_non_next_payloads() { - let rewriter = NextJsScriptRewriter::new(test_config(), NextJsRewriteMode::Streamed); - - let noop = rewriter.rewrite("console.log('hello');", &ctx("script")); - assert!(matches!(noop, ScriptRewriteAction::Keep)); - - let payload = - r#"self.__next_f.push([1, "{\"href\":\"https://origin.example.com/app\"}"]);"#; - let result = rewriter.rewrite(payload, &ctx("script")); - assert!( - matches!(result, ScriptRewriteAction::Keep), - "Streamed rewriter should skip __next_f.push payloads (handled by post-processor)" - ); - - let init_script = r#"(self.__next_f = self.__next_f || []).push([0]); var url = "https://origin.example.com/api";"#; - let init_result = rewriter.rewrite(init_script, &ctx("script")); - assert!( - matches!( - init_result, - ScriptRewriteAction::Keep | ScriptRewriteAction::Replace(_) - ), - "Streamed rewriter should handle non-push __next_f scripts" - ); - } - #[test] fn rewrite_helper_handles_protocol_relative_urls() { let content = r#"{"props":{"pageProps":{"link":"//origin.example.com/image.png"}}}"#; @@ -452,7 +286,6 @@ mod tests { "ts.example.com", "https", &["link".into()], - false, ) .expect("should rewrite protocol relative link"); @@ -472,7 +305,6 @@ mod tests { "proxy.example.com", "http", &["url".into()], - true, ); assert!( @@ -494,7 +326,6 @@ mod tests { "proxy.example.com", "http", &["url".into()], - true, ) .expect("should rewrite URL"); @@ -517,7 +348,6 @@ mod tests { "proxy.example.com", "http", &["url".into()], - true, ); assert!( @@ -544,7 +374,6 @@ mod tests { "proxy.example.com", "http", &["url".into()], - true, ) .expect("should rewrite URL"); @@ -566,7 +395,6 @@ mod tests { "proxy.example.com", "http", &["url".into(), "siteProductionDomain".into()], - true, ) .expect("should rewrite URLs"); @@ -581,57 +409,17 @@ mod tests { } #[test] - fn whitespace_padding_calculation() { - let padding = UrlRewriter::calculate_padding(21, 24); - assert_eq!(padding.len(), 3, "Should need 3 spaces"); - assert_eq!(padding, " ", "Should be 3 spaces"); - - let padding = UrlRewriter::calculate_padding(24, 24); - assert_eq!(padding.len(), 0); - - let padding = UrlRewriter::calculate_padding(30, 24); - assert_eq!(padding.len(), 0); - } - - #[test] - fn whitespace_padding_rewrite() { - let rewriter = UrlRewriter::new( - "origin.example.com", - "proxy.example.com", - "http", - &["url".into()], - true, - ); - - let original_url = "https://origin.example.com/news"; - let result = rewriter - .rewrite_url_value(original_url) - .expect("URL should be rewritten"); - let (new_url, padding) = result; - - assert_eq!(new_url, "http://proxy.example.com/news"); - assert_eq!( - new_url.len() + padding.len(), - original_url.len(), - "URL + padding should equal original length" - ); - assert_eq!(padding, " ", "Should be 2 spaces"); - } - - #[test] - fn no_padding_when_disabled() { + fn url_rewriter_rewrites_url() { let rewriter = UrlRewriter::new( "origin.example.com", "proxy.example.com", "http", &["url".into()], - false, ); - let (new_url, padding) = rewriter + let new_url = rewriter .rewrite_url_value("https://origin.example.com/news") .expect("URL should be rewritten"); assert_eq!(new_url, "http://proxy.example.com/news"); - assert_eq!(padding, "", "No padding when preserve_length is false"); } } diff --git a/crates/common/src/integrations/nextjs/shared.rs b/crates/common/src/integrations/nextjs/shared.rs new file mode 100644 index 0000000..aedac84 --- /dev/null +++ b/crates/common/src/integrations/nextjs/shared.rs @@ -0,0 +1,229 @@ +//! Shared utilities for Next.js integration modules. + +use std::borrow::Cow; + +use once_cell::sync::Lazy; +use regex::{escape, Regex}; + +use crate::host_rewrite::rewrite_bare_host_at_boundaries; + +/// RSC push script call pattern for extracting payload string boundaries. +pub(crate) static RSC_PUSH_CALL_PATTERN: Lazy = Lazy::new(|| { + Regex::new( + r#"(?s)(?:(?:self|window)\.__next_f\.push|\(\s*(?:self|window)\.__next_f\s*=\s*(?:self|window)\.__next_f\s*\|\|\s*\[\]\s*\)\s*\.push)\(\[\s*1\s*,\s*(['"])"#, + ) + .expect("valid RSC push call regex") +}); + +/// Find the payload string boundaries within an RSC push script. +/// +/// Returns `Some((start, end))` where `start` is the position after the opening quote +/// and `end` is the position of the closing quote. +pub(crate) fn find_rsc_push_payload_range(script: &str) -> Option<(usize, usize)> { + let cap = RSC_PUSH_CALL_PATTERN.captures(script)?; + let quote_match = cap.get(1)?; + let quote = quote_match + .as_str() + .chars() + .next() + .expect("push call regex should capture a quote character"); + let payload_start = quote_match.end(); + + let bytes = script.as_bytes(); + let mut i = payload_start; + while i < bytes.len() { + if bytes[i] == b'\\' && i + 1 < bytes.len() { + i += 2; + } else if bytes[i] == b'\\' { + return None; + } else if bytes[i] == quote as u8 { + return Some((payload_start, i)); + } else { + i += 1; + } + } + + None +} + +// ============================================================================= +// URL Rewriting +// ============================================================================= + +/// Rewriter for URL patterns in RSC payloads. +/// +/// This rewrites all occurrences of origin URLs in content, including: +/// - Full URLs: `https://origin.example.com/path` or `http://origin.example.com/path` +/// - Protocol-relative: `//origin.example.com/path` +/// - Escaped variants: `\/\/origin.example.com` (JSON-escaped) +/// - Bare hostnames: `origin.example.com` (as JSON values) +/// +/// Use this for RSC T-chunk content where any origin URL should be rewritten. +/// For attribute-specific rewriting (e.g., only rewrite `"href"` values), use +/// the `UrlRewriter` in `script_rewriter.rs` instead. +pub(crate) struct RscUrlRewriter { + origin_host: String, + request_host: String, + request_scheme: String, + pattern: Regex, +} + +impl RscUrlRewriter { + pub(crate) fn new(origin_host: &str, request_host: &str, request_scheme: &str) -> Self { + let escaped_origin = escape(origin_host); + + // Match: + // - https://origin_host or http://origin_host + // - //origin_host (protocol-relative) + // - escaped variants inside JSON-in-JS strings (e.g., \/\/origin_host) + let pattern = Regex::new(&format!( + r#"(https?)?(:)?(\\\\\\\\\\\\\\\\//|\\\\\\\\//|\\/\\/|//){}"#, + escaped_origin + )) + .expect("valid RSC URL rewrite regex"); + + Self { + origin_host: origin_host.to_string(), + request_host: request_host.to_string(), + request_scheme: request_scheme.to_string(), + pattern, + } + } + + pub(crate) fn rewrite<'a>(&self, input: &'a str) -> Cow<'a, str> { + if !input.contains(&self.origin_host) { + return Cow::Borrowed(input); + } + + // Phase 1: Regex-based URL pattern rewriting (handles escaped slashes, schemes, etc.) + let replaced = self + .pattern + .replace_all(input, |caps: ®ex::Captures<'_>| { + let slashes = caps.get(3).map_or("//", |m| m.as_str()); + if caps.get(1).is_some() { + format!("{}:{}{}", self.request_scheme, slashes, self.request_host) + } else { + format!("{}{}", slashes, self.request_host) + } + }); + + // Phase 2: Handle bare host occurrences not matched by the URL regex + // (e.g., `siteProductionDomain`). Only check if regex made no changes, + // because if it did, we already know origin_host was present. + let text = match &replaced { + Cow::Borrowed(s) => *s, + Cow::Owned(s) => s.as_str(), + }; + + if !text.contains(&self.origin_host) { + return replaced; + } + + rewrite_bare_host_at_boundaries(text, &self.origin_host, &self.request_host) + .map(Cow::Owned) + .unwrap_or(replaced) + } + + pub(crate) fn rewrite_to_string(&self, input: &str) -> String { + self.rewrite(input).into_owned() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn finds_double_quoted_payload() { + let script = r#"self.__next_f.push([1,"hello world"])"#; + let (start, end) = find_rsc_push_payload_range(script).expect("should find payload"); + assert_eq!(&script[start..end], "hello world"); + } + + #[test] + fn finds_single_quoted_payload() { + let script = r#"self.__next_f.push([1,'hello world'])"#; + let (start, end) = find_rsc_push_payload_range(script).expect("should find payload"); + assert_eq!(&script[start..end], "hello world"); + } + + #[test] + fn finds_assignment_form() { + let script = r#"(self.__next_f=self.__next_f||[]).push([1,"payload"])"#; + let (start, end) = find_rsc_push_payload_range(script).expect("should find payload"); + assert_eq!(&script[start..end], "payload"); + } + + #[test] + fn returns_none_for_trailing_backslash() { + let script = r#"self.__next_f.push([1,"incomplete\"])"#; + assert!(find_rsc_push_payload_range(script).is_none()); + } + + #[test] + fn returns_none_for_unterminated_string() { + let script = r#"self.__next_f.push([1,"no closing quote"#; + assert!(find_rsc_push_payload_range(script).is_none()); + } + + // RscUrlRewriter tests + + #[test] + fn rsc_url_rewriter_rewrites_https_url() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"url":"https://origin.example.com/path"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"url":"https://proxy.example.com/path"}"#); + } + + #[test] + fn rsc_url_rewriter_rewrites_http_url() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "http"); + let input = r#"{"url":"http://origin.example.com/path"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"url":"http://proxy.example.com/path"}"#); + } + + #[test] + fn rsc_url_rewriter_rewrites_protocol_relative_url() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"url":"//origin.example.com/path"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"url":"//proxy.example.com/path"}"#); + } + + #[test] + fn rsc_url_rewriter_rewrites_escaped_slashes() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"url":"\/\/origin.example.com/path"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"url":"\/\/proxy.example.com/path"}"#); + } + + #[test] + fn rsc_url_rewriter_rewrites_bare_host() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"siteProductionDomain":"origin.example.com"}"#; + let result = rewriter.rewrite(input); + assert_eq!(result, r#"{"siteProductionDomain":"proxy.example.com"}"#); + } + + #[test] + fn rsc_url_rewriter_does_not_rewrite_partial_hostname() { + let rewriter = RscUrlRewriter::new("example.com", "proxy.example.com", "https"); + let input = r#"{"domain":"subexample.com"}"#; + let result = rewriter.rewrite(input); + // Should not rewrite because "example.com" is not a standalone host here + assert_eq!(result, r#"{"domain":"subexample.com"}"#); + } + + #[test] + fn rsc_url_rewriter_no_change_when_origin_not_present() { + let rewriter = RscUrlRewriter::new("origin.example.com", "proxy.example.com", "https"); + let input = r#"{"url":"https://other.example.com/path"}"#; + let result = rewriter.rewrite(input); + // Should return borrowed reference (no allocation) + assert!(matches!(result, Cow::Borrowed(_))); + assert_eq!(result, input); + } +} diff --git a/crates/common/src/integrations/registry.rs b/crates/common/src/integrations/registry.rs index b029d6b..819890d 100644 --- a/crates/common/src/integrations/registry.rs +++ b/crates/common/src/integrations/registry.rs @@ -1,5 +1,6 @@ +use std::any::Any; use std::collections::BTreeMap; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; use async_trait::async_trait; use error_stack::Report; @@ -87,6 +88,82 @@ pub struct IntegrationScriptContext<'a> { pub request_host: &'a str, pub request_scheme: &'a str, pub origin_host: &'a str, + pub is_last_in_text_node: bool, + pub document_state: &'a IntegrationDocumentState, +} + +/// Per-document state shared between HTML/script rewriters and post-processors. +/// +/// This exists to support multi-phase HTML processing without requiring a second HTML parse. +#[derive(Clone, Default)] +pub struct IntegrationDocumentState { + inner: Arc>>>, +} + +impl std::fmt::Debug for IntegrationDocumentState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let keys: Vec<&'static str> = { + let guard = self + .inner + .lock() + .expect("should lock integration document state"); + guard.keys().copied().collect() + }; + f.debug_struct("IntegrationDocumentState") + .field("keys", &keys) + .finish() + } +} + +impl IntegrationDocumentState { + pub fn get(&self, integration_id: &'static str) -> Option> + where + T: Any + Send + Sync + 'static, + { + let guard = self + .inner + .lock() + .expect("should lock integration document state"); + guard.get(integration_id).and_then(|value| { + let cloned: Arc = Arc::clone(value); + cloned.downcast::().ok() + }) + } + + pub fn get_or_insert_with( + &self, + integration_id: &'static str, + init: impl FnOnce() -> T, + ) -> Arc + where + T: Any + Send + Sync + 'static, + { + let mut guard = self + .inner + .lock() + .expect("should lock integration document state"); + + if let Some(existing) = guard.get(integration_id) { + if let Ok(downcast) = Arc::clone(existing).downcast::() { + return downcast; + } + } + + let value: Arc = Arc::new(init()); + guard.insert( + integration_id, + Arc::clone(&value) as Arc, + ); + value + } + + pub fn clear(&self) { + let mut guard = self + .inner + .lock() + .expect("should lock integration document state"); + guard.clear(); + } } /// Describes an HTTP endpoint exposed by an integration. @@ -255,6 +332,7 @@ pub struct IntegrationHtmlContext<'a> { pub request_host: &'a str, pub request_scheme: &'a str, pub origin_host: &'a str, + pub document_state: &'a IntegrationDocumentState, } /// Trait for integration-provided HTML post-processors. @@ -270,7 +348,7 @@ pub trait IntegrationHtmlPostProcessor: Send + Sync { /// every HTML response when the integration is enabled. fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { let _ = (html, ctx); - true + false } /// Post-process complete HTML content. @@ -664,6 +742,35 @@ mod tests { } } + struct NoopHtmlPostProcessor; + + impl IntegrationHtmlPostProcessor for NoopHtmlPostProcessor { + fn integration_id(&self) -> &'static str { + "noop" + } + + fn post_process(&self, _html: &mut String, _ctx: &IntegrationHtmlContext<'_>) -> bool { + false + } + } + + #[test] + fn default_html_post_processor_should_process_is_false() { + let processor = NoopHtmlPostProcessor; + let document_state = IntegrationDocumentState::default(); + let ctx = IntegrationHtmlContext { + request_host: "proxy.example.com", + request_scheme: "https", + origin_host: "origin.example.com", + document_state: &document_state, + }; + + assert!( + !processor.should_process("", &ctx), + "Default `should_process` should be false to avoid running post-processing unexpectedly" + ); + } + #[test] fn test_exact_route_matching() { let routes = vec![( diff --git a/crates/common/src/lib.rs b/crates/common/src/lib.rs index c8df91d..b9f5fd5 100644 --- a/crates/common/src/lib.rs +++ b/crates/common/src/lib.rs @@ -29,6 +29,7 @@ pub mod creative; pub mod error; pub mod fastly_storage; pub mod geo; +pub(crate) mod host_rewrite; pub mod html_processor; pub mod http_util; pub mod integrations; diff --git a/crates/common/src/rsc_flight.rs b/crates/common/src/rsc_flight.rs index d850010..309e950 100644 --- a/crates/common/src/rsc_flight.rs +++ b/crates/common/src/rsc_flight.rs @@ -1,5 +1,6 @@ use std::io; +use crate::host_rewrite::rewrite_bare_host_at_boundaries; use crate::streaming_processor::StreamProcessor; #[derive(Debug, Clone, Copy, PartialEq, Eq)] @@ -115,7 +116,9 @@ impl RscFlightUrlRewriter { &self.origin_protocol_relative, &self.request_protocol_relative, ); - rewritten = rewritten.replace(&self.origin_host, &self.request_host); + rewritten = + rewrite_bare_host_at_boundaries(&rewritten, &self.origin_host, &self.request_host) + .unwrap_or(rewritten); rewritten.into_bytes() } @@ -364,4 +367,25 @@ mod tests { "Rewriter should handle T rows split across chunks" ); } + + #[test] + fn bare_host_rewrite_respects_hostname_boundaries() { + let input = b"0:[\"cdn.origin.example.com\",\"notorigin.example.com\",\"origin.example.com.uk\",\"origin.example.com/news\",\"origin.example.com\"]\n"; + + let mut rewriter = RscFlightUrlRewriter::new( + "origin.example.com", + "https://origin.example.com", + "proxy.example.com", + "https", + ); + + let output = run_rewriter(&mut rewriter, input, 5); + let output_str = String::from_utf8(output).expect("should be valid UTF-8"); + + assert_eq!( + output_str, + "0:[\"cdn.origin.example.com\",\"notorigin.example.com\",\"origin.example.com.uk\",\"proxy.example.com/news\",\"proxy.example.com\"]\n", + "Output should only rewrite bare host occurrences" + ); + } } diff --git a/docs/RSC_HYDRATION_FINDINGS.md b/docs/RSC_HYDRATION_FINDINGS.md index 66de477..1f7c8c8 100644 --- a/docs/RSC_HYDRATION_FINDINGS.md +++ b/docs/RSC_HYDRATION_FINDINGS.md @@ -176,16 +176,13 @@ The HTML rewriter runs in a streaming pipeline (decompress → rewrite → recom ### Phase 2: HTML Post-Processing (cross-script RSC) -At end-of-document, a post-processor handles cross-script T-chunks: +At end-of-document, the Next.js integration rewrites cross-script T-chunks **without a second HTML parse**: -1. **Finds all RSC push scripts** in the complete HTML -2. **Combines their payloads** with markers -3. **Processes T-chunks across the combined content**, skipping markers when counting bytes -4. **Rewrites URLs and recalculates lengths** for the combined content -5. **Splits back on markers** to get individual rewritten payloads -6. **Rebuilds the HTML** with rewritten scripts +1. During the initial `lol_html` pass, `NextJsRscPlaceholderRewriter` replaces each `__next_f.push([1,"..."])` payload string with a placeholder token and records the original payloads in `IntegrationDocumentState`. +2. `NextJsHtmlPostProcessor` rewrites the recorded payload strings using the marker-based cross-script algorithm (combine → rewrite → split). +3. `NextJsHtmlPostProcessor` substitutes the placeholders in the final HTML with the rewritten payload strings. -This phase is gated by a cheap `should_process` preflight so non‑Next.js pages do not pay the extra pass ([html_post_process.rs:36](crates/common/src/integrations/nextjs/html_post_process.rs#L36)). +This phase is gated by `IntegrationHtmlPostProcessor::should_process` checking whether any RSC payloads were recorded, so non‑Next.js pages do not pay the post-processing cost ([html_post_process.rs:41](crates/common/src/integrations/nextjs/html_post_process.rs#L41)). ### Marker-Based Cross-Script Processing @@ -199,7 +196,7 @@ The marker `\x00SPLIT\x00` is chosen because: - Easily identifiable for splitting - Won't be confused with any escape sequence -**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11) and combine/split logic in [rsc.rs:417](crates/common/src/integrations/nextjs/rsc.rs#L417) +**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11) and combine/split logic in [rsc.rs:324](crates/common/src/integrations/nextjs/rsc.rs#L324) #### Step 2: Find T-Chunks Across Combined Content @@ -207,7 +204,7 @@ Scan the combined stream for `ID:T,` headers, then consume exactly ` The key insight: markers don't count toward byte consumption. When a T-chunk declares 1679 bytes, we consume 1679 bytes of actual content, skipping over any markers we encounter. -**Implementation:** T-chunk discovery at [rsc.rs:198](crates/common/src/integrations/nextjs/rsc.rs#L198) with marker-aware escape sequence iterator at [rsc.rs:68](crates/common/src/integrations/nextjs/rsc.rs#L68) +**Implementation:** T-chunk discovery at [rsc.rs:202](crates/common/src/integrations/nextjs/rsc.rs#L202) with marker-aware escape sequence iterator at [rsc.rs:72](crates/common/src/integrations/nextjs/rsc.rs#L72) #### Step 3: Rewrite URLs and Recalculate Lengths @@ -234,11 +231,13 @@ The post-processing is implemented as an integration hook, allowing other integr ### Trait Definition -**Implementation:** Context at [registry.rs:254](crates/common/src/integrations/registry.rs#L254) and trait at [registry.rs:263](crates/common/src/integrations/registry.rs#L263) +**Implementation:** Per-document state at [registry.rs:99](crates/common/src/integrations/registry.rs#L99), context at [registry.rs:331](crates/common/src/integrations/registry.rs#L331), and trait at [registry.rs:341](crates/common/src/integrations/registry.rs#L341) + +**Note:** `IntegrationHtmlPostProcessor::should_process` defaults to `false`, so integrations must explicitly opt in to post-processing via a cheap preflight check. ### Registration -**Implementation:** Next.js registers its HTML post-processor in [mod.rs:47](crates/common/src/integrations/nextjs/mod.rs#L47) +**Implementation:** Next.js registers its placeholder rewriter + HTML post-processor when enabled in [mod.rs:86](crates/common/src/integrations/nextjs/mod.rs#L86) ### Execution in HTML Processor @@ -250,11 +249,13 @@ The post-processing is implemented as an integration hook, allowing other integr `T`-chunk lengths use the **unescaped** byte count of the payload (after decoding JavaScript string escapes). Correct handling requires: -- Shared escape sequence iterator handles `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [rsc.rs:33](crates/common/src/integrations/nextjs/rsc.rs#L33) -- Counting unescaped bytes: [rsc.rs:162](crates/common/src/integrations/nextjs/rsc.rs#L162) -- Consuming exactly _N unescaped bytes_ to locate the end of a declared `T` chunk: [rsc.rs:167](crates/common/src/integrations/nextjs/rsc.rs#L167) -- Marker-aware byte length calculation for cross-script processing: [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) -- Size-limited combined payload allocation (10 MB max): [rsc.rs:395](crates/common/src/integrations/nextjs/rsc.rs#L395) +- Shared escape sequence iterator handles `\\n`, `\\xHH`, `\\uHHHH`, and surrogate pairs: [rsc.rs:37](crates/common/src/integrations/nextjs/rsc.rs#L37) +- Counting unescaped bytes: [rsc.rs:166](crates/common/src/integrations/nextjs/rsc.rs#L166) +- Consuming exactly _N unescaped bytes_ to locate the end of a declared `T` chunk: [rsc.rs:171](crates/common/src/integrations/nextjs/rsc.rs#L171) +- Marker-aware byte length calculation for cross-script processing: [rsc.rs:327](crates/common/src/integrations/nextjs/rsc.rs#L327) +- Size-limited combined payload allocation (default 10 MB, configurable via `integrations.nextjs.max_combined_payload_bytes`): [rsc.rs:378](crates/common/src/integrations/nextjs/rsc.rs#L378) +- Fail-safe: if `T`-chunk parsing fails (unreasonable length or truncated content), Trusted Server skips rewriting to avoid breaking hydration: [rsc.rs:202](crates/common/src/integrations/nextjs/rsc.rs#L202) +- If the size limit is exceeded and cross-script T-chunks are present, Trusted Server skips rewriting rather than risk breaking hydration: [rsc.rs:410](crates/common/src/integrations/nextjs/rsc.rs#L410) --- @@ -273,7 +274,7 @@ The solution handles multiple URL formats in RSC content: ### Regex Pattern -**Implementation:** Regex-based rewriting in [rsc.rs:272](crates/common/src/integrations/nextjs/rsc.rs#L272) +**Implementation:** Regex-based rewriting in [shared.rs:62](crates/common/src/integrations/nextjs/shared.rs#L62) This pattern handles: @@ -293,19 +294,13 @@ This pattern handles: │ ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ -│ PHASE 1: Streaming HTML Processing │ -│ │ -│ ┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ │ -│ │ Process Script 1 │ │ Process Script 2 │ │ Process Script N │ │ -│ │ │ │ │ │ │ │ -│ │ - Extract payload│ │ - Extract payload│ │ - Extract payload│ │ -│ │ - Find T-chunks │ │ - Find T-chunks │ │ - Find T-chunks │ │ -│ │ - Rewrite URLs │ │ - Rewrite URLs │ │ - Rewrite URLs │ │ -│ │ - Update lengths │ │ - Update lengths │ │ - Update lengths │ │ -│ └──────────────────┘ └──────────────────┘ └──────────────────┘ │ +│ PHASE 1: HTML Rewrite (lol_html) │ │ │ -│ Works for self-contained T-chunks, but cross-script T-chunks may have │ -│ incorrect lengths at this point. │ +│ - Rewrite HTML attributes (href/src/etc.) │ +│ - Rewrite Pages Router data (`__NEXT_DATA__`) │ +│ - For App Router RSC push scripts (`__next_f.push([1,\"...\"])`): │ +│ * Replace payload string with placeholder token │ +│ * Record original payloads (IntegrationDocumentState) │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ @@ -313,22 +308,8 @@ This pattern handles: │ PHASE 2: HTML Post-Processing │ │ (Integration Hook: NextJsHtmlPostProcessor) │ │ │ -│ 1. Find all RSC push scripts in complete HTML │ -│ │ -│ 2. Extract payloads and combine with markers: │ -│ "payload1\x00SPLIT\x00payload2\x00SPLIT\x00payload3..." │ -│ │ -│ 3. Find T-chunks across combined content (markers don't count as bytes) │ -│ │ -│ 4. For each T-chunk: │ -│ - Extract content (may span markers) │ -│ - Rewrite URLs │ -│ - Calculate new byte length (excluding markers) │ -│ - Write new header: ID:T, │ -│ │ -│ 5. Split on markers to get individual payloads │ -│ │ -│ 6. Rebuild HTML with corrected scripts │ +│ - Rewrite recorded payloads (marker-based cross-script T-chunk logic) │ +│ - Substitute placeholders with rewritten payload strings │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ @@ -419,35 +400,42 @@ Because post-processing runs inside the HTML processor (before recompression), ` ## Implementation Files -| File | Purpose | -| ------------------------------------------------------------ | --------------------------------------------------------- | -| `crates/common/src/integrations/nextjs/mod.rs` | Next.js integration config + registration | -| `crates/common/src/integrations/nextjs/html_post_process.rs` | HTML post-processing for cross-script RSC | -| `crates/common/src/integrations/nextjs/rsc.rs` | RSC T-chunk parsing + URL rewriting | -| `crates/common/src/integrations/nextjs/script_rewriter.rs` | Script rewrites (`__NEXT_DATA__`, inline `__next_f.push`) | -| `crates/common/src/rsc_flight.rs` | Flight response rewriting (`text/x-component`) | -| `crates/common/src/integrations/registry.rs` | `IntegrationHtmlPostProcessor` trait | -| `crates/common/src/integrations/mod.rs` | Module exports | -| `crates/common/src/html_processor.rs` | HTML rewriting + post-processor invocation | -| `crates/common/src/publisher.rs` | Response routing + streaming pipeline setup | -| `crates/common/src/streaming_processor.rs` | Compression transforms + `StreamProcessor` | +| File | Purpose | +| ------------------------------------------------------------ | -------------------------------------------------------- | +| `crates/common/src/integrations/nextjs/mod.rs` | Next.js integration config + registration | +| `crates/common/src/integrations/nextjs/html_post_process.rs` | HTML post-processing for cross-script RSC | +| `crates/common/src/integrations/nextjs/rsc_placeholders.rs` | RSC placeholder insertion + payload capture (App Router) | +| `crates/common/src/integrations/nextjs/rsc.rs` | RSC T-chunk parsing + URL rewriting | +| `crates/common/src/integrations/nextjs/script_rewriter.rs` | Script rewrites (`__NEXT_DATA__`) | +| `crates/common/src/integrations/nextjs/shared.rs` | Shared regex patterns + payload parsing utilities | +| `crates/common/src/rsc_flight.rs` | Flight response rewriting (`text/x-component`) | +| `crates/common/src/integrations/registry.rs` | Integration traits + `IntegrationDocumentState` | +| `crates/common/src/integrations/mod.rs` | Module exports | +| `crates/common/src/html_processor.rs` | HTML rewriting + post-processor invocation | +| `crates/common/src/publisher.rs` | Response routing + streaming pipeline setup | +| `crates/common/src/streaming_processor.rs` | Compression transforms + `StreamProcessor` | ### Key Functions (Next.js integration) -| Symbol | Location | Purpose | -| ---------------------------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------- | -| `extract_rsc_push_payload` | [script_rewriter.rs:94](crates/common/src/integrations/nextjs/script_rewriter.rs#L94) | Extract string from `self.__next_f.push([1, '...'])` | -| `EscapeSequenceIter` | [rsc.rs:33](crates/common/src/integrations/nextjs/rsc.rs#L33) | Shared iterator for escape sequence parsing | -| `calculate_unescaped_byte_length` | [rsc.rs:162](crates/common/src/integrations/nextjs/rsc.rs#L162) | Count unescaped bytes with escape handling | -| `consume_unescaped_bytes` | [rsc.rs:167](crates/common/src/integrations/nextjs/rsc.rs#L167) | Advance through string consuming N bytes | -| `find_tchunks` | [rsc.rs:256](crates/common/src/integrations/nextjs/rsc.rs#L256) | Find T-chunks in a single payload | -| `RscUrlRewriter` | [rsc.rs:272](crates/common/src/integrations/nextjs/rsc.rs#L272) | Regex URL rewriting (compiled once per rewrite call) | -| `rewrite_rsc_tchunks_with_rewriter` | [rsc.rs:343](crates/common/src/integrations/nextjs/rsc.rs#L343) | Single-payload T-chunk processing | -| `calculate_unescaped_byte_length_skip_markers` | [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) | Count unescaped bytes, excluding markers | -| `find_tchunks_with_markers` | [rsc.rs:260](crates/common/src/integrations/nextjs/rsc.rs#L260) | Find T-chunks in marker-combined content | -| `rewrite_rsc_scripts_combined` | [rsc.rs:392](crates/common/src/integrations/nextjs/rsc.rs#L392) | Cross-script T-chunk processing | -| `find_rsc_push_scripts` | [html_post_process.rs:67](crates/common/src/integrations/nextjs/html_post_process.rs#L67) | Find all RSC scripts in HTML | -| `post_process_rsc_html_in_place` | [html_post_process.rs:135](crates/common/src/integrations/nextjs/html_post_process.rs#L135) | Complete HTML post-processing | +| Symbol | Location | Purpose | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------- | ---------------------------------------------------------------- | +| `NextJsRscPlaceholderRewriter` | [rsc_placeholders.rs:52](crates/common/src/integrations/nextjs/rsc_placeholders.rs#L52) | Replace RSC payload strings with placeholders + record originals | +| `NextJsHtmlPostProcessor::post_process` | [html_post_process.rs:52](crates/common/src/integrations/nextjs/html_post_process.rs#L52) | Rewrite recorded payloads + substitute placeholders | +| `substitute_rsc_payload_placeholders` | [html_post_process.rs:116](crates/common/src/integrations/nextjs/html_post_process.rs#L116) | Substitute placeholder tokens in HTML | +| `IntegrationDocumentState` | [registry.rs:99](crates/common/src/integrations/registry.rs#L99) | Per-document state shared across phases | +| `EscapeSequenceIter` | [rsc.rs:37](crates/common/src/integrations/nextjs/rsc.rs#L37) | Shared iterator for escape sequence parsing | +| `TChunkInfo` | [rsc.rs:190](crates/common/src/integrations/nextjs/rsc.rs#L190) | T-chunk position info (stores `id_end` position, not String) | +| `calculate_unescaped_byte_length` | [rsc.rs:166](crates/common/src/integrations/nextjs/rsc.rs#L166) | Count unescaped bytes with escape handling | +| `consume_unescaped_bytes` | [rsc.rs:171](crates/common/src/integrations/nextjs/rsc.rs#L171) | Advance through string consuming N bytes | +| `find_tchunks` | [rsc.rs:260](crates/common/src/integrations/nextjs/rsc.rs#L260) | Find T-chunks in a single payload | +| `RscUrlRewriter` | [shared.rs:62](crates/common/src/integrations/nextjs/shared.rs#L62) | Regex URL rewriting for RSC payloads | +| `UrlRewriter` (script) | [script_rewriter.rs:91](crates/common/src/integrations/nextjs/script_rewriter.rs#L91) | Attribute-specific URL rewriting for `__NEXT_DATA__` (combined regex) | +| `rewrite_rsc_tchunks_with_rewriter` | [rsc.rs:272](crates/common/src/integrations/nextjs/rsc.rs#L272) | Single-payload T-chunk processing | +| `calculate_unescaped_byte_length_skip_markers` | [rsc.rs:314](crates/common/src/integrations/nextjs/rsc.rs#L314) | Count unescaped bytes, excluding markers | +| `find_tchunks_with_markers` | [rsc.rs:264](crates/common/src/integrations/nextjs/rsc.rs#L264) | Find T-chunks in marker-combined content | +| `rewrite_rsc_scripts_combined` | [rsc.rs:321](crates/common/src/integrations/nextjs/rsc.rs#L321) | Cross-script T-chunk processing | +| `find_rsc_push_scripts` | [html_post_process.rs:171](crates/common/src/integrations/nextjs/html_post_process.rs#L171) | (Deprecated) Find RSC scripts in HTML | +| `post_process_rsc_html_in_place` | [html_post_process.rs:287](crates/common/src/integrations/nextjs/html_post_process.rs#L287) | (Deprecated) Full HTML scan + rewrite | --- @@ -464,10 +452,10 @@ If the proxy URL is significantly longer than the original, T-chunk content grow The post-processing phase requires: -1. Parsing complete HTML to find scripts (O(n) string scan) +1. Placeholder insertion during the initial `lol_html` pass (payload capture) 2. Combining payloads (memory allocation) 3. Regex matching for T-chunks -4. String rebuilding +4. One pass placeholder substitution over the final HTML string For typical pages with 100-300 RSC scripts, this adds ~1-5ms to processing time. @@ -481,25 +469,25 @@ For typical pages with 100-300 RSC scripts, this adds ~1-5ms to processing time. ## Deconstruction and Reconstruction Logic -The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main entry point is `post_process_rsc_html_in_place()` at [html_post_process.rs:136](crates/common/src/integrations/nextjs/html_post_process.rs#L136). +The RSC rewriting process involves carefully deconstructing RSC payloads, rewriting URLs, and reconstructing them with correct T-chunk lengths. The main runtime entry point is `NextJsHtmlPostProcessor::post_process()` at [html_post_process.rs:52](crates/common/src/integrations/nextjs/html_post_process.rs#L52), operating on payloads captured during phase 1 by `NextJsRscPlaceholderRewriter` ([rsc_placeholders.rs:52](crates/common/src/integrations/nextjs/rsc_placeholders.rs#L52)). -### Step 1: Find RSC Push Scripts +### Step 1: Capture RSC Payloads (placeholders) -Find all `self.__next_f.push([1, "..."])` scripts in the HTML and extract their payloads. +During the initial HTML rewrite pass, replace each `self.__next_f.push([1, "..."])` payload string with a placeholder token and record the original payload strings in `IntegrationDocumentState`. -**Implementation:** `find_rsc_push_scripts()` at [html_post_process.rs:67](crates/common/src/integrations/nextjs/html_post_process.rs#L67) +**Implementation:** `NextJsRscPlaceholderRewriter::rewrite()` at [rsc_placeholders.rs:71](crates/common/src/integrations/nextjs/rsc_placeholders.rs#L71) and `IntegrationDocumentState` at [registry.rs:99](crates/common/src/integrations/registry.rs#L99) ### Step 2: Combine Payloads with Markers Join all payloads with a marker string (`\x00SPLIT\x00`) that cannot appear in valid JSON/RSC content. This allows T-chunks to be processed across script boundaries while preserving the ability to split back later. -**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11), combining logic in `rewrite_rsc_scripts_combined()` at [rsc.rs:392](crates/common/src/integrations/nextjs/rsc.rs#L392) +**Implementation:** Marker constant at [rsc.rs:11](crates/common/src/integrations/nextjs/rsc.rs#L11), combining logic in `rewrite_rsc_scripts_combined()` at [rsc.rs:324](crates/common/src/integrations/nextjs/rsc.rs#L324) ### Step 3: Find T-Chunks Across Combined Content Parse T-chunk headers (`ID:T,`) and consume exactly the declared number of unescaped bytes, skipping over markers. -**Implementation:** `find_tchunks_with_markers()` at [rsc.rs:260](crates/common/src/integrations/nextjs/rsc.rs#L260), using `EscapeSequenceIter::from_position_with_marker()` at [rsc.rs:68](crates/common/src/integrations/nextjs/rsc.rs#L68) +**Implementation:** `find_tchunks_with_markers()` at [rsc.rs:267](crates/common/src/integrations/nextjs/rsc.rs#L267), using `EscapeSequenceIter::from_position_with_marker()` at [rsc.rs:72](crates/common/src/integrations/nextjs/rsc.rs#L72) ### Step 4: Rewrite URLs in T-Chunk Content @@ -510,25 +498,25 @@ Rewrite all URL patterns in the T-chunk content: - `\\/\\/origin.example.com` → `\\/\\/proxy.example.com` (JSON-escaped) - `\\\\//origin.example.com` → `\\\\//proxy.example.com` (double-escaped) -**Implementation:** `RscUrlRewriter::rewrite()` at [rsc.rs:301](crates/common/src/integrations/nextjs/rsc.rs#L301) +**Implementation:** `RscUrlRewriter::rewrite()` at [shared.rs:91](crates/common/src/integrations/nextjs/shared.rs#L91) ### Step 5: Recalculate T-Chunk Length Calculate the new unescaped byte length (excluding markers) and update the T-chunk header with the new hex length. -**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [rsc.rs:385](crates/common/src/integrations/nextjs/rsc.rs#L385) +**Implementation:** `calculate_unescaped_byte_length_skip_markers()` at [rsc.rs:317](crates/common/src/integrations/nextjs/rsc.rs#L317) ### Step 6: Split Back on Markers Split the combined rewritten content back into individual payloads on the marker boundaries. Each payload corresponds to one original script, with T-chunk lengths now correct across script boundaries. -**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [rsc.rs:392](crates/common/src/integrations/nextjs/rsc.rs#L392) +**Implementation:** Part of `rewrite_rsc_scripts_combined()` at [rsc.rs:324](crates/common/src/integrations/nextjs/rsc.rs#L324) ### Step 7: Reconstruct HTML -Replace each original script with its rewritten version in the HTML. +Substitute placeholder tokens in the final HTML with the rewritten payload strings (no HTML re-parse). -**Implementation:** Part of `post_process_rsc_html_in_place()` at [html_post_process.rs:135](crates/common/src/integrations/nextjs/html_post_process.rs#L135) +**Implementation:** `substitute_rsc_payload_placeholders()` at [html_post_process.rs:116](crates/common/src/integrations/nextjs/html_post_process.rs#L116) ### Visual Example diff --git a/trusted-server.toml b/trusted-server.toml index caf7e65..1eee922 100644 --- a/trusted-server.toml +++ b/trusted-server.toml @@ -45,6 +45,8 @@ debug = false [integrations.nextjs] enabled = false rewrite_attributes = ["href", "link", "siteBaseUrl", "siteProductionDomain", "url"] +# Maximum combined payload size for cross-script RSC processing (bytes). Default is 10 MB. +max_combined_payload_bytes = 10485760 [integrations.testlight] endpoint = "https://testlight.example/openrtb2/auction" From fd5b91d2d20f975ebdafbb8179df6c184c48fe8c Mon Sep 17 00:00:00 2001 From: Aram Grigoryan <132480+aram356@users.noreply.github.com> Date: Wed, 17 Dec 2025 01:18:19 -0800 Subject: [PATCH 11/11] Fixed bug --- .../integrations/nextjs/html_post_process.rs | 60 ++++--- crates/common/src/integrations/nextjs/mod.rs | 69 ++++++++ .../integrations/nextjs/rsc_placeholders.rs | 151 +++++++----------- 3 files changed, 169 insertions(+), 111 deletions(-) diff --git a/crates/common/src/integrations/nextjs/html_post_process.rs b/crates/common/src/integrations/nextjs/html_post_process.rs index 1171331..f379872 100644 --- a/crates/common/src/integrations/nextjs/html_post_process.rs +++ b/crates/common/src/integrations/nextjs/html_post_process.rs @@ -30,38 +30,62 @@ impl IntegrationHtmlPostProcessor for NextJsHtmlPostProcessor { } fn should_process(&self, html: &str, ctx: &IntegrationHtmlContext<'_>) -> bool { - let _ = html; if !self.config.enabled || self.config.rewrite_attributes.is_empty() { return false; } - let Some(state) = ctx + // Check if we have captured placeholders from streaming + if let Some(state) = ctx .document_state .get::>(NEXTJS_INTEGRATION_ID) - else { - return false; - }; + { + let guard = state.lock().unwrap_or_else(|e| e.into_inner()); + if !guard.payloads.is_empty() { + return true; + } + } - let guard = state.lock().unwrap_or_else(|e| e.into_inner()); - !guard.payloads.is_empty() + // Also check if HTML contains RSC scripts that weren't captured during streaming + // (e.g., fragmented scripts that we skipped during the streaming pass) + html.contains("__next_f.push") && html.contains(ctx.origin_host) } fn post_process(&self, html: &mut String, ctx: &IntegrationHtmlContext<'_>) -> bool { - let Some(state) = ctx + // Try to get payloads captured during streaming (placeholder approach) + let payloads = ctx .document_state .get::>(NEXTJS_INTEGRATION_ID) - else { - return false; - }; - - let payloads = { - let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); - guard.take_payloads() - }; - if payloads.is_empty() { - return false; + .map(|state| { + let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); + guard.take_payloads() + }) + .unwrap_or_default(); + + if !payloads.is_empty() { + // Placeholder approach: substitute placeholders with rewritten payloads + return self.substitute_placeholders(html, ctx, payloads); } + // Fallback: re-parse HTML to find RSC scripts that weren't captured during streaming + // (e.g., fragmented scripts that we skipped during the streaming pass) + post_process_rsc_html_in_place_with_limit( + html, + ctx.origin_host, + ctx.request_host, + ctx.request_scheme, + self.config.max_combined_payload_bytes, + ) + } +} + +impl NextJsHtmlPostProcessor { + /// Substitute placeholders with rewritten payloads (fast path for unfragmented scripts). + fn substitute_placeholders( + &self, + html: &mut String, + ctx: &IntegrationHtmlContext<'_>, + payloads: Vec, + ) -> bool { let payload_refs: Vec<&str> = payloads.iter().map(String::as_str).collect(); let mut rewritten_payloads = rewrite_rsc_scripts_combined_with_limit( payload_refs.as_slice(), diff --git a/crates/common/src/integrations/nextjs/mod.rs b/crates/common/src/integrations/nextjs/mod.rs index 9a3648e..d79e98c 100644 --- a/crates/common/src/integrations/nextjs/mod.rs +++ b/crates/common/src/integrations/nextjs/mod.rs @@ -419,4 +419,73 @@ mod tests { final_html ); } + + #[test] + fn html_processor_preserves_non_rsc_scripts_with_chunked_streaming() { + // Regression test: ensure non-RSC scripts are preserved when streamed alongside RSC scripts. + // With small chunk sizes, scripts get fragmented and the buffering logic must correctly + // handle non-RSC scripts without corrupting them. + let html = r#" + + + +"#; + + let mut settings = create_test_settings(); + settings + .integrations + .insert_config( + "nextjs", + &json!({ + "enabled": true, + "rewrite_attributes": ["url"], + }), + ) + .expect("should update nextjs config"); + let registry = IntegrationRegistry::new(&settings); + let config = config_from_settings(&settings, ®istry); + let processor = create_html_processor(config); + // Use small chunk size to force fragmentation + let pipeline_config = PipelineConfig { + input_compression: Compression::None, + output_compression: Compression::None, + chunk_size: 16, + }; + let mut pipeline = StreamingPipeline::new(pipeline_config, processor); + + let mut output = Vec::new(); + pipeline + .process(Cursor::new(html.as_bytes()), &mut output) + .unwrap(); + let final_html = String::from_utf8_lossy(&output); + + // Non-RSC scripts should be preserved + assert!( + final_html.contains(r#"console.log("hello world");"#), + "First non-RSC script should be preserved intact. Output: {}", + final_html + ); + assert!( + final_html.contains("window.analytics"), + "Third non-RSC script should be preserved. Output: {}", + final_html + ); + assert!( + final_html.contains("track: function(e)"), + "Third non-RSC script content should be intact. Output: {}", + final_html + ); + + // RSC scripts should be rewritten + assert!( + final_html.contains("test.example.com"), + "RSC URL should be rewritten. Output: {}", + final_html + ); + assert!( + !final_html.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), + "No placeholders should remain. Output: {}", + final_html + ); + } } diff --git a/crates/common/src/integrations/nextjs/rsc_placeholders.rs b/crates/common/src/integrations/nextjs/rsc_placeholders.rs index 84ac650..e26a98f 100644 --- a/crates/common/src/integrations/nextjs/rsc_placeholders.rs +++ b/crates/common/src/integrations/nextjs/rsc_placeholders.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::sync::{Arc, Mutex}; use crate::integrations::{ @@ -11,36 +10,18 @@ use super::{NextJsIntegrationConfig, NEXTJS_INTEGRATION_ID}; pub(super) const RSC_PAYLOAD_PLACEHOLDER_PREFIX: &str = "__ts_rsc_payload_"; pub(super) const RSC_PAYLOAD_PLACEHOLDER_SUFFIX: &str = "__"; +/// State for RSC placeholder-based rewriting. +/// +/// Stores RSC payloads extracted during streaming for later rewriting during post-processing. +/// Only unfragmented RSC scripts are processed during streaming; fragmented scripts are +/// handled by the post-processor which re-parses the final HTML. #[derive(Default)] pub(super) struct NextJsRscPostProcessState { pub(super) payloads: Vec, - buffer: String, - buffering: bool, } impl NextJsRscPostProcessState { - fn buffer_chunk(&mut self, chunk: &str) { - if !self.buffering { - self.buffering = true; - self.buffer.clear(); - } - self.buffer.push_str(chunk); - } - - /// Returns the complete script content, either borrowed from input or owned from buffer. - fn take_script_or_borrow<'a>(&mut self, chunk: &'a str) -> Cow<'a, str> { - if self.buffering { - self.buffer.push_str(chunk); - self.buffering = false; - Cow::Owned(std::mem::take(&mut self.buffer)) - } else { - Cow::Borrowed(chunk) - } - } - pub(super) fn take_payloads(&mut self) -> Vec { - self.buffer.clear(); - self.buffering = false; std::mem::take(&mut self.payloads) } } @@ -73,84 +54,48 @@ impl IntegrationScriptRewriter for NextJsRscPlaceholderRewriter { return ScriptRewriteAction::keep(); } + // Only process complete (unfragmented) scripts during streaming. + // Fragmented scripts are handled by the post-processor which re-parses the final HTML. + // This avoids corrupting non-RSC scripts that happen to be fragmented during streaming. if !ctx.is_last_in_text_node { - if let Some(existing) = ctx - .document_state - .get::>(NEXTJS_INTEGRATION_ID) - { - let mut guard = existing.lock().unwrap_or_else(|e| e.into_inner()); - if guard.buffering { - guard.buffer_chunk(content); - return ScriptRewriteAction::remove_node(); - } - } - - let trimmed = content.trim_start(); - if trimmed.starts_with('{') || trimmed.starts_with('[') { - // Avoid interfering with other inline JSON scripts (e.g. `__NEXT_DATA__`, JSON-LD). - return ScriptRewriteAction::keep(); - } - - let state = ctx - .document_state - .get_or_insert_with(NEXTJS_INTEGRATION_ID, || { - Mutex::new(NextJsRscPostProcessState::default()) - }); - let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); - guard.buffer_chunk(content); - return ScriptRewriteAction::remove_node(); - } - - if !content.contains("__next_f") - && ctx - .document_state - .get::>(NEXTJS_INTEGRATION_ID) - .is_none() - { + // Script is fragmented - skip placeholder processing. + // The post-processor will handle RSC scripts at end-of-document. return ScriptRewriteAction::keep(); } - let state = ctx - .document_state - .get_or_insert_with(NEXTJS_INTEGRATION_ID, || { - Mutex::new(NextJsRscPostProcessState::default()) - }); - let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); - let script = guard.take_script_or_borrow(content); - let was_buffered = matches!(script, Cow::Owned(_)); - - if !script.contains("__next_f") { - if was_buffered { - return ScriptRewriteAction::replace(script.into_owned()); - } + // Quick check: skip scripts that can't be RSC payloads + if !content.contains("__next_f") { return ScriptRewriteAction::keep(); } - let Some((payload_start, payload_end)) = find_rsc_push_payload_range(&script) else { - if was_buffered { - return ScriptRewriteAction::replace(script.into_owned()); - } + let Some((payload_start, payload_end)) = find_rsc_push_payload_range(content) else { + // Contains __next_f but doesn't match RSC push pattern - leave unchanged return ScriptRewriteAction::keep(); }; if payload_start > payload_end - || payload_end > script.len() - || !script.is_char_boundary(payload_start) - || !script.is_char_boundary(payload_end) + || payload_end > content.len() + || !content.is_char_boundary(payload_start) + || !content.is_char_boundary(payload_end) { - if was_buffered { - return ScriptRewriteAction::replace(script.into_owned()); - } return ScriptRewriteAction::keep(); } + // Insert placeholder for this RSC payload and store original for post-processing + let state = ctx + .document_state + .get_or_insert_with(NEXTJS_INTEGRATION_ID, || { + Mutex::new(NextJsRscPostProcessState::default()) + }); + let mut guard = state.lock().unwrap_or_else(|e| e.into_inner()); + let placeholder_index = guard.payloads.len(); let placeholder = rsc_payload_placeholder(placeholder_index); guard .payloads - .push(script[payload_start..payload_end].to_string()); + .push(content[payload_start..payload_end].to_string()); - let mut rewritten = script.into_owned(); + let mut rewritten = content.to_string(); rewritten.replace_range(payload_start..payload_end, &placeholder); ScriptRewriteAction::replace(rewritten) } @@ -211,32 +156,52 @@ mod tests { } #[test] - fn buffers_fragmented_scripts_and_emits_single_replacement() { + fn skips_fragmented_scripts_for_post_processor_handling() { + // Fragmented scripts are not processed during streaming - they're passed through + // unchanged and handled by the post-processor which re-parses the final HTML. let state = IntegrationDocumentState::default(); let rewriter = NextJsRscPlaceholderRewriter::new(test_config()); let first = "self.__next_f.push([1,\"https://origin.example.com"; let second = "/page\"])"; + // Intermediate chunk should be kept (not processed) let action_first = rewriter.rewrite(first, &ctx(false, &state)); assert_eq!( action_first, - ScriptRewriteAction::RemoveNode, - "Intermediate chunk should be removed" + ScriptRewriteAction::Keep, + "Intermediate chunk should be kept unchanged" ); + // Final chunk should also be kept since it doesn't contain the full RSC pattern let action_second = rewriter.rewrite(second, &ctx(true, &state)); - let ScriptRewriteAction::Replace(rewritten) = action_second else { - panic!("Final chunk should be replaced with combined output"); - }; + assert_eq!( + action_second, + ScriptRewriteAction::Keep, + "Final chunk of fragmented script should be kept" + ); + // No payloads should be stored - post-processor will handle this assert!( - rewritten.contains(RSC_PAYLOAD_PLACEHOLDER_PREFIX), - "Combined output should include placeholder. Got: {rewritten}" + state + .get::>(NEXTJS_INTEGRATION_ID) + .is_none(), + "No RSC state should be created for fragmented scripts" ); - assert!( - rewritten.contains("self.__next_f.push"), - "Combined output should keep the push call. Got: {rewritten}" + } + + #[test] + fn skips_non_rsc_scripts() { + let state = IntegrationDocumentState::default(); + let rewriter = NextJsRscPlaceholderRewriter::new(test_config()); + + let script = r#"console.log("hello world");"#; + let action = rewriter.rewrite(script, &ctx(true, &state)); + + assert_eq!( + action, + ScriptRewriteAction::Keep, + "Non-RSC scripts should be kept unchanged" ); } }