fix comments

zewenli98 · zewenli98 · commit 4bf12e7a0338 · 2025-11-06T18:22:41.000-08:00
diff --git a/examples/dynamo/autocast_example.py b/examples/dynamo/autocast_example.py
@@ -31,12 +31,14 @@ def forward(self, x, y):
         )  # fp32 because of `torch.ops.aten.flatten.using_ints` in `autocast_excluded_ops`
         # Respect the precisions in the pytorch autocast context
         with torch.autocast(x.device.type, enabled=True, dtype=torch.float32):
-            x = self.fc1(x)
+            x = self.fc1(x)  # fp32
             with torch.autocast(x.device.type, enabled=False):
-                x = torch.sub(x.half(), y)
-                out2 = torch.add(x, x)
+                x = torch.sub(x.half(), y)  # fp16
+                out2 = torch.add(x, x)  # fp16
         with torch.autocast(x.device.type, enabled=True, dtype=torch.float16):
-            out2 = torch.log(out2)
+            out2 = torch.log(
+                out2
+            )  # fp32 because Pytorch Autocast requires `log` to be in fp32
         return x, out, out2
 
 
@@ -46,6 +48,9 @@ def forward(self, x, y):
         torch.randn((1, 3, 32, 32), dtype=torch.float32, device="cuda"),
         torch.randn((1,), dtype=torch.float16, device="cuda"),
     )
+    calibration_dataloader = torch.utils.data.DataLoader(
+        torch.utils.data.TensorDataset(*inputs), batch_size=1, shuffle=False
+    )
 
     ep = torch.export.export(model, inputs)
 
@@ -68,8 +73,9 @@ def forward(self, x, y):
             autocast_low_precision_type=torch.float16,
             autocast_excluded_nodes={"^conv1$", "relu"},
             autocast_excluded_ops={torch.ops.aten.flatten.using_ints},
-            autocast_data_max=512,
+            autocast_max_output_threshold=512,
             autocast_max_depth_of_reduction=None,
+            autocast_calibration_dataloader=calibration_dataloader,
         )
 
         trt_out = trt_mod(*inputs)
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -440,7 +440,7 @@ def compile(
     ] = _defaults.AUTOCAST_LOW_PRECISION_TYPE,
     autocast_excluded_nodes: Collection[str] = _defaults.AUTOCAST_EXCLUDED_NODES,
     autocast_excluded_ops: Collection[Target] = _defaults.AUTOCAST_EXCLUDED_OPS,
-    autocast_data_max: float = _defaults.AUTOCAST_DATA_MAX,
+    autocast_max_output_threshold: float = _defaults.AUTOCAST_MAX_OUTPUT_THRESHOLD,
     autocast_max_depth_of_reduction: Optional[
         int
     ] = _defaults.AUTOCAST_MAX_DEPTH_OF_REDUCTION,
@@ -526,10 +526,10 @@ def compile(
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
         autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
-        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
+        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match user-specified node names that should remain in FP32. Default is [].
         autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
-        autocast_data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
-        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
+        autocast_max_output_threshold (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
+        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. This helps prevent excessive accuracy loss in operations particularly sensitive to reduced precision, as higher-depth reductions may amplify computation errors in low precision formats. If not provided, infinity will be used. Default is None.
         autocast_calibration_dataloader (Optional[torch.utils.data.DataLoader]): The dataloader to use for autocast calibration. Default is None.
         **kwargs: Any,
     Returns:
@@ -721,7 +721,7 @@ def compile(
         "autocast_low_precision_type": autocast_low_precision_type,
         "autocast_excluded_nodes": autocast_excluded_nodes,
         "autocast_excluded_ops": autocast_excluded_ops,
-        "autocast_data_max": autocast_data_max,
+        "autocast_max_output_threshold": autocast_max_output_threshold,
         "autocast_max_depth_of_reduction": autocast_max_depth_of_reduction,
         "autocast_calibration_dataloader": autocast_calibration_dataloader,
     }
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -61,7 +61,7 @@
 AUTOCAST_LOW_PRECISION_TYPE = None
 AUTOCAST_EXCLUDED_NODES = set[str]()
 AUTOCAST_EXCLUDED_OPS = set[torch.fx.node.Target]()
-AUTOCAST_DATA_MAX = 512
+AUTOCAST_MAX_OUTPUT_THRESHOLD = 512
 AUTOCAST_MAX_DEPTH_OF_REDUCTION = None
 AUTOCAST_CALIBRATION_DATALOADER = None
 
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -8,11 +8,11 @@
 from torch_tensorrt.dynamo._defaults import (
     ASSUME_DYNAMIC_SHAPE_SUPPORT,
     AUTOCAST_CALIBRATION_DATALOADER,
-    AUTOCAST_DATA_MAX,
     AUTOCAST_EXCLUDED_NODES,
     AUTOCAST_EXCLUDED_OPS,
     AUTOCAST_LOW_PRECISION_TYPE,
     AUTOCAST_MAX_DEPTH_OF_REDUCTION,
+    AUTOCAST_MAX_OUTPUT_THRESHOLD,
     CACHE_BUILT_ENGINES,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
@@ -107,10 +107,10 @@ class CompilationSettings:
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
         autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
-        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
+        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match user-specified node names that should remain in FP32. Default is [].
         autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
-        autocast_data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
-        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
+        autocast_max_output_threshold (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
+        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. This helps prevent excessive accuracy loss in operations particularly sensitive to reduced precision, as higher-depth reductions may amplify computation errors in low precision formats. If not provided, infinity will be used. Default is None.
         autocast_calibration_dataloader (Optional[torch.utils.data.DataLoader]): The dataloader to use for autocast calibration. Default is None.
     """
 
@@ -163,7 +163,7 @@ class CompilationSettings:
     autocast_excluded_ops: Collection[Target] = field(
         default_factory=lambda: AUTOCAST_EXCLUDED_OPS
     )
-    autocast_data_max: float = AUTOCAST_DATA_MAX
+    autocast_max_output_threshold: float = AUTOCAST_MAX_OUTPUT_THRESHOLD
     autocast_max_depth_of_reduction: Optional[int] = AUTOCAST_MAX_DEPTH_OF_REDUCTION
     autocast_calibration_dataloader: Optional[torch.utils.data.DataLoader] = (
         AUTOCAST_CALIBRATION_DATALOADER
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -1,9 +1,13 @@
 import logging
+import operator
 from typing import Any, Callable, Optional, Sequence, Union
 
 import torch
 from torch_tensorrt._utils import is_tegra_platform
 from torch_tensorrt.dynamo._settings import CompilationSettings
+from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
+    trace_intermediate_node_outputs,
+)
 
 from .complex_graph_rewrite import complex_graph_detection
 from .constant_folding import constant_fold
@@ -141,33 +145,11 @@ def pre_export_lowering(
 
     # Only for rule-based autocast to collect the intermediate node outputs
     if settings.enable_autocast:
-        autocast_intermediate_node_outputs: dict[str, torch.Tensor] = {}
-
-        class IntermediateNodeTracer(torch.fx.Interpreter):  # type: ignore[misc]
-            def run_node(self, n: torch.fx.Node) -> Any:
-                out = super().run_node(n)
-                if (
-                    n.op == "call_function"
-                    and n.target != torch.ops.higher_order.wrap_with_autocast
-                ):
-                    if not isinstance(out, torch.Tensor):
-                        raise ValueError(
-                            f"Please file a bug with Torch-TensorRT because it expects a torch.Tensor but got {type(out)} for node {n.name}."
-                        )
-                    if n.name in autocast_intermediate_node_outputs:
-                        autocast_intermediate_node_outputs[n.name] = torch.cat(
-                            [autocast_intermediate_node_outputs[n.name], out], dim=0
-                        )
-                    else:
-                        autocast_intermediate_node_outputs[n.name] = out
-                return out
-
-        if settings.autocast_calibration_dataloader is not None:
-            tracer = IntermediateNodeTracer(ep.module())
-            for batch in settings.autocast_calibration_dataloader:
-                tracer.run(tuple(batch))
-        settings.autocast_intermediate_node_outputs = autocast_intermediate_node_outputs
-
+        settings.autocast_intermediate_node_outputs = trace_intermediate_node_outputs(
+            ep.module(),
+            settings.autocast_calibration_dataloader,
+            [torch.ops.higher_order.wrap_with_autocast, operator.getitem],
+        )
     gm = ep.graph_module
     gm = ATEN_PRE_LOWERING_PASSES(gm, settings)
     return ep
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/nodeclassifier.py b/py/torch_tensorrt/dynamo/lowering/passes/nodeclassifier.py
@@ -29,7 +29,7 @@ def check(self, node):
         """Check if a node should be skipped based on the rule.
 
         Args:
-            node: The ONNX node to check.
+            node: The torch.fx.Node to check.
 
         Returns:
             bool: True if the node should be kept in high precision, False otherwise.
@@ -42,13 +42,13 @@ def check(self, node):
 
 
 class DisabledNodeNameRegexRule(NodeRuleBase):
-    """Rule for keeping nodes with matching names in high precision."""
+    """Rule for keeping nodes with matching user-specified names in high precision."""
 
     def __init__(self, disabled_node_name_regex):
         """Initialize the rule.
 
         Args:
-            disabled_node_name_regex: List of regex patterns for node names to keep in high precision.
+            disabled_node_name_regex: List of regex patterns for user-specified node names to keep in high precision.
         """
         self.disabled_node_name_regex = disabled_node_name_regex
 
@@ -63,13 +63,13 @@ def _check_inner(self, node):
 
 
 class DisabledOpTypes(NodeRuleBase):
-    """Rule for keeping nodes with specific operation types in high precision."""
+    """Rule for keeping nodes with specific ATen ops in high precision."""
 
     def __init__(self, excluded_ops):
         """Initialize the rule.
 
         Args:
-            excluded_ops: List of operation types to keep in high precision.
+            excluded_ops: List of ATen ops that should remain in FP32.
         """
         self.excluded_ops = excluded_ops
 
@@ -80,14 +80,14 @@ def _check_inner(self, node):
 class IORangeRule(NodeRuleBase):
     """Rule for keeping nodes with out-of-range inputs/outputs in high precision."""
 
-    def __init__(self, data_max, reference_data):
+    def __init__(self, max_output_threshold, reference_data):
         """Initialize the rule.
 
         Args:
-            data_max: Maximum absolute value allowed for node I/O.
+            max_output_threshold: Maximum absolute value allowed for node I/O.
             reference_data: Reference data for checking I/O ranges.
         """
-        self.data_max = data_max
+        self.max_output_threshold = max_output_threshold
         self.reference_data = reference_data
         self.output_data = None
 
@@ -108,7 +108,7 @@ def is_io_out_of_range(node):
             logger.debug(
                 f"Node {node.name}: reference data: min={ref_data.min()}, max={ref_data.max()}"
             )
-            if torch.any(torch.abs(ref_data) > self.data_max):
+            if torch.any(torch.abs(ref_data) > self.max_output_threshold):
                 self.output_data = ref_data
                 return True
 
@@ -126,14 +126,17 @@ def _log_skipped(self, node, **kwargs):
         if self.output_data is not None:
             logger.info(
                 f"Skipping node {node.name}: reference IO out of range: min={torch.min(self.output_data)}, "
-                f"max={torch.max(self.output_data)}, range=[{-self.data_max}, {self.data_max}]"
+                f"max={torch.max(self.output_data)}, range=[{-self.max_output_threshold}, {self.max_output_threshold}]"
             )
         else:
             super()._log_skipped(node, **kwargs)
 
 
 class DepthOfReductionRule(NodeRuleBase):
-    """Rule for keeping nodes with high depth of reduction in high precision."""
+    """
+    Rule for keeping nodes with high depth of reduction in high precision. This helps prevent excessive accuracy loss in operations particularly sensitive to reduced precision, as higher-depth reductions may amplify computation errors in low precision formats.
+    Reduction ops are those that aggregate data across one or more axes, decreasing the dimensionality of the input tensor, such as convolution, gemm, etc.
+    """
 
     def __init__(self, max_depth_of_reduction, reference_data):
         """Initialize the rule.
@@ -226,7 +229,7 @@ def __init__(
         excluded_nodes: Collection[str] | None = None,
         excluded_ops: Collection[torch.fx.node.Target] | None = None,
         custom_rule: NodeRuleBase | None = None,
-        data_max: float | None = 1000.0,
+        max_output_threshold: float | None = 512,
         max_depth_of_reduction: int | None = None,
     ):
         """Initialize the node classifier.
@@ -236,14 +239,14 @@ def __init__(
             nodes_to_exclude: Collection of regex patterns for node names to keep in high precision.
             targets_to_exclude: Collection of targets to keep in high precision.
             custom_rule: Optional custom classification rule.
-            data_max: Maximum absolute value allowed for node I/O.
+            max_output_threshold: Maximum absolute value allowed for node I/O.
             max_depth_of_reduction: Maximum depth of reduction allowed in low precision.
         """
         self.nodes = nodes
         self.excluded_nodes = excluded_nodes
         self.excluded_ops = excluded_ops
         self.custom_rule = custom_rule
-        self.data_max = data_max
+        self.max_output_threshold = max_output_threshold
         self.max_depth_of_reduction = max_depth_of_reduction
 
     def _gen_block_node_rules(self, reference_data):
@@ -261,7 +264,9 @@ def _gen_block_node_rules(self, reference_data):
         if self.excluded_ops:
             block_node_rules.append(DisabledOpTypes(self.excluded_ops))
         if reference_data:
-            block_node_rules.append(IORangeRule(self.data_max, reference_data))
+            block_node_rules.append(
+                IORangeRule(self.max_output_threshold, reference_data)
+            )
         if self.max_depth_of_reduction is not None:
             block_node_rules.append(
                 DepthOfReductionRule(
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py
@@ -1,4 +1,4 @@
-from typing import List
+from typing import Any, Dict, List, Sequence
 
 import torch
 
@@ -68,3 +68,44 @@ def is_node_complex(node: torch.fx.Node, complexNodes):
                     complexNodes[node.name] = True
                     return True
     return False
+
+
+def trace_intermediate_node_outputs(
+    gm: torch.fx.GraphModule,
+    calibration_dataloader: torch.utils.data.DataLoader,
+    excluded_ops: Sequence[torch.fx.node.Target] = [],
+) -> Dict[str, torch.Tensor]:
+    """Trace the intermediate node outputs of a graph module.
+
+    Args:
+        gm (torch.fx.GraphModule): The graph module to trace the intermediate node outputs of.
+        calibration_dataloader (torch.utils.data.DataLoader): The dataloader to use for tracing.
+        excluded_ops (Set[torch.fx.node.Target]): The set of ATen ops that should be excluded from the trace. For example, `{torch.ops.higher_order.wrap_with_autocast, operator.getitem}`. Default is an empty set.
+
+    Returns:
+        Dict[str, torch.Tensor]: A dictionary of intermediate node outputs. The key is the node name and the value is the tensor.
+    """
+
+    intermediate_node_outputs: Dict[str, torch.Tensor] = {}
+
+    class IntermediateNodeTracer(torch.fx.Interpreter):  # type: ignore[misc]
+        def run_node(self, n: torch.fx.Node) -> Any:
+            out = super().run_node(n)
+            if n.op == "call_function" and n.target not in excluded_ops:
+                if not isinstance(out, torch.Tensor):
+                    raise ValueError(
+                        f"Please file a bug with Torch-TensorRT because it expects a torch.Tensor but got {type(out)} for node {n.name}."
+                    )
+                if n.name in intermediate_node_outputs:
+                    intermediate_node_outputs[n.name] = torch.cat(
+                        [intermediate_node_outputs[n.name], out], dim=0
+                    )
+                else:
+                    intermediate_node_outputs[n.name] = out
+            return out
+
+    if calibration_dataloader is not None:
+        tracer = IntermediateNodeTracer(gm)
+        for batch in calibration_dataloader:
+            tracer.run(tuple(batch))
+    return intermediate_node_outputs
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/rule_based_autocast.py b/py/torch_tensorrt/dynamo/lowering/passes/rule_based_autocast.py