support dataloader for calibration

zewenli98 · zewenli98 · commit 94757d28d746 · 2025-11-06T10:47:33.000-08:00
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -444,6 +444,9 @@ def compile(
     autocast_max_depth_of_reduction: Optional[
         int
     ] = _defaults.AUTOCAST_MAX_DEPTH_OF_REDUCTION,
+    autocast_calibration_dataloader: Optional[
+        torch.utils.data.DataLoader
+    ] = _defaults.AUTOCAST_CALIBRATION_DATALOADER,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -527,6 +530,7 @@ def compile(
         autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
         autocast_data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
         autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
+        autocast_calibration_dataloader (Optional[torch.utils.data.DataLoader]): The dataloader to use for autocast calibration. Default is None.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -655,38 +659,6 @@ def compile(
     if not isinstance(arg_inputs, collections.abc.Sequence):
         arg_inputs = [arg_inputs]  # type: ignore
 
-    # save intermediate outputs of each node for Autocast
-    autocast_intermediate_node_outputs = {}
-    if not use_explicit_typing:
-
-        class DumpInterpreter(torch.fx.Interpreter):  # type: ignore[misc]
-            """Dump intermediate outputs of each node"""
-
-            def run_node(self, n: torch.fx.Node) -> Any:
-                if (
-                    n.op == "call_function"
-                    and n.target != torch.ops.higher_order.wrap_with_autocast
-                ):
-                    out = super().run_node(n)
-                    if not isinstance(out, torch.Tensor):
-                        raise ValueError(
-                            f"Please file a bug with Torch-TensorRT because it expects a torch.Tensor but got {type(out)} for node {n.name}."
-                        )
-                    autocast_intermediate_node_outputs[n.name] = out
-                    return out
-                return super().run_node(n)
-
-        def _materialize(x: Input | torch.Tensor) -> torch.Tensor:
-            """Materialize an Input object to a tensor"""
-            if isinstance(x, Input):
-                return x.torch_tensor
-            return x
-
-        with torch.no_grad():
-            mat_args = tuple(_materialize(a) for a in arg_inputs)
-            mat_kwargs = {k: _materialize(v) for k, v in kwarg_inputs.items()}
-            DumpInterpreter(exported_program.module()).run(*mat_args, **mat_kwargs)
-
     # Prepare torch_trt inputs
     trt_arg_inputs: Sequence[Input] = prepare_inputs(arg_inputs)
     trt_kwarg_inputs: Optional[dict[Any, Any]] = prepare_inputs(kwarg_inputs)
@@ -751,7 +723,7 @@ def _materialize(x: Input | torch.Tensor) -> torch.Tensor:
         "autocast_excluded_ops": autocast_excluded_ops,
         "autocast_data_max": autocast_data_max,
         "autocast_max_depth_of_reduction": autocast_max_depth_of_reduction,
-        "autocast_intermediate_node_outputs": autocast_intermediate_node_outputs,
+        "autocast_calibration_dataloader": autocast_calibration_dataloader,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -63,6 +63,7 @@
 AUTOCAST_EXCLUDED_OPS = set[torch.fx.node.Target]()
 AUTOCAST_DATA_MAX = 512
 AUTOCAST_MAX_DEPTH_OF_REDUCTION = None
+AUTOCAST_CALIBRATION_DATALOADER = None
 
 if platform.system() == "Linux":
     import pwd
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -7,6 +7,7 @@
 from torch_tensorrt._enums import EngineCapability, dtype
 from torch_tensorrt.dynamo._defaults import (
     ASSUME_DYNAMIC_SHAPE_SUPPORT,
+    AUTOCAST_CALIBRATION_DATALOADER,
     AUTOCAST_DATA_MAX,
     AUTOCAST_EXCLUDED_NODES,
     AUTOCAST_EXCLUDED_OPS,
@@ -110,7 +111,7 @@ class CompilationSettings:
         autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
         autocast_data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
         autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
-        autocast_intermediate_node_outputs (dict[str, torch.Tensor]): The intermediate node outputs of the graph. Default is {}.
+        autocast_calibration_dataloader (Optional[torch.utils.data.DataLoader]): The dataloader to use for autocast calibration. Default is None.
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -164,8 +165,8 @@ class CompilationSettings:
     )
     autocast_data_max: float = AUTOCAST_DATA_MAX
     autocast_max_depth_of_reduction: Optional[int] = AUTOCAST_MAX_DEPTH_OF_REDUCTION
-    autocast_intermediate_node_outputs: dict[str, torch.Tensor] = field(
-        default_factory=lambda: {}
+    autocast_calibration_dataloader: Optional[torch.utils.data.DataLoader] = (
+        AUTOCAST_CALIBRATION_DATALOADER
     )
 
     def __getstate__(self) -> dict[str, Any]:
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -138,6 +138,36 @@ def pre_export_lowering(
     logging.debug(
         f"Invoking DynamoPassManager and applying lowering passes: {ATEN_PRE_LOWERING_PASSES}"
     )
+
+    # Only for rule-based autocast to collect the intermediate node outputs
+    if settings.enable_autocast:
+        autocast_intermediate_node_outputs: dict[str, torch.Tensor] = {}
+
+        class IntermediateNodeTracer(torch.fx.Interpreter):  # type: ignore[misc]
+            def run_node(self, n: torch.fx.Node) -> Any:
+                out = super().run_node(n)
+                if (
+                    n.op == "call_function"
+                    and n.target != torch.ops.higher_order.wrap_with_autocast
+                ):
+                    if not isinstance(out, torch.Tensor):
+                        raise ValueError(
+                            f"Please file a bug with Torch-TensorRT because it expects a torch.Tensor but got {type(out)} for node {n.name}."
+                        )
+                    if n.name in autocast_intermediate_node_outputs:
+                        autocast_intermediate_node_outputs[n.name] = torch.cat(
+                            [autocast_intermediate_node_outputs[n.name], out], dim=0
+                        )
+                    else:
+                        autocast_intermediate_node_outputs[n.name] = out
+                return out
+
+        if settings.autocast_calibration_dataloader is not None:
+            tracer = IntermediateNodeTracer(ep.module())
+            for batch in settings.autocast_calibration_dataloader:
+                tracer.run(tuple(batch))
+        settings.autocast_intermediate_node_outputs = autocast_intermediate_node_outputs
+
     gm = ep.graph_module
     gm = ATEN_PRE_LOWERING_PASSES(gm, settings)
     return ep