add arg enable_autocast

zewenli98 · zewenli98 · commit f7d80686f0f3 · 2025-10-28T19:10:46.000-07:00
diff --git a/examples/dynamo/autocast_example.py b/examples/dynamo/autocast_example.py
@@ -85,9 +85,14 @@ def forward(self, x):
         trt_mod = torch_tensorrt.compile(
             ep.module(),
             arg_inputs=inputs,
-            use_explicit_typing=False,
             min_block_size=1,
             use_python_runtime=True,
+            ##### weak typing #####
+            # use_explicit_typing=False,
+            # enabled_precisions={torch.float16},
+            ##### strong typing + autocast #####
+            use_explicit_typing=True,
+            enable_autocast=True,
             low_precision_type=torch.float16,
             # nodes_to_exclude={"^conv2d$"},
             targets_to_exclude={},
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -141,7 +141,7 @@ def cross_compile_for_windows(
         disable_tf32 (bool): Force FP32 layers to use traditional as FP32 format vs the default behavior of rounding the inputs to 10-bit mantissas before multiplying, but accumulates the sum using 23-bit mantissas
         assume_dynamic_shape_support (bool): Setting this to true enables the converters work for both dynamic and static shapes. Default: False
         sparse_weights (bool): Enable sparsity for convolution and fully connected layers.
-        enabled_precision (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
+        enabled_precisions (Set(Union(torch.dtype, torch_tensorrt.dtype))): The set of datatypes that TensorRT can use when selecting kernels
         capability (torch_tensorrt.EngineCapability): Restrict kernel selection to safe gpu kernels or safe dla kernels
         num_avg_timing_iters (int): Number of averaging timing iterations used to select kernels
         workspace_size (int): Maximum size of workspace given to TensorRT
@@ -434,6 +434,7 @@ def compile(
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
+    enable_autocast: bool = _defaults.ENABLE_AUTOCAST,
     low_precision_type: Optional[
         Union[torch.dtype, dtype]
     ] = _defaults.LOW_PRECISION_TYPE,
@@ -518,6 +519,7 @@ def compile(
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+        enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
         low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
         nodes_to_exclude (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
         targets_to_exclude (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
@@ -596,6 +598,10 @@ def compile(
             "\nThis feature is unimplemented in Torch-TRT Dynamo currently."
         )
 
+    if enable_autocast:
+        use_explicit_typing = True
+        logger.debug("Autocast is enabled, setting use_explicit_typing to True.")
+
     if use_explicit_typing:
         if len(enabled_precisions) != 1 or not any(
             x in enabled_precisions
@@ -608,7 +614,7 @@ def compile(
     if low_precision_type is not None:
         if not isinstance(low_precision_type, (torch.dtype, dtype)):
             raise ValueError(
-                f"low_precision_type must be a torch.dtype or dtype, got {type(low_precision_type)}"
+                f"low_precision_type must be a torch.dtype or torch_tensorrt._enums.dtype, got {type(low_precision_type)}"
             )
         if low_precision_type not in {
             torch.float16,
@@ -737,6 +743,7 @@ def _materialize(x: Input | torch.Tensor) -> torch.Tensor:
         "l2_limit_for_tiling": l2_limit_for_tiling,
         "offload_module_to_cpu": offload_module_to_cpu,
         "use_distributed_mode_trace": use_distributed_mode_trace,
+        "enable_autocast": enable_autocast,
         "low_precision_type": low_precision_type,
         "nodes_to_exclude": nodes_to_exclude,
         "targets_to_exclude": targets_to_exclude,
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -57,6 +57,7 @@
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
+ENABLE_AUTOCAST = False
 LOW_PRECISION_TYPE = None
 NODES_TO_EXCLUDE = set[str]()
 TARGETS_TO_EXCLUDE = set[torch.fx.node.Target]()
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -14,6 +14,7 @@
     DLA_LOCAL_DRAM_SIZE,
     DLA_SRAM_SIZE,
     DRYRUN,
+    ENABLE_AUTOCAST,
     ENABLE_CROSS_COMPILE_FOR_WINDOWS,
     ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
     ENABLE_WEIGHT_STREAMING,
@@ -103,6 +104,7 @@ class CompilationSettings:
         tiling_optimization_level (str): The optimization level of tiling strategies. A higher level allows TensorRT to spend more time searching for better tiling strategy. We currently support ["none", "fast", "moderate", "full"].
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
+        enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
         low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
         nodes_to_exclude (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
         targets_to_exclude (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
@@ -152,6 +154,7 @@ class CompilationSettings:
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
     offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
+    enable_autocast: bool = ENABLE_AUTOCAST
     low_precision_type: Optional[dtype] = LOW_PRECISION_TYPE
     nodes_to_exclude: Collection[str] = field(default_factory=lambda: NODES_TO_EXCLUDE)
     targets_to_exclude: Collection[Target] = field(
@@ -179,6 +182,7 @@ def __setstate__(self, state: dict[str, Any]) -> None:
         self.__dict__.update(state)
 
 
+# If any of the following setting is changed, the engine should be rebuilt.
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
     "enabled_precisions",
     "max_aux_streams",
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -292,9 +292,17 @@ def _populate_trt_builder_config(
             )
 
         if not self.compilation_settings.use_explicit_typing:
-            _LOGGER.info(
-                "Torch-TensorRT uses Autocast to determine the precision of the graph, because weak typing has been deprecated in TensorRT 10.12."
-            )
+            if dtype.float16 in self.compilation_settings.enabled_precisions:
+                builder_config.set_flag(trt.BuilderFlag.FP16)
+
+            if dtype.int8 in self.compilation_settings.enabled_precisions:
+                builder_config.set_flag(trt.BuilderFlag.INT8)
+
+            if dtype.fp8 in self.compilation_settings.enabled_precisions:
+                builder_config.set_flag(trt.BuilderFlag.FP8)
+
+            if dtype.bfloat16 in self.compilation_settings.enabled_precisions:
+                builder_config.set_flag(trt.BuilderFlag.BF16)
 
         if self.compilation_settings.sparse_weights:
             builder_config.set_flag(trt.BuilderFlag.SPARSE_WEIGHTS)
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/rule_based_autocast.py b/py/torch_tensorrt/dynamo/lowering/passes/rule_based_autocast.py
@@ -26,8 +26,8 @@ def rule_based_autocast(
     gm: torch.fx.GraphModule, settings: CompilationSettings
 ) -> torch.fx.GraphModule:
     """Rule-based autocast"""
-    if settings.use_explicit_typing:
-        logger.debug("Strong typing is enabled, skipping rule-based autocast.")
+    if not settings.enable_autocast:
+        logger.debug("Autocast is not enabled, skipping rule-based autocast.")
         return gm
 
     # nodes = list(gm.graph.nodes)
diff --git a/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
@@ -154,10 +154,6 @@ def forward(
                         + contiguous_inputs[i + 1 :]
                     )
 
-                assert (
-                    contiguous_inputs[i].dtype == inputs[i].dtype
-                ), f"Dtype mismatch for {i}th input. Expect {inputs[i].dtype}, got {contiguous_inputs[i].dtype}."
-
                 if need_cudagraphs_record:
                     # If cudagraphs is enabled, this memory is reserved for future cudagraph runs
                     # Clone is required to avoid re-using user-provided GPU memory