change names of API and support for user specified node names

zewenli98 · zewenli98 · commit e15ce946fa28 · 2025-11-04T18:50:01.000-08:00
diff --git a/examples/dynamo/autocast_example.py b/examples/dynamo/autocast_example.py
@@ -1,22 +1,6 @@
 import torch
 import torch.nn as nn
 import torch_tensorrt
-import torchvision
-
-
-class MyModule(torch.nn.Module):
-    def forward(self, a_float32, b_float32, c_float32, d_float32):
-        with torch.autocast(device_type="cuda"):
-            e_float16 = torch.mm(a_float32, b_float32)
-            with torch.autocast(device_type="cuda", enabled=False):
-                # Calls e_float16.float() to ensure float32 execution
-                # (necessary because e_float16 was created in an autocasted region)
-                f_float32 = torch.mm(c_float32, e_float16.float())
-
-            # No manual casts are required when re-entering the autocast-enabled region.
-            # torch.mm again runs in float16 and produces float16 output, regardless of input types.
-            g_float16 = torch.mm(d_float32, f_float32)
-        return g_float16
 
 
 class AutocastExample(nn.Module):
@@ -36,44 +20,32 @@ def __init__(self):
         self.fc1 = nn.Linear(16 * 8 * 8, 10)
 
     def forward(self, x, y):
-        out = self.pool1(self.relu1(self.conv1(x)))  # fp16
-        x = self.pool2(self.relu2(self.conv2(out)))  # fp16
-        x = self.flatten(x)
+        x = self.conv1(x)  # fp32 because of "^conv1$" in `autocast_excluded_nodes`
+        x = self.relu1(x)  # fp32 because of "relu" in `autocast_excluded_nodes`
+        out = self.pool1(x)  # fp16
+        x = self.conv2(out)  # fp16
+        x = self.relu2(x)  # fp32 because of "relu" in `autocast_excluded_nodes`
+        x = self.pool2(x)  # fp16
+        x = self.flatten(
+            x
+        )  # fp32 because of `torch.ops.aten.flatten.using_ints` in `autocast_excluded_ops`
+        # Respect the precisions in the pytorch autocast context
         with torch.autocast(x.device.type, enabled=True, dtype=torch.float32):
-            x = self.fc1(x)  # fp32
+            x = self.fc1(x)
             with torch.autocast(x.device.type, enabled=False):
-                x = torch.sub(x.half(), y)  # fp16
-                out2 = torch.add(x, x)  # fp16
+                x = torch.sub(x.half(), y)
+                out2 = torch.add(x, x)
         with torch.autocast(x.device.type, enabled=True, dtype=torch.float16):
-            out2 = torch.log(out2)  # fp32
+            out2 = torch.log(out2)
         return x, out, out2
 
 
-class MyResNet18Wrapper(torch.nn.Module):
-    def __init__(self, num_classes=1000, pretrained=True):
-        super(MyResNet18Wrapper, self).__init__()
-        self.resnet = torchvision.models.resnet18(
-            num_classes=num_classes, weights="IMAGENET1K_V1" if pretrained else None
-        )
-
-    def forward(self, x):
-        x = self.resnet(x)
-        return x
-
-
 if __name__ == "__main__":
-    # model = MyModule().cuda().eval()
-    # inputs = (torch.randn((8, 8), device="cuda"),
-    #           torch.randn((8, 8), device="cuda"),
-    #           torch.randn((8, 8), device="cuda"),
-    #           torch.randn((8, 8), device="cuda"),)
-
-    # model = AutocastExample().cuda().eval()
-    # inputs = (torch.randn((1, 3, 32, 32), dtype=torch.float32, device="cuda"),
-    #           torch.randn((1,), dtype=torch.float16, device="cuda"),)
-
-    model = MyResNet18Wrapper().cuda().eval()
-    inputs = (torch.randn((1, 3, 224, 224), dtype=torch.float32, device="cuda"),)
+    model = AutocastExample().cuda().eval()
+    inputs = (
+        torch.randn((1, 3, 32, 32), dtype=torch.float32, device="cuda"),
+        torch.randn((1,), dtype=torch.float16, device="cuda"),
+    )
 
     ep = torch.export.export(model, inputs)
 
@@ -93,11 +65,11 @@ def forward(self, x):
             ##### strong typing + autocast #####
             use_explicit_typing=True,
             enable_autocast=True,
-            low_precision_type=torch.float16,
-            # nodes_to_exclude={"^conv2d$"},
-            targets_to_exclude={},
-            data_max=512,
-            max_depth_of_reduction=None,
+            autocast_low_precision_type=torch.float16,
+            autocast_excluded_nodes={"^conv1$", "relu"},
+            autocast_excluded_ops={torch.ops.aten.flatten.using_ints},
+            autocast_data_max=512,
+            autocast_max_depth_of_reduction=None,
         )
 
         trt_out = trt_mod(*inputs)
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -435,13 +435,15 @@ def compile(
     offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     use_distributed_mode_trace: bool = _defaults.USE_DISTRIBUTED_MODE_TRACE,
     enable_autocast: bool = _defaults.ENABLE_AUTOCAST,
-    low_precision_type: Optional[
+    autocast_low_precision_type: Optional[
         Union[torch.dtype, dtype]
-    ] = _defaults.LOW_PRECISION_TYPE,
-    nodes_to_exclude: Collection[str] = _defaults.NODES_TO_EXCLUDE,
-    targets_to_exclude: Collection[Target] = _defaults.TARGETS_TO_EXCLUDE,
-    data_max: float = _defaults.DATA_MAX,
-    max_depth_of_reduction: Optional[int] = _defaults.MAX_DEPTH_OF_REDUCTION,
+    ] = _defaults.AUTOCAST_LOW_PRECISION_TYPE,
+    autocast_excluded_nodes: Collection[str] = _defaults.AUTOCAST_EXCLUDED_NODES,
+    autocast_excluded_ops: Collection[Target] = _defaults.AUTOCAST_EXCLUDED_OPS,
+    autocast_data_max: float = _defaults.AUTOCAST_DATA_MAX,
+    autocast_max_depth_of_reduction: Optional[
+        int
+    ] = _defaults.AUTOCAST_MAX_DEPTH_OF_REDUCTION,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -520,11 +522,11 @@ def compile(
         offload_module_to_cpu (bool): Offload the module to CPU. This is useful when we need to minimize GPU memory usage.
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
-        low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
-        nodes_to_exclude (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
-        targets_to_exclude (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
-        data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
-        max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
+        autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
+        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
+        autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
+        autocast_data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
+        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -611,17 +613,17 @@ def compile(
                 f"use_explicit_typing was set to True, however found that enabled_precisions was also specified (saw: {enabled_precisions}, expected: dtype.f32, dtype.f4). enabled_precisions should not be used when use_explicit_typing=True"
             )
 
-    if low_precision_type is not None:
-        if not isinstance(low_precision_type, (torch.dtype, dtype)):
+    if autocast_low_precision_type is not None:
+        if not isinstance(autocast_low_precision_type, (torch.dtype, dtype)):
             raise ValueError(
-                f"low_precision_type must be a torch.dtype or torch_tensorrt._enums.dtype, got {type(low_precision_type)}"
+                f"autocast_low_precision_type must be a torch.dtype or torch_tensorrt._enums.dtype, got {type(autocast_low_precision_type)}"
             )
-        if low_precision_type not in {
+        if autocast_low_precision_type not in {
             torch.float16,
             torch.bfloat16,
-        } and low_precision_type not in {dtype.f16, dtype.bf16}:
+        } and autocast_low_precision_type not in {dtype.f16, dtype.bf16}:
             raise ValueError(
-                f"low_precision_type must be one of torch.float16, torch.bfloat16, dtype.f16, dtype.bf16, got {low_precision_type}"
+                f"autocast_low_precision_type must be one of torch.float16, torch.bfloat16, dtype.f16, dtype.bf16, got {autocast_low_precision_type}"
             )
 
     if use_fp32_acc:
@@ -654,7 +656,7 @@ def compile(
         arg_inputs = [arg_inputs]  # type: ignore
 
     # save intermediate outputs of each node for Autocast
-    intermediate_node_outputs = {}
+    autocast_intermediate_node_outputs = {}
     if not use_explicit_typing:
 
         class DumpInterpreter(torch.fx.Interpreter):  # type: ignore[misc]
@@ -670,7 +672,7 @@ def run_node(self, n: torch.fx.Node) -> Any:
                         raise ValueError(
                             f"Please file a bug with Torch-TensorRT because it expects a torch.Tensor but got {type(out)} for node {n.name}."
                         )
-                    intermediate_node_outputs[n.name] = out
+                    autocast_intermediate_node_outputs[n.name] = out
                     return out
                 return super().run_node(n)
 
@@ -744,12 +746,12 @@ def _materialize(x: Input | torch.Tensor) -> torch.Tensor:
         "offload_module_to_cpu": offload_module_to_cpu,
         "use_distributed_mode_trace": use_distributed_mode_trace,
         "enable_autocast": enable_autocast,
-        "low_precision_type": low_precision_type,
-        "nodes_to_exclude": nodes_to_exclude,
-        "targets_to_exclude": targets_to_exclude,
-        "data_max": data_max,
-        "max_depth_of_reduction": max_depth_of_reduction,
-        "intermediate_node_outputs": intermediate_node_outputs,
+        "autocast_low_precision_type": autocast_low_precision_type,
+        "autocast_excluded_nodes": autocast_excluded_nodes,
+        "autocast_excluded_ops": autocast_excluded_ops,
+        "autocast_data_max": autocast_data_max,
+        "autocast_max_depth_of_reduction": autocast_max_depth_of_reduction,
+        "autocast_intermediate_node_outputs": autocast_intermediate_node_outputs,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -58,11 +58,11 @@
 USE_DISTRIBUTED_MODE_TRACE = False
 OFFLOAD_MODULE_TO_CPU = False
 ENABLE_AUTOCAST = False
-LOW_PRECISION_TYPE = None
-NODES_TO_EXCLUDE = set[str]()
-TARGETS_TO_EXCLUDE = set[torch.fx.node.Target]()
-DATA_MAX = 512
-MAX_DEPTH_OF_REDUCTION = None
+AUTOCAST_LOW_PRECISION_TYPE = None
+AUTOCAST_EXCLUDED_NODES = set[str]()
+AUTOCAST_EXCLUDED_OPS = set[torch.fx.node.Target]()
+AUTOCAST_DATA_MAX = 512
+AUTOCAST_MAX_DEPTH_OF_REDUCTION = None
 
 if platform.system() == "Linux":
     import pwd
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -7,8 +7,12 @@
 from torch_tensorrt._enums import EngineCapability, dtype
 from torch_tensorrt.dynamo._defaults import (
     ASSUME_DYNAMIC_SHAPE_SUPPORT,
+    AUTOCAST_DATA_MAX,
+    AUTOCAST_EXCLUDED_NODES,
+    AUTOCAST_EXCLUDED_OPS,
+    AUTOCAST_LOW_PRECISION_TYPE,
+    AUTOCAST_MAX_DEPTH_OF_REDUCTION,
     CACHE_BUILT_ENGINES,
-    DATA_MAX,
     DISABLE_TF32,
     DLA_GLOBAL_DRAM_SIZE,
     DLA_LOCAL_DRAM_SIZE,
@@ -24,11 +28,8 @@
     IMMUTABLE_WEIGHTS,
     L2_LIMIT_FOR_TILING,
     LAZY_ENGINE_INIT,
-    LOW_PRECISION_TYPE,
     MAX_AUX_STREAMS,
-    MAX_DEPTH_OF_REDUCTION,
     MIN_BLOCK_SIZE,
-    NODES_TO_EXCLUDE,
     NUM_AVG_TIMING_ITERS,
     OFFLOAD_MODULE_TO_CPU,
     OPTIMIZATION_LEVEL,
@@ -38,7 +39,6 @@
     REUSE_CACHED_ENGINES,
     SPARSE_WEIGHTS,
     STRIP_ENGINE_WEIGHTS,
-    TARGETS_TO_EXCLUDE,
     TILING_OPTIMIZATION_LEVEL,
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
@@ -105,12 +105,12 @@ class CompilationSettings:
         l2_limit_for_tiling (int): The target L2 cache usage limit (in bytes) for tiling optimization (default is -1 which means no limit).
         use_distributed_mode_trace (bool):  Using aot_autograd to trace the graph. This is enabled when DTensors or distributed tensors are present in distributed model
         enable_autocast (bool): Whether to enable autocast. If enabled, use_explicit_typing will be set to True.
-        low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
-        nodes_to_exclude (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
-        targets_to_exclude (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
-        data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
-        max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
-        intermediate_node_outputs (dict[str, torch.Tensor]): The intermediate node outputs of the graph. Default is {}.
+        autocast_low_precision_type (Optional[Union[torch.dtype, dtype]]): The precision to reduce to. We currently support torch.float16 and torch.bfloat16. Default is None, which means no low precision is used.
+        autocast_excluded_nodes (Collection[str]): The set of regex patterns to match node names that should remain in FP32. Default is [].
+        autocast_excluded_ops (Collection[Target]): The set of targets (ATen ops) that should remain in FP32. Default is [].
+        autocast_data_max (float): Maximum absolute value for node outputs, nodes with outputs greater than this value will remain in FP32. Default is 512.
+        autocast_max_depth_of_reduction (Optional[int]): Maximum depth of reduction allowed in low precision. Nodes with higher reduction depths will remain in FP32. If not provided, infinity will be used. Default is None.
+        autocast_intermediate_node_outputs (dict[str, torch.Tensor]): The intermediate node outputs of the graph. Default is {}.
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -155,14 +155,16 @@ class CompilationSettings:
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
     offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
     enable_autocast: bool = ENABLE_AUTOCAST
-    low_precision_type: Optional[dtype] = LOW_PRECISION_TYPE
-    nodes_to_exclude: Collection[str] = field(default_factory=lambda: NODES_TO_EXCLUDE)
-    targets_to_exclude: Collection[Target] = field(
-        default_factory=lambda: TARGETS_TO_EXCLUDE
+    autocast_low_precision_type: Optional[dtype] = AUTOCAST_LOW_PRECISION_TYPE
+    autocast_excluded_nodes: Collection[str] = field(
+        default_factory=lambda: AUTOCAST_EXCLUDED_NODES
     )
-    data_max: float = DATA_MAX
-    max_depth_of_reduction: Optional[int] = MAX_DEPTH_OF_REDUCTION
-    intermediate_node_outputs: dict[str, torch.Tensor] = field(
+    autocast_excluded_ops: Collection[Target] = field(
+        default_factory=lambda: AUTOCAST_EXCLUDED_OPS
+    )
+    autocast_data_max: float = AUTOCAST_DATA_MAX
+    autocast_max_depth_of_reduction: Optional[int] = AUTOCAST_MAX_DEPTH_OF_REDUCTION
+    autocast_intermediate_node_outputs: dict[str, torch.Tensor] = field(
         default_factory=lambda: {}
     )
 
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/nodeclassifier.py b/py/torch_tensorrt/dynamo/lowering/passes/nodeclassifier.py
@@ -53,24 +53,28 @@ def __init__(self, disabled_node_name_regex):
         self.disabled_node_name_regex = disabled_node_name_regex
 
     def _check_inner(self, node):
+        stack = node.meta.get("nn_module_stack")
+        node_name = next(reversed(stack), "").split("__")[
+            -1
+        ]  # get the user specified name of the node
         return any(
-            re.match(regex, node.name) for regex in self.disabled_node_name_regex
+            re.match(regex, node_name) for regex in self.disabled_node_name_regex
         )
 
 
-class DisabledTargets(NodeRuleBase):
+class DisabledOpTypes(NodeRuleBase):
     """Rule for keeping nodes with specific operation types in high precision."""
 
-    def __init__(self, targets_to_exclude):
+    def __init__(self, excluded_ops):
         """Initialize the rule.
 
         Args:
-            targets_to_exclude: List of operation types to keep in high precision.
+            excluded_ops: List of operation types to keep in high precision.
         """
-        self.targets_to_exclude = targets_to_exclude
+        self.excluded_ops = excluded_ops
 
     def _check_inner(self, node):
-        return node.target in self.targets_to_exclude
+        return node.target in self.excluded_ops
 
 
 class IORangeRule(NodeRuleBase):
@@ -219,8 +223,8 @@ class NodeClassifier:
     def __init__(
         self,
         nodes,
-        nodes_to_exclude: Collection[str] | None = None,
-        targets_to_exclude: Collection[torch.fx.node.Target] | None = None,
+        excluded_nodes: Collection[str] | None = None,
+        excluded_ops: Collection[torch.fx.node.Target] | None = None,
         custom_rule: NodeRuleBase | None = None,
         data_max: float | None = 1000.0,
         max_depth_of_reduction: int | None = None,
@@ -236,8 +240,8 @@ def __init__(
             max_depth_of_reduction: Maximum depth of reduction allowed in low precision.
         """
         self.nodes = nodes
-        self.nodes_to_exclude = nodes_to_exclude
-        self.targets_to_exclude = targets_to_exclude
+        self.excluded_nodes = excluded_nodes
+        self.excluded_ops = excluded_ops
         self.custom_rule = custom_rule
         self.data_max = data_max
         self.max_depth_of_reduction = max_depth_of_reduction
@@ -252,10 +256,10 @@ def _gen_block_node_rules(self, reference_data):
             list[NodeRuleBase]: List of rules to apply.
         """
         block_node_rules: list[NodeRuleBase] = []
-        if self.nodes_to_exclude:
-            block_node_rules.append(DisabledNodeNameRegexRule(self.nodes_to_exclude))
-        if self.targets_to_exclude:
-            block_node_rules.append(DisabledTargets(self.targets_to_exclude))
+        if self.excluded_nodes:
+            block_node_rules.append(DisabledNodeNameRegexRule(self.excluded_nodes))
+        if self.excluded_ops:
+            block_node_rules.append(DisabledOpTypes(self.excluded_ops))
         if reference_data:
             block_node_rules.append(IORangeRule(self.data_max, reference_data))
         if self.max_depth_of_reduction is not None:
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/rule_based_autocast.py b/py/torch_tensorrt/dynamo/lowering/passes/rule_based_autocast.py