optimize Cast insertion logic, fix io dtype issue and comments, and add tests

zewenli98 · zewenli98 · commit 0a6214969bef · 2025-11-07T16:35:40.000-08:00
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -107,6 +107,12 @@ void setup_input_tensors(
     TORCHTRT_CHECK(
         inputs[i].is_cuda(), "Expected input tensors to have device cuda, found device " << inputs[i].device());
 
+    auto expected_type =
+        util::TRTDataTypeToScalarType(compiled_engine->exec_ctx->getEngine().getTensorDataType(name.c_str()));
+    TORCHTRT_CHECK(
+        inputs[i].dtype() == expected_type,
+        "Expected input tensors to have type " << expected_type << ", found type " << inputs[i].dtype());
+
     auto dims = core::util::toDims(inputs[i].sizes());
     auto shape = core::util::toVec(dims);
     LOG_DEBUG("Input Name: " << name << " Shape: " << dims);
diff --git a/examples/dynamo/autocast_example.py b/examples/dynamo/autocast_example.py
@@ -3,9 +3,9 @@
 import torch_tensorrt
 
 
-class AutocastExample(nn.Module):
+class MixedPytorchAutocastModel(nn.Module):
     def __init__(self):
-        super(AutocastExample, self).__init__()
+        super(MixedPytorchAutocastModel, self).__init__()
         self.conv1 = nn.Conv2d(
             in_channels=3, out_channels=8, kernel_size=3, stride=1, padding=1
         )
@@ -19,47 +19,36 @@ def __init__(self):
         self.flatten = nn.Flatten()
         self.fc1 = nn.Linear(16 * 8 * 8, 10)
 
-    def forward(self, x, y):
-        x = self.conv1(x)  # fp32 because of "^conv1$" in `autocast_excluded_nodes`
-        x = self.relu1(x)  # fp32 because of "relu" in `autocast_excluded_nodes`
-        out = self.pool1(x)  # fp16
-        x = self.conv2(out)  # fp16
-        x = self.relu2(x)  # fp32 because of "relu" in `autocast_excluded_nodes`
-        x = self.pool2(x)  # fp16
-        x = self.flatten(
-            x
-        )  # fp32 because of `torch.ops.aten.flatten.using_ints` in `autocast_excluded_ops`
-        # Respect the precisions in the pytorch autocast context
-        with torch.autocast(x.device.type, enabled=True, dtype=torch.float32):
-            x = self.fc1(x)  # fp32
-            with torch.autocast(x.device.type, enabled=False):
-                x = torch.sub(x.half(), y)  # fp16
-                out2 = torch.add(x, x)  # fp16
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.relu1(x)
+        x = self.pool1(x)
+        x = self.conv2(x)
+        x = self.relu2(x)
+        x = self.pool2(x)
+        x = self.flatten(x)
         with torch.autocast(x.device.type, enabled=True, dtype=torch.float16):
-            out2 = torch.log(
-                out2
-            )  # fp32 because Pytorch Autocast requires `log` to be in fp32
-        return x, out, out2
+            x = self.fc1(x)
+            out = torch.log(
+                torch.abs(x) + 1
+            )  # log is fp32 due to Pytorch Autocast requirements
+        return out
 
 
 if __name__ == "__main__":
-    model = AutocastExample().cuda().eval()
-    inputs = (
-        torch.randn((1, 3, 32, 32), dtype=torch.float32, device="cuda"),
-        torch.randn((1,), dtype=torch.float16, device="cuda"),
-    )
+    model = MixedPytorchAutocastModel().cuda().eval()
+    inputs = (torch.randn((8, 3, 32, 32), dtype=torch.float32, device="cuda"),)
+    ep = torch.export.export(model, inputs)
     calibration_dataloader = torch.utils.data.DataLoader(
-        torch.utils.data.TensorDataset(*inputs), batch_size=1, shuffle=False
+        torch.utils.data.TensorDataset(*inputs), batch_size=2, shuffle=False
     )
 
-    ep = torch.export.export(model, inputs)
-
     with torch_tensorrt.dynamo.Debugger(
         "graphs",
         logging_dir=".",
         engine_builder_monitor=False,
     ):
-        trt_mod = torch_tensorrt.compile(
+        trt_autocast_mod = torch_tensorrt.compile(
             ep.module(),
             arg_inputs=inputs,
             min_block_size=1,
@@ -78,4 +67,4 @@ def forward(self, x, y):
             autocast_calibration_dataloader=calibration_dataloader,
         )
 
-        trt_out = trt_mod(*inputs)
+        autocast_outs = trt_autocast_mod(*inputs)
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -23,8 +23,8 @@
 
 pre_lowering_pass_list = [
     remove_detach,
+    remove_assert_nodes,
     rule_based_autocast,
-    remove_assert_nodes,  # rule_based_autocast might insert assert nodes
 ]
 
 post_lowering_pass_list = [
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/nodeclassifier.py b/py/torch_tensorrt/dynamo/lowering/passes/nodeclassifier.py
@@ -54,9 +54,13 @@ def __init__(self, disabled_node_name_regex):
 
     def _check_inner(self, node):
         stack = node.meta.get("nn_module_stack")
-        node_name = next(reversed(stack), "").split("__")[
-            -1
-        ]  # get the user specified name of the node
+        try:
+            # get the user specified name of the node
+            node_name = stack.get(next(reversed(stack)), [""])[0]
+        except Exception as e:
+            raise ValueError(
+                f"Failed to get the user specified name of the node {node.name} because {e}. Please file a bug with Torch-TensorRT."
+            )
         return any(
             re.match(regex, node_name) for regex in self.disabled_node_name_regex
         )
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py b/py/torch_tensorrt/dynamo/lowering/passes/pass_utils.py
@@ -93,9 +93,7 @@ def run_node(self, n: torch.fx.Node) -> Any:
             out = super().run_node(n)
             if n.op == "call_function" and n.target not in excluded_ops:
                 if not isinstance(out, torch.Tensor):
-                    raise ValueError(
-                        f"Please file a bug with Torch-TensorRT because it expects a torch.Tensor but got {type(out)} for node {n.name}."
-                    )
+                    return out
                 if n.name in intermediate_node_outputs:
                     intermediate_node_outputs[n.name] = torch.cat(
                         [intermediate_node_outputs[n.name], out], dim=0
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/rule_based_autocast.py b/py/torch_tensorrt/dynamo/lowering/passes/rule_based_autocast.py
@@ -66,15 +66,19 @@ def _cast_all_tensor_args_to_dtype(
         """
         if isinstance(arg, torch.fx.Node) and is_tensor_node(arg):
             val = arg.meta.get("val", None)
-            with gm.graph.inserting_before(node):
-                cast = gm.graph.call_function(
-                    torch.ops.aten.to.dtype, args=(arg, dtype)
-                )
-
             if isinstance(val, torch.Tensor):
-                arg.meta["val"] = val.to(dtype)
-            cast.meta.update(arg.meta)
-            return cast
+                if val.dtype == dtype:
+                    return arg
+                else:
+                    with gm.graph.inserting_before(node):
+                        cast = gm.graph.call_function(
+                            torch.ops.aten.to.dtype, args=(arg, dtype)
+                        )
+                    # copy the meta of the original tensor to the casted tensor
+                    cast.meta.update(arg.meta)
+                    # update the dtype of the casted tensor
+                    cast.meta["val"] = cast.meta["val"].to(dtype)
+                    return cast
         elif isinstance(arg, (tuple, list)):
             return type(arg)(
                 _cast_all_tensor_args_to_dtype(node, a, dtype) for a in arg
@@ -102,13 +106,15 @@ def _cast_all_tensor_args_to_dtype(
                 node.kwargs = _cast_all_tensor_args_to_dtype(
                     node, node.kwargs, autocast_low_precision_type
                 )
+                node.meta["val"] = node.meta["val"].to(autocast_low_precision_type)
             elif node.name in high_precision_nodes:
                 node.args = _cast_all_tensor_args_to_dtype(
                     node, node.args, autocast_high_precision_type
                 )
                 node.kwargs = _cast_all_tensor_args_to_dtype(
                     node, node.kwargs, autocast_high_precision_type
                 )
+                node.meta["val"] = node.meta["val"].to(autocast_high_precision_type)
 
     gm = clean_up_graph_after_modifications(gm)
     logger.debug("Graph after Autocast based on the rules:\n%s", gm.graph)
diff --git a/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
@@ -154,6 +154,10 @@ def forward(
                         + contiguous_inputs[i + 1 :]
                     )
 
+                assert (
+                    contiguous_inputs[i].dtype == inputs[i].dtype
+                ), f"Dtype mismatch for {i}th input. Expect {inputs[i].dtype}, got {contiguous_inputs[i].dtype}."
+
                 if need_cudagraphs_record:
                     # If cudagraphs is enabled, this memory is reserved for future cudagraph runs
                     # Clone is required to avoid re-using user-provided GPU memory
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -275,6 +275,11 @@ def setup_engine(self) -> None:
             len(self.input_names) + len(self.output_names)
         )
 
+        self.input_dtypes = [
+            dtype._from(self.engine.get_tensor_dtype(input_name))
+            for input_name in self.input_names
+        ]
+
         self.input_shapes = [
             self.engine.get_tensor_shape(input_name) for input_name in self.input_names
         ]
@@ -367,6 +372,10 @@ def setup_input_tensors(
                     + contiguous_inputs[i + 1 :]
                 )
 
+            assert (
+                contiguous_inputs[i].dtype == self.input_dtypes[i]
+            ), f"Dtype mismatch for {i}th input({input_name}). Expect {self.input_dtypes[i]}, got {contiguous_inputs[i].dtype}."
+
             if need_cudagraphs_record:
                 # If cudagraphs is enabled, this memory is reserved for future cudagraph runs
                 # Clone is required to avoid re-using user-provided GPU memory
diff --git a/tests/py/dynamo/models/test_autocast.py b/tests/py/dynamo/models/test_autocast.py

Original file line number	Diff line number	Diff line change
`@@ -23,8 +23,8 @@`
`23`	`23`
`24`	`24`	`pre_lowering_pass_list = [`
`25`	`25`	`remove_detach,`
	`26`	`+ remove_assert_nodes,`
`26`	`27`	`rule_based_autocast,`
`27`		`- remove_assert_nodes, # rule_based_autocast might insert assert nodes`
`28`	`28`	`]`
`29`	`29`
`30`	`30`	`post_lowering_pass_list = [`