fix TE issue which report UNSUPPORTED DTYPE error if calling to bfloat16 (#910)

XiaobingSuper · web-flow · commit e7b925ae6c0a · 2022-06-28T13:58:07.000+08:00
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.h b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.h
@@ -71,6 +71,12 @@ void insertPrePackedConvTransposeOp(std::shared_ptr<Graph>& graph);
 void fuseConvTransposeWithEltwise(std::shared_ptr<Graph>& graph);
 
 void FusedEinsumPost(std::shared_ptr<Graph>& graph);
+
+// This code will be removed after the official PyTorch NNC fully support
+// BFloat16.
+void replaceAtenToWithIPEXTo(std::shared_ptr<Graph>& graph);
+void replaceIPEXToWithAtenTo(std::shared_ptr<Graph>& graph);
+
 } // namespace graph_rewrite
 } // namespace jit
 } // namespace torch
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_to.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_to.cpp
@@ -0,0 +1,86 @@
+#include <ATen/code_template.h>
+#include "graph_rewrite.h"
+
+namespace torch {
+namespace jit {
+namespace graph_rewrite {
+
+using namespace at::jit;
+
+// This code will be removed after the official PyTorch NNC fully support
+// BFloat16.
+
+void replaceAtenToWithIPEXTo(Block* b) {
+  for (Node* n : b->nodes()) {
+    for (Block* block : n->blocks()) {
+      replaceAtenToWithIPEXTo(block);
+    }
+    if (n->kind() == aten::to) {
+      // skip aten::to.other
+      if (n->inputs().at(1)->type()->kind() == TypeKind::TensorType) {
+        continue;
+      }
+      if (n->inputs().size() == 5 || n->inputs().size() == 4) {
+        auto const& input_dtype =
+            n->inputs().at(0)->type()->cast<TensorType>()->scalarType();
+        auto const& output_dtype =
+            n->outputs().at(0)->type()->cast<TensorType>()->scalarType();
+        if (!input_dtype || !output_dtype) {
+          continue;
+        }
+        if (!(*input_dtype == c10::ScalarType::Float &&
+              *output_dtype == c10::ScalarType::BFloat16)) {
+          continue;
+        }
+        // device check?
+        WithInsertPoint guard(n);
+        auto graph = n->owningGraph();
+        Node* ipex_to_node =
+            graph->create(Symbol::fromQualString("ipex::to_dtype"));
+        for (auto i = 0; i < n->inputs().size(); ++i) {
+          Value* v = n->inputs().at(i);
+          ipex_to_node->addInput(v);
+        }
+        graph->insertNode(ipex_to_node);
+        n->output()->replaceAllUsesWith(ipex_to_node->output());
+      } else {
+        continue;
+      }
+    }
+  }
+  EliminateDeadCode(b);
+}
+
+void replaceIPEXToWithAtenTo(Block* b) {
+  for (Node* n : b->nodes()) {
+    for (Block* block : n->blocks()) {
+      replaceIPEXToWithAtenTo(block);
+    }
+    if (n->kind() == Symbol::fromQualString("ipex::to_dtype")) {
+      WithInsertPoint guard(n);
+      auto graph = n->owningGraph();
+      Node* aten_to_node = graph->create(aten::to);
+      for (auto i = 0; i < n->inputs().size(); ++i) {
+        Value* v = n->inputs().at(i);
+        aten_to_node->addInput(v);
+      }
+      graph->insertNode(aten_to_node);
+      n->output()->replaceAllUsesWith(aten_to_node->output());
+    }
+  }
+  EliminateDeadCode(b);
+}
+
+void replaceAtenToWithIPEXTo(std::shared_ptr<Graph>& graph) {
+  replaceAtenToWithIPEXTo(graph->block());
+  EliminateDeadCode(graph);
+}
+
+void replaceIPEXToWithAtenTo(std::shared_ptr<Graph>& graph) {
+  replaceIPEXToWithAtenTo(graph->block());
+  EliminateDeadCode(graph);
+}
+
+} // namespace graph_rewrite
+} // namespace jit
+} // namespace torch
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
@@ -1154,6 +1154,46 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
+    Operator(
+        "ipex::to_dtype(Tensor(a) self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor(a)",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto result = at::native::to(
+                (std::move(peek(stack, 0, 5))).toTensor(),
+                (std::move(peek(stack, 1, 5))).toScalarType(),
+                (std::move(peek(stack, 2, 5))).toBool(),
+                (std::move(peek(stack, 3, 5))).toBool(),
+                (std::move(peek(stack, 4, 5))).toOptional<at::MemoryFormat>());
+            drop(stack, 5);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+    Operator(
+        "ipex::to_dtype(Tensor(a) self, int? dtype, bool non_blocking=False, bool copy=False) -> Tensor(a|b)",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            const auto& input = (std::move(peek(stack, 0, 4))).toTensor();
+            const auto dtype =
+                (std::move(peek(stack, 1, 4))).toOptional<at::ScalarType>();
+            const auto copy = (std::move(peek(stack, 3, 4))).toBool();
+            at::Tensor result;
+            if (!dtype && !copy) {
+              result = input;
+            } else {
+              TORCH_CHECK(
+                  dtype,
+                  "dtype cannot be None when copy is True for ipex::to.prim_dtype");
+              result = at::native::to(
+                  input, *dtype, (std::move(peek(stack, 2, 4))).toBool(), copy);
+            }
+            drop(stack, 4);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
 });
 } // namespace jit
 } // namespace torch
diff --git a/intel_extension_for_pytorch/csrc/jit/fusion_pass.cpp b/intel_extension_for_pytorch/csrc/jit/fusion_pass.cpp
@@ -284,6 +284,7 @@ void FusionPass(std::shared_ptr<Graph>& graph) {
   BatchMM(graph);
 
   if (tensorExprFuserEnabled()) {
+    graph_rewrite::replaceAtenToWithIPEXTo(graph);
     auto min_size = getFusionGroupInlining() ? 2 : 1;
     // Here we always get the first valid behavior per the global fusion
     // strategies configured by PyTorch (`getInstantiatedBailoutDepth` always
@@ -293,6 +294,7 @@ void FusionPass(std::shared_ptr<Graph>& graph) {
     bool dyn_shapes = getCurrentBehavior(getInstantiatedBailoutDepth()) ==
         FusionBehavior::DYNAMIC;
     FuseTensorExprs(graph, min_size, /* composed op*/ false, dyn_shapes);
+    graph_rewrite::replaceIPEXToWithAtenTo(graph);
   }
 
   // Apply IPEX inplace optimization/replacement
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -67,6 +67,7 @@
 
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.testing import FileCheck
 
 from common_utils import TestCase
 
@@ -3172,6 +3173,34 @@ def forward(self, x):
             kind_in_graph="ipex::hardsigmoid",
             kind_not_in_graph="aten::hardsigmoid")
 
+    # This test case will be removed after offical PyTorch NNC support bfloat16.
+    def test_TEfusion_with_to_dtype(self):
+        class TestTo(torch.nn.Module):
+            def __init__(self, dtype):
+                 super(TestTo, self).__init__()
+                 self.dtype = dtype
+
+            def forward(self, x):
+                return (x + 1).to(self.dtype)
+
+        X = torch.randn((5, 5))
+        with torch.no_grad():
+            # to(torch.bfloat16)
+            m = TestTo(torch.bfloat16).eval()
+            m = torch.jit.trace(m, X)
+            torch.jit.freeze(m)
+            out = m(X)
+            graph = m.graph_for(X)
+            FileCheck().check_not("prim::TensorExprGroup").run(graph)
+            # to(torch.long)
+            m = TestTo(torch.long).eval()
+            m = torch.jit.trace(m, X)
+            torch.jit.freeze(m)
+            out = m(X)
+            graph = m.graph_for(X)
+            FileCheck().check("prim::TensorExprGroup").run(graph)
+
+
 if __name__ == '__main__':
     torch.manual_seed(2020)
     test = unittest.main()