Fix matmul post scalar op fusion (#1257)

jianan-gu · web-flow · commit bde089dd012e · 2022-11-03T14:30:11.000+08:00
* fix matmul post scalar fusion

* rename related codes from div to mul
diff --git a/csrc/jit/cpu/kernels/Mha.cpp b/csrc/jit/cpu/kernels/Mha.cpp
@@ -266,7 +266,7 @@ at::Tensor dil_transfree_vit_mha(
   value.resize_({batchSize, sequenceSize, head_num, head_size})
       .transpose_(1, 2);
 
-  bmm_impl(query, key, qk, ideep::attr_t(), {}, 1.f / dim_per_head);
+  bmm_impl(query, key, qk, ideep::attr_t(), {}, dim_per_head);
   qk = dil_softmax_(qk, softmax_dim, dtype);
 
   auto output = dil_mha_matmul_trans(qk, value);
diff --git a/csrc/jit/passes/graph_rewrite.cpp b/csrc/jit/passes/graph_rewrite.cpp
@@ -251,16 +251,19 @@ void FuseMatmulDivOrMul(std::shared_ptr<Graph>& graph) {
         return (%r) )";
   std::string fused_matmul_mul = R"(
       graph(%x, %y, %z):
-        %ones : float = prim::Constant[value=1.0]()
-        %z_ = aten::div(%ones, %z)
-        %r = ipex::matmul_div(%x, %y, %z_)
+        %r = ipex::matmul_mul(%x, %y, %z)
         return (%r) )";
   std::string fused_matmul_mul_with_out = R"(
       graph(%x, %y, %z, %out):
-        %ones : float = prim::Constant[value=1.0]()
-        %z_ = aten::div(%ones, %z)
-        %r = ipex::matmul_div(%x, %y, %out, %z_)
+        %r = ipex::matmul_mul(%x, %y, %out, %z)
         return (%r) )";
+  auto filter_scalar = [](const Match& match,
+                          const std::unordered_map<std::string, Value*>& vmap) {
+    Node* node = match.anchor;
+    auto target_value = node->input(1);
+    return utils::is_scalar(target_value);
+  };
+
   for (auto const& it : div_ops) {
     at::jit::TemplateEnv env;
     env.s("div_op", it);
@@ -281,7 +284,7 @@ void FuseMatmulDivOrMul(std::shared_ptr<Graph>& graph) {
         aten_mul_pattern.format(env), fused_matmul_mul);
     rewriter.RegisterRewritePattern(
         aten_mul_pattern_with_out.format(env), fused_matmul_mul_with_out);
-    rewriter.runOnGraph(graph);
+    rewriter.runOnGraph(graph, filter_scalar);
   }
 }
 
@@ -309,11 +312,16 @@ void PostScalarDivOrMul(std::shared_ptr<Graph>& graph) {
         %qk = aten::matmul(%q, %k) 
         %r = aten::mul(%qk, %scale)
         return (%r) )";
-
+  auto filter_scalar = [](const Match& match,
+                          const std::unordered_map<std::string, Value*>& vmap) {
+    Node* node = match.anchor;
+    auto target_value = node->input(0)->node()->input(1);
+    return utils::is_scalar(target_value);
+  };
   SubgraphRewriter rewriter;
   rewriter.RegisterRewritePattern(div_matmul, matmul_div);
   rewriter.RegisterRewritePattern(mul_matmul, matmul_mul);
-  rewriter.runOnGraph(graph);
+  rewriter.runOnGraph(graph, filter_scalar);
 }
 
 // MHA fusion covers aten::softmax, ipex::softmax and ipex::softmax_:
@@ -495,7 +503,8 @@ void FuseMHAScoreCalc(std::shared_ptr<Graph>& graph) {
         // This constant fill value could be either 0-dim tensor or just a
         // scalar
         auto fill_value_node = qk_node->input(2)->node();
-        if (fill_value_node->kind() != prim::Constant) {
+        if (fill_value_node->kind() != prim::Constant ||
+            !utils::is_scalar(qk_node->input(2))) {
           return false;
         }
 
diff --git a/csrc/jit/passes/graph_rewrite_mha.cpp b/csrc/jit/passes/graph_rewrite_mha.cpp
@@ -299,7 +299,7 @@ void FusedTransFreeMha(std::shared_ptr<Graph>& graph) {
         %key_ = aten::select(%qkv2, %select_dim, %key_select)
         %value = aten::select(%qkv2, %select_dim, %value_select)
         %key = aten::transpose(%key_, %trans_a, %trans_b)
-        %bmm1 = ipex::matmul_div(%query, %key, %scale)
+        %bmm1 = ipex::matmul_mul(%query, %key, %scale)
         %smx = ipex::softmax(%bmm1, %trans_b, %dtype)
         %bmm2 = aten::matmul(%smx, %value)
         %context_layer = aten::transpose(%bmm2, %key_select, %value_select)
diff --git a/csrc/jit/passes/register_dnnl_jit_ops.cpp b/csrc/jit/passes/register_dnnl_jit_ops.cpp
@@ -873,6 +873,88 @@ torch::jit::RegisterOperators op({
         },
         aliasAnalysisFromSchema()),
 
+    Operator(
+        "ipex::matmul_mul(Tensor left, Tensor right, Tensor(a!) out_opt, Tensor "
+        "mul_input) -> Tensor(a!)",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto mul_tensor = std::move(peek(stack, 3, 4).toTensor());
+            auto mul_input_data = mul_tensor.item();
+            // divide mul_input to reuse dil_matmul_div function
+            auto div_input_data = 1.0f / mul_input_data.to<float>();
+            auto result = dil_matmul_div(
+                (std::move(peek(stack, 0, 4))).toTensor(),
+                (std::move(peek(stack, 1, 4))).toTensor(),
+                toOptionalTensor(std::move(peek(stack, 2, 4))),
+                div_input_data);
+            drop(stack, 4);
+            torch::jit::pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+
+    Operator(
+        "ipex::matmul_mul(Tensor left, Tensor right, Tensor(a!) out_opt, Scalar "
+        "mul_input) -> Tensor(a!)",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            // divide mul_input to reuse dil_matmul_div function
+            auto div_input_data =
+                1.0f / (std::move(peek(stack, 3, 4))).toScalar().to<float>();
+            auto result = dil_matmul_div(
+                (std::move(peek(stack, 0, 4))).toTensor(),
+                (std::move(peek(stack, 1, 4))).toTensor(),
+                toOptionalTensor(std::move(peek(stack, 2, 4))),
+                div_input_data);
+            drop(stack, 4);
+            torch::jit::pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+
+    Operator(
+        "ipex::matmul_mul(Tensor left, Tensor right,  Tensor mul_input) -> "
+        "Tensor",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto mul_tensor = (std::move(peek(stack, 2, 3))).toTensor();
+            auto mul_input_data = mul_tensor.item();
+            // divide mul_input to reuse dil_matmul_div function
+            auto div_input_data = 1.0f / mul_input_data.to<float>();
+            auto result = dil_matmul_div(
+                (std::move(peek(stack, 0, 3))).toTensor(),
+                (std::move(peek(stack, 1, 3))).toTensor(),
+                at::Tensor(),
+                div_input_data);
+            drop(stack, 3);
+            torch::jit::pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+
+    Operator(
+        "ipex::matmul_mul(Tensor left, Tensor right,  Scalar mul_input) -> "
+        "Tensor",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            // divide mul_input to reuse dil_matmul_div function
+            auto div_input_data =
+                1.0f / (std::move(peek(stack, 2, 3))).toScalar().to<float>();
+            auto result = dil_matmul_div(
+                (std::move(peek(stack, 0, 3))).toTensor(),
+                (std::move(peek(stack, 1, 3))).toTensor(),
+                at::Tensor(),
+                div_input_data);
+            drop(stack, 3);
+            torch::jit::pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+
     Operator(
         "ipex::bmm_add(Tensor input, Tensor batch1, Tensor batch2, Scalar alpha) -> "
         "Tensor",
@@ -944,7 +1026,7 @@ torch::jit::RegisterOperators op({
           auto scale_tensor = std::move(peek(stack, 4, 7).toTensor());
           auto scale_data = scale_tensor.item();
           // divide scale to reuse dil_mha_scores_calc function
-          auto div_scale_data = 1 / scale_data.to<float>();
+          auto div_scale_data = 1.0f / scale_data.to<float>();
           auto result = dil_mha_scores_calc(
               peek(stack, 0, 7).toTensor(),
               peek(stack, 1, 7).toTensor(),
@@ -964,7 +1046,7 @@ torch::jit::RegisterOperators op({
         "Scalar scale, int softmax_dim, ScalarType ? dtype) -> Tensor",
         [](Stack& stack) {
           // divide scale to reuse dil_mha_scores_calc function
-          auto div_scale_data = 1 / peek(stack, 4, 7).toScalar().to<float>();
+          auto div_scale_data = 1.0f / peek(stack, 4, 7).toScalar().to<float>();
           auto result = dil_mha_scores_calc(
               peek(stack, 0, 7).toTensor(),
               peek(stack, 1, 7).toTensor(),
diff --git a/csrc/jit/passes/utils.cpp b/csrc/jit/passes/utils.cpp
@@ -171,6 +171,25 @@ bool is_contiguous(c10::TensorTypePtr tensor) {
   return is_contiguous;
 }
 
+// Check if the target IValue is a scalar or a 0-dim scalar tensor
+bool is_scalar(torch::jit::Value* target_value) {
+  if (!toIValue(target_value).has_value()) {
+    return false;
+  }
+  if (toIValue(target_value).value().isScalar()) {
+    return true;
+  } else if (toIValue(target_value).value().isTensor()) {
+    auto target_tensor_dim = target_value->type()->cast<TensorType>()->dim();
+    if (!target_tensor_dim.has_value()) {
+      return false;
+    } else {
+      return target_tensor_dim.value() == 0 ? true : false;
+    }
+  } else {
+    return false;
+  }
+}
+
 } // namespace utils
 } // namespace graph_rewrite
 } // namespace jit
diff --git a/csrc/jit/passes/utils.h b/csrc/jit/passes/utils.h
@@ -30,7 +30,8 @@ supported_non_unary_post_op_fusion_set();
 bool is_channelslast(c10::TensorType tensor);
 // Check if the memory format of the tensor is Contiguous
 bool is_contiguous(c10::TensorTypePtr tensor);
-
+// Check if the target IValue is a scalar or a 0-dim scalar tensor
+bool is_scalar(torch::jit::Value* target_value);
 } // namespace utils
 } // namespace graph_rewrite
 } // namespace jit
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -846,6 +846,28 @@ def forward(self, x):
         else:
             return mm_res.div_(torch.ones(mm_res_shape,dtype=x.dtype)+1)
 
+class MatmulMul(nn.Module):
+    def __init__(self, mul_scalar=False, with_out=False):
+        super(MatmulMul, self).__init__()
+        self.with_out = with_out
+        self.mul_scalar = mul_scalar
+    def forward(self, x):
+        mm_res = None
+        y = torch.transpose(x, -1, -2).contiguous()
+        mm_res_shape = x.size()[:-1] + (y.size()[-1:])
+        if not self.mul_scalar:
+            x = x * (torch.ones([1],dtype=x.dtype) + 1)
+        if self.with_out:
+            mm_res = torch.randn(mm_res_shape, dtype=x.dtype)
+            mm_res = torch.matmul(x, y, out=mm_res)
+        else:
+            mm_res = torch.matmul(x, y)
+        if self.mul_scalar:
+            mm_res = mm_res * 0.125
+        else:
+            mm_res = mm_res * (torch.ones([1],dtype=x.dtype) + 1)
+        return mm_res
+
 class TransposedMatmulDiv(nn.Module):
     def __init__(self):
         super(TransposedMatmulDiv, self).__init__()
@@ -1282,7 +1304,6 @@ def _test_output_bf16(self, base_model, x, kind_in_graph=None, kind_not_in_graph
                     #bf16, jit trace path
                     trace_graph = trace_fused_model.graph_for(x3)
                     fused_tresult = trace_fused_model(x3)
-
                 self.assertEqual(fused_tresult, result, prec=prec)
                 self.assertEqual(fused_tresult.dtype, torch.bfloat16)
 
@@ -3337,9 +3358,37 @@ def fn(input, weight, bias):
         self.assertEqual(scripted_fn(input, weight, bias), result)
         self.assertEqual(traced_fn(input, weight, bias), result)
 
-    def test_matmul_div(self):
+    def test_matmul_div_or_mul(self):
         inputs = [torch.randn(10, 3, 4), torch.randn(3, 4)]
         for x in inputs:
+            self._test_output(
+                MatmulMul(mul_scalar=True, with_out=False),
+                x,
+                kind_in_graph="ipex::matmul_mul",
+                kind_not_in_graph=None)
+            self._test_output(
+                MatmulMul(mul_scalar=True, with_out=True),
+                x,
+                kind_in_graph="ipex::matmul_mul",
+                kind_not_in_graph=None)
+            self._test_output(
+                MatmulMul(mul_scalar=False, with_out=True),
+                x,
+                kind_in_graph=None,
+                kind_not_in_graph="ipex::matmul_mul")
+            self._test_output_bf16(
+                MatmulMul(mul_scalar=True, with_out=False),
+                x.to(torch.bfloat16),
+                kind_in_graph="ipex::matmul_mul",
+                kind_not_in_graph=None,
+                prec=5e-2)
+            self._test_output_bf16(
+                MatmulMul(mul_scalar=True, with_out=True),
+                x.to(torch.bfloat16),
+                kind_in_graph="ipex::matmul_mul",
+                kind_not_in_graph=None,
+                prec=5e-2)
+
             self._test_output(
                 MatmulDivOutplace(div_scalar=True, with_out=True),
                 x,
@@ -3466,14 +3515,14 @@ def test_transposed_matmuldiv(self):
                     fused_mod = traced_mod.graph_for(x1[i], y1[j])
                     out = traced_mod(x1[i], y1[j])
                     expected = model(x1[i], y1[j])
-                    self.assertTrue(any(n.kind() == "ipex::matmul_div" for n in fused_mod.nodes()))
+                    self.assertTrue(any(n.kind() == "ipex::matmul_mul" for n in fused_mod.nodes()))
                     self.assertEqual(out, expected, prec=1e-4)
                 with torch.cpu.amp.autocast(), torch.no_grad():
                     traced_mod = torch.jit.trace(model, (x1[i].bfloat16(), y1[j].bfloat16()))
                     fused_mod = traced_mod.graph_for(x1[i].bfloat16(), y1[j].bfloat16())
                     out = traced_mod(x1[i].bfloat16(), y1[j].bfloat16())
                     expected = model(x1[i].bfloat16(), y1[j].bfloat16())
-                    self.assertTrue(any(n.kind() == "ipex::matmul_div" for n in fused_mod.nodes()))
+                    self.assertTrue(any(n.kind() == "ipex::matmul_mul" for n in fused_mod.nodes()))
                     self.assertEqual(out, expected, prec=1e-1)
 
     def test_bmm_add(self):
diff --git a/tests/cpu/test_mha.py b/tests/cpu/test_mha.py
@@ -131,7 +131,7 @@ def test_transfree_mha_bf16(self):
                 for _ in range(2):
                     mha_jit = mha_ipex(mat, mask_base)
                     vit_mha_jit = vit_mha_ipex(mat)
-                
+
                 mha_ref = mha_model(mat, mask_base)
                 vit_mha_ref = vit_mha_model(mat)
 
@@ -141,6 +141,7 @@ def test_transfree_mha_bf16(self):
                 mha_graph = mha_ipex.graph_for(mat, mask_base)
                 vit_mha_graph = vit_mha_ipex.graph_for(mat)
 
+
                 self.assertTrue(any(n.kind() == "ipex::transfree_mha" for n in mha_graph.nodes()))
                 self.assertTrue(any(n.kind() == "ipex::transfree_vit_mha" for n in vit_mha_graph.nodes()))
 
@@ -336,7 +337,7 @@ def test_fake_mha_fp32(self):
                 fake_mha_jit.append(fake_mha_ipex[i](mat))
                 fake_mha_ref.append(fake_mha_model[i](mat))
                 fake_mha_graph = fake_mha_ipex[i].graph_for(mat)
-                self.assertTrue(any(n.kind() == "ipex::matmul_div" for n in fake_mha_graph.nodes()))
+                self.assertTrue(any(n.kind() == "ipex::matmul_mul" for n in fake_mha_graph.nodes()))
                 with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CPU]) as p:
                     fake_mha_ipex[i](mat)
                 if i == 6: