Add add broadcast checks for einsum+add fusion kernel (#885)

jianan-gu · Jiong Gong · web-flow · commit b7b9359308e5 · 2022-06-23T23:23:42.000+08:00
* add runtime check

* refine code

* refine code

* refine code

* add corner case from alphafold2

* Update intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.cpp

Co-authored-by: Jiong Gong &lt;jiong.gong@intel.com&gt;

* refine code

Co-authored-by: Jiong Gong &lt;jiong.gong@intel.com&gt;
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.cpp
@@ -20,6 +20,67 @@ namespace cpu {
 using at::IntArrayRef;
 using at::Tensor;
 
+//! function: is_add_broadcast_supported_by_onednn
+/*!
+ * This is a workaround checking since oneDNN is not well supported
+ * matmul+binary_add fusion with all kinds of add input broadcast dims;
+ * Depending the add input broadcast dims, oneDNN matmul+binary_add will go into
+ * ref path in some cases; Here we add this function checking to map those
+ * verified supported cases, and fallback those unsupported cases;
+ *
+ * The verified supported cases use following oneDNN non_broadcast_mask:
+ * 2D: oneDNN non_broadcast_mask = {0, 2, 3}
+ * 3D: oneDNN non_broadcast_mask = {0, 2, 4, 5, 7}
+ * 4D: oneDNN non_broadcast_mask = {0, 2, 8, 9, 13, 15}
+ *
+ * For example:
+ * For 4D tensors, left has shape [8, 2, 4, 6] and right has shape [8, 2, 6, 4],
+ * so matmul shape is [8, 2, 4, 4], and post_add_tensor has shape [8, 1, 1, 4].
+ * Therefore, the according non_broadcast_mask is 9, which is supported.
+ *
+ * \param left: the left operand of matmul
+ * \param right: the right operand of matmul
+ * \param post_add_tensor: the post add input tensor
+ * \return: whether the post add input is supported for broadcast by oneDNN for
+ * matmul+binary_add fusion
+ */
+bool is_add_broadcast_supported_by_onednn(
+    const at::Tensor& left,
+    const at::Tensor& right,
+    const at::Tensor& post_add_tensor) {
+  auto non_broadcast_mask = 0;
+  for (int i = 0; i < left.dim(); i++) {
+    if (post_add_tensor.size(i) != 1) {
+      if (i == left.dim() - 1) {
+        non_broadcast_mask +=
+            post_add_tensor.size(i) == right.size(i) ? 1 << i : 0;
+      } else {
+        non_broadcast_mask +=
+            post_add_tensor.size(i) == left.size(i) ? 1 << i : 0;
+      }
+    }
+  }
+  if (left.dim() == 4) {
+    if (non_broadcast_mask == 0 || non_broadcast_mask == 2 ||
+        non_broadcast_mask == 8 || non_broadcast_mask == 9 ||
+        non_broadcast_mask == 13 || non_broadcast_mask == 15) {
+      return true;
+    }
+  } else if (left.dim() == 3) {
+    if (non_broadcast_mask == 0 || non_broadcast_mask == 2 ||
+        non_broadcast_mask == 4 || non_broadcast_mask == 5 ||
+        non_broadcast_mask == 7) {
+      return true;
+    }
+  } else if (left.dim() == 2) {
+    if (non_broadcast_mask == 0 || non_broadcast_mask == 2 ||
+        non_broadcast_mask == 3) {
+      return true;
+    }
+  }
+
+  return false;
+}
 //! function: sumproduct_pair
 /*!
  *
@@ -266,14 +327,19 @@ static Tensor sumproduct_pair(
   left = left.permute(lpermutation).reshape(left_shape);
   right = right.permute(rpermutation).reshape(right_shape);
 
-  // Tensor result = at::bmm(left, right);
-  auto _input = arg.is_contiguous() ? arg : arg.contiguous();
-  ideep::tensor onednn_input = itensor_view_from_dense(_input);
-  auto op_attr = ideep::attr_t::fuse_binary(
-      dnnl::algorithm::binary_add, onednn_input.get_desc());
-  Tensor result =
-      bmm_impl(left, right, at::Tensor(), op_attr, {onednn_input}, 1.0f);
-
+  // now we do the computation
+  Tensor result;
+  if (is_add_broadcast_supported_by_onednn(left, right, arg)) {
+    auto _input = arg.is_contiguous() ? arg : arg.contiguous();
+    ideep::tensor onednn_input = itensor_view_from_dense(_input);
+    auto op_attr = ideep::attr_t::fuse_binary(
+        dnnl::algorithm::binary_add, onednn_input.get_desc());
+    result = bmm_impl(left, right, at::Tensor(), op_attr, {onednn_input}, 1.0f);
+  } else {
+    result = at::matmul(left, right);
+    auto f_alpha = alpha.to<float>();
+    result = result + f_alpha * arg;
+  }
   result = result.view(out_size).permute(opermutation);
 
   // finally squeeze summed dimensions if desired
@@ -666,6 +732,7 @@ at::Tensor einsum_binary(
     const c10::List<at::Tensor>& operands,
     const at::Tensor& add_arg,
     const c10::Scalar& alpha) {
+  IPEX_RECORD_FUNCTION("dil_einsum_binary", c10::ArrayRef<c10::IValue>({}));
   auto prepare_res = einsum_prepare(equation, operands);
   bool has_zero_size_dim = std::get<0>(prepare_res);
   auto out_size = std::get<1>(prepare_res);
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.h b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Einsum.h
@@ -30,5 +30,9 @@ at::Tensor einsum_binary(
     const at::Tensor& input,
     const c10::Scalar& alpha);
 
+bool is_add_broadcast_supported_by_onednn(
+    const at::Tensor& left,
+    const at::Tensor& right,
+    const at::Tensor& post_add_tensor);
 } // namespace cpu
 } // namespace torch_ipex
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_einsum.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_einsum.cpp
@@ -16,8 +16,9 @@ auto ipex_einsum_filter =
       auto equation =
           getIValue("equation", match_vmap, vmap).value().toStringView();
       int num_ops = std::count(equation.begin(), equation.end(), ',') + 1;
-      if (num_ops != 2)
+      if (num_ops != 2) {
         return false; // only process the 2 operands
+      }
       return true;
     };
 
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -2813,6 +2813,37 @@ def _test_fp32(model_test, input1, input2, bias=None, kind_in_graph='ipex::einsu
         model = EinsumAdd(("ij,j"))
         _test_fp32(model, input1, input2, bias)
 
+        bias = torch.randn(1, 4, 49, 49)
+        input1 = torch.randn(8, 4, 49, 32)
+        input2 = torch.randn(8, 4, 49, 32)
+        model_from_vit = EinsumAdd('bhid,bhjd->bhij')
+        _test_fp32(model_from_vit, input1, input2, bias)
+
+        bias = torch.randn(1, 1, 49, 49)
+        input1 = torch.randn(8, 6, 49, 32)
+        input2 = torch.randn(8, 6, 49, 32)
+        model_from_vit_v2 = EinsumAdd('bhid,bhjd->bhij')
+        _test_fp32(model_from_vit_v2, input1, input2, bias)
+
+        bias = torch.randn(8, 1, 1, 49)
+        input1 = torch.randn(8, 6, 49, 32)
+        input2 = torch.randn(8, 6, 49, 32)
+        model_from_vit_alphafold2_v1 = EinsumAdd('bhid,bhjd->bhij')
+        _test_fp32(model_from_vit_alphafold2_v1, input1, input2, bias)
+
+        bias = torch.randn(1, 1, 32)
+        input1 = torch.randn( 6, 50, 32)
+        input2 = torch.randn( 32, 32)
+        model_from_vit_alphafold2_v2 = EinsumAdd('bsh,ho->bso')
+        _test_fp32(model_from_vit_alphafold2_v2, input1, input2, bias)
+
+        bias = torch.randn(6, 1, 50)
+        input1 = torch.randn( 6, 50, 32)
+        input2 = torch.randn( 6, 32, 50)
+        model_from_vit_alphafold2_v3 = EinsumAdd('bsh,bho->bso')
+        _test_fp32(model_from_vit_alphafold2_v3, input1, input2, bias)
+
+
     def test_ipex_softmax(self):
         self._test_output(
             AtenSoftmaxRepalce(),