jit: support yolo mish, yolo mish add on release branch (#1669)

ganyi1996ppo · web-flow · commit 88e2f09b7d23 · 2022-10-25T13:42:57.000+08:00
* jit: support yolo mish, yolo mish add on release branch
diff --git a/csrc/aten/operators/Conv.cpp b/csrc/aten/operators/Conv.cpp
@@ -856,6 +856,81 @@ Tensor convolution_silu(
       attr);
 }
 
+Tensor convolution_mish(
+    const Tensor& input_r,
+    const Tensor& weight_r,
+    const Tensor& bias_r,
+    IntArrayRef stride_,
+    IntArrayRef padding_,
+    IntArrayRef dilation_,
+    bool transposed_,
+    IntArrayRef output_padding_,
+    int64_t groups_,
+    Scalar scale,
+    Scalar alpha,
+    Scalar beta) {
+  // only support scale = 1.0f in oneDNN for non-quantized case.
+  TORCH_CHECK(
+      scale.to<float>() == 1.f && alpha.to<float>() == 1.f,
+      "only support convolution silu fusion with silu scale equals to 1, alpha equal to 1");
+  Attr attr;
+  attr.append_post_eltwise(
+      /* relu_scale */ 1.0,
+      /* alpha */ 1.f,
+      /* beta */ 0.f,
+      attr.kind_with_mish);
+  return _convolution(
+      input_r,
+      weight_r,
+      bias_r,
+      stride_,
+      padding_,
+      dilation_,
+      transposed_,
+      output_padding_,
+      groups_,
+      attr);
+}
+
+Tensor convolution_mish_add(
+    const Tensor& input_r,
+    const Tensor& weight_r,
+    const Tensor& bias_r,
+    IntArrayRef stride_,
+    IntArrayRef padding_,
+    IntArrayRef dilation_,
+    bool transposed_,
+    IntArrayRef output_padding_,
+    int64_t groups_,
+    Tensor& accumu,
+    Scalar scale,
+    Scalar alpha,
+    Scalar beta) {
+  // only support scale = 1.0f in oneDNN for non-quantized case.
+  TORCH_CHECK(
+      scale.to<float>() == 1.f && alpha.to<float>() == 1.f,
+      "only support convolution silu fusion with silu scale equals to 1, alpha equal to 1");
+  Attr attr;
+  attr.append_post_eltwise(
+          /* relu_scale */ 1.0,
+          /* alpha */ 1.f,
+          /* beta */ 0.f,
+          attr.kind_with_mish)
+      .append_post_sum(/* sum_scale */ scale.to<float>()); // append post op sum
+  return _convolution_out(
+      accumu,
+      input_r,
+      weight_r,
+      bias_r,
+      stride_,
+      padding_,
+      dilation_,
+      transposed_,
+      output_padding_,
+      groups_,
+      attr);
+}
+
 Tensor convolution_sigmoid(
     const Tensor& input_r,
     const Tensor& weight_r,
diff --git a/csrc/intrinsic/intrinsic.h b/csrc/intrinsic/intrinsic.h
@@ -346,6 +346,35 @@ at::Tensor dequantize_tensor_per_channel_affine(
     const at::Tensor& zero_points,
     int64_t axis);
 
+Tensor convolution_mish(
+    const Tensor& input_r,
+    const Tensor& weight_r,
+    const Tensor& bias_r,
+    IntArrayRef stride_,
+    IntArrayRef padding_,
+    IntArrayRef dilation_,
+    bool transposed_,
+    IntArrayRef output_padding_,
+    int64_t groups_,
+    Scalar scale,
+    Scalar alpha,
+    Scalar beta);
+
+Tensor convolution_mish_add(
+    const Tensor& input_r,
+    const Tensor& weight_r,
+    const Tensor& bias_r,
+    IntArrayRef stride_,
+    IntArrayRef padding_,
+    IntArrayRef dilation_,
+    bool transposed_,
+    IntArrayRef output_padding_,
+    int64_t groups_,
+    Tensor& accumu,
+    Scalar scale,
+    Scalar alpha,
+    Scalar beta);
+
 } // namespace AtenIpexTypeXPU
 } // namespace at
 
diff --git a/csrc/jit/accelerated_ops.h b/csrc/jit/accelerated_ops.h
@@ -75,7 +75,10 @@ static auto permute_contiguous_sym =
     Symbol::fromQualString("xpu::permute_contiguous");
 static auto convolution_silu_sym =
     Symbol::fromQualString("xpu::_convolution_silu");
-
+static auto _convolution_mish_sym =
+    Symbol::fromQualString("xpu::_convolution_mish");
+static auto _convolution_mish_add_sym =
+    Symbol::fromQualString("xpu::_convolution_mish_add");
 // Fold weights of batch_norm with conv2d's
 static auto fold_weight_sym = Symbol::fromQualString("xpu::fold_weight");
 static auto fold_bias_sym = Symbol::fromQualString("xpu::fold_bias");
diff --git a/csrc/jit/dpcpp_ops.cpp b/csrc/jit/dpcpp_ops.cpp
@@ -1126,6 +1126,77 @@ at::Tensor _convolution_silu(
       0.0);
 }
 
+at::Tensor _convolution_mish(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& bias,
+    at::IntArrayRef stride_,
+    at::IntArrayRef padding_,
+    at::IntArrayRef dilation_,
+    bool transposed_,
+    at::IntArrayRef output_padding_,
+    int64_t groups_,
+    bool benchmark,
+    bool deterministic,
+    bool cudnn_enabled,
+    bool allow_tf32,
+    Scalar beta,
+    Scalar threshold) {
+  RECORD_FUNCTION(
+      "_convolution_mish", std::vector<c10::IValue>({input, weight, bias}));
+  const OptionalDeviceGuard device_guard(device_of(input));
+  return at::AtenIpexTypeXPU::convolution_mish(
+      input,
+      weight,
+      bias,
+      stride_,
+      padding_,
+      dilation_,
+      transposed_,
+      output_padding_,
+      groups_,
+      1.0,
+      1.0,
+      0.0);
+}
+
+at::Tensor _convolution_mish_add(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& bias,
+    at::IntArrayRef stride_,
+    at::IntArrayRef padding_,
+    at::IntArrayRef dilation_,
+    bool transposed_,
+    at::IntArrayRef output_padding_,
+    int64_t groups_,
+    bool benchmark,
+    bool deterministic,
+    bool cudnn_enabled,
+    bool allow_tf32,
+    Scalar beta,
+    Scalar threshold,
+    Tensor accumu,
+    Scalar scale) {
+  RECORD_FUNCTION(
+      "_convolution_silu", std::vector<c10::IValue>({input, weight, bias}));
+  const OptionalDeviceGuard device_guard(device_of(input));
+  return at::AtenIpexTypeXPU::convolution_mish_add(
+      input,
+      weight,
+      bias,
+      stride_,
+      padding_,
+      dilation_,
+      transposed_,
+      output_padding_,
+      groups_,
+      accumu,
+      scale,
+      1.0,
+      0.0);
+}
+
 } // namespace xpu
 } // namespace jit
 } // namespace torch
diff --git a/csrc/jit/dpcpp_ops.h b/csrc/jit/dpcpp_ops.h
@@ -324,6 +324,42 @@ at::Tensor _convolution_silu(
     bool cudnn_enabled,
     bool allow_tf32);
 
+at::Tensor _convolution_mish(
+    const at::Tensor& input_r,
+    const at::Tensor& weight_r,
+    const at::Tensor& bias_r,
+    at::IntArrayRef stride_,
+    at::IntArrayRef padding_,
+    at::IntArrayRef dilation_,
+    bool transposed_,
+    at::IntArrayRef output_padding_,
+    int64_t groups_,
+    bool benchmark,
+    bool deterministic,
+    bool cudnn_enabled,
+    bool allow_tf32,
+    Scalar beta,
+    Scalar threshold);
+
+at::Tensor _convolution_mish_add(
+    const at::Tensor& input_r,
+    const at::Tensor& weight_r,
+    const at::Tensor& bias_r,
+    at::IntArrayRef stride_,
+    at::IntArrayRef padding_,
+    at::IntArrayRef dilation_,
+    bool transposed_,
+    at::IntArrayRef output_padding_,
+    int64_t groups_,
+    bool benchmark,
+    bool deterministic,
+    bool cudnn_enabled,
+    bool allow_tf32,
+    Scalar beta,
+    Scalar threshold,
+    Tensor accumu,
+    Scalar alpha);
+
 } // namespace xpu
 } // namespace jit
 } // namespace torch
diff --git a/csrc/jit/fusion_pass.cpp b/csrc/jit/fusion_pass.cpp
@@ -506,7 +506,10 @@ OpFuser::RuleTab OpFuser::dnnlRules = {
      xpu::_convolution_sum_relu_sym},
     {{Symbol::fromQualString("aten::_convolution"),
       Symbol::fromQualString("aten::silu_")},
-     xpu::convolution_silu_sym}};
+     xpu::convolution_silu_sym},
+    {{Symbol::fromQualString("aten::_convolution"), xpu::softplus_tanh_mul_sym},
+     xpu::_convolution_mish_sym},
+    {{xpu::_convolution_mish_sym, aten::add_}, xpu::_convolution_mish_add_sym}};
 
 void FusionPass(std::shared_ptr<Graph>& graph) {
   // Pattern based fusion was lack of alias analysis
@@ -531,4 +534,4 @@ static RegisterPreFusionPass pass_3([](std::shared_ptr<Graph>& g) {
 });
 
 } // namespace jit
-} // namespace torch
+} // namespace torch
diff --git a/csrc/jit/register_dnnl_jit_ops.cpp b/csrc/jit/register_dnnl_jit_ops.cpp
@@ -831,6 +831,62 @@ RegisterOperators op(
             },
             aliasAnalysisFromSchema()),
 
+        Operator(
+            "xpu::_convolution_mish(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, Scalar beta, Scalar threshold) -> Tensor",
+            [](const Node* node) -> Operation {
+              return [](Stack& stack) {
+                at::Tensor input = std::move(peek(stack, 0, 15)).toTensor();
+                auto result = torch::jit::xpu::_convolution_mish(
+                    input,
+                    (std::move(peek(stack, 1, 15))).toTensor(),
+                    toOptionalTensor(std::move(peek(stack, 2, 15))),
+                    (std::move(peek(stack, 3, 15))).toIntVector(),
+                    (std::move(peek(stack, 4, 15))).toIntVector(),
+                    (std::move(peek(stack, 5, 15))).toIntVector(),
+                    (std::move(peek(stack, 6, 15))).toBool(),
+                    (std::move(peek(stack, 7, 15))).toIntVector(),
+                    (std::move(peek(stack, 8, 15))).toInt(),
+                    (std::move(peek(stack, 9, 15))).toBool(),
+                    (std::move(peek(stack, 10, 15))).toBool(),
+                    (std::move(peek(stack, 11, 15))).toBool(),
+                    (std::move(peek(stack, 12, 15))).toBool(),
+                    (std::move(peek(stack, 13, 15))).toScalar(),
+                    (std::move(peek(stack, 14, 15))).toScalar());
+                drop(stack, 15);
+                pack(stack, std::move(result));
+              };
+            },
+            aliasAnalysisFromSchema()),
+
+        Operator(
+            "xpu::_convolution_mish_add(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32, Scalar beta, Scalar threshold, Tensor(a!) accumu, *, Scalar alpha) -> Tensor",
+            [](const Node* node) -> Operation {
+              return [](Stack& stack) {
+                at::Tensor input = std::move(peek(stack, 0, 17)).toTensor();
+                auto result = torch::jit::xpu::_convolution_mish_add(
+                    input,
+                    (std::move(peek(stack, 1, 17))).toTensor(),
+                    toOptionalTensor(std::move(peek(stack, 2, 17))),
+                    (std::move(peek(stack, 3, 17))).toIntVector(),
+                    (std::move(peek(stack, 4, 17))).toIntVector(),
+                    (std::move(peek(stack, 5, 17))).toIntVector(),
+                    (std::move(peek(stack, 6, 17))).toBool(),
+                    (std::move(peek(stack, 7, 17))).toIntVector(),
+                    (std::move(peek(stack, 8, 17))).toInt(),
+                    (std::move(peek(stack, 9, 17))).toBool(),
+                    (std::move(peek(stack, 10, 17))).toBool(),
+                    (std::move(peek(stack, 11, 17))).toBool(),
+                    (std::move(peek(stack, 12, 17))).toBool(),
+                    (std::move(peek(stack, 13, 17))).toScalar(),
+                    (std::move(peek(stack, 14, 17))).toScalar(),
+                    (std::move(peek(stack, 15, 17))).toTensor(),
+                    (std::move(peek(stack, 16, 17))).toScalar());
+                drop(stack, 17);
+                pack(stack, std::move(result));
+              };
+            },
+            aliasAnalysisFromSchema()),
+
     });
 } // namespace jit
 } // namespace torch
diff --git a/csrc/oneDNN/Conv.h b/csrc/oneDNN/Conv.h
@@ -537,16 +537,16 @@ static at::Tensor convolution(
       bia_m = dpcpp_onednn_memory(bia_md, engine, bia_.data_ptr());
       xpu::oneDNN::reorder(bia, bia_, reorder_attr);
 
-      // Following is for saving bias correctly.
-      // TODO: Need a general solution for bias caching
-      #ifndef BUILD_JIT_QUANTIZATION_SAVE
-        if (weight_cache_optimization) {
-          strm.wait();
-          // FIXME: thread safty
-          auto bia_opt_ctx = DPCPPTensorContext::release_tensor_ctx(bia_);
-          DPCPPTensorContext::set_tensor_ctx(bia, std::move(bia_opt_ctx));
-        }
-      #endif
+// Following is for saving bias correctly.
+// TODO: Need a general solution for bias caching
+#ifndef BUILD_JIT_QUANTIZATION_SAVE
+      if (weight_cache_optimization) {
+        strm.wait();
+        // FIXME: thread safty
+        auto bia_opt_ctx = DPCPPTensorContext::release_tensor_ctx(bia_);
+        DPCPPTensorContext::set_tensor_ctx(bia, std::move(bia_opt_ctx));
+      }
+#endif
     }
   }
 
diff --git a/tests/gpu/examples/test_fusion.py b/tests/gpu/examples/test_fusion.py