Replace at::layer_norm with ipex::layernorm (#129)

liangan1 · web-flow · commit 5d14f8063e81 · 2021-07-29T14:16:32.000+08:00
* Replace at::layer_norm with ipex::layernorm

* Add torch_ipex/csrc/cpu/LayerNorm.h torch_ipex/csrc/cpu/LayerNorm.cpp

* Add comment

* Fix conflict build issue

* Fix clang-format issue

* Update CustomOPs.cpp

Add the reason for layer_norm performance regression and condition to remove  workaround for layer_norm.

* Update CustomOPs.cpp

* Fix clang issue
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -436,6 +436,14 @@ def __init__(self, dim=-1):
     def forward(self, x):
         return self.softmax(x)
 
+class IPEXLayerNorm(torch.nn.Module):
+    def __init__(self):
+        super(IPEXLayerNorm, self).__init__()
+        self.layernorm = torch.nn.LayerNorm(4)
+    def forward(self, x):
+        return self.layernorm(x)
+
+
 
 class Tester(TestCase):
 
@@ -947,6 +955,17 @@ def test_ipex_softmax(self):
             torch.rand(3, 4, 4, dtype=torch.bfloat16),
             kind_in_graph="ipex::softmax",
             prec=5e-3)
+    def test_ipex_layernorm(self):
+        self._test_output(
+            IPEXLayerNorm(),
+            torch.rand(8, 3, 4),
+            kind_in_graph="ipex::layernorm")
+        self._test_output_bf16(
+             IPEXLayerNorm(),
+             torch.rand(8, 3, 4, dtype=torch.bfloat16),
+             kind_in_graph="ipex::layernorm",
+             prec=5e-2)
+
 
 if __name__ == '__main__':
     torch.manual_seed(2020)
diff --git a/torch_ipex/csrc/cpu/CustomOPs.cpp b/torch_ipex/csrc/cpu/CustomOPs.cpp
@@ -1,10 +1,11 @@
 #include "torch_ipex/csrc/cpu/CustomOPs.h"
-#include "torch_ipex/csrc/utils.h"
 #include "Conv.h"
+#include "LayerNorm.h"
 #include "Linear.h"
-#include "Pooling.h"
 #include "Matmul.h"
+#include "Pooling.h"
 #include "Softmax.h"
+#include "torch_ipex/csrc/utils.h"
 
 #include <ATen/Context.h>
 #include <ATen/InferSize.h>
@@ -357,5 +358,115 @@ at::Tensor AtenIpexJITDev::dil_softmax(
   return softmax_impl(input, dim);
 }
 
+/**
+ *prepare inputs for dil_layernorm
+ *
+ *@param input: the source tensor to layernorm
+ *@param normalized_shape: input shape from an expected input of size
+ *@param weight: scale tensor for layernorm
+ *@param bias: shift tensor for layernorm
+ *
+ *@return inputs for dil_layernorm.
+ **/
+std::tuple<at::Tensor, at::Tensor, at::Tensor, int64_t, int64_t>
+_prepare_layer_norm_inputs(const at::Tensor &input,
+                           at::IntArrayRef normalized_shape,
+                           const at::Tensor &weight /* optional */,
+                           const at::Tensor &bias /* optional */) {
+
+  const int normalized_ndim = normalized_shape.size();
+  TORCH_CHECK(normalized_ndim >= 1,
+              "Expected normalized_shape to be at least 1-dimensional, i.e., ",
+              "containing at least one element, but got normalized_shape = ",
+              normalized_shape);
+  TORCH_CHECK(
+      !weight.defined() || weight.sizes().equals(normalized_shape),
+      "Expected weight to be of same shape as normalized_shape, but got ",
+      "weight of shape ", weight.sizes(),
+      " and normalized_shape = ", normalized_shape);
+  TORCH_CHECK(!bias.defined() || bias.sizes().equals(normalized_shape),
+              "Expected bias to be of same shape as normalized_shape, but got ",
+              "bias of shape ", bias.sizes(),
+              " and normalized_shape = ", normalized_shape);
+
+  const auto input_shape = input.sizes();
+  const auto input_ndim = input.dim();
+
+  if (input_ndim < normalized_ndim ||
+      !input_shape.slice(input_ndim - normalized_ndim)
+           .equals(normalized_shape)) {
+    std::stringstream ss;
+    ss << "Given normalized_shape=" << normalized_shape
+       << ", expected input with shape [*";
+    for (auto size : normalized_shape) {
+      ss << ", " << size;
+    }
+    ss << "], but got input of size" << input_shape;
+    AT_ERROR(ss.str());
+  }
+
+  const int axis = input_ndim - normalized_ndim;
+  const int64_t M =
+      std::accumulate(input_shape.cbegin(), input_shape.cbegin() + axis,
+                      static_cast<int64_t>(1), std::multiplies<int64_t>());
+  const int64_t N =
+      std::accumulate(input_shape.cbegin() + axis, input_shape.cend(),
+                      static_cast<int64_t>(1), std::multiplies<int64_t>());
+  ;
+
+  const auto &X = input.is_contiguous() ? input : input.contiguous();
+  const auto &gamma = weight.is_contiguous() ? weight : weight.contiguous();
+  const auto &beta = bias.is_contiguous() ? bias : bias.contiguous();
+  return std::make_tuple(X, gamma, beta, M, N);
+}
+
+/**
+ * at::layer_norm performance drop due to
+ * #PR https://github.com/pytorch/pytorch/pull/59987
+ * This is a workaround for layernorm regression.
+ * Replace at::layer_norm with ipex::layernorm in jit pass for inference.
+ * Now, we only use oneDNN kernel when both weight and bias are provided.
+ * ToDo: more scenarios to use oneDNN or remvoe this pass
+ * when at::layer_norm performance is back compared to w/o
+ * mergeing https://github.com/pytorch/pytorch/pull/59987
+ *
+ * @param input: the source tensor to layernorm
+ * @param normalized_shape: input shape from an expected input of size
+ * @param weight_opt: scale tensor for layernorm
+ * @param bias_opt: shift tensor for layernorm
+ * @param bias: a value added to the denominator for numerical stability.
+ * Default: 1e-5
+ *
+ * return: output for layernorm
+ */
+at::Tensor AtenIpexJITDev::dil_layernorm(
+    const at::Tensor &input, at::IntArrayRef normalized_shape,
+    const c10::optional<at::Tensor> &weight_opt,
+    const c10::optional<at::Tensor> &bias_opt, float eps, bool cudnn_enable) {
+
+  if (weight_opt.has_value() && bias_opt.has_value()) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("AtenIpexJITDev::dil_layernorm",
+                    std::vector<c10::IValue>({}));
+#endif
+    auto inputs = _prepare_layer_norm_inputs(
+        input, normalized_shape, weight_opt.value(), bias_opt.value());
+    auto X = std::get<0>(inputs);
+    auto gamma = std::get<1>(inputs);
+    auto beta = std::get<2>(inputs);
+    auto M = std::get<3>(inputs);
+    auto N = std::get<4>(inputs);
+    return std::get<0>(dil_native_layer_norm_impl(X, gamma, beta, M, N, eps));
+  }
+  c10::MaybeOwned<at::Tensor> weight_maybe_owned =
+      at::borrow_from_optional_tensor(weight_opt);
+  const at::Tensor &weight = *weight_maybe_owned;
+  c10::MaybeOwned<at::Tensor> bias_maybe_owned =
+      at::borrow_from_optional_tensor(bias_opt);
+  const at::Tensor &bias = *bias_maybe_owned;
+  return std::get<0>(
+      at::native_layer_norm(input, normalized_shape, weight, bias, eps));
+}
+
 }  // namespace cpu
 }  // namespace torch_ipex
diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h
@@ -31,6 +31,7 @@ namespace ipex {
 
   static auto max_pool2d = Symbol::fromQualString("ipex::max_pool2d");
   static auto softmax = Symbol::fromQualString("ipex::softmax");
+  static auto layernorm = Symbol::fromQualString("ipex::layernorm");
 
   // n-dims tensor op.
   static auto convolution_nd_weight_base =
@@ -186,6 +187,11 @@ class AtenIpexJITDev {
        at::IntArrayRef kernel_size, int64_t groups, int64_t output_channel,
        bool weight_channels_last, bool weight_prepacked, at::Tensor &accumu,
        at::Scalar alpha);
+   static at::Tensor dil_layernorm(const at::Tensor &input,
+                                   at::IntArrayRef normalized_shape,
+                                   const c10::optional<at::Tensor> &weight_opt,
+                                   const c10::optional<at::Tensor> &bias_opt,
+                                   float eps, bool cudnn_enable);
 };
 
 }  // namespace cpu
diff --git a/torch_ipex/csrc/cpu/LayerNorm.cpp b/torch_ipex/csrc/cpu/LayerNorm.cpp
@@ -0,0 +1,48 @@
+#include "LayerNorm.h"
+#include "mkldnn/MKLDNNCommon.h"
+
+namespace torch_ipex {
+namespace cpu {
+
+/**layer_norm kernel for inference mode with oneDNN implementation
+ *
+ * @param X: input tensor for layernorm
+ * @param gamma: scale for layernorm
+ * @param beta: shift for layernorm
+ * @param M
+ * @param N
+ * @param eps
+ **/
+std::tuple<at::Tensor, at::Tensor, at::Tensor> dil_native_layer_norm_impl(
+    const at::Tensor &X, const at::Tensor &gamma /* optional */,
+    const at::Tensor &beta /* optional */, int64_t M, int64_t N, double eps) {
+  ideep::tensor x = itensor_view_from_dense(X);
+  auto gamma_fp32 = gamma.to(at::kFloat);
+  auto beta_fp32 = beta.to(at::kFloat);
+  const ideep::tensor scale = itensor_view_from_dense(gamma_fp32);
+  const ideep::tensor shift = itensor_view_from_dense(beta_fp32);
+  int64_t i = 0;
+  auto dim = at::maybe_wrap_dim(0, X.dim(), false);
+  auto j = X.sizes()[dim];
+  std::vector<int64_t> input_size;
+  while (j <= M) {
+    dim = at::maybe_wrap_dim(i++, X.dim(), false);
+    input_size.push_back(X.sizes()[dim]);
+    dim = at::maybe_wrap_dim(i, X.dim(), false);
+    j *= X.sizes()[dim];
+  }
+  input_size.push_back(N);
+  auto src = x.reshape(input_size);
+  at::Tensor Y = at::native::empty_like(X);
+  at::Tensor mean = at::empty({M}, X.options());
+  at::Tensor variance = at::empty({M}, X.options());
+  auto onednn_Y = itensor_view_from_dense(Y);
+  auto onednn_mean = itensor_view_from_dense(mean);
+  auto onednn_variance = itensor_view_from_dense(variance);
+  ideep::layer_normalization_forward::compute(
+      src, scale, shift, onednn_Y, onednn_mean, onednn_variance, eps);
+  return std::make_tuple(Y, mean, variance);
+}
+
+} // namespace cpu
+} // namespace torch_ipex
diff --git a/torch_ipex/csrc/cpu/LayerNorm.h b/torch_ipex/csrc/cpu/LayerNorm.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <ATen/Tensor.h>
+
+#include "ideep/ideep.hpp"
+
+namespace torch_ipex {
+namespace cpu {
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> dil_native_layer_norm_impl(
+    const at::Tensor &X, const at::Tensor &gamma /* optional */,
+    const at::Tensor &beta /* optional */, int64_t M, int64_t N, double eps);
+} // namespace cpu
+} // namespace torch_ipex
diff --git a/torch_ipex/csrc/jit/fusion_pass.cpp b/torch_ipex/csrc/jit/fusion_pass.cpp
@@ -345,7 +345,7 @@ void FusionPass(std::shared_ptr<Graph> &graph) {
   
   // replace aten::softmax with ipex::softmax
   graph_rewrite::replaceAtenLinearWithIpexSoftmax(graph);
-
+  graph_rewrite::replaceAtenLayerNormWithIpexLayerNorm(graph);
   // TODO: Some post processing?? ECS/EDC/Peephole???
   ConstantPropagation(graph);
 }
diff --git a/torch_ipex/csrc/jit/graph_rewrite.cpp b/torch_ipex/csrc/jit/graph_rewrite.cpp
@@ -551,6 +551,22 @@ void replaceAtenLinearWithIpexSoftmax(std::shared_ptr<Graph>& graph) {
   rewriter_aten.runOnGraph(graph);
 
 }
+// replace aten::layer_norm with ipex::layer_norm during jit pass
+// this is a just workaround for layernorm performance reggression
+void replaceAtenLayerNormWithIpexLayerNorm(std::shared_ptr<Graph> &graph) {
+  std::string aten_layernorm = R"(
+      graph(%a, %shape:int[], %w, %b, %eps:float, %cudnn_enable:bool):
+        %r = aten::layer_norm(%a, %shape, %w, %b, %eps, %cudnn_enable)
+        return (%r) )";
+  std::string ipex_layernorm = R"(
+      graph(%a, %shape:int[], %w, %b, %eps:float, %cudnn_enable:bool):
+        %r = ipex::layernorm(%a, %shape, %w, %b, %eps, %cudnn_enable)
+        return (%r) )";
+  SubgraphRewriter rewriter_aten;
+  rewriter_aten.RegisterRewritePattern(aten_layernorm, ipex_layernorm);
+  rewriter_aten.runOnGraph(graph);
+}
+
 } // namespace graph_rewrite
 } // namespace jit
 } // namespace torch
diff --git a/torch_ipex/csrc/jit/graph_rewrite.h b/torch_ipex/csrc/jit/graph_rewrite.h
@@ -27,6 +27,7 @@ void FuseShuffle(std::shared_ptr<Graph>& graph);
 void replaceAtenMaxPool2dWithIpexMaxPool2d(std::shared_ptr<Graph>& graph);
 void replaceAtenLinearWithIpexLinear(std::shared_ptr<Graph>& graph);
 void replaceAtenLinearWithIpexSoftmax(std::shared_ptr<Graph>& graph);
+void replaceAtenLayerNormWithIpexLayerNorm(std::shared_ptr<Graph> &graph);
 } // namespace graph_rewrite_helper
 } // namespace jit
 } // namespace torch
diff --git a/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp b/torch_ipex/csrc/jit/register_dnnl_jit_ops.cpp
@@ -400,9 +400,28 @@ RegisterOperators op(
              return 0;
            };
          },
+         aliasAnalysisFromSchema()),
+
+     Operator(
+         "ipex::layernorm(Tensor a, int[] normalized_shape, Tensor ? "
+         "weight_opt, Tensor ? bias_opt, float eps, bool cudnn_enable) -> "
+         "Tensor",
+         [](const Node *node) -> Operation {
+           return [](Stack *stack) {
+             auto result = AtenIpexJITDev::dil_layernorm(
+                 (std::move(peek(stack, 0, 6))).toTensor(),
+                 (std::move(peek(stack, 1, 6))).toIntVector(),
+                 toOptionalTensor(std::move(peek(stack, 2, 6))),
+                 toOptionalTensor(std::move(peek(stack, 3, 6))),
+                 (std::move(peek(stack, 4, 6))).toDouble(),
+                 (std::move(peek(stack, 5, 6))).toBool());
+             drop(stack, 6);
+             pack(stack, std::move(result));
+             return 0;
+           };
+         },
          aliasAnalysisFromSchema())
 
     });
-
 } // namespace jit
 } // namespace torch

Original file line number	Diff line number	Diff line change
`@@ -345,7 +345,7 @@ void FusionPass(std::shared_ptr<Graph> &graph) {`
`345`	`345`
`346`	`346`	`// replace aten::softmax with ipex::softmax`
`347`	`347`	`graph_rewrite::replaceAtenLinearWithIpexSoftmax(graph);`
`348`		`-`
	`348`	`+ graph_rewrite::replaceAtenLayerNormWithIpexLayerNorm(graph);`
`349`	`349`	`// TODO: Some post processing?? ECS/EDC/Peephole???`
`350`	`350`	`ConstantPropagation(graph);`
`351`	`351`	`}`