[LLGA] do not rewrite single quant and dequant node (#139)

chunyuan-w · web-flow · commit f77fcec2d57a · 2021-08-04T14:01:13.000+08:00
* [LLGA] do not rewrite single quant/dequant

* [LLGA] update UTs since we don't rewrite single quant/dequant anymore
diff --git a/tests/cpu/test_jit_llga_quantization_fuser.py b/tests/cpu/test_jit_llga_quantization_fuser.py
@@ -76,14 +76,13 @@ def test_conv2d_int8_in_f32_out(self):
                           bias=bias)
             x = torch.rand(1, in_channels * g, spatial, spatial)
             patterns = [
-                ["aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution"]
             ]
             #TODO: enable torch.per_tensor_symmetric case.
             for qscheme in [torch.per_tensor_affine]:
                 graph = self.checkQuantizeTrace(m, [x], x_var=[torch.rand(5, in_channels * g, spatial, spatial, requires_grad=False)], atol=2e-1, config_name="conv2d", qscheme=qscheme)
-                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
-                self.assertFused(graph, ['aten::_convolution', 'aten::quantize_per_tensor', 'aten::quantize_per_channel'])
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+                self.assertFused(graph, ['aten::_convolution', 'aten::quantize_per_channel', 'aten::dequantize'])
                 self.checkPatterns(graph, patterns)
 
     @llga_test_env
@@ -93,13 +92,12 @@ def test_linear_int8_in_f32_out(self):
             m = torch.nn.Linear(in_features=28, out_features=64, bias=bias)
 
             patterns = [
-                ["aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::linear"],
             ]
             for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
                 graph = self.checkQuantizeTrace(m, [x], atol=1e-1, config_name="linear", qscheme=qscheme)
-                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
-                self.assertFused(graph, ['aten::linear', 'aten::quantize_per_tensor', 'aten::quantize_per_channel', 'aten::dequantize'])
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+                self.assertFused(graph, ['aten::linear', 'aten::quantize_per_channel', 'aten::dequantize'])
                 self.checkPatterns(graph, patterns)
 
     @llga_test_env
@@ -121,16 +119,14 @@ def forward(self, x, y):
             m = M(bias)
 
             patterns = [
-                ["aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::linear", "aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::linear"]
             ]
 
             for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
                 graph = self.checkQuantizeTrace(m, [x, y], atol=2e-1, config_name="linear_int8", qscheme=qscheme)
-                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
-                self.assertFused(graph, ['aten::linear',
-                                        'aten::quantize_per_tensor', 'aten::quantize_per_channel', 'aten::dequantize'])
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
+                self.assertFused(graph, ['aten::linear', 'aten::quantize_per_channel', 'aten::dequantize'])
                 self.checkPatterns(graph, patterns)
 
     @llga_test_env
@@ -158,14 +154,12 @@ def test_max_pool2d(self):
             x = torch.rand(1, 3, spatial, spatial)
 
             patterns = [
-                ["aten::quantize_per_tensor"],
                 ["aten::dequantize", "aten::max_pool2d", "aten::quantize_per_tensor"],
-                ["aten::dequantize"]
             ]
             for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
                 graph = self.checkQuantizeTrace(m, [x], atol=1e-1, config_name="max_pool2d", qscheme=qscheme)
-                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
-                self.assertFused(graph, ['aten::max_pool2d', 'aten::quantize_per_tensor', 'aten::dequantize'])
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+                self.assertFused(graph, ['aten::max_pool2d'])
                 self.checkPatterns(graph, patterns)
 
     @llga_test_env
@@ -212,14 +206,13 @@ def forward(self, x):
                 x = torch.rand(1, 32, 28, 28)
 
                 patterns = [
-                    ["aten::quantize_per_tensor"],
                     ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution", 'aten::' + eltwise, "aten::quantize_per_tensor"], # inplace op will become outplace op on the JIT graph
                     ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution"]
                 ]
                 for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
                     graph = self.checkQuantizeTrace(m, [x], atol=2e-1, config_name="conv2d_eltwise", qscheme=qscheme)
-                    self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
-                    self.assertFused(graph, ['aten::_convolution', 'aten::' + eltwise, 'aten::quantize_per_tensor', 'aten::quantize_per_channel', 'aten::dequantize'])
+                    self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
+                    self.assertFused(graph, ['aten::_convolution', 'aten::' + eltwise, 'aten::quantize_per_channel', 'aten::dequantize'])
                     self.checkPatterns(graph, patterns)
 
     @llga_test_env
@@ -241,14 +234,13 @@ def forward(self, x):
             # x = torch.rand(1, 32, 28, 28)
 
             patterns = [
-                ["aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution"]
             ]
             # TODO: add torch.per_tensor_symmetric case.
             for qscheme in [torch.per_tensor_affine]:
                 graph = self.checkQuantizeTrace(m, [x], atol=1e-1, folding=True, config_name="conv2d_bn", qscheme=qscheme)
-                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
-                self.assertFused(graph, ['aten::_convolution', 'aten::quantize_per_tensor', 'aten::quantize_per_channel'])
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+                self.assertFused(graph, ['aten::_convolution', 'aten::quantize_per_channel', 'aten::dequantize'])
                 self.checkPatterns(graph, patterns)
 
     @llga_test_env
@@ -268,15 +260,12 @@ def forward(self, x):
         m = M().eval()
         x = torch.rand(1, 32, 28, 28)
         patterns = [
-            ["aten::quantize_per_tensor"],
             ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution", "aten::relu", "aten::quantize_per_tensor"],
-            ["aten::dequantize"]
         ]
         for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
             graph = self.checkQuantizeTrace(m, [x], atol=1e-1, folding=True, config_name="conv2d_bn_relu", qscheme=qscheme)
-            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
-            self.assertFused(graph, ['aten::_convolution', 'aten::relu',
-                                    'aten::quantize_per_tensor', 'aten::quantize_per_channel', 'aten::dequantize'])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+            self.assertFused(graph, ['aten::_convolution', 'aten::relu', 'aten::quantize_per_channel'])
             self.checkPatterns(graph, patterns)
 
     @llga_test_env
@@ -305,13 +294,11 @@ def forward(self, x):
             m = M(eltwise_fn, has_bias)
             x = torch.rand(32, 28, requires_grad=False)
             patterns = [
-                ["aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::linear", "aten::" + eltwise, "aten::quantize_per_tensor"],
-                ["aten::dequantize"]
             ]
             for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
                 graph = self.checkQuantizeTrace(m, [x], x_var=[torch.rand(2, 28, requires_grad=False)], atol=1e-1, config_name="linear_eltwise", qscheme=qscheme)
-                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
                 self.assertFused(graph, ['aten::' + eltwise])
                 self.checkPatterns(graph, patterns)
 
@@ -343,15 +330,14 @@ def forward(self, x, y):
             x = torch.rand(1, 32, 16, 16, requires_grad=False)
             y = torch.rand(1, 32, 16, 16, requires_grad=False)
             patterns = [
-                ["aten::quantize_per_tensor"],
-                ["aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution", "aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution", "aten::relu", "aten::add", "aten::quantize_per_tensor"],
                 ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution"]
             ]
             for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
                 graph = self.checkQuantizeTrace(m, [x, y], folding=True, atol=1e-1, config_name="conv2d_sum", qscheme=qscheme)
-                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 5)
+                self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
+                self.assertFused(graph, ['aten::_convolution', 'aten::relu', 'aten::add', 'aten::quantize_per_channel', 'aten::dequantize'])
                 self.checkPatterns(graph, patterns)
 
     @llga_test_env
@@ -373,29 +359,15 @@ def forward(self, x, y):
         y = torch.randn(2, 20)
         m = M()
         patterns = [
-            ["aten::quantize_per_tensor"],
-            ["aten::quantize_per_tensor"],
             ["aten::quantize_per_channel", "aten::dequantize", "aten::linear", "aten::add", "aten::quantize_per_tensor"],
             ["aten::quantize_per_channel", "aten::dequantize", "aten::linear"]
         ]
         for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
             graph = self.checkQuantizeTrace(m, [x, y], atol=2e-1, remove_dropout=True, config_name="linear_dropout_sum", qscheme=qscheme)
-            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 4)
-            self.assertFused(graph, ['aten::linear', 'aten::add',
-                                    'aten::quantize_per_tensor', 'aten::quantize_per_channel', 'aten::dequantize'])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
+            self.assertFused(graph, ['aten::linear', 'aten::add', 'aten::quantize_per_channel', 'aten::dequantize'])
         self.checkPatterns(graph, patterns)
 
-        # TODO: check patterns when oneDNN support sum post_ops with zps
-        # patterns = [
-        #     ["aten::quantize_per_tensor"],
-        #     ["aten::quantize_per_channel"],
-        #     ["aten::dequantize", "aten::linear", "aten::add", "aten::quantize_per_tensor"],
-        #     ["aten::quantize_per_channel"],
-        #     ["aten::dequantize", "aten::linear", "aten::quantize_per_tensor"],
-        #     ["aten::dequantize"]
-        # ]
-        # self.checkPatterns(graph, patterns)
-
     @llga_test_env
     def test_defer_size(self):
         class M(nn.Module):
@@ -415,14 +387,13 @@ def forward(self, x):
         m = M()
         x = torch.rand(1, 32, 28, 28)
         patterns = [
-            ["aten::quantize_per_tensor"],
             ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution", 'aten::relu', "aten::quantize_per_tensor"],
             ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution"]
         ]
         for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
             graph = self.checkQuantizeTrace(m, [x], atol=2e-1, config_name="defer_size", qscheme=qscheme)
-            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
-            self.assertFused(graph, ['aten::_convolution', 'aten::relu', 'aten::quantize_per_tensor', 'aten::quantize_per_channel', 'aten::dequantize'])
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
+            self.assertFused(graph, ['aten::_convolution', 'aten::relu', 'aten::quantize_per_channel', 'aten::dequantize'])
             self.checkPatterns(graph, patterns)
 
 class TestShapeFallback(JitLlgaTestCase):
@@ -486,9 +457,7 @@ def _test_vision(self, model_name):
 
             # TODO: aten::adaptive_avg_pool2d also need to be fused once backend supported it
             self.assertFused(graph, ['aten::_convolution', 'aten::relu',
-                                    'aten::max_pool2d', 'aten::linear'
-                                    'aten::quantize_per_tensor', 'aten::quantize_per_channel',
-                                    'aten::dequantize'])
+                                    'aten::max_pool2d', 'aten::linear', 'aten::quantize_per_channel'])
 
 
 for model_name, enabled in [
diff --git a/torch_ipex/csrc/jit/codegen/onednn/graph_helper.cpp b/torch_ipex/csrc/jit/codegen/onednn/graph_helper.cpp
@@ -488,6 +488,41 @@ bool isViewOp(Node* n) {
   return false;
 }
 
+void checkAndRemoveAttr(Node *n, std::string attr) {
+  TORCH_CHECK(n->hasAttributeS(attr),
+              "dequant node with numAttributes != 0 must have attr: ", attr);
+  n->removeAttributeS(attr);
+}
+
+void removeAttrOfDequant(Node *n) {
+  if (n->kind() == Symbol::aten("dequantize")) {
+    if (n->numAttributes() == 0)
+      return;
+    std::vector<std::string> common_attrs{"zps", "scales", "in_type"};
+    for (const auto &attr : common_attrs) {
+      checkAndRemoveAttr(n, attr);
+    }
+
+    if (n->s(Symbol::attr("qtype")) == std::string("per_channel")) {
+      checkAndRemoveAttr(n, std::string("axis"));
+    }
+    checkAndRemoveAttr(n, std::string("qtype"));
+  }
+}
+
+bool LlgaGraphHelper::isSingleQuantDequant(Node *n) {
+  if (n->kind() != Symbol::aten("quantize_per_tensor") &&
+      n->kind() != Symbol::aten("quantize_per_channel") &&
+      n->kind() != Symbol::aten("dequantize"))
+    return false;
+  if (!opToOwningPartition_.has(n))
+    return false;
+
+  auto partitionId = opToOwningPartition_.get(n);
+  auto OpNum = partitions_[partitionId].get_ops_num();
+  return OpNum == 1;
+}
+
 bool LlgaGraphHelper::shouldConsiderForMerge(Node* node) {
   // if we're already in the process of merging
   if (isLlgaSubgraph(node)) {
@@ -496,6 +531,15 @@ bool LlgaGraphHelper::shouldConsiderForMerge(Node* node) {
   if (isViewOp(node)) {
     return false;
   }
+  // For a partition composed of 1 single quant or 1 single dequant,
+  // do not rewrite it in the bridge, so that the FWK may have chances
+  // to optimize single int8 op that LLGA does not support
+  if (isSingleQuantDequant(node)) {
+    // We have added attr on dequant node to create LLGA dequant op.
+    // If we won't rewrite it with LLGA op, remove the attr here.
+    removeAttrOfDequant(node);
+    return false;
+  }
   return opToOwningPartition_.has(node);
 }
 
diff --git a/torch_ipex/csrc/jit/codegen/onednn/graph_helper.h b/torch_ipex/csrc/jit/codegen/onednn/graph_helper.h
@@ -63,6 +63,8 @@ class LlgaGraphHelper {
  private:
   size_t countSupportedOps(const std::shared_ptr<Graph>& graph) const;
 
+  bool isSingleQuantDequant(Node *node);
+
   OpPartitionMap opToOwningPartition_;
   std::vector<dnnl::graph::partition> partitions_;
 };