enable quantize AdaptiveAvgPool2d and flatten fusion group in ipex side (#140)

XiaobingSuper · web-flow · commit 07b9ae7a089a · 2021-08-05T16:52:42.000+08:00
* enable quantize AdaptiveAvgPool2d  and flatten fusion group in ipex side

* add test case

* record flatten data flow for ipex int8 fusion path

* change code format

* refine code

* fix undefined symbol error when debug buiild
diff --git a/intel_pytorch_extension_py/conf.py b/intel_pytorch_extension_py/conf.py
@@ -42,6 +42,7 @@ def save(self, configure_file, default_recipe=True):
     def get_default_recipe(self, configures):
         elt_wise = ['relu', 'sigmoid', 'gelu']
         inplace_ops = ['relu_', 'add_']
+        shape_ops = ['flatten']
         # get default recipe,
         # q+dq+conv+q+dq+relu => q+dq+conv+relu
         # q+dq+op1+q+dq+q+dq+op2+q+dq => q+dq+op1+q+dq+op2+q+dq
@@ -75,6 +76,19 @@ def get_default_recipe(self, configures):
                                 default_configures[cur_id]['inputs_quantized'][i_num] = False
                             if cur_op == 'add':
                                 pre_ops[i_num] = pre_op
+                            if cur_op in shape_ops:
+                                # for pooling case, the input and output always has same scale and zero point,
+                                # if the pooling's post ops is flatten, need sync flatten's input and output's
+                                # scale and zero point to pooling.
+                                if pre_op in ['max_pool2d', 'adaptive_avg_pool2d']:
+                                    default_configures[cur_id]['input_scales'][i_num] = default_configures[pre_id]['output_scales'][o_num]
+                                    default_configures[cur_id]['input_zero_points'][i_num] = default_configures[pre_id]['output_zero_points'][o_num]
+                                    default_configures[cur_id]['output_scales'][i_num] = default_configures[pre_id]['output_scales'][o_num]
+                                    default_configures[cur_id]['output_zero_points'][i_num] = default_configures[pre_id]['output_zero_points'][o_num]
+                            if pre_op in shape_ops:
+                                # if pre op is flatten, sync the input's scale and zero point to flatten.
+                                default_configures[cur_id]['input_scales'][i_num] = default_configures[pre_id]['output_scales'][o_num]
+                                default_configures[cur_id]['input_zero_points'][i_num] = default_configures[pre_id]['output_zero_points'][o_num]
             # conv            op        conv         op
             #    \            /          \           /
             #     q          q            \         q
@@ -98,10 +112,17 @@ def get_default_recipe(self, configures):
         # post process for add, linear, if cur op hasn't post quantized op, i.e. 'outputs_quantized' is True,
         # for good perfromance, the default recipe:
         # int8_input -> op -> q -> dq will converted to int8_input -> op.
-        post_process_ops = ['add', 'linear', 'conv2d']
+        ops_remove_q_dq_after = ['add', 'linear', 'conv2d']
+        # post process for flatten, if flatten's pre-pop and post op are fp32 op, don't need add q and dq
+        # before and after it.
+        ops_remove_q_dq_before_after = ['flatten']
         for cur_id in range(num_ops):
             cur_op = default_configures[cur_id]['name']
-            if cur_op in post_process_ops and default_configures[cur_id]['outputs_quantized'][0]:
+            if cur_op in ops_remove_q_dq_after and default_configures[cur_id]['outputs_quantized'][0]:
+                default_configures[cur_id]['outputs_quantized'][0] = False
+            if cur_op in ops_remove_q_dq_before_after and default_configures[cur_id]['inputs_quantized'][0] \
+                    and default_configures[cur_id]['outputs_quantized'][0]:
+                default_configures[cur_id]['inputs_quantized'][0] = False
                 default_configures[cur_id]['outputs_quantized'][0] = False
 
         return default_configures
diff --git a/tests/cpu/test_jit_ipex_quantization.py b/tests/cpu/test_jit_ipex_quantization.py
@@ -0,0 +1,85 @@
+import unittest
+import itertools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.testing import FileCheck
+
+from test_jit_llga_utils import JitLlgaTestCase, run_tests, LLGA_FUSION_GROUP, llga_test_env
+
+import intel_pytorch_extension as ipex
+
+
+class TestIpexOps(JitLlgaTestCase):
+    @llga_test_env
+    def test_adaptive_avg_pool2d(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.adaptive_avg_pool2d = nn.AdaptiveAvgPool2d((5,7))
+
+            def forward(self, x):
+                x = self.adaptive_avg_pool2d(x)
+                return x
+
+        m = M()
+        x = torch.rand(1, 32, 28, 28)
+        for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
+            graph = self.checkQuantizeTrace(m, [x], atol=2e-1, config_name="adaptive_avg_pool2d", qscheme=qscheme)
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0)
+
+
+    @llga_test_env
+    def test_flatten_int8(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(3, 3, 2, padding=1, bias=True)
+                self.pool = nn.MaxPool2d(2)
+                self.flatten = nn.Flatten(1)
+                self.linear = nn.Linear(147, 32)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.pool(x)
+                x = self.flatten(x)
+                x = self.linear(x)
+                return x
+
+        m = M()
+        x = torch.rand(1, 3, 14, 14)
+        patterns = [
+            ["aten::quantize_per_channel", "aten::dequantize", "aten::_convolution"],
+            ["aten::dequantize", "aten::max_pool2d", "aten::quantize_per_tensor"],
+            ["aten::quantize_per_channel", "aten::dequantize", "aten::linear"],
+        ]
+        for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
+            graph = self.checkQuantizeTrace(m, [x], atol=2e-1, config_name="flatten", qscheme=qscheme)
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
+            self.checkPatterns(graph, patterns)
+
+    @llga_test_env
+    def test_flatten_fp32(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.flatten = nn.Flatten(1)
+
+            def forward(self, x):
+                x = self.flatten(x)
+                return x
+
+        m = M()
+        x = torch.rand(1, 3, 14, 14)
+        for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
+            graph = self.checkQuantizeTrace(m, [x], config_name="flatten", qscheme=qscheme)
+            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 0)
+            FileCheck().check_not("aten::quantize_per_tensor") \
+                .check_not("at::dequantize") \
+                .check("aten::flatten") \
+                .run(graph)
+
+
+
+if __name__ == '__main__':
+    run_tests()
diff --git a/tests/cpu/test_jit_llga_quantization_fuser.py b/tests/cpu/test_jit_llga_quantization_fuser.py
@@ -1,11 +1,9 @@
 import unittest
 import itertools
-from functools import wraps
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from test_jit_llga_utils import JitLlgaTestCase, run_tests, LLGA_FUSION_GROUP
+from test_jit_llga_utils import JitLlgaTestCase, run_tests, LLGA_FUSION_GROUP, llga_test_env
 from torch.testing._internal.common_utils import TEST_SCIPY
 
 import intel_pytorch_extension as ipex
@@ -27,21 +25,6 @@ def get_eltwise_fn(name):
     else:
         raise NameError('Eltwise function %s not found' % name)
 
-# For LLGA UT, disable the PyTorch profiling executor and the IPEX JIT opt
-def llga_test_env(func):
-    @wraps(func)
-    def wrapTheFunction(*args):
-        # make sure that the profiling mode is turned on
-        torch._C._jit_set_profiling_mode(True)
-        torch._C._jit_set_profiling_executor(True)
-        
-        ipex.core._jit_set_llga_enabled(True)
-        ipex.core.disable_jit_opt()
-        func(*args)
-        ipex.core.enable_jit_opt()
-        ipex.core._jit_set_llga_enabled(False)
-    return wrapTheFunction
-
 class TestOp(JitLlgaTestCase):
     @llga_test_env
     def test_conv2d_int8_in_f32_out(self):
@@ -162,25 +145,6 @@ def test_max_pool2d(self):
                 self.assertFused(graph, ['aten::max_pool2d'])
                 self.checkPatterns(graph, patterns)
 
-    @llga_test_env
-    @unittest.skipIf(True, 'int8 adaptive_avg_pool2d is not supported in the backend')
-    def test_adaptive_avg_pool2d(self):
-        m = nn.AdaptiveAvgPool2d((1, 1))
-        N = torch.randint(3, 10, (1,)).item()
-        C = torch.randint(3, 10, (1,)).item()
-        x = torch.randn(N, C, 224, 224, dtype=torch.float32) * 100
-
-        patterns = [
-            ["aten::quantize_per_tensor"],
-            ["aten::dequantize", "aten::adaptive_avg_pool2d", "aten::quantize_per_tensor"],
-            ["aten::dequantize"]
-        ]
-        for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
-            graph = self.checkQuantizeTrace(m, [x], atol=1e-1, config_name="adaptive_avg_pool2d", qscheme=qscheme)
-            self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
-            self.assertFused(graph, ['aten::adaptive_avg_pool2d', 'aten::quantize_per_tensor', 'aten::dequantize'])
-            self.checkPatterns(graph, patterns)
-
 class TestFusionPattern(JitLlgaTestCase):
     @llga_test_env
     def test_conv2d_eltwise(self):
@@ -408,7 +372,7 @@ def forward(self, x):
                 new_x_shape = x.size()[:-1] + (3, 5)
                 x = x.view(*new_x_shape)
                 return x.permute(0, 2, 1, 3)
-        
+
         x = torch.randn(5, 10, 15)
         m = M()
 
@@ -434,7 +398,7 @@ def forward(self, x):
                 x = self.conv1(x)
                 x = self.conv2(x).reshape(x.size(0), 4, -1)
                 return x
-        
+
         x = torch.randn(15, 4, 28, 28)
         # change the size of the input, check the fallback
         x_var = torch.randn(7, 4, 16, 16)
diff --git a/tests/cpu/test_jit_llga_utils.py b/tests/cpu/test_jit_llga_utils.py
@@ -1,8 +1,9 @@
 import os
 import copy
 import tempfile
-
 import torch
+
+from functools import wraps
 from torch.testing._internal.jit_utils import JitTestCase, warmup_backward, \
     get_execution_plan
 from torch.testing._internal.common_utils import freeze_rng_state, run_tests, \
@@ -14,6 +15,21 @@
 
 LLGA_FUSION_GROUP = 'ipex::LlgaFusionGroup'
 
+# For LLGA UT, disable the PyTorch profiling executor and the IPEX JIT opt
+def llga_test_env(func):
+    @wraps(func)
+    def wrapTheFunction(*args):
+        # make sure that the profiling mode is turned on
+        torch._C._jit_set_profiling_mode(True)
+        torch._C._jit_set_profiling_executor(True)
+
+        ipex.core._jit_set_llga_enabled(True)
+        ipex.core.disable_jit_opt()
+        func(*args)
+        ipex.core.enable_jit_opt()
+        ipex.core._jit_set_llga_enabled(False)
+    return wrapTheFunction
+
 def all_backward_graphs(module):
     ge_state = module.get_debug_state()
     fwd_plan = get_execution_plan(ge_state)
@@ -133,7 +149,7 @@ def prepareModel(self, model, x, folding=False, remove_dropout=False, config_nam
 
             # get the graph at the second run after freezing
             graph = model.graph_for(*x)
-            
+
             return graph, model, fp32_model_with_quant_dequant
 
     def checkPatterns(self, graph, patterns):
diff --git a/torch_ipex/csrc/autocast_kernel.cpp b/torch_ipex/csrc/autocast_kernel.cpp
@@ -242,5 +242,16 @@ lstm_aten(const at::Tensor &_input, at::TensorList hx, at::TensorList _params,
   return at::lstm(_input, hx, _params, has_biases, num_layers, dropout_p, train, bidirectional, batch_first);
 }
 
+at::Tensor flatten(const at::Tensor &input, int64_t start_dim,
+                   int64_t end_dim) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocastCPU(DispatchKey::AutocastCPU);
+  auto target_type = get_autocast_dtype();
+  if (at::ScalarType::Char == target_type) {
+    return int8::flatten(input, start_dim, end_dim);
+  }
+  // Fall Through.
+  return at::flatten(input, start_dim, end_dim);
+}
+
 } // autocast
 } // torch_ipex
diff --git a/torch_ipex/csrc/autocast_kernel.hpp b/torch_ipex/csrc/autocast_kernel.hpp
@@ -54,5 +54,7 @@ lstm_aten(const at::Tensor &_input, at::TensorList hx, at::TensorList _params,
           bool has_biases, int64_t num_layers, double dropout_p, bool train,
           bool bidirectional, bool batch_first);
 
+at::Tensor flatten(const at::Tensor &input, int64_t start_dim, int64_t end_dim);
+
 } // autocast
 } // torch_ipex
diff --git a/torch_ipex/csrc/autocast_mode.cpp b/torch_ipex/csrc/autocast_mode.cpp
@@ -772,6 +772,8 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
          TORCH_FN((&torch_ipex::autocast::gelu)));
   m.impl(TORCH_SELECTIVE_NAME("aten::lstm.input"),
          TORCH_FN((&torch_ipex::autocast::lstm_aten)));
+  m.impl(TORCH_SELECTIVE_NAME("aten::flatten.using_ints"),
+         TORCH_FN((&torch_ipex::autocast::flatten)));
 }
 
 } // namespace autocast
diff --git a/torch_ipex/csrc/jit/codegen/onednn/interface.cpp b/torch_ipex/csrc/jit/codegen/onednn/interface.cpp
@@ -7,6 +7,8 @@
 #include "jit/codegen/onednn/layout_propagation.h"
 #include "jit/codegen/onednn/prepare_binary.h"
 #include "jit/codegen/onednn/prepare_dequant.h"
+#include "jit/codegen/onednn/quantization_patterns.h"
+
 #include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/common_subexpression_elimination.h>
 #include <torch/csrc/jit/passes/decompose_ops.h>
@@ -76,8 +78,10 @@ void fuseGraph(std::shared_ptr<Graph> &g) {
                g);
     RemoveTensorTypeSpecializations(g);
     GRAPH_DUMP(
-        "After RemoveTensorTypeSpecializations. End of LLGA optimization pass",
+        "After RemoveTensorTypeSpecializations. Before IPEX optimization pass",
         g);
+    IpexQuantFusion(g);
+    GRAPH_DUMP("After IpexQuantFusion. End of IPEX optimization pass", g);
   }
 }
 
diff --git a/torch_ipex/csrc/jit/codegen/onednn/quantization_patterns.h b/torch_ipex/csrc/jit/codegen/onednn/quantization_patterns.h
@@ -0,0 +1,68 @@
+#include <string>
+#include <torch/csrc/jit/ir/ir.h>
+#include <torch/csrc/jit/ir/subgraph_matcher.h>
+#include <torch/csrc/jit/passes/subgraph_rewrite.h>
+
+namespace torch {
+namespace jit {
+
+struct FusionInfo {
+  std::string quantized_op_name;
+  std::string pattern;
+  std::string replacement;
+  std::vector<MatchFilter> filters = {};
+};
+
+namespace {
+
+std::string getArgList(std::vector<std::string> extra_args) {
+  return std::accumulate(
+      extra_args.begin(), extra_args.end(), std::string(),
+      [](std::string acc, const std::string &arg) { return acc + ", " + arg; });
+}
+
+FusionInfo getIpexFusionInfo(const std::string &fp_op_name,
+                             const std::string &q_op_name,
+                             const std::vector<std::string> &fp_extra_args,
+                             const std::vector<std::string> &q_extra_args) {
+  const auto &fp_extra_arg_list = getArgList(fp_extra_args);
+  const auto &q_extra_arg_list = getArgList(q_extra_args);
+
+  std::string op_pattern = "graph(%a_quant" + fp_extra_arg_list +
+                           ", %r_scale, %r_zero_point, %r_dtype):" + R"(
+          %a_dequant = aten::dequantize(%a_quant)
+          %r = )" + fp_op_name +
+                           "(" + "%a_dequant" + fp_extra_arg_list + ")" + R"(
+          %r_quant = aten::quantize_per_tensor(%r, %r_scale, %r_zero_point, %r_dtype)
+          return (%r_quant) )";
+
+  std::string aten_op_pattern = "graph(%a_quant" + fp_extra_arg_list +
+                                ", %r_scale, %r_zero_point, %r_dtype):" + R"(
+      %r_quant = )" + q_op_name +
+                                "(%a_quant" + q_extra_arg_list + ")" + R"(
+      return (%r_quant) )";
+
+  return {q_op_name, op_pattern, aten_op_pattern};
+}
+
+} // namespace
+
+void IpexQuantFusion(std::shared_ptr<Graph> &graph) {
+  std::vector<FusionInfo> patterns;
+  auto adaptive_avg_pool2d_patten = getIpexFusionInfo(
+      "aten::adaptive_avg_pool2d", "aten::adaptive_avg_pool2d",
+      {"%output_size"}, {"%output_size"});
+  auto flatten_patten =
+      getIpexFusionInfo("aten::flatten", "aten::flatten",
+                        {"%start_dim, %end_dim"}, {"%start_dim, %end_dim"});
+  patterns.emplace_back(adaptive_avg_pool2d_patten);
+  patterns.emplace_back(flatten_patten);
+  for (const auto &info : patterns) {
+    SubgraphRewriter rewriter;
+    rewriter.RegisterRewritePattern(info.pattern, info.replacement);
+    rewriter.runOnGraph(graph, info.filters);
+  }
+}
+
+} // namespace jit
+} // namespace torch
diff --git a/torch_ipex/csrc/quantization/AutoCast.cpp b/torch_ipex/csrc/quantization/AutoCast.cpp
diff --git a/torch_ipex/csrc/quantization/AutoCast.hpp b/torch_ipex/csrc/quantization/AutoCast.hpp

Original file line number	Diff line number	Diff line change
`@@ -772,6 +772,8 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {`
`772`	`772`	`TORCH_FN((&torch_ipex::autocast::gelu)));`
`773`	`773`	`m.impl(TORCH_SELECTIVE_NAME("aten::lstm.input"),`
`774`	`774`	`TORCH_FN((&torch_ipex::autocast::lstm_aten)));`
	`775`	`+ m.impl(TORCH_SELECTIVE_NAME("aten::flatten.using_ints"),`
	`776`	`+ TORCH_FN((&torch_ipex::autocast::flatten)));`
`775`	`777`	`}`
`776`	`778`
`777`	`779`	`} // namespace autocast`