quantizatioon: enable torch.Tensor.matmul quantization (#878)

XiaobingSuper · web-flow · commit 2779937f0174 · 2022-06-17T17:52:34.000+08:00
diff --git a/intel_extension_for_pytorch/ao/quantization/_quantization_state_utils.py b/intel_extension_for_pytorch/ao/quantization/_quantization_state_utils.py
@@ -37,6 +37,7 @@
     F.linear,
     torch._C._nn.linear,
     torch.matmul,
+    torch.Tensor.matmul,
     F.embedding_bag,
     torch.embedding_bag,
     ])
@@ -348,8 +349,8 @@ def iterate_and_apply_convert(
                     args = torch.quantize_per_channel(args, scale, zp, ch_axis, dtype)
                     args = args.dequantize()
             else:
-                # white list, conv, linear, matmul, we alsy covert it's input to bflat16 firstly, and then inser q+dq
-                if str(op) in conv_linear_ops + [str(torch.matmul)] + embedding_op or str(type(op)) in conv_linear_modules:
+                # white list, conv, linear, matmul, we always convert it's input to bflat16 firstly, and then inser q+dq
+                if str(op) in conv_linear_ops + [str(torch.matmul), str(torch.Tensor.matmul)] + embedding_op or str(type(op)) in conv_linear_modules:
                     if torch.is_autocast_cpu_enabled() and core.get_autocast_dtype() == torch.bfloat16:
                         if args.dtype == torch.float32:
                             args = args.to(torch.bfloat16)
diff --git a/intel_extension_for_pytorch/ao/quantization/_recipe.py b/intel_extension_for_pytorch/ao/quantization/_recipe.py
@@ -15,7 +15,7 @@
         str(nn.SiLU), str(F.silu), str(torch.Tensor.sigmoid), str(torch.sigmoid), str(F.sigmoid), str(nn.Sigmoid), str(F.gelu), str(nn.GELU)]
 conv_gemm_ops = [str(F.conv2d), str(nn.Conv2d), str(F.conv3d), str(nn.Conv3d), str(torch.conv2d), str(torch.conv3d), \
     str(F.conv_transpose2d), str(torch.nn.ConvTranspose2d), str(F.conv_transpose3d), str(torch.nn.ConvTranspose3d),
-    str(torch.conv_transpose2d), str(torch.conv_transpose2d), str(F.linear), str(nn.Linear), str(torch.matmul)]
+    str(torch.conv_transpose2d), str(torch.conv_transpose2d), str(F.linear), str(nn.Linear), str(torch.matmul), str(torch.Tensor.matmul)]
 rnn_ops = [str(torch.nn.LSTM)]
 
 # Those ops only support s8->s8 path, and also require the qscheme is per_tensor_symmetric.
diff --git a/tests/cpu/test_ao_jit_llga_quantization_fuser.py b/tests/cpu/test_ao_jit_llga_quantization_fuser.py
@@ -1062,6 +1062,49 @@ def forward(self, x, y):
         self.assertFused(graph, ['aten::dequantize', 'aten::matmul', 'aten::div'])
         self.checkPatterns(graph, patterns)
 
+    def test_bmm_method_bf16(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+
+            def forward(self, x, y):
+                mm_res = x.matmul(y)
+                return mm_res
+
+        x = torch.randn(1, 16, 384, 64) * 0.1
+        y = torch.randn(1, 1, 64, 384) * 0.1
+        patterns = [
+            ["aten::to", "aten::quantize_per_tensor"],
+            ["aten::to", "aten::quantize_per_tensor"],
+            ["aten::dequantize", "aten::to", "aten::matmul"],
+        ]
+        m = M()
+        graph = self.checkQuantizeTrace(m, [x, y], atol=2e-1, int8_bf16=True)
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
+        # single aten::to won't be rewritten by llga backend
+        self.assertFused(graph, ['aten::dequantize', 'aten::matmul'])
+        self.checkPatterns(graph, patterns)
+
+    def test_bmm_method_fp32(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+
+            def forward(self, x, y):
+                mm_res = x.matmul(y)
+                return mm_res
+
+        x = torch.randn(1, 16, 384, 64) * 0.1
+        y = torch.randn(1, 1, 64, 384) * 0.1
+        patterns = [
+            ["aten::dequantize", "aten::matmul"],
+        ]
+        m = M()
+        graph = self.checkQuantizeTrace(m, [x, y], atol=2e-1)
+        self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 1)
+        self.assertFused(graph, ['aten::dequantize', 'aten::matmul'])
+        self.checkPatterns(graph, patterns)
+
     def test_strided_bmm_div_int8_in_bf16_out(self):
         class M(nn.Module):
             def __init__(self):