opaque context: avoid recursive invoke in context conversion routine by bypassing ATen dispatch. (#2566) (#2573)

cchheennhhaaoo · ZhiweiYan-96 · web-flow · commit ad0b75396003 · 2023-04-20T12:35:38.000+08:00
* Check opaque u8 in to_plain

* Add comment for case analysis

Co-authored-by: Zhiwei &lt;zhiwei.yan@intel.com&gt;
diff --git a/csrc/gpu/aten/tensor/Context.cpp b/csrc/gpu/aten/tensor/Context.cpp
@@ -32,7 +32,14 @@ at::Tensor DPCPPTensorConvertor::to_plain(const at::Tensor& from_original) {
   // dtype in stored context, so the reconstruction is needed for reorder's
   // correctness.
   auto is_equal = check_equality_for_meta_dtype_and_ctx_dtype(from_original);
-  if (!is_equal) {
+  auto is_opaque_u8_qtensor = is_opaque_u8(from_original);
+  // Case 1:
+  //   Tensor ctx has real dtype(like f32,s8) but meta has byte dtype(u8), this
+  //   is for pickling tensor. Run following if statement
+  // Case 2:
+  //  Opaque u8 qtensor has QUInt8 meta but s8 ctx. No need for pickiling it,
+  //  bypass following if statement.
+  if (!is_equal && !is_opaque_u8_qtensor) {
     // Here use opaqueTypeToScalarType to deduce the meta dtype
     // [from] the context dtype, then reconstruct the tensor [from]
     from = at::empty_like(
@@ -59,7 +66,7 @@ at::Tensor DPCPPTensorConvertor::to_plain(const at::Tensor& from_original) {
       to, to_meta.sizes_, to_meta.strides_, c10::nullopt);
   xpu::oneDNN::reorder(from, to_);
 
-  if (!is_equal) {
+  if (!is_equal && !is_opaque_u8_qtensor) {
     // reconstruct the [to] tensor with the original tensor meta
     to = at::empty_like(from_original);
 
diff --git a/tests/gpu/regression/test_q_to_plain.py b/tests/gpu/regression/test_q_to_plain.py
@@ -0,0 +1,76 @@
+import torch
+import torch.nn as nn
+from torch.testing._internal.common_utils import TestCase
+import intel_extension_for_pytorch  # noqa
+
+from torch.quantization.quantize_jit import (
+    convert_jit,
+    prepare_jit,
+)
+import pytest
+import time
+
+def trace_int8_model(model, device, test_input):
+    model = model.to(device)
+    modelJit = torch.jit.trace(model, test_input.to(device))
+    modelJit.eval()
+    modelJit.to(device)
+    print(modelJit)
+    print("finish jit tracing...")
+
+    print("start ", device, " calibration ...")
+    qconfig_u8 = torch.quantization.QConfig(
+        activation=torch.quantization.observer.MinMaxObserver.with_args(
+            qscheme=torch.per_tensor_symmetric,
+            reduce_range=False,
+            dtype=torch.quint8
+        ),
+        weight=torch.quantization.default_weight_observer
+    )
+
+    modelJit = prepare_jit(modelJit, {'': qconfig_u8}, True)
+
+    # do calibration
+    test_input = test_input.to(device)
+    with torch.no_grad():
+        for i in range(1):
+            calib_input = test_input
+            modelJit(calib_input)
+    print("start ", device, " convert...")
+    modelJit = convert_jit(modelJit, True)
+    # inference
+    print("start ", device, " inference ...")
+    with torch.no_grad():
+        for i in range(1):
+            start = time.time()
+            output_cpu = modelJit(test_input)
+            end = time.time()
+            print("iter.{} ... {time:.3f}ms".format(i, time=(end - start) * 1000))
+        print("print ", device, " jit graph ....")
+        print(modelJit.graph_for(test_input))
+
+        print("get ", device, " test input result....")
+        output = modelJit(test_input)
+        print("finish ", device, " testing.......")
+    return output
+
+class SimpleModule(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 6, 3, 1, 1)
+        self.instance_norm = nn.InstanceNorm2d(6, **{'eps': 1e-5, 'affine': True, 'momentum': 0.1})
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.instance_norm(x)
+        return x
+
+
+class TestQTensortoPlain(TestCase):
+    def test_q_to_plain(self):
+        mod = SimpleModule()
+        test_input = torch.randn(3, 3, 16, 16)
+        with torch.no_grad():
+            with torch.xpu.onednn_layout():
+                trace_int8_model(mod, "xpu", test_input)
+