[block][storage save] fix storage save issue for block format (#2399) (#2539)

zejun-chen · gujinghui · web-flow · commit 56428f879001 · 2023-04-12T14:14:05.000+08:00
* Fix save acc drop error for block format
* Add include file for Context method
* Use at::AtenIpexTypeXPU::DPCPPTensorContext in Utils.h
* change the dtype mapping:
    u8 &lt;---&gt; QUInt8
    s8 &lt;---&gt; QInt8
* Change the deduce dtype mapping for quantized tensor

---------

Signed-off-by: Chen, Zejun &lt;zejun.chen@intel.com&gt;
Co-authored-by: Jinghui &lt;jinghui.gu@intel.com&gt;
diff --git a/csrc/gpu/aten/tensor/Context.cpp b/csrc/gpu/aten/tensor/Context.cpp
@@ -8,9 +8,39 @@ using namespace xpu::oneDNN;
 namespace at {
 namespace AtenIpexTypeXPU {
 
-at::Tensor DPCPPTensorConvertor::to_plain(const at::Tensor& from) {
-  if (!is_opaque_tensor(from))
-    return from;
+at::Tensor DPCPPTensorConvertor::to_plain(const at::Tensor& from_original) {
+  if (!is_opaque_tensor(from_original))
+    return from_original;
+
+  auto from = from_original;
+
+  // [watch out] here the dtype in tensor from_original's meta may not be equal
+  // to the dtype stored in its context. When doing save, the storage is
+  // regarded as pure u8 type then to be pickled down(here is the reorder happen
+  // when pickling:
+  // https://github.com/pytorch/pytorch/blob/0f4652f4989a2d196f36fe75e5c73cb88dc0800d/torch/serialization.py#L667)
+
+  // Before saving, the block tensor should be converted to plain to make sure
+  // the correctness, so the tensor MUST be reconstructed on the given storage
+  // with the CORRECT meta dtype.
+  // If saving block tensor, here tensor [from_original] is a pure u8 one-dim
+  // tensor. Thus, the [from] tensor should be reconstructed with the correct
+  // meta dtype associated with the dtype stored in the [from_original]'s
+  // context. After reordering, the plain u8 tensor should be recovered.
+
+  // Here is_equal(false) means the dtype in tensor meta is not equal to the
+  // dtype in stored context, so the reconstruction is needed for reorder's
+  // correctness.
+  auto is_equal = check_equality_for_meta_dtype_and_ctx_dtype(from_original);
+  if (!is_equal) {
+    // Here use opaqueTypeToScalarType to deduce the meta dtype
+    // [from] the context dtype, then reconstruct the tensor [from]
+    from = at::empty_like(
+        from_original,
+        from_original.options().dtype(opaqueTypeToScalarType(from_original)));
+
+    unsafe_get_and_set_data_ptr(from_original, from);
+  }
 
   // use native API to break recursive call resulted by opaque guard in aten itf
   auto to = from.is_quantized()
@@ -29,6 +59,22 @@ at::Tensor DPCPPTensorConvertor::to_plain(const at::Tensor& from) {
       to, to_meta.sizes_, to_meta.strides_, c10::nullopt);
   xpu::oneDNN::reorder(from, to_);
 
+  if (!is_equal) {
+    // reconstruct the [to] tensor with the original tensor meta
+    to = at::empty_like(from_original);
+
+    // release [to_] context and set it to [to], now the tensor [to] is plain
+    // and it has same meta with the tensor [from_original]
+    unsafe_release_and_set_data_ptr(to_, to);
+
+    // manually free [from] context
+    from.unsafeGetTensorImpl()
+        ->storage()
+        .unsafeGetStorageImpl()
+        ->data_ptr()
+        .release_context();
+  }
+
   return to;
 }
 
diff --git a/csrc/gpu/aten/tensor/Tensor.cpp b/csrc/gpu/aten/tensor/Tensor.cpp
@@ -123,5 +123,44 @@ Tensor share_storage_and_set_strided_as(
   return result;
 }
 
+// get ctx from src and set(share) it to dst
+// [watch out] When using this function, the src and the dst will share the same
+// memory raw data with associated different storage and context, thus the src's
+// context MUST be explicitly and manually released after this calling to avoid
+// double free when the src and the dst end their life cycle.
+void unsafe_get_and_set_data_ptr(const Tensor& src, const Tensor& dst) {
+  auto src_cptr = (DPCPPTensorContext*)src.unsafeGetTensorImpl()
+                      ->storage()
+                      .unsafeGetStorageImpl()
+                      ->data_ptr()
+                      .get_context();
+  at::DataPtr src_dptr(
+      src_cptr->data(),
+      src_cptr,
+      getDeviceAllocator()->raw_deleter(),
+      dst.device());
+  dst.unsafeGetTensorImpl()->storage().unsafeGetStorageImpl()->set_data_ptr(
+      std::move(src_dptr));
+}
+
+// release ctx from src and set it to dst
+// [watch out] When using this function, the src's control to its memory raw
+// data will be transfered to dst. After this calling, the src's context is
+// released and cannot be used any more.
+void unsafe_release_and_set_data_ptr(const Tensor& src, const Tensor& dst) {
+  auto src_cptr = (DPCPPTensorContext*)src.unsafeGetTensorImpl()
+                      ->storage()
+                      .unsafeGetStorageImpl()
+                      ->data_ptr()
+                      .release_context();
+  at::DataPtr dptr(
+      src_cptr->data(),
+      src_cptr,
+      getDeviceAllocator()->raw_deleter(),
+      src.device());
+  dst.unsafeGetTensorImpl()->storage().unsafeGetStorageImpl()->set_data_ptr(
+      std::move(dptr));
+}
+
 } // namespace AtenIpexTypeXPU
 } // namespace at
diff --git a/csrc/gpu/aten/tensor/Tensor.h b/csrc/gpu/aten/tensor/Tensor.h
@@ -20,5 +20,8 @@ Tensor share_storage_and_set_strided_as(
     IntArrayRef stride,
     c10::optional<int64_t> storage_offset_);
 
+void unsafe_get_and_set_data_ptr(const Tensor& src, const Tensor& dst);
+
+void unsafe_release_and_set_data_ptr(const Tensor& src, const Tensor& dst);
 } // namespace AtenIpexTypeXPU
 } // namespace at
diff --git a/csrc/gpu/oneDNN/Utils.h b/csrc/gpu/oneDNN/Utils.h
@@ -105,6 +105,38 @@ static bool is_supported_onednn_dtype(const at::Tensor& tensor) {
       : true;
 }
 
+// Here this function is used to deduce the torch tensor meta dtype from the
+// kept oqaque tensor context in case of saving tensor
+static inline c10::ScalarType opaqueTypeToScalarType(const at::Tensor& tensor) {
+  auto is_quantized = tensor.is_quantized();
+  auto ctx = *(static_cast<at::AtenIpexTypeXPU::DPCPPTensorContext*>(
+      tensor.unsafeGetTensorImpl()->storage().data_ptr().get_context()));
+  switch (ctx.dtype()) {
+    case dnnl::memory::data_type::u8:
+      // For quantized tensor, the meta dtype is QUInt8
+      return (is_quantized) ? at::ScalarType::QUInt8 : at::ScalarType::Byte;
+    case dnnl::memory::data_type::s8:
+      // For quantized tensor, the meta dtype is QInt8
+      return (is_quantized) ? at::ScalarType::QInt8 : at::ScalarType::Char;
+    case dnnl::memory::data_type::f16:
+      return at::ScalarType::Half;
+    case dnnl::memory::data_type::f32:
+      return at::ScalarType::Float;
+    case dnnl::memory::data_type::bf16:
+      return at::ScalarType::BFloat16;
+    case dnnl::memory::data_type::f64:
+      return at::ScalarType::Double;
+    default:
+      TORCH_CHECK(false, "Cannot be translated to torch dtype");
+  };
+}
+
+static inline bool check_equality_for_meta_dtype_and_ctx_dtype(
+    const at::Tensor& tensor) {
+  auto ctx_dtype = opaqueTypeToScalarType(tensor);
+  return bool(ctx_dtype == tensor.scalar_type());
+}
+
 static inline fpmath_mode get_onednn_fpmath_mode() {
   auto math_mode = Settings::I().get_fp32_math_mode();
   switch (math_mode) {
diff --git a/tests/gpu/examples/test_save_load.py b/tests/gpu/examples/test_save_load.py
@@ -5,39 +5,20 @@
 import torch.nn as nn
 from torch.testing._internal.common_utils import TestCase
 import intel_extension_for_pytorch  # noqa
+import torchvision.models as models
 import pytest
 import os
 
 cpu_device = torch.device("cpu")
 xpu_device = torch.device("xpu")
 
 batch_size = 128
-class_num = 1000
-input_channel = 512
-hidden_channel = 2048
-num_iter = 10
+input_channel = 3
+train_num_iter = 5
+eval_num_iter = 3
 lr = 0.01
 checkpoint_path_str = './_checkpoint.test.case.test_xpu_checkpoint_save_load_integrity_and_accuracy.pth.tar'
 
-class TrainingModel(nn.Module):
-    def __init__(self):
-        super(TrainingModel, self).__init__()
-        self.m = nn.Sequential(
-            nn.Conv2d(input_channel, hidden_channel, kernel_size=(1, 1), stride=(1, 1), bias=False),
-            nn.BatchNorm2d(hidden_channel, eps=1e-05, momentum=0.1),
-            nn.ReLU(inplace=True),
-            nn.AvgPool2d(kernel_size=7, stride=1, padding=0),
-        )
-        self.fc = nn.Linear(in_features=hidden_channel, out_features=class_num, bias=True)
-
-    def forward(self, x, indentity_for_mul, indentity_for_add):
-        x = self.m(x)
-        x = x * indentity_for_mul
-        x = x.view(x.size(0), -1)
-        x = self.fc(x)
-        x = x + indentity_for_add
-        return x
-
 class TestTorchMethod(TestCase):
     @pytest.mark.skipif(not torch.xpu.utils.has_fp64_dtype(), reason="fp64 not support by this device")
     def test_save_load(self):
@@ -65,31 +46,17 @@ def test_serialization_multi_map_location(self):
         self.assertEqual(b.device.__str__(), 'xpu:1')
 
     @pytest.mark.skipif(not torch.xpu.utils.has_fp64_dtype(), reason="fp64 not support by this device")
-    def test_xpu_checkpoint_save_load_integrity_and_accuracy(self, dtype=torch.bfloat16):
-        # create model
+    def test_xpu_checkpoint_save_load_integrity_and_accuracy(self):
         device = 'xpu'
-        model_xpu = TrainingModel()
-        model_xpu = model_xpu.to(device=device).train()
-        optimizer_xpu = torch.optim.SGD(model_xpu.parameters(), lr=lr)
-        criterion = nn.CrossEntropyLoss()
-
-        if os.path.exists(checkpoint_path_str):
-            os.remove(checkpoint_path_str)
-
-        # process torch.xpu.optimize
-        model_xpu, optimizer_xpu = torch.xpu.optimize(model=model_xpu, dtype=dtype, optimizer=optimizer_xpu)
-
-        def training_step(model_xpu, optimizer_xpu, criterion):
-            input = torch.randn(batch_size, input_channel, 7, 7)
-            target = torch.empty(batch_size, dtype=torch.long).random_(class_num)
+        def training_step(model_xpu, optimizer_xpu, criterion, dtype):
+            input = torch.randn(batch_size, input_channel, 224, 224)
+            target = torch.empty(batch_size, dtype=torch.long).random_(1000)
             input_xpu = input.clone().to(device=device).requires_grad_()
             target_xpu = target.to(device)
-            indentity_for_mul = torch.randn(batch_size, hidden_channel, 1, 1).to(device=device)
-            indentity_for_add = torch.randn(batch_size, class_num).to(device=device)
 
             # forward
             with torch.xpu.amp.autocast(enabled=True, dtype=dtype):
-                output_xpu = model_xpu(input_xpu, indentity_for_mul, indentity_for_add)
+                output_xpu = model_xpu(input_xpu)
                 loss_xpu = criterion(output_xpu, target_xpu)
 
             # optimizer
@@ -103,35 +70,70 @@ def training_step(model_xpu, optimizer_xpu, criterion):
             loss_xpu = loss_xpu.cpu()
             output_xpu = output_xpu.cpu()
 
-        def save_checkpoint(state, filename=checkpoint_path_str):
-            torch.save(state, filename)
+        def eval_step(model_xpu, dtype):
+            input = torch.randn(batch_size, input_channel, 224, 224)
+            target = torch.empty(batch_size, dtype=torch.long).random_(1000)
+            input_xpu = input.clone().to(device=device).requires_grad_()
+            target_xpu = target.to(device)
 
-        for _ in range(num_iter):
-            training_step(model_xpu, optimizer_xpu, criterion)
+            # forward
+            with torch.xpu.amp.autocast(enabled=True, dtype=dtype):
+                output_xpu = model_xpu(input_xpu)
+                loss_xpu = criterion(output_xpu, target_xpu)
+
+            loss_xpu = loss_xpu.cpu()
+            output_xpu = output_xpu.cpu()
 
-        save_checkpoint({'model_state_dict': model_xpu.state_dict(), 'optimizer_state_dict': optimizer_xpu.state_dict()})
-        if os.path.isfile(checkpoint_path_str):
-            # load checkpoint
-            checkpoint = torch.load(checkpoint_path_str, map_location='xpu')
-            print('load checkpoint')
+        def save_checkpoint(state, filename=checkpoint_path_str):
+            torch.save(state, filename)
 
+        for dtype in [torch.float32, torch.bfloat16]:
+            print('dtype = ', dtype)
             # create model
-            new_model = TrainingModel()
-            new_model = new_model.to(device=device).train()
-            print('create model')
-
-            # create optimizer
-            new_optimizer = torch.optim.SGD(new_model.parameters(), lr=lr)
-            print('create model')
-
-            # load state dict
-            new_model.load_state_dict(checkpoint['model_state_dict'])
-            new_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-            print('load state dict')
-
-            # check
-            print('checking...')
-            self.assertEqual(model_xpu.state_dict(), new_model.state_dict(), atol=1e-6, rtol=1e-6)
-            self.assertEqual(optimizer_xpu.state_dict(), new_optimizer.state_dict(), atol=1e-6, rtol=1e-6)
-        else:
-            assert False, "save checkpoint failed for xpu model" # noqa B011
+            model_xpu = models.__dict__['resnet18'](pretrained=True).to(device=device).train()
+            optimizer_xpu = torch.optim.SGD(model_xpu.parameters(), lr=lr)
+            criterion = nn.CrossEntropyLoss()
+
+            if os.path.exists(checkpoint_path_str):
+                os.remove(checkpoint_path_str)
+
+            # process torch.xpu.optimize
+            model_xpu, optimizer_xpu = torch.xpu.optimize(model=model_xpu, dtype=dtype, optimizer=optimizer_xpu)
+
+            # mimic model train, then eval
+            for _ in range(train_num_iter):
+                training_step(model_xpu, optimizer_xpu, criterion, dtype)
+            model_xpu.eval()
+            for _ in range(eval_num_iter):
+                eval_step(model_xpu, dtype)
+            torch.xpu.synchronize()
+
+            save_checkpoint({'model_state_dict': model_xpu.state_dict(), 'optimizer_state_dict': optimizer_xpu.state_dict()})
+            if os.path.isfile(checkpoint_path_str):
+                # load checkpoint
+                checkpoint = torch.load(checkpoint_path_str, map_location=device)
+                print('load checkpoint')
+
+                # create model
+                new_model = models.__dict__['resnet18'](pretrained=False).to(device=device).train()
+                print('create model')
+
+                # create optimizer
+                new_optimizer = torch.optim.SGD(new_model.parameters(), lr=lr)
+                print('create model')
+
+                # optimize
+                new_model, new_optimizer = torch.xpu.optimize(model=new_model, dtype=dtype, optimizer=new_optimizer)
+
+                # load state dict
+                new_model.load_state_dict(checkpoint['model_state_dict'])
+                new_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+                print('load state dict')
+
+                # check
+                print('checking...')
+                self.assertEqual(model_xpu.state_dict(), new_model.state_dict(), atol=1e-6, rtol=1e-6)
+                self.assertEqual(optimizer_xpu.state_dict(), new_optimizer.state_dict(), atol=1e-6, rtol=1e-6)
+                os.remove(checkpoint_path_str)
+            else:
+                assert False, "save checkpoint failed for xpu model" # noqa B011