autocast: register mul_add and roialign to autocast on low precision policy (#2511) (#2540)

ganyi1996ppo · 1pikachu · weishi-deng · web-flow · commit 4f0598bc9d19 · 2023-04-12T15:40:13.000+08:00
* add mul add to autocast support
* register mul_add to autocast and run it with low precision
* add roialign to autocast
* using promote policy on mul_add in autocast scenario
* using acc type as compute type in mul_add
* using retrive device as autocast device
* add bf16 and fp16 test on strict tolerance
* add roialign into torchvision's autocast op

---------

Co-authored-by: Du, Jun &lt;jun.du@intel.com&gt;
Co-authored-by: Deng, Weishi &lt;weishi.deng@intel.com&gt;
Co-authored-by: Jinghui &lt;jinghui.gu@intel.com&gt;
diff --git a/csrc/gpu/aten/operators/ROIAlign.cpp b/csrc/gpu/aten/operators/ROIAlign.cpp
@@ -8,6 +8,7 @@
 #include <runtime/Utils.h>
 #include <utils/DPCPP.h>
 
+#include <ATen/autocast_mode.h>
 #include "RandomEngine.h"
 #include "comm/ATDispatch.h"
 #include "comm/AccumulateType.h"
@@ -495,24 +496,57 @@ at::Tensor roi_align_backward_kernel(
   return grad_input;
 }
 
+at::Tensor roi_align_forward_autocast(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    int64_t pooled_height,
+    int64_t pooled_width,
+    int64_t sampling_ratio,
+    bool aligned) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastXPU);
+  return roi_align_forward_kernel(
+             at::autocast::cached_cast(at::kFloat, input, c10::DeviceType::XPU),
+             at::autocast::cached_cast(at::kFloat, rois, c10::DeviceType::XPU),
+             spatial_scale,
+             pooled_height,
+             pooled_width,
+             sampling_ratio,
+             aligned)
+      .to(input.scalar_type());
+}
+
 } // namespace AtenIpexTypeXPU
 } // namespace at
 
 namespace {
 IPEX_LIBRARY_FRAGMENT() {
-  IPEX_OP_REGISTER(
-      "roi_align.xpu", at::AtenIpexTypeXPU::roi_align_forward_kernel);
-  IPEX_OP_REGISTER(
+  IPEX_OP_REGISTER_DISPATCH(
+      "roi_align.xpu",
+      at::AtenIpexTypeXPU::roi_align_forward_kernel,
+      c10::DispatchKey::XPU);
+  IPEX_OP_REGISTER_DISPATCH(
       "_roi_align_backward.xpu",
-      at::AtenIpexTypeXPU::roi_align_backward_kernel);
+      at::AtenIpexTypeXPU::roi_align_backward_kernel,
+      c10::DispatchKey::XPU);
+  IPEX_OP_REGISTER_DISPATCH(
+      "roi_align.xpu",
+      at::AtenIpexTypeXPU::roi_align_forward_autocast,
+      c10::DispatchKey::AutocastXPU);
 }
 
-IPEX_TORCH_LIBRARY_IMPL(torchvision, XPU, m) {
+TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.impl(
       TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      c10::DispatchKey::XPU,
       TORCH_FN((&at::AtenIpexTypeXPU::roi_align_forward_kernel)));
   m.impl(
       TORCH_SELECTIVE_NAME("torchvision::_roi_align_backward"),
+      c10::DispatchKey::XPU,
       TORCH_FN((&at::AtenIpexTypeXPU::roi_align_backward_kernel)));
+  m.impl(
+      TORCH_SELECTIVE_NAME("torchvision::roi_align"),
+      c10::DispatchKey::AutocastXPU,
+      TORCH_FN((&at::AtenIpexTypeXPU::roi_align_forward_autocast)));
 }
 } // namespace
diff --git a/csrc/gpu/aten/operators/TripleOps.cpp b/csrc/gpu/aten/operators/TripleOps.cpp
@@ -1,6 +1,7 @@
 #include <ATen/ATen.h>
 #include <ATen/Context.h>
 #include <ATen/SparseTensorUtils.h>
+#include <ATen/autocast_mode.h>
 #include <ATen/native/BinaryOps.h>
 #include <ATen/native/TensorIterator.h>
 #include <ATen/record_function.h>
@@ -23,6 +24,9 @@ using namespace at::sparse;
 
 namespace at {
 namespace AtenIpexTypeXPU {
+using autocast::cached_cast;
+using autocast::get_lower_precision_fp_from_device_type;
+using autocast::promote_type;
 
 std::tuple<Tensor, Tensor> sort(
     const Tensor& self,
@@ -38,7 +42,8 @@ static void mul_add_kernel_dpcpp(TensorIterator& iter, Scalar alpha_scalar) {
       iter.dtype(),
       "mul_add",
       [&]() {
-        auto alpha = alpha_scalar.to<scalar_t>();
+        using accscalar_t = acc_type<scalar_t>;
+        auto alpha = alpha_scalar.to<accscalar_t>();
         dpcpp_kernel_for_tensor_iter(
             iter, [=](scalar_t a, scalar_t b, scalar_t c) -> scalar_t {
               return a * b + alpha * c;
@@ -141,8 +146,9 @@ Tensor mul_scalar_add_scalar(
         iter.dtype(),
         "mul_scalar_add_scalar",
         [&]() {
-          auto add_scalar = alpha.to<scalar_t>() * accumu.to<scalar_t>();
-          auto other_scalar = other.to<scalar_t>();
+          using accscalar_t = acc_type<scalar_t>;
+          auto add_scalar = alpha.to<accscalar_t>() * accumu.to<accscalar_t>();
+          auto other_scalar = other.to<accscalar_t>();
           dpcpp_kernel_for_tensor_iter(iter, [=](scalar_t a) -> scalar_t {
             return a * other_scalar + add_scalar;
           });
@@ -151,6 +157,15 @@ Tensor mul_scalar_add_scalar(
   return result;
 }
 
+Tensor mul_scalar_add_scalar_autocast(
+    const Tensor& self,
+    Scalar other,
+    Scalar accumu,
+    Scalar alpha) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastXPU);
+  return mul_scalar_add_scalar(self, other, accumu, alpha);
+}
+
 Tensor mul_add_scalar(
     const Tensor& self,
     const Tensor& other,
@@ -174,7 +189,8 @@ Tensor mul_add_scalar(
       iter.dtype(),
       "mul_scalar_add_scalar",
       [&]() {
-        auto add_scalar = alpha.to<scalar_t>() * accumu.to<scalar_t>();
+        using accscalar_t = acc_type<scalar_t>;
+        auto add_scalar = alpha.to<accscalar_t>() * accumu.to<accscalar_t>();
         dpcpp_kernel_for_tensor_iter(
             iter, [=](scalar_t a, scalar_t b) -> scalar_t {
               return a * b + add_scalar;
@@ -184,6 +200,24 @@ Tensor mul_add_scalar(
   return result;
 }
 
+Tensor mul_add_scalar_autocast(
+    const Tensor& self,
+    const Tensor& other,
+    Scalar accumu,
+    Scalar alpha) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastXPU);
+  auto to_type = promote_type(
+      get_lower_precision_fp_from_device_type(c10::DeviceType::XPU),
+      c10::DeviceType::XPU,
+      self,
+      other);
+  return mul_add_scalar(
+      cached_cast(to_type, self, c10::DeviceType::XPU),
+      cached_cast(to_type, other, c10::DeviceType::XPU),
+      accumu,
+      alpha);
+}
+
 Tensor mul_scalar_add(
     const Tensor& self,
     Scalar other,
@@ -207,8 +241,9 @@ Tensor mul_scalar_add(
         iter.dtype(),
         "mul_scalar_add_scalar",
         [&]() {
-          auto alpha_scalar = alpha.to<scalar_t>();
-          auto other_scalar = other.to<scalar_t>();
+          using accscalar_t = acc_type<scalar_t>;
+          auto alpha_scalar = alpha.to<accscalar_t>();
+          auto other_scalar = other.to<accscalar_t>();
           dpcpp_kernel_for_tensor_iter(
               iter, [=](scalar_t a, scalar_t b) -> scalar_t {
                 return a * other_scalar + b * alpha_scalar;
@@ -218,6 +253,24 @@ Tensor mul_scalar_add(
   return result;
 }
 
+Tensor mul_scalar_add_autocast(
+    const Tensor& self,
+    Scalar other,
+    const Tensor& accumu,
+    Scalar alpha) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastXPU);
+  auto to_type = promote_type(
+      get_lower_precision_fp_from_device_type(c10::DeviceType::XPU),
+      c10::DeviceType::XPU,
+      self,
+      accumu);
+  return mul_scalar_add(
+      cached_cast(to_type, self, c10::DeviceType::XPU),
+      other,
+      cached_cast(to_type, accumu, c10::DeviceType::XPU),
+      alpha);
+}
+
 Tensor mul_add(
     const Tensor& self,
     const Tensor& other,
@@ -242,6 +295,25 @@ Tensor mul_add(
   return result;
 }
 
+Tensor mul_add_autocast(
+    const Tensor& self,
+    const Tensor& other,
+    const Tensor& accumu,
+    Scalar alpha) {
+  c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::AutocastXPU);
+  auto to_type = promote_type(
+      get_lower_precision_fp_from_device_type(c10::DeviceType::XPU),
+      c10::DeviceType::XPU,
+      self,
+      other,
+      accumu);
+  return mul_add(
+      cached_cast(to_type, self, c10::DeviceType::XPU),
+      cached_cast(to_type, other, c10::DeviceType::XPU),
+      cached_cast(to_type, accumu, c10::DeviceType::XPU),
+      alpha);
+}
+
 template <typename scalar_t>
 static inline void packed_add_kernel(
     unsigned short* __restrict__ w_MSB,
@@ -428,10 +500,27 @@ Tensor packed_add(
 
 namespace {
 IPEX_LIBRARY_FRAGMENT() {
-  IPEX_OP_REGISTER("mul_add", mul_add);
-  IPEX_OP_REGISTER("mul_add.Scalar_Tensor", mul_add_scalar);
-  IPEX_OP_REGISTER("mul_add.Tensor_Scalar", mul_scalar_add);
-  IPEX_OP_REGISTER("mul_add.Scalar_Scalar", mul_scalar_add_scalar);
+  IPEX_OP_REGISTER_DISPATCH("mul_add", mul_add, c10::DispatchKey::XPU);
+  IPEX_OP_REGISTER_DISPATCH(
+      "mul_add", mul_add_autocast, c10::DispatchKey::AutocastXPU);
+  IPEX_OP_REGISTER_DISPATCH(
+      "mul_add.Scalar_Tensor", mul_scalar_add, c10::DispatchKey::XPU);
+  IPEX_OP_REGISTER_DISPATCH(
+      "mul_add.Scalar_Tensor",
+      mul_scalar_add_autocast,
+      c10::DispatchKey::AutocastXPU);
+  IPEX_OP_REGISTER_DISPATCH(
+      "mul_add.Tensor_Scalar", mul_add_scalar, c10::DispatchKey::XPU);
+  IPEX_OP_REGISTER_DISPATCH(
+      "mul_add.Tensor_Scalar",
+      mul_add_scalar_autocast,
+      c10::DispatchKey::AutocastXPU);
+  IPEX_OP_REGISTER_DISPATCH(
+      "mul_add.Scalar_Scalar", mul_scalar_add_scalar, c10::DispatchKey::XPU);
+  IPEX_OP_REGISTER_DISPATCH(
+      "mul_add.Scalar_Scalar",
+      mul_scalar_add_scalar_autocast,
+      c10::DispatchKey::AutocastXPU);
   IPEX_OP_REGISTER_DISPATCH(
       "packed_add", at::AtenIpexTypeXPU::packed_add, c10::DispatchKey::XPU);
   IPEX_OP_REGISTER_DISPATCH(
diff --git a/tests/gpu/examples/test_fusion.py b/tests/gpu/examples/test_fusion.py
@@ -1524,7 +1524,15 @@ def model_check(model):
                     modelJit(m1_dpcpp, m2_dpcpp, add1_dpcpp)
                 print(modelJit.graph_for(m1_dpcpp, m2_dpcpp, add1_dpcpp))
                 real = modelJit(m1_dpcpp, m2_dpcpp, add2_dpcpp)
-            self.assertEqual(raw, real.to(cpu_device))
+                self.assertEqual(raw, real.to(cpu_device))
+
+                with torch.xpu.amp.autocast(enabled=True, dtype=torch.float16):
+                    autocast_arg1 = modelJit(m1_dpcpp, m2_dpcpp, add1_dpcpp)
+                    self.assertEqual(raw, autocast_arg1.to(device=cpu_device, dtype=torch.float), atol=1e-5, rtol=1e-5)
+                with torch.xpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
+                    autocast_arg1 = modelJit(m1_dpcpp, m2_dpcpp, add1_dpcpp)
+                    self.assertEqual(raw, autocast_arg1.to(device=cpu_device, dtype=torch.float), atol=1e-5, rtol=1e-5)
+
             del modelJit
         model_check(MulAdd())
         model_check(MulAddScalar())
diff --git a/tests/gpu/examples/test_roi_align.py b/tests/gpu/examples/test_roi_align.py
@@ -97,7 +97,30 @@ def roi_align_forward_(self, dtype_):
         tol = 1e-2 if (x_dtype is torch.half or rois_dtype is torch.half) else 1e-5
         torch.testing.assert_close(gt_y.cpu(), y.cpu(), rtol=tol, atol=tol)
 
+    def roi_align_autocast_forward_(self, dtype_):
+        device = torch.device('xpu')
+        pool_size = 5
+        n_channels = 2 * (pool_size**2)
+        x = torch.rand(2, n_channels, 10, 10, dtype=dtype_, device=device)
+        rois = torch.tensor(
+            [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9], [1, 0, 0, 9, 9]],  # format is (xyxy)
+            dtype=torch.float,
+            device=device,
+        )
+        pool_h, pool_w = pool_size, pool_size
+
+        with torch.xpu.amp.autocast(enabled=True, dtype=dtype_):
+            y = torch.xpu.roi_align(x, rois, [pool_h, pool_w], spatial_scale=1, sampling_ratio=-1)
+            gt_y = expected_fn(
+                x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=torch.float
+            )
+            tol = 1e-2 if dtype_ is torch.float16 else 1e-1
+            torch.testing.assert_close(gt_y.cpu(), y.to(torch.float).cpu(), rtol=tol, atol=tol)
+
     def test_roi_align_forward(self):
         for dtype in [torch.float, torch.half]:
             print('testing dtype:', dtype)
             self.roi_align_forward_(dtype)
+        for dtype in [torch.float16, torch.bfloat16]:
+            print('testing dtype in autocast: ', dtype)
+            self.roi_align_autocast_forward_(dtype)