fix bf16 runtime error when one cpu device doesn't meet OneDNN ISA requiresment (#867)

XiaobingSuper · chunyuan-w · web-flow · commit 7b2b56163acd · 2022-06-16T20:07:01.000+08:00
Co-authored-by: Chunyuan WU &lt;chunyuan.wu@intel.com&gt;
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv.cpp
@@ -1,4 +1,5 @@
 #include "csrc/aten/cpu/WeightPack.h"
+#include "csrc/cpu/ideep/ideep.hpp"
 #include "csrc/jit/cpu/kernels/OpContext.h"
 #include "csrc/jit/cpu/passes/utils.h"
 #include "graph_rewrite.h"
@@ -106,19 +107,20 @@ void insertPrePackedConvOp(Block* b) {
       IValue input_size_value(input_size_option.value());
       if (n->kind() == aten::conv1d || n->kind() == aten::conv2d ||
           n->kind() == aten::conv3d) {
-        auto weight_size_option = n->inputs()
-                                      .at(1)
-                                      ->type()
-                                      ->cast<TensorType>()
-                                      ->sizes()
-                                      .concrete_sizes();
+        auto weight_tensor_type = n->inputs().at(1)->type()->cast<TensorType>();
+        auto weight_size_option = weight_tensor_type->sizes().concrete_sizes();
         // weight has not shape info, will not do weight prapacked.
         if (!(weight_size_option.has_value() &&
               (weight_size_option.value().size() == 3 ||
                weight_size_option.value().size() == 4 ||
                weight_size_option.value().size() == 5))) {
           continue;
         }
+        const auto dtype = weight_tensor_type->scalarType();
+        if (dtype.has_value() && *dtype == at::ScalarType::BFloat16 &&
+            !ideep::has_bf16_type_support()) {
+          continue;
+        }
         bool w_is_channels_last = false;
         if (constant_as<at::Tensor>(n->namedInput("weight")).has_value()) {
           at::Tensor weight_tensor =
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv_transpose.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_conv_transpose.cpp
@@ -1,3 +1,4 @@
+#include "csrc/cpu/ideep/ideep.hpp"
 #include "graph_rewrite.h"
 #include "graph_rewrite_utils.h"
 #include "utils.h"
@@ -35,19 +36,19 @@ void insertPrePackedConvTransposeOpForATen(Block* b) {
       }
       IValue input_size_value(input_size_option.value());
 
-      auto weight_size_option = n->inputs()
-                                    .at(1)
-                                    ->type()
-                                    ->cast<TensorType>()
-                                    ->sizes()
-                                    .concrete_sizes();
+      auto weight_tensor_type = n->inputs().at(1)->type()->cast<TensorType>();
+      auto weight_size_option = weight_tensor_type->sizes().concrete_sizes();
       // weight has not shape info, will not do weight prapacked.
       if (!(weight_size_option.has_value() &&
             (weight_size_option.value().size() == 4 ||
              weight_size_option.value().size() == 5))) {
         continue;
       }
-
+      const auto dtype = weight_tensor_type->scalarType();
+      if (dtype.has_value() && *dtype == at::ScalarType::BFloat16 &&
+          !ideep::has_bf16_type_support()) {
+        continue;
+      }
       // # padding - output_padding + stride <= 0 unsupported in mkldnn
       auto stride = toIValue(n->input(3))->toIntList();
       auto padding = toIValue(n->input(4))->toIntList();
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_linear.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_linear.cpp
@@ -1,5 +1,7 @@
 #include <ATen/code_template.h>
+#include "csrc/cpu/ideep/ideep.hpp"
 #include "csrc/jit/cpu/passes/utils.h"
+
 #include "graph_rewrite.h"
 #include "graph_rewrite_utils.h"
 
@@ -98,7 +100,8 @@ void insertPrePackedLinearOp(Block* b, std::unordered_set<Node*>& aten_linear) {
     }
     auto weight_dtype_option = tt->scalarType();
     if (!(weight_dtype_option.has_value() &&
-              (weight_dtype_option.value() == at::ScalarType::BFloat16) ||
+              (weight_dtype_option.value() == at::ScalarType::BFloat16) &&
+              ideep::has_bf16_type_support() ||
           aten_linear.find(n) == aten_linear.end())) {
       continue;
     }
diff --git a/intel_extension_for_pytorch/csrc/python/init_python_bindings.cpp b/intel_extension_for_pytorch/csrc/python/init_python_bindings.cpp
@@ -24,8 +24,8 @@
 #include "intel_extension_for_pytorch/csrc/jit/auto_opt_config.h"
 #include "intel_extension_for_pytorch/csrc/utils/env_settings.h"
 #include "intel_extension_for_pytorch/csrc/utils/fpmath_mode.h"
+#include "intel_extension_for_pytorch/csrc/utils/onednn_utils.h"
 #include "intel_extension_for_pytorch/csrc/utils/rw_lock.h"
-#include "intel_extension_for_pytorch/csrc/utils/verbose.hpp"
 
 #include <c10/core/DeviceType.h>
 #include <torch/csrc/Exceptions.h>
@@ -73,7 +73,11 @@ void InitIpexModuleBindings(py::module m) {
     EnvSettings::get_instance().set_settings_profile_op(b_enable);
   });
 
-  m.def("mkldnn_set_verbose", &torch_ipex::verbose::_mkldnn_set_verbose);
+  m.def("mkldnn_set_verbose", &torch_ipex::utils::onednn_set_verbose);
+  m.def("onednn_has_bf16_support", []() {
+    return torch_ipex::utils::onednn_has_bf16_type_support();
+  });
+
   // ipex amp autocast
   m.def("get_autocast_dtype", []() {
     at::ScalarType current_dtype = torch_ipex::autocast::get_autocast_dtype();
diff --git a/intel_extension_for_pytorch/csrc/utils/onednn_utils.cpp b/intel_extension_for_pytorch/csrc/utils/onednn_utils.cpp
@@ -0,0 +1,17 @@
+#include "onednn_utils.h"
+
+#include "csrc/cpu/ideep/ideep.hpp"
+
+namespace torch_ipex {
+namespace utils {
+
+int onednn_set_verbose(int level) {
+  return ideep::utils::set_verbose(level);
+}
+
+bool onednn_has_bf16_type_support() {
+  return ideep::has_bf16_type_support();
+}
+
+} // namespace utils
+} // namespace torch_ipex
diff --git a/intel_extension_for_pytorch/csrc/utils/onednn_utils.h b/intel_extension_for_pytorch/csrc/utils/onednn_utils.h
@@ -0,0 +1,8 @@
+namespace torch_ipex {
+namespace utils {
+
+int onednn_set_verbose(int level);
+bool onednn_has_bf16_type_support();
+
+} // namespace utils
+} // namespace torch_ipex
diff --git a/intel_extension_for_pytorch/csrc/utils/verbose.cpp b/intel_extension_for_pytorch/csrc/utils/verbose.cpp
diff --git a/intel_extension_for_pytorch/csrc/utils/verbose.hpp b/intel_extension_for_pytorch/csrc/utils/verbose.hpp
diff --git a/intel_extension_for_pytorch/frontend.py b/intel_extension_for_pytorch/frontend.py
@@ -287,6 +287,10 @@ def optimize(
         optimized_model, optimized_optimizer, params_attr = utils._weight_cast.weight_dtype_convert_with_ipex(
             optimized_model, optimized_optimizer, params_attr, opt_properties.split_master_weight_for_bf16)
     if opt_properties.weights_prepack:
+        if dtype == torch.bfloat16:
+            assert core.onednn_has_bf16_support(), \
+                    "BF16 weight prepack needs the cpu support avx512bw, avx512vl and avx512dq, " + \
+                    "please set dtype to torch.float or set weights_prepack to False."
         optimized_model, optimized_optimizer, params_attr = utils._weight_prepack.weight_prepack_with_ipex(
             optimized_model, optimized_optimizer, params_attr, opt_properties.auto_kernel_selection)
     # TODO: model list, optimizer list.
diff --git a/tests/cpu/test_ipex_optimize.py b/tests/cpu/test_ipex_optimize.py
@@ -1,5 +1,6 @@
 import torch
 import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch._C as core
 from intel_extension_for_pytorch.nn.utils._weight_prepack import _IPEXLinear as _IPEXLinear, _IPEXConv2d as _IPEXConv2d
 from torch.testing._internal.common_utils import TestCase
 from torch.optim import Adadelta, Adagrad, Adam, AdamW, Adamax, ASGD, RMSprop, Rprop, SGD
@@ -126,6 +127,21 @@ def forward(self, x):
                                    "WARNING: Can't convert model's parameters dtype"):
             optimized_model = ipex.optimize(model.eval(), dtype=torch.bfloat16)
 
+    def test_optimize_bf16_upsupported(self):
+        class Conv(torch.nn.Module):
+            def __init__(self,):
+                super(Conv, self).__init__()
+                self.conv = torch.nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
+
+        def forward(self, x):
+            return self.conv(x)
+
+        model = Conv()
+        if not core.onednn_has_bf16_support():
+            msg = r"BF16 weight prepack needs the cpu support avx512bw, avx512vl and avx512dq, please set dtype to torch.float or set weights_prepack to False."
+            with self.assertRaisesRegex(AssertionError, msg):
+                optimized_model = ipex.optimize(model.eval(), dtype=torch.bfloat16)
+
     def test_optimize_unsupport_freeze_optimization(self):
         model = ConvBatchNorm().eval()
         x = model.input1
diff --git a/tests/cpu/test_weight_prepack.py b/tests/cpu/test_weight_prepack.py
@@ -12,6 +12,8 @@
 
 import torch
 import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch._C as core
+
 from torch.testing._internal.common_utils import TestCase
 from torch.optim import Adadelta, Adagrad, Adam, AdamW, Adamax, ASGD, RMSprop, Rprop, SGD
 from intel_extension_for_pytorch.optim._lamb import Lamb
@@ -180,20 +182,25 @@ def _test_convolution_training_base(self, dim, dtype, rtol=None, atol=None):
 
     def test_conv2d_training(self):
         self._test_convolution_training_base(dim=2, dtype=torch.float)
-        self._test_convolution_training_base(dim=2, dtype=torch.bfloat16, rtol=1e-2, atol=1e-03)
+        if core.onednn_has_bf16_support(): 
+            self._test_convolution_training_base(dim=2, dtype=torch.bfloat16, rtol=1e-2, atol=1e-03)
 
         # TODO: add inference case.
 
     def test_conv3d_training(self):
         # skip conv3d training case, because the backward weight get different result when calling different kernel.
         # self._test_convolution_training_base(dim=3, dtype=torch.float, rtol=1e-3, atol=1e-03)
-        self._test_convolution_training_base(dim=3, dtype=torch.bfloat16, rtol=1e-2, atol=1e-03)
+        if core.onednn_has_bf16_support():
+            self._test_convolution_training_base(dim=3, dtype=torch.bfloat16, rtol=1e-2, atol=1e-03)
         # TODO: add inference case.
 
     def _test_conv_nc11_base(self, dim):
         # related issue: https://github.com/intel-innersource/frameworks.ai.pytorch.ipex-cpu/pull/86.
         channels_last = torch.channels_last if dim ==2 else torch.channels_last_3d
-        options = itertools.product([torch.float, torch.bfloat16],
+        test_dtypes = [torch.float]
+        if core.onednn_has_bf16_support():
+            test_dtypes.append(torch.bfloat16)
+        options = itertools.product(test_dtypes,
                                     [1, 256], [1, 324],
                                     [torch.contiguous_format, channels_last],
                                     [True, False])
@@ -272,7 +279,10 @@ def test_conv3d_nc11(self):
     def _test_conv_serialization_base(self, dim):
         channels_last = torch.channels_last if dim ==2 else torch.channels_last_3d
         optimizer_options = [Lamb, Adadelta, Adagrad, Adam, AdamW, Adamax, ASGD, RMSprop, Rprop, SGD]
-        options = itertools.product([torch.float, torch.bfloat16], optimizer_options, [True, False])
+        test_dtypes = [torch.float]
+        if core.onednn_has_bf16_support():
+            test_dtypes.append(torch.bfloat16)
+        options = itertools.product(test_dtypes, optimizer_options, [True, False])
         input_shape = [8, 3, 56, 56]
         if dim == 3:
             input_shape.append(56)
@@ -370,7 +380,10 @@ def test_conv3d_serialization(self):
 
     def _test_imagenet_model(self, model):
         model = model.to(memory_format=torch.channels_last)
-        for dtype, feed_sample_input in itertools.product([torch.float32, torch.bfloat16], [True, False]):
+        test_dtypes = [torch.float]
+        if core.onednn_has_bf16_support():
+            test_dtypes.append(torch.bfloat16)
+        for dtype, feed_sample_input in itertools.product(test_dtypes, [True, False]):
             model = model.to(dtype).float()
             # inference case, will do conv+bn folding 'O1'. do nothing for 'O0'.
             x = torch.randn(1, 3, 224, 224).to(dtype=dtype).float().to(memory_format=torch.channels_last)
@@ -436,34 +449,37 @@ def forward(self, x):
         in_features = torch.randint(3, 10, (1,)).item()
 
         input_shapes = [(8, in_features), (2, 4, in_features), (2, 2, 2, in_features)]
-        options = itertools.product([True, False], input_shapes, [True, False])
-        for bias, x_shape, feed_sample_input  in options:
-            for dtype in [torch.float32, torch.bfloat16]:
-                x = torch.randn(x_shape, dtype=torch.float32).to(dtype=dtype).float()
-                model = L(in_features, out_features, bias).to(dtype=dtype).float().eval()
-                x1 = x.clone().requires_grad_(False)
-                x2 = x.clone().requires_grad_(False)
-                origin_model = copy.deepcopy(model).eval()
-                if feed_sample_input:
-                    ipex_model = ipex.optimize(origin_model, dtype=dtype, level='O1', sample_input=x)
-                else:
-                    ipex_model = ipex.optimize(origin_model, dtype=dtype, level='O1')
-
-                self.assertEqual(ipex_model.linear.weight.dtype, dtype)
-                y1 = origin_model(x1)
-                with torch.cpu.amp.autocast(enabled=True, dtype=dtype):
-                    # ipex path
-                    y2 = ipex_model(x2)
-                self.assertEqual(y1, y2.float(), rtol=1e-2, atol=1e-3)
+        test_dtypes = [torch.float]
+        if core.onednn_has_bf16_support():
+            test_dtypes.append(torch.bfloat16)
+        options = itertools.product([True, False], input_shapes, [True, False], test_dtypes)
+        for bias, x_shape, feed_sample_input, dtype in options:
+            x = torch.randn(x_shape, dtype=torch.float32).to(dtype=dtype).float()
+            model = L(in_features, out_features, bias).to(dtype=dtype).float().eval()
+            x1 = x.clone().requires_grad_(False)
+            x2 = x.clone().requires_grad_(False)
+            origin_model = copy.deepcopy(model).eval()
+            if feed_sample_input:
+                ipex_model = ipex.optimize(origin_model, dtype=dtype, level='O1', sample_input=x)
+            else:
+                ipex_model = ipex.optimize(origin_model, dtype=dtype, level='O1')
+
+            self.assertEqual(ipex_model.linear.weight.dtype, dtype)
+            y1 = origin_model(x1)
+            with torch.cpu.amp.autocast(enabled=True, dtype=dtype):
+                # ipex path
+                y2 = ipex_model(x2)
+            self.assertEqual(y1, y2.float(), rtol=1e-2, atol=1e-3)
 
-    # @unittest.skipIf(True, "ipex linear bf16 backward data has NAN issue")
+    @unittest.skipIf(not core.onednn_has_bf16_support(), "ipex linear bf16 is not supported on this CPU device")
     def test_linear_training(self):
         linear_module = torch.nn.Linear
         out_feature = [1024, 256, 1, torch.randint(3, 10, (1, )).item()]
         in_feature = [128, 479, torch.randint(3, 10, (1, )).item()]
         input_shapes = []
         for s in in_feature:
             input_shapes += [(128, s), (2, 64, s), (2, 2, 32, s)]
+        
         options = itertools.product(out_feature, [True, False], input_shapes, [torch.bfloat16], [True, False])
         for out_features, bias, x_shape, dtype, feed_sample_input in options:
             in_features = x_shape[-1]
@@ -617,8 +633,10 @@ def forward(self, x):
                 elif dims == 3:
                     model = Deconv3d(ic, oc, kernel_size, stride, padding, output_padding, groups, bias, dilation).to(memory_format=torch.channels_last_3d)
                     x = torch.rand((2, ic, input_depth, input_height, input_width)).to(memory_format=torch.channels_last_3d)
-
-                for dtype, feed_sample_input in itertools.product([torch.float32, torch.bfloat16], [True, False]):
+                test_dtypes = [torch.float]
+                if core.onednn_has_bf16_support():
+                    test_dtypes.append(torch.bfloat16)
+                for dtype, feed_sample_input in itertools.product(test_dtypes, [True, False]):
                     x = x.to(dtype=dtype).float()
                     model = model.to(dtype=dtype).float()
                     if inference:
@@ -705,4 +723,4 @@ def test_deconv_3d_training(self):
 
 if __name__ == '__main__':
     torch.manual_seed(2020)
-    test = unittest.main()
+    test = unittest.main()