intel
diff --git a/‎tests/cpu/test_jit_llga_quantization_fuser.py‎
Lines changed: 57 additions & 6 deletions b/‎tests/cpu/test_jit_llga_quantization_fuser.py‎
Lines changed: 57 additions & 6 deletions
diff --git a/‎tests/cpu/test_jit_llga_throughput_benchmark.py‎
Lines changed: 61 additions & 0 deletions b/‎tests/cpu/test_jit_llga_throughput_benchmark.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎tests/cpu/test_jit_llga_utils.py‎
Lines changed: 30 additions & 8 deletions b/‎tests/cpu/test_jit_llga_utils.py‎
Lines changed: 30 additions & 8 deletions
diff --git a/‎torch_ipex/csrc/jit/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎torch_ipex/csrc/jit/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎torch_ipex/csrc/jit/codegen/onednn/fusion_group_name.cpp‎
Lines changed: 6 additions & 1 deletion b/‎torch_ipex/csrc/jit/codegen/onednn/fusion_group_name.cpp‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎torch_ipex/csrc/jit/codegen/onednn/fusion_group_name.h‎
Lines changed: 6 additions & 3 deletions b/‎torch_ipex/csrc/jit/codegen/onednn/fusion_group_name.h‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎torch_ipex/csrc/jit/codegen/onednn/guard_shape.cpp‎
Lines changed: 144 additions & 0 deletions b/‎torch_ipex/csrc/jit/codegen/onednn/guard_shape.cpp‎
Lines changed: 144 additions & 0 deletions
@@ -31,13 +31,15 @@ def get_eltwise_fn(name):
 def llga_test_env(func):
     @wraps(func)
     def wrapTheFunction(*args):
-        torch._C._jit_set_profiling_mode(False)
-        torch._C._jit_set_profiling_executor(False)
+        # make sure that the profiling mode is turned on
+        torch._C._jit_set_profiling_mode(True)
+        torch._C._jit_set_profiling_executor(True)
+        
+        ipex.core._jit_set_llga_enabled(True)
         ipex.core.disable_jit_opt()
         func(*args)
         ipex.core.enable_jit_opt()
-        torch._C._jit_set_profiling_mode(True)
-        torch._C._jit_set_profiling_executor(True)
+        ipex.core._jit_set_llga_enabled(False)
     return wrapTheFunction
 
 class TestOp(JitLlgaTestCase):
@@ -79,7 +81,7 @@ def test_conv2d_int8_in_f32_out(self):
             ]
             #TODO: enable torch.per_tensor_symmetric case.
             for qscheme in [torch.per_tensor_affine]:
-                graph = self.checkQuantizeTrace(m, [x], atol=2e-1, config_name="conv2d", qscheme=qscheme)
+                graph = self.checkQuantizeTrace(m, [x], x_var=[torch.rand(5, in_channels * g, spatial, spatial, requires_grad=False)], atol=2e-1, config_name="conv2d", qscheme=qscheme)
                 self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 2)
                 self.assertFused(graph, ['aten::_convolution', 'aten::quantize_per_tensor', 'aten::quantize_per_channel'])
                 self.checkPatterns(graph, patterns)
@@ -308,7 +310,7 @@ def forward(self, x):
                 ["aten::dequantize"]
             ]
             for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
-                graph = self.checkQuantizeTrace(m, [x], atol=1e-1, config_name="linear_eltwise", qscheme=qscheme)
+                graph = self.checkQuantizeTrace(m, [x], x_var=[torch.rand(2, 28, requires_grad=False)], atol=1e-1, config_name="linear_eltwise", qscheme=qscheme)
                 self.assertGraphContainsExactly(graph, LLGA_FUSION_GROUP, 3)
                 self.assertFused(graph, ['aten::' + eltwise])
                 self.checkPatterns(graph, patterns)
@@ -423,6 +425,55 @@ def forward(self, x):
             self.assertFused(graph, ['aten::_convolution', 'aten::relu', 'aten::quantize_per_tensor', 'aten::quantize_per_channel', 'aten::dequantize'])
             self.checkPatterns(graph, patterns)
 
+class TestShapeFallback(JitLlgaTestCase):
+    @unittest.skipIf(True, 'Size peephole optimization not enabled yet')
+    @llga_test_env
+    def test_view_permute(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+
+            def forward(self, x):
+                new_x_shape = x.size()[:-1] + (3, 5)
+                x = x.view(*new_x_shape)
+                return x.permute(0, 2, 1, 3)
+        
+        x = torch.randn(5, 10, 15)
+        m = M()
+
+        for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
+            graph = self.checkQuantizeTrace(m, [x], config_name="view_permute", qscheme=qscheme)
+            self.assertGraphContainsExactly(graph, "aten::size", 0)
+            self.assertGraphContainsExactly(graph, "prim::ListConstruct", 0)
+
+            # change the size of the input
+            x2 = torch.randn(6, 4, 15)
+            # Bailout get triggered here
+            y2 = m(x2)
+
+    @llga_test_env
+    def test_conv_reshape(self):
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+                self.conv1 = nn.Conv2d(4, 4, 3, padding=1, bias=True)
+                self.conv2 = nn.Conv2d(4, 32, 3, padding=1, bias=True)
+
+            def forward(self, x):
+                x = self.conv1(x)
+                x = self.conv2(x).reshape(x.size(0), 4, -1)
+                return x
+        
+        x = torch.randn(15, 4, 28, 28)
+        # change the size of the input, check the fallback
+        x_var = torch.randn(7, 4, 16, 16)
+        m = M()
+        for qscheme in [torch.per_tensor_affine, torch.per_tensor_symmetric]:
+            graph = self.checkQuantizeTrace(m, [x], x_var = [x_var], atol=2e-1, config_name="conv_reshape", qscheme=qscheme)
+
+            # TODO: enable this check when size peephole optimization is enabled
+            # self.assertGraphContainsExactly(graph, "aten::size", 0)
+
 class TestModel(JitLlgaTestCase):
     @skipIfNoTorchVision
     @llga_test_env
 
@@ -0,0 +1,61 @@
+from functools import wraps
+
+import torch
+from torch.utils import ThroughputBenchmark
+from torch.testing import assert_allclose
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+import intel_pytorch_extension as ipex
+from test_jit_llga_utils import JitLlgaTestCase, run_tests, LLGA_FUSION_GROUP
+from test_jit_llga_quantization_fuser import llga_test_env
+
+class LinearEltwise(torch.nn.Module):
+    def __init__(self, D_in, H, D_out):
+        super(LinearEltwise, self).__init__()
+        self.linear1 = torch.nn.Linear(D_in, H)
+        self.eltwise = torch.nn.ReLU()
+        self.linear2 = torch.nn.Linear(H, D_out)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.eltwise(x)
+        x = self.linear2(x)
+        return x
+
+def freeze(model):
+    return torch.jit._recursive.wrap_cpp_module(torch._C._freeze_module(model._c, preserveParameters=True))
+
+class TestThroughputBenchmark(JitLlgaTestCase):
+    @llga_test_env
+    def test_linear_eltwise(self):
+        with torch.no_grad():
+            D_in = 10
+            H = 5
+            D_out = 15
+            B = 8
+
+            m = LinearEltwise(D_in, H, D_out)
+            x = torch.randn(B, D_in)
+            
+            graph, m_llga, m_cpu = self.prepareModel(m, [x])
+            
+            ipex.core._jit_set_llga_enabled(False)
+            module_result = m_cpu(x)
+            ipex.core._jit_set_llga_enabled(True)
+
+            bench = ThroughputBenchmark(m_llga)
+            bench.add_input(x)
+            bench_result = bench.run_once(x)
+
+            assert_allclose(bench_result, module_result, atol=1e-1, rtol=1e-2)
+
+            stats = bench.benchmark(
+                num_calling_threads=4,
+                num_warmup_iters=100,
+                num_iters=1000
+            )
+
+            print(stats)
+
+if __name__ == '__main__':
+    run_tests()
@@ -73,7 +73,30 @@ def assertFused(self, graph, fused_patterns):
         for pat in fused_patterns:
             self.assertGraphContainsExactly(graph, pat, 0)
 
-    def checkQuantizeTrace(self, model, x, atol=1e-3, rtol=1e-2, folding=False, remove_dropout=False, config_name="", qscheme=torch.per_tensor_affine):
+    def checkQuantizeTrace(self, model, x, atol=1e-3, rtol=1e-2, folding=False, remove_dropout=False, config_name="", x_var=None, qscheme=torch.per_tensor_affine):
+        graph, model, fp32_model_with_quant_dequant = self.prepareModel(model, x, folding, remove_dropout, config_name, qscheme)
+        with torch.no_grad():
+            # calculate after getting the graph
+            y_llga = model(*x)
+
+            # disable llga for fp32 path
+            ipex.core._jit_set_llga_enabled(False)
+            y = fp32_model_with_quant_dequant(*x)
+            # test Fallback when input shape changes:
+            if x_var:
+                y_var = fp32_model_with_quant_dequant(*x_var)
+            ipex.core._jit_set_llga_enabled(True)
+
+            self.assertEqual(y, y_llga, atol=atol, rtol=rtol)
+
+            # test Fallback when input shape changes:
+            if x_var:
+                y_var_llga = model(*x_var)
+                self.assertEqual(y_var, y_var_llga, atol=atol, rtol=rtol)
+
+            return graph
+
+    def prepareModel(self, model, x, folding=False, remove_dropout=False, config_name="", qscheme=torch.per_tensor_affine):
         model.eval()
         with torch.no_grad(), torch._jit_internal._disable_emit_hooks():
             # fold conv bn
@@ -105,14 +128,13 @@ def checkQuantizeTrace(self, model, x, atol=1e-3, rtol=1e-2, folding=False, remo
             # freeze the module
             model = freeze(model)
 
-            # apply llga optimization pass
-            ipex.core._jit_llga_fuser(model.graph)
-
-            y = fp32_model_with_quant_dequant(*x)
-            y_llga = model(*x)
+            # warm up run
+            y0 = model(*x)
 
-            self.assertEqual(y, y_llga, atol=atol, rtol=rtol)
-            return model.graph
+            # get the graph at the second run after freezing
+            graph = model.graph_for(*x)
+            
+            return graph, model, fp32_model_with_quant_dequant
 
     def checkPatterns(self, graph, patterns):
         fusion_groups = findFusionGroups(graph)
 
@@ -13,6 +13,8 @@ LIST(APPEND DPCPP_JIT_SRCS
     ${DPCPP_ROOT}/jit/fusion_pass.cpp
     ${DPCPP_ROOT}/jit/register_dnnl_jit_ops.cpp
     ${DPCPP_ROOT}/jit/graph_rewrite.cpp
+    ${DPCPP_ROOT}/jit/codegen/onednn/register_interface.cpp
+    ${DPCPP_ROOT}/jit/codegen/onednn/guard_shape.cpp
 )
 
 # Pass to parent
 
@@ -5,11 +5,16 @@ namespace jit {
 namespace fuser {
 namespace onednn {
 
-const std::string& LlgaFusionGroupName() {
+const std::string &LlgaFusionGroupName() {
   static const std::string _LlgaFusionGroupName = "ipex::LlgaFusionGroup";
   return _LlgaFusionGroupName;
 }
 
+const std::string &LlgaGuardName() {
+  static const std::string LlgaGuardName = "ipex::LlgaFusionGuard";
+  return LlgaGuardName;
+}
+
 } // namespace onednn
 } // namespace fuser
 } // namespace jit
 
@@ -8,9 +8,12 @@ namespace fuser {
 namespace onednn {
 
 // Workaround here. Once the PR of PyTorch LLGA bridge code has been landed
-// into the stock PyTorch, we could directly use the Symbol: prim::LlgaFusionGroup
-// instead of Symbol::fromQualString(LlgaFusionGroupName())
-extern const std::string& LlgaFusionGroupName();
+// into the stock PyTorch, we could directly use the Symbol:
+// prim::LlgaFusionGroup and prim::LlgaFusionGuard instead of
+// Symbol::fromQualString(LlgaFusionGroupName()) and
+// Symbol::fromQualString(LlgaGuardName())
+extern const std::string &LlgaFusionGroupName();
+extern const std::string &LlgaGuardName();
 
 } // namespace onednn
 } // namespace fuser
 
@@ -0,0 +1,144 @@
+#include "jit/codegen/onednn/guard_shape.h"
+#include "jit/codegen/onednn/fusion_group_name.h"
+
+#include <torch/csrc/jit/jit_log.h>
+#include <torch/csrc/jit/passes/tensorexpr_fuser.h>
+#include <torch/csrc/jit/runtime/graph_executor.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace onednn {
+
+using tensor_type_converter_t =
+    c10::function_ref<TensorTypePtr(const TensorTypePtr &t)>;
+
+void insertTypeGuardForFusionGroup(Node *guarded_node,
+                                   tensor_type_converter_t type_converter,
+                                   Symbol kind) {
+  GRAPH_DEBUG("Inserting a typecheck guard for a node", *guarded_node);
+  auto subgraph = guarded_node->g(attr::Subgraph);
+
+  // Fixup types of the subgraph inputs
+  std::vector<Value *> inputs_to_check;
+  std::vector<TypePtr> guard_types;
+  for (Value *input : guarded_node->inputs()) {
+    // We only check inputs of the guarded nodes and expect user to infer
+    // intermediates and outputs shapes
+    if (!input->type()->cast<TensorType>()) {
+      continue;
+    }
+
+    // fusion outputs are already guarded
+    if (input->node()->kind() == prim::Constant ||
+        input->node()->kind() ==
+            Symbol::fromQualString(LlgaFusionGroupName())) {
+      continue;
+    }
+    inputs_to_check.push_back(input);
+    guard_types.push_back(type_converter(input->type()->expect<TensorType>()));
+  }
+  if (!inputs_to_check.size()) {
+    return;
+  }
+
+  // Add ipex::LlgaFusionGuard node
+  //
+  // ipex::LlgaFusionGuard nodes  look like the following:
+  //   %out1 : Float(2, 3), %out2 : Int(10, 30), %types_match : bool =
+  //   ipex::LlgaFusionGuard(%inp1 : Tensor, %inp2 : Tensor)
+  //
+  // They have N inputs whose types we are going to check and N+1 outputs. The
+  // first N outputs specify expected types and N+1-th output holds the result
+  // of the check (bool).
+  Node *typecheck_node =
+      guarded_node->owningGraph()
+          ->create(kind, inputs_to_check, inputs_to_check.size() + 1)
+          ->insertBefore(guarded_node);
+  typecheck_node->tys_(attr::types, guard_types);
+  Value *typecheck_result = typecheck_node->output(inputs_to_check.size());
+
+  std::unordered_map<Value *, Value *> typechecked_inputs;
+  for (size_t i = 0; i < typecheck_node->inputs().size(); ++i) {
+    typechecked_inputs[typecheck_node->input(i)] = typecheck_node->output(i);
+  }
+
+  // Fixup types of the typecheck node outputs, which are used by the op in
+  // execution
+  typecheck_node->output(inputs_to_check.size())->setType(BoolType::get());
+  for (size_t i = 0; i < typecheck_node->inputs().size(); ++i) {
+    typecheck_node->output(i)->setType(typecheck_node->input(i)->type());
+  }
+
+  // Insert if
+  auto versioning_if =
+      guarded_node->owningGraph()
+          ->create(prim::If, {typecheck_result}, guarded_node->outputs().size())
+          ->insertAfter(typecheck_node);
+  for (size_t idx = 0; idx < guarded_node->outputs().size(); ++idx) {
+    versioning_if->output(idx)->setType(guarded_node->output(idx)->type());
+    guarded_node->output(idx)->replaceAllUsesWith(versioning_if->output(idx));
+  }
+  auto true_block = versioning_if->addBlock();
+  auto false_block = versioning_if->addBlock();
+
+  // Fill in the false block. It should contain the unoptimized
+  // copy of the fused subgraph.
+  WithInsertPoint guard(false_block->return_node());
+  const auto subgraph_outputs = insertGraph(*guarded_node->owningGraph(),
+                                            *subgraph, guarded_node->inputs());
+  for (Value *output : subgraph_outputs) {
+    false_block->registerOutput(output);
+  }
+
+  // types get copied to the fallback graph, so remove specializations before
+  // replacing
+  removeTensorTypeSpecializations(false_block);
+  replaceBlockWithFallbackGraph(false_block, guarded_node->inputs());
+
+  // Fill in the true block. It has all inputs type-checked and its
+  // body should be the fusion group node.
+  guarded_node->moveBefore(true_block->return_node());
+  for (size_t idx = 0; idx < guarded_node->inputs().size(); ++idx) {
+    if (typechecked_inputs.count(guarded_node->input(idx))) {
+      guarded_node->replaceInput(
+          idx, typechecked_inputs.at(guarded_node->input(idx)));
+    }
+  }
+  for (Value *output : guarded_node->outputs()) {
+    true_block->registerOutput(output);
+  }
+}
+
+//! [ Note -- prepareFusionGroupAndGuardOutputs implementation ]
+//! shamelessly copying code from NNC (tensorexpr_fuser)  with very little
+//! modification, original code at:
+//! `torch/csrc/jit/passes/tensorexpr_fuser.cpp:prepareFusionGroupAndGuardOutputs`
+//!
+//! We have the assumption that LLGA does not have operators
+//! depending on the content of the tensor.
+void prepareFusionGroupAndGuardOutputs(Block *block) {
+  std::vector<Node *> fusion_groups;
+  for (Node *n : block->nodes()) {
+    for (Block *b : n->blocks()) {
+      prepareFusionGroupAndGuardOutputs(b);
+    }
+    if (n->kind() == Symbol::fromQualString(LlgaFusionGroupName())) {
+      fusion_groups.push_back(n);
+    }
+  }
+  for (Node *fusion_group : fusion_groups) {
+    // TODO: add further optimization pass to removeOutputsUsedOnlyInSize,
+    // refer to
+    // `torch/csrc/jit/passes/tensorexpr_fuser.cpp:removeOutputsUsedOnlyInSize`
+    // removeOutputsUsedOnlyInSize(fusion_group);
+    insertTypeGuardForFusionGroup(
+        fusion_group, [](const TensorTypePtr &t) { return t; },
+        Symbol::fromQualString(fuser::onednn::LlgaGuardName()));
+  }
+}
+
+} // namespace onednn
+} // namespace fuser
+} // namespace jit
+} // namespace torch
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,8 @@ LIST(APPEND DPCPP_JIT_SRCS`
`13`	`13`	`${DPCPP_ROOT}/jit/fusion_pass.cpp`
`14`	`14`	`${DPCPP_ROOT}/jit/register_dnnl_jit_ops.cpp`
`15`	`15`	`${DPCPP_ROOT}/jit/graph_rewrite.cpp`
	`16`	`+ ${DPCPP_ROOT}/jit/codegen/onednn/register_interface.cpp`
	`17`	`+ ${DPCPP_ROOT}/jit/codegen/onednn/guard_shape.cpp`
`16`	`18`	`)`
`17`	`19`
`18`	`20`	`# Pass to parent`