align FuseTensorExprs with PyTorch (#883) (#893)

jiayisunx · EikanWang · web-flow · commit 2af4ca8e5685 · 2022-06-24T09:32:51.000+08:00
* align FuseTensorExprs with PyTorch

* add UT

Co-authored-by: Wang Weihan &lt;eikan.wang@intel.com&gt;
diff --git a/intel_extension_for_pytorch/csrc/jit/fusion_pass.cpp b/intel_extension_for_pytorch/csrc/jit/fusion_pass.cpp
@@ -227,6 +227,30 @@ bool isQuantized(const std::shared_ptr<Graph>& graph) {
   return checkQuantization(graph->block());
 }
 
+FusionBehavior getCurrentBehavior(size_t remaining_depth) {
+  size_t curr_depth = 0;
+  FusionStrategy fusion_strategy_ = getFusionStrategy();
+  for (int i = static_cast<int>(fusion_strategy_.size()) - 1; i >= 0; i--) {
+    curr_depth += fusion_strategy_[i].second;
+    if (remaining_depth <= curr_depth) {
+      return fusion_strategy_[i].first;
+    }
+  }
+  // should never get here
+  TORCH_WARN("Stratgy changed mid-invocation, NYI");
+  return FusionBehavior::STATIC;
+}
+
+size_t getInstantiatedBailoutDepth() {
+  // Initialize bailout_depth from command-line flag.
+  size_t depth = 0;
+  FusionStrategy fusion_strategy_ = getFusionStrategy();
+  for (const auto& pair : fusion_strategy_) {
+    depth += pair.second;
+  }
+  return depth;
+}
+
 void FusionPass(std::shared_ptr<Graph>& graph) {
   GRAPH_DUMP(
       "Before RemoveProfileNodesAndSpecializeTypes. Beginning of "
@@ -260,7 +284,15 @@ void FusionPass(std::shared_ptr<Graph>& graph) {
   BatchMM(graph);
 
   if (tensorExprFuserEnabled()) {
-    FuseTensorExprs(graph, getFusionGroupInlining() ? 2 : 1);
+    auto min_size = getFusionGroupInlining() ? 2 : 1;
+    // Here we always get the first valid behavior per the global fusion
+    // strategies configured by PyTorch (`getInstantiatedBailoutDepth` always
+    // returns the maximum configured depth). This is because IPEX TE fusion is
+    // only called the first time of the compilation while the later
+    // re-compilations are triggered from inside PyTorch.
+    bool dyn_shapes = getCurrentBehavior(getInstantiatedBailoutDepth()) ==
+        FusionBehavior::DYNAMIC;
+    FuseTensorExprs(graph, min_size, /* composed op*/ false, dyn_shapes);
   }
 
   // Apply IPEX inplace optimization/replacement
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
@@ -917,6 +917,13 @@ def __init__(self, equation):
     def forward(self, input1, input2, bias):
         return bias.add_(torch.einsum(self.equation, input1, input2))
 
+class AddMulDiv(nn.Module):
+    def __init__(self):
+        super(AddMulDiv, self).__init__()
+
+    def forward(self, input):
+        return torch.div(torch.mul(input, torch.add(input, 3)), 6)
+
 class Tester(TestCase):
     @contextlib.contextmanager
     def _texpr_enable(self, strategy):
@@ -3137,6 +3144,18 @@ def forward(self, x):
             kind_not_in_graph="aten::mul",
             prec=0.1)
 
+    def test_TEfusion_with_dynamic_input(self):
+        model = AddMulDiv().eval()
+        with torch.no_grad():
+            traced_model = torch.jit.trace(model, torch.randn(11, 3, 20, 20)).eval()
+            traced_model = torch.jit.freeze(traced_model)
+
+        for i in range(5):
+            input = torch.randn(i, 3, 20, 20)
+            tresult = traced_model(input)
+            result = model(input)
+            self.assertEqual(tresult, result)
+
     def test_hardsigmoid_mul(self):
         class HardsigmoidMul(nn.Module):
             def __init__(self) -> None: