complete the pattern matching for matmul + div & modified the out parameter's semantic for this pattern (#862) (#903)

tangleintel · web-flow · commit 3223ca7df2d2 · 2022-06-24T00:40:31.000+08:00
* complete the pattern matching for matmul + div &amp; modified the out parameter's semantic for this pattern

* clang-format

* clang-format

* remove OpFuser

* Explicitly mark the out parameter and return value with alias anotation in fused kernel's signature; Add UT to clarify when the div is an outplace version and the out parameter may have side effect, its will not be replaced by our fused pattern.

* remove the ill illustrated UT

* complete UT coverage

* add two UT to demonstrate we are free of side effect when we replace div with div_ in this pattern
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Matmul.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/Matmul.cpp
@@ -84,10 +84,10 @@ at::Tensor dil_matmul_div(
 
   if (out.defined()) {
     at::matmul_out(out, tensor1, tensor2);
-    return out.div(div_input);
+    return out.div_(div_input);
   }
   auto output = at::matmul(tensor1, tensor2);
-  return output.div(div_input);
+  return output.div_(div_input);
 }
 
 /**
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.cpp
@@ -218,6 +218,43 @@ void FuseAddLayerNorm(std::shared_ptr<Graph>& graph) {
   rewriter_aten.runOnGraph(graph);
 }
 
+void FuseMatmulDiv(std::shared_ptr<Graph>& graph) {
+  const std::string div_str = R"(div)";
+  const std::string div_inplace_str = R"(div_)";
+  std::vector<std::string> div_ops = {div_str, div_inplace_str};
+
+  auto aten_pattern = at::jit::CodeTemplate(R"(
+      graph(%x, %y, %z):
+        %mm_res = aten::matmul(%x, %y)
+        %div_res = aten::${div_op}(%mm_res, %z)
+        return (%div_res) )");
+
+  auto aten_pattern_with_out = at::jit::CodeTemplate(R"(
+      graph(%x, %y, %z, %out):
+        %mm_res = aten::matmul(%x, %y, %out)
+        %div_res = aten::${div_op}(%mm_res, %z)
+        return (%div_res) )");
+
+  std::string fused_matmul_div = R"(
+      graph(%x, %y, %z):
+        %r = ipex::matmul_div(%x, %y, %z)
+        return (%r) )";
+  std::string fused_matmul_div_with_out = R"(
+      graph(%x, %y, %z, %out):
+        %r = ipex::matmul_div(%x, %y, %out, %z)
+        return (%r) )";
+  for (auto const& it : div_ops) {
+    at::jit::TemplateEnv env;
+    env.s("div_op", it);
+
+    SubgraphRewriter rewriter;
+    rewriter.RegisterRewritePattern(aten_pattern.format(env), fused_matmul_div);
+    rewriter.RegisterRewritePattern(
+        aten_pattern_with_out.format(env), fused_matmul_div_with_out);
+    rewriter.runOnGraph(graph);
+  }
+}
+
 // MHA fusion covers aten::softmax, ipex::softmax and ipex::softmax_:
 // (1) MHA obviously shows better performance than aten div/matmul/add/softmax.
 // (2) MHA also shows better performance than aten add + matmul_div fusion
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.h b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.h
@@ -56,6 +56,7 @@ void fuseLinearWithEltwise(std::shared_ptr<Graph>& graph);
 void fuseLinearAddRelu(std::shared_ptr<Graph>& graph);
 
 void FuseAddLayerNorm(std::shared_ptr<Graph>& graph);
+void FuseMatmulDiv(std::shared_ptr<Graph>& graph);
 void FuseConcatBnRelu(std::shared_ptr<Graph>& graph);
 
 void insertPrePackedConvTransposeOp(std::shared_ptr<Graph>& graph);
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
@@ -729,8 +729,8 @@ RegisterOperators op({
         },
         aliasAnalysisFromSchema()),
     Operator(
-        "ipex::matmul_div(Tensor left, Tensor right, Tensor? out_opt, Tensor "
-        "div_input) -> Tensor",
+        "ipex::matmul_div(Tensor left, Tensor right, Tensor(a!) out_opt, Tensor "
+        "div_input) -> Tensor(a!)",
         [](const Node* node) -> Operation {
           return [](Stack* stack) {
             auto result = dil_matmul_div(
@@ -746,8 +746,8 @@ RegisterOperators op({
         aliasAnalysisFromSchema()),
 
     Operator(
-        "ipex::matmul_div(Tensor left, Tensor right, Tensor? out_opt, Scalar "
-        "div_input) -> Tensor",
+        "ipex::matmul_div(Tensor left, Tensor right, Tensor(a!) out_opt, Scalar "
+        "div_input) -> Tensor(a!)",
         [](const Node* node) -> Operation {
           return [](Stack* stack) {
             auto result = dil_matmul_div(
diff --git a/intel_extension_for_pytorch/csrc/jit/fusion_pass.cpp b/intel_extension_for_pytorch/csrc/jit/fusion_pass.cpp
@@ -40,263 +40,6 @@ struct hash<std::pair<Symbol, Symbol>> {
 
 namespace torch {
 namespace jit {
-//
-// The main goal of MKL-DNN fusion is to limit bandwidth wasting.
-// MKL-DNN provided post ops to fuse ops in its output stage
-// What we could do is listed inside RuleTab.
-//
-class OpFuser {
-  Block* block_;
-  std::unique_ptr<AliasDb> aliasDb_;
-  std::shared_ptr<Graph> graph_;
-  using Symbols = std::vector<Symbol>;
-  using RuleTab = std::unordered_map<::std::pair<Symbol, Symbol>, Symbol>;
-  using Rule = RuleTab::iterator;
-  static RuleTab dnnlRules;
-
- public:
-  OpFuser(Block* block, std::shared_ptr<Graph> graph)
-      : block_(block), graph_(std::move(graph)) {}
-
-  void run() {
-    bool any_changed = true;
-    while (any_changed) {
-      any_changed = false;
-      refreshAliasDb();
-      for (auto it = block_->nodes().begin(); it != block_->nodes().end();) {
-        bool changed;
-        std::tie(it, changed) = processNode(*it);
-        any_changed |= changed;
-      }
-    }
-
-    refreshAliasDb();
-
-    for (Node* node : block_->nodes()) {
-      for (Block* sub : node->blocks()) {
-        OpFuser(sub, graph_).run();
-      }
-    }
-  }
-
-  c10::optional<Rule> isFusable(Node* curr, Node* prev) const {
-    // Is it happening in our case ???
-    if (curr->owningBlock() != block_)
-      return c10::nullopt;
-
-    auto choice = dnnlRules.find({prev->kind(), curr->kind()});
-    if (choice != dnnlRules.end())
-      return choice;
-
-    return c10::nullopt;
-  }
-
-  void refreshAliasDb() {
-    aliasDb_ = std::make_unique<AliasDb>(graph_);
-  }
-
-  Node* fuseOpsWithNewKind(Node* curr, Value* v, Graph* g, NodeKind kind) {
-    auto newNode = g->create(kind);
-    auto prev = v->node();
-    newNode->insertBefore(prev);
-    newNode->setScope(prev->scope());
-    newNode->copyAttributes(*prev);
-
-    for (auto input : prev->inputs()) {
-      newNode->addInput(input);
-    }
-
-    for (auto input : curr->inputs()) {
-      if (input != v) {
-        newNode->addInput(input);
-      }
-    }
-
-    // Copy curr or prev?
-    newNode->output()->copyMetadata(prev->output());
-    newNode->output()->setType(prev->output()->type());
-
-    v->replaceAllUsesWith(newNode->output());
-    curr->replaceAllUsesWith(newNode);
-
-    prev->destroy();
-    curr->destroy();
-
-    return newNode;
-  }
-
-  Node* fuseNodes(Node* curr, Value* path, Rule rule) {
-    return fuseOpsWithNewKind(curr, path, curr->owningGraph(), rule->second);
-  }
-
-  bool aliasIsSafeForSquashingValue(Node* node, Value* v) {
-    bool safe = false;
-    auto prev = v->node();
-    if (aliasDb_->moveAfterTopologicallyValid(node, prev)) {
-      if (v->uses().size() == 1 ||
-          aliasDb_->mayAlias /* mustAlias */ (v, node->output())) {
-        safe = true;
-      }
-    }
-    return safe;
-  }
-
-  //
-  // Check whether we could change specific input to be inplace with output
-  // Any use topologically after node will fail it.
-  // XXX: haven't considered loop
-  //
-  bool aliasIsSafeForInplaceValue(Node* node, Value* v) {
-    for (auto use : v->uses())
-      if (use.user->isAfter(node))
-        return false;
-
-    return true;
-  }
-
-  const FunctionSchema& matchSchemaForFusion(
-      c10::Symbol symbol,
-      Node* prev,
-      Node* node) {
-    auto ops = getAllOperatorsFor(symbol);
-
-    for (auto& op : ops) {
-      auto& schema = op->schema();
-      if (schema.arguments().size() ==
-              prev->inputs().size() + node->inputs().size() - 1 &&
-          schema.returns().size() == node->outputs().size())
-        return schema;
-    }
-
-    // throw
-    auto er = ErrorReport(node->sourceRange());
-    er << "Schema not found for fusion process. \n";
-    er << "Prev: " << *prev << "\n";
-    er << "Node: " << *node << "\n";
-
-    if (ops.size() > 0) {
-      er << "\ncandidates were:\n";
-      for (auto& op : ops)
-        er << "  " << op->schema() << "\n";
-    } else {
-      er << "\nno candidates found\n";
-    }
-    er << "within the graph:\n";
-    er << *node->owningGraph() << "\n";
-    throw er;
-  }
-
-  bool aliasIsSafeForFusion(Node* node, Value* v, c10::optional<Rule> r) {
-    bool safe = false;
-    // Returns false if the two nodes to be fused do not have the same owning
-    // block
-    if (node->owningBlock() != v->node()->owningBlock()) {
-      return safe;
-    }
-    // TODO: it might be flawed because we don't have 'alias must' information
-    //
-    // Simple fusion, unary ops:
-    // Example: conv2d -> relu to conv2d_relu
-    //
-    // To maintain equivalence before and after fusion, we have some rules:
-    // 1. Op could be moved safely right after the op it fuse to.
-    // 2. If one of node's input and output are alias must (relu_?), we could
-    // replace all uses of input to use output, which remove the use that might
-    // clogging the fuse path which is to be squashed.
-    // 3. If there is no alias between input and output, we can only fuse the
-    // case when there is only use.
-    //
-    // Y-merge (conv-sum-relu?)
-    // 4. We aquire alias info from resulted op schema, check whether the fusion
-    // is not breaking any computational semantics.
-    //
-    // A Y-merge fusion, like:
-    //           conv2d_inputs | or | conv2d_inputs
-    //             /           |    |      \
-    //      x   conv2d         |    |    conv2d  x
-    //       \   /             |    |        \  /
-    //        add              |    |        add
-    //         |               |    |         |
-    //         y               |    |         y
-    //
-    // both to:
-    //
-    // conv2d_inputs  x(a!)
-    //      \        /
-    //     conv2d_sum
-    //         |
-    //       y(a!)
-    //
-    // Which y is alias to x, we check whether later is equivalent to formal.
-    // The params convention when we do Y-merge: arguments from both ops comes
-    // to new op in topological order. So in the exmaple conv2d's inputs comes
-    // first then sum's inputs (without the input which is squashed).
-    //
-    safe = aliasIsSafeForSquashingValue(node, v);
-
-    //
-    // Y-merge like case
-    //
-    if (safe && node->inputs().size() > 1) {
-      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(r);
-      auto rule = *r.value();
-      auto& schema = matchSchemaForFusion(rule.second, v->node(), node);
-      auto o_schema = node->schema();
-
-      auto pos = v->node()->inputs().size();
-
-      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-          schema.arguments().size() == pos + node->inputs().size() - 1);
-
-      for (int i = 0; i < node->inputs().size(); ++i) {
-        if (node->input(i) != v) { /* avoid squashing path */
-          auto aliasInfo = schema.arguments()[pos++].alias_info();
-          if (!aliasInfo)
-            continue;
-
-          // Introdued new alias write to
-          if (aliasInfo->isWrite()) {
-            auto old_info = o_schema.arguments()[i].alias_info();
-            if (!old_info || !old_info->isWrite()) {
-              // Introduced new written to alias
-              safe = safe && aliasIsSafeForInplaceValue(node, node->input(i));
-            }
-          }
-        }
-      }
-
-      // XXX: Do we have to handle output alias change case?
-    }
-    return safe;
-  }
-
-  std::pair<graph_node_list::iterator, bool> processNode(Node* node) {
-    Node* pos = node;
-    bool changed = false;
-
-    //
-    // Check whether we could fuse to one certain value path
-    //
-    for (auto* v : node->inputs()) {
-      auto prev = v->node();
-      auto fuseRule = isFusable(node, prev);
-
-      // We can fuse only one path
-      if (fuseRule && aliasIsSafeForFusion(node, v, fuseRule)) {
-        pos = fuseNodes(node, v, fuseRule.value());
-        changed = true;
-        break;
-      }
-    }
-    return std::make_pair(++pos->iterator(), changed);
-  }
-};
-
-// TODO: These rules should be more scalable
-OpFuser::RuleTab OpFuser::dnnlRules = {
-    {{aten::matmul, aten::div}, ipex::matmul_div},
-};
-
 // Including in-place optimizations that try to (conditionally)
 // replace the origin op with in-place opted one for better performance.
 // This in-place optimized ops may come from either oneDNN or aten
@@ -440,11 +183,7 @@ void IPEXFusionPass(std::shared_ptr<Graph>& graph) {
 
   // Fuse operators as shuffle
   graph_rewrite::FuseShuffle(graph);
-
-  // Pattern based fusion was lack of alias analysis
-  // ??? It may either be too conservative or too aggressive ???
-  // getSubgraphRewriter().runOnGraph(graph);
-  OpFuser(graph->block(), graph).run();
+  graph_rewrite::FuseMatmulDiv(graph);
 
   // replace aten max_pool2d with ipex max_pool2d
   graph_rewrite::replaceAtenMaxPool2dWithIpexMaxPool2d(graph);
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py

Original file line number	Diff line number	Diff line change
`@@ -84,10 +84,10 @@ at::Tensor dil_matmul_div(`
`84`	`84`
`85`	`85`	`if (out.defined()) {`
`86`	`86`	`at::matmul_out(out, tensor1, tensor2);`
`87`		`- return out.div(div_input);`
	`87`	`+ return out.div_(div_input);`
`88`	`88`	`}`
`89`	`89`	`auto output = at::matmul(tensor1, tensor2);`
`90`		`- return output.div(div_input);`
	`90`	`+ return output.div_(div_input);`
`91`	`91`	`}`
`92`	`92`
`93`	`93`	`/**`