From 8e333904f82b3ad9b54d9cf81c6a7bc4b960bd4f Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 16 Sep 2025 15:17:48 +0800
Subject: [PATCH 1/3] fuse all the rhs constants into ops

---
 include/NeuraDialect/NeuraOps.td              |  15 +-
 include/NeuraDialect/NeuraPasses.h            |  14 +-
 include/NeuraDialect/NeuraPasses.td           |  16 +-
 .../ArithToNeura/ArithToNeuraPass.cpp         |  20 +-
 .../LlvmToNeura/LlvmToNeuraPass.cpp           |   9 +-
 lib/NeuraDialect/Transforms/CMakeLists.txt    |   7 +-
 .../Transforms/CanonicalizeCastPass.cpp       |   3 +
 .../Transforms/CanonicalizeLiveInPass.cpp     |   2 +-
 .../Transforms/FoldConstantPass.cpp           |  91 ----
 .../Transforms/InsertCtrlMovPass.cpp          |   2 +-
 .../Transforms/InsertDataMovPass.cpp          |   2 +-
 .../LeveragePredicatedValuePass.cpp           |   2 +-
 .../Transforms/MapToAcceleratorPass.cpp       |   2 +-
 .../Transforms/Optimizations/CMakeLists.txt   |  19 +
 .../HWAgnosticOpt/CMakeLists.txt              |  18 +
 .../HWAgnosticOpt/FoldConstantPass.cpp        | 252 +++++++++++
 .../HWSpecificOpt/CMakeLists.txt              |  18 +
 .../HWSpecificOpt/FuseLoopControlPass.cpp}    |  16 +-
 .../Transforms/PromoteFuncArgToConstPass.cpp  |  97 +++++
 .../TransformCtrlToDataFlowPass.cpp           |   2 +-
 .../simple_loop/simple_loop.mlir              |   4 +-
 .../simple_loop_reduction.mlir                |   4 +-
 test/mapping_quality/tiny_loop.mlir           |   4 +-
 test/neura/ctrl/branch_for.mlir               | 390 ++++++++----------
 .../interpreter/basic_operation/add.mlir      |  20 +-
 .../interpreter/basic_operation/fadd.mlir     |  39 +-
 .../interpreter/basic_operation/sub.mlir      |  35 +-
 tools/neura-interpreter/neura-interpreter.cpp |  42 +-
 28 files changed, 706 insertions(+), 439 deletions(-)
 delete mode 100644 lib/NeuraDialect/Transforms/FoldConstantPass.cpp
 create mode 100644 lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt
 create mode 100644 lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/CMakeLists.txt
 create mode 100644 lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/FoldConstantPass.cpp
 create mode 100644 lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/CMakeLists.txt
 rename lib/NeuraDialect/Transforms/{FuseControlFlowPass.cpp => Optimizations/HWSpecificOpt/FuseLoopControlPass.cpp} (97%)
 create mode 100644 lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 227d4382..88513411 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -18,7 +18,7 @@ def Neura_ConstantOp : Op<NeuraDialect, "constant"> {
 def Neura_AddOp : Op<NeuraDialect, "add"> {
   let summary = "Integer addition operation";
   let opName = "add";
-  let arguments = (ins AnyType:$lhs, AnyType:$rhs, Optional<AnyType>:$predicate);
+  let arguments = (ins AnyType:$lhs, Optional<AnyType>:$rhs);
   let results = (outs AnyType:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
   let traits = [SameOperandsAndResultElementType];
@@ -26,7 +26,7 @@ def Neura_AddOp : Op<NeuraDialect, "add"> {
 
 def Neura_SubOp : Op<NeuraDialect, "sub"> {
   let summary = "Integer substraction operation";
-  let arguments = (ins AnyType:$lhs, AnyType:$rhs, Optional<AnyType>:$predicate);
+  let arguments = (ins AnyType:$lhs, Optional<AnyType>:$rhs);
   let results = (outs AnyType:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
   let traits = [SameOperandsAndResultElementType];
@@ -35,7 +35,7 @@ def Neura_SubOp : Op<NeuraDialect, "sub"> {
 def Neura_MulOp : Op<NeuraDialect, "mul"> {
   let summary = "Integer multiplication operation";
   let opName = "mul";
-  let arguments = (ins AnyType:$lhs, AnyType:$rhs, Optional<AnyType>:$predicate);
+  let arguments = (ins AnyType:$lhs, Optional<AnyType>:$rhs);
   let results = (outs AnyType:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
   let traits = [SameOperandsAndResultElementType];
@@ -53,10 +53,10 @@ def Neura_DivOp : Op<NeuraDialect, "div"> {
 def Neura_FAddOp : Op<NeuraDialect, "fadd"> {
   let summary = "Floating addition operation";
   let opName = "fadd";
-  let arguments = (ins AnyType:$lhs, AnyType:$rhs, Optional<AnyType>:$predicate);
+  let arguments = (ins AnyType:$lhs, Optional<AnyType>:$rhs);
   let results = (outs AnyType:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
-  // let traits = [SameOperandsAndResultElementType];
+  let traits = [SameOperandsAndResultElementType];
 }
 
 // Defines a floating-point substraction operation.
@@ -99,7 +99,7 @@ def Neura_OrOp : Op<NeuraDialect, "or"> {
 def Neura_ICmpOp : Op<NeuraDialect, "icmp"> {
   let summary = "Integer compare operation";
   let opName = "icmp";
-  let arguments = (ins AnyType:$lhs, AnyType:$rhs, Optional<AnyType>:$predicate,
+  let arguments = (ins AnyType:$lhs, Optional<AnyType>:$rhs,
                    StrAttr:$cmpType);
   let results = (outs AnyType:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` $cmpTypeAttr `,` $cmp_type attr-dict `:` type($result)";
@@ -111,8 +111,7 @@ def Neura_FCmpOp : Op<NeuraDialect, "fcmp"> {
   let summary = "Floating-point compare operation";
   let opName = "fcmp";
   let arguments = (ins AnyType:$lhs,
-                   AnyType:$rhs,
-                   Optional<AnyType>:$predicate,
+                   Optional<AnyType>:$rhs,
                    StrAttr:$cmpType);
   let results = (outs AnyType:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` $cmpType attr-dict `:` type($result)";
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index c8a8001c..5a220d12 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -20,16 +20,24 @@ void registerNeuraConversionPassPipeline();
 #include "NeuraDialect/NeuraPasses.h.inc"
 std::unique_ptr<mlir::Pass> createInsertDataMovPass();
 std::unique_ptr<mlir::Pass> createInsertCtrlMovPass();
-std::unique_ptr<mlir::Pass> createFusePatternPass();
 std::unique_ptr<mlir::Pass> createAssignAcceleratorPass();
 std::unique_ptr<mlir::Pass> createTransformCtrlToDataFlowPass();
 std::unique_ptr<mlir::Pass> createLeveragePredicatedValuePass();
 std::unique_ptr<mlir::Pass> createMapToAcceleratorPass();
 std::unique_ptr<mlir::Pass> createGenerateCodePass();
-std::unique_ptr<mlir::Pass> createFuseControlFlowPass();
 std::unique_ptr<mlir::Pass> createCanonicalizeLiveInPass();
-std::unique_ptr<mlir::Pass> createCanonicalizeCastPass();
+std::unique_ptr<mlir::Pass> createPromoteFuncArgToConstPass();
+
+// ====================================
+// Optimization Passes
+// ====================================
+// Hardware specific optimization passes
+std::unique_ptr<mlir::Pass> createFuseLoopControlPass();
+std::unique_ptr<mlir::Pass> createFusePatternPass();
+
+// Hardware agnostic optimization passes
 std::unique_ptr<mlir::Pass> createFoldConstantPass();
+std::unique_ptr<mlir::Pass> createCanonicalizeCastPass();
 
 #define GEN_PASS_REGISTRATION
 #include "NeuraDialect/NeuraPasses.h.inc"
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index 31e0356e..c96e0db1 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -67,12 +67,12 @@ def GenerateCode : Pass<"generate-code", "ModuleOp"> {
   let constructor = "neura::createGenerateCodePass()";
 }
 
-def FuseControlFlow: Pass<"fuse-control-flow", "ModuleOp">{
-  let summary = "Fuses control flow operations in the Neura dialect";
+def FuseLoopControl: Pass<"fuse-loop-control", "ModuleOp">{
+  let summary = "Fuses loop control operations in the Neura dialect";
   let description = [{
-    This pass fuses control flow operations.
+    This pass fuses loop control operations.
   }];
-  let constructor = "neura::createFuseControlFlowPass()";
+  let constructor = "neura::createFuseLoopControlPass()";
 }
 
 def CanonicalizeLiveIn : Pass<"canonicalize-live-in", "ModuleOp"> {
@@ -86,6 +86,14 @@ def CanonicalizeLiveIn : Pass<"canonicalize-live-in", "ModuleOp"> {
   let constructor = "neura::createCanonicalizeLiveInPass()";
 }
 
+def PromoteFuncArgToConst : Pass<"promote-func-arg-to-const", "ModuleOp"> {
+  let summary = "Promotes function arguments to neura constant operations";
+  let description = [{
+    This pass promotes function arguments to neura constant operations.
+  }];
+  let constructor = "neura::createPromoteFuncArgToConstPass()";
+}
+
 def CanonicalizeCast : Pass<"canonicalize-cast", "ModuleOp"> {
   let summary = "Canonicalizes cast operations in the Neura dialect";
   let description = [{
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index 098c3e02..03ab2517 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -53,8 +53,7 @@ struct ArithAddIToNeuraAdd : public OpRewritePattern<mlir::arith::AddIOp> {
     Type result_type = op.getType();
 
     // Optional predicate: default to null.
-    rewriter.replaceOpWithNewOp<neura::AddOp>(op, result_type, lhs, rhs,
-                                              nullptr);
+    rewriter.replaceOpWithNewOp<neura::AddOp>(op, result_type, lhs, rhs);
     return success();
   }
 };
@@ -69,8 +68,7 @@ struct ArithFAddToNeuraFAdd : public OpRewritePattern<mlir::arith::AddFOp> {
     Type result_type = op.getType();
 
     // Optional predicate: default to null.
-    rewriter.replaceOpWithNewOp<neura::FAddOp>(op, result_type, lhs, rhs,
-                                               nullptr);
+    rewriter.replaceOpWithNewOp<neura::FAddOp>(op, result_type, lhs, rhs);
     return success();
   }
 };
@@ -85,8 +83,7 @@ struct ArithSubIToNeuraSub : public OpRewritePattern<mlir::arith::SubIOp> {
     Type result_type = op.getType();
 
     // Optional predicate: default to null.
-    rewriter.replaceOpWithNewOp<neura::SubOp>(op, result_type, lhs, rhs,
-                                              nullptr);
+    rewriter.replaceOpWithNewOp<neura::SubOp>(op, result_type, lhs, rhs);
     return success();
   }
 };
@@ -117,8 +114,7 @@ struct ArithMulIToNeuraMul : public OpRewritePattern<mlir::arith::MulIOp> {
     Type result_type = op.getType();
 
     // Optional predicate: default to null.
-    rewriter.replaceOpWithNewOp<neura::MulOp>(op, result_type, lhs, rhs,
-                                              nullptr);
+    rewriter.replaceOpWithNewOp<neura::MulOp>(op, result_type, lhs, rhs);
     return success();
   }
 };
@@ -183,10 +179,8 @@ struct ArithRemSIToNeuraOp : public OpRewritePattern<mlir::arith::RemSIOp> {
     // Optional predicate: default to null.
     Value div =
         rewriter.create<neura::DivOp>(loc, result_type, lhs, rhs, nullptr);
-    Value mul =
-        rewriter.create<neura::MulOp>(loc, result_type, rhs, div, nullptr);
-    Value rem =
-        rewriter.create<neura::SubOp>(loc, result_type, lhs, mul, nullptr);
+    Value mul = rewriter.create<neura::MulOp>(loc, result_type, rhs, div);
+    Value rem = rewriter.create<neura::SubOp>(loc, result_type, lhs, mul);
 
     rewriter.replaceOp(op, rem);
     return success();
@@ -241,7 +235,7 @@ struct ArithCmpiToNeuraICmp : public OpRewritePattern<mlir::arith::CmpIOp> {
     // Converts arith CmpIOp to Neura ICmpOp.
     // Optional predicate: default to null.
     rewriter.replaceOpWithNewOp<neura::ICmpOp>(
-        op, result_type, lhs, rhs, nullptr, rewriter.getStringAttr(cmp_type));
+        op, result_type, lhs, rhs, rewriter.getStringAttr(cmp_type));
     return success();
   }
 };
diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
index 7255c05c..ec3c3eb2 100644
--- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
+++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
@@ -34,7 +34,7 @@ struct LlvmAddToNeuraAdd : public OpRewritePattern<mlir::LLVM::AddOp> {
   LogicalResult matchAndRewrite(mlir::LLVM::AddOp op,
                                 PatternRewriter &rewriter) const override {
     rewriter.replaceOpWithNewOp<neura::AddOp>(op, op.getType(), op.getLhs(),
-                                              op.getRhs(), Value());
+                                              op.getRhs());
     return success();
   }
 };
@@ -53,8 +53,7 @@ struct LlvmFAddToNeuraFAdd : public OpRewritePattern<mlir::LLVM::FAddOp> {
       return failure();
 
     // Optional predicate: default to 'none'
-    rewriter.replaceOpWithNewOp<neura::FAddOp>(op, result_type, lhs, rhs,
-                                               Value());
+    rewriter.replaceOpWithNewOp<neura::FAddOp>(op, result_type, lhs, rhs);
     return success();
   }
 };
@@ -141,7 +140,7 @@ struct LlvmICmpToNeuraICmp : public OpRewritePattern<LLVM::ICmpOp> {
     auto resultType = op.getType();
 
     rewriter.replaceOpWithNewOp<neura::ICmpOp>(
-        op, resultType, lhs, rhs, Value(),
+        op, resultType, lhs, rhs,
         rewriter.getStringAttr(LLVM::stringifyICmpPredicate(pred)));
     return success();
   }
@@ -158,7 +157,7 @@ struct LlvmFCmpToNeuraFCmp : public OpRewritePattern<LLVM::FCmpOp> {
     auto resultType = op.getType();
 
     rewriter.replaceOpWithNewOp<neura::FCmpOp>(
-        op, resultType, lhs, rhs, Value(),
+        op, resultType, lhs, rhs,
         rewriter.getStringAttr(LLVM::stringifyFCmpPredicate(pred)));
     return success();
   }
diff --git a/lib/NeuraDialect/Transforms/CMakeLists.txt b/lib/NeuraDialect/Transforms/CMakeLists.txt
index d5073678..4913cc8a 100644
--- a/lib/NeuraDialect/Transforms/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/CMakeLists.txt
@@ -10,10 +10,9 @@ add_mlir_library(
     LeveragePredicatedValuePass.cpp
     MapToAcceleratorPass.cpp
     GenerateCodePass.cpp
-    FuseControlFlowPass.cpp
     CanonicalizeLiveInPass.cpp
     CanonicalizeCastPass.cpp
-    FoldConstantPass.cpp
+    PromoteFuncArgToConstPass.cpp
 
     DEPENDS
     MLIRNeuraTransformsIncGen
@@ -26,4 +25,6 @@ add_mlir_library(
       MLIRNeura
       ${dialect_libs}
       LLVMSupport
-)
\ No newline at end of file
+)
+
+add_subdirectory(Optimizations)
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp
index 3451acf2..d16b542c 100644
--- a/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp
+++ b/lib/NeuraDialect/Transforms/CanonicalizeCastPass.cpp
@@ -11,6 +11,9 @@
 
 using namespace mlir;
 
+#define GEN_PASS_DEF_CANONICALIZECAST
+#include "NeuraDialect/NeuraPasses.h.inc"
+
 namespace {
 
 LogicalResult canonicalizeCast(Region &region) {
diff --git a/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp b/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp
index 00ba5599..5b2bda27 100644
--- a/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp
+++ b/lib/NeuraDialect/Transforms/CanonicalizeLiveInPass.cpp
@@ -13,7 +13,7 @@
 
 using namespace mlir;
 
-#define GEN_PASS_DEF_NEURACANONICALIZE
+#define GEN_PASS_DEF_CANONICALIZELIVEIN
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
diff --git a/lib/NeuraDialect/Transforms/FoldConstantPass.cpp b/lib/NeuraDialect/Transforms/FoldConstantPass.cpp
deleted file mode 100644
index 9e410ed4..00000000
--- a/lib/NeuraDialect/Transforms/FoldConstantPass.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "NeuraDialect/NeuraOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-using namespace mlir;
-
-#define GEN_PASS_DEF_FOLDCONSTANT
-#include "NeuraDialect/NeuraPasses.h.inc"
-
-namespace {
-struct FuseConstantAndGrantPattern
-    : public OpRewritePattern<neura::ConstantOp> {
-  using OpRewritePattern<neura::ConstantOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(neura::ConstantOp constant_op,
-                                PatternRewriter &rewriter) const override {
-    bool made_change = false;
-
-    // Checks if the constant operation is used by a grant_once or grant_always
-    // operation.
-    for (auto user : constant_op->getUsers()) {
-      llvm::errs() << "Checking use: " << *user << "\n";
-      if (isa<neura::GrantOnceOp>(user) || isa<neura::GrantAlwaysOp>(user)) {
-        if (neura::GrantOnceOp grant_once_op =
-                dyn_cast<neura::GrantOnceOp>(user)) {
-          auto new_grant_once_op = rewriter.create<neura::GrantOnceOp>(
-              grant_once_op.getLoc(), grant_once_op.getResult().getType(),
-              /*value=*/nullptr, constant_op->getAttr("value"));
-          // Replaces the original constant operation with the new one.
-          rewriter.replaceOp(grant_once_op, new_grant_once_op);
-          made_change = true;
-        } else if (neura::GrantAlwaysOp grant_always_op =
-                       dyn_cast<neura::GrantAlwaysOp>(user)) {
-          auto new_grant_always_op = rewriter.create<neura::GrantAlwaysOp>(
-              grant_always_op.getLoc(), grant_always_op.getResult().getType(),
-              /*value=*/nullptr, constant_op->getAttr("value"));
-          // Replaces the original constant operation with the new one.
-          rewriter.replaceOp(grant_always_op, new_grant_always_op);
-          made_change = true;
-        }
-      }
-    }
-
-    if (constant_op->use_empty()) {
-      // If the constant operation has no users, it can be removed.
-      rewriter.eraseOp(constant_op);
-      made_change = true;
-    }
-
-    return success(made_change);
-  }
-};
-
-struct FoldConstantPass
-    : public PassWrapper<FoldConstantPass, OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FoldConstantPass)
-
-  StringRef getArgument() const override { return "fold-constant"; }
-  StringRef getDescription() const override {
-    return "Fold constant operations.";
-  }
-
-  void runOnOperation() override {
-    ModuleOp module_op = getOperation();
-    RewritePatternSet patterns(&getContext());
-    patterns.add<FuseConstantAndGrantPattern>(&getContext());
-    FrozenRewritePatternSet frozen(std::move(patterns));
-
-    // Applies to every region inside the module (regardless of func type,
-    // e.g., mlir func or llvm func).
-    module_op.walk([&](Operation *op) {
-      if (!op->getRegions().empty()) {
-        for (Region &region : op->getRegions()) {
-          if (failed(applyPatternsGreedily(region, frozen))) {
-            signalPassFailure();
-          }
-        }
-      }
-    });
-  }
-};
-
-} // namespace
-
-namespace mlir::neura {
-std::unique_ptr<Pass> createFoldConstantPass() {
-  return std::make_unique<FoldConstantPass>();
-}
-} // namespace mlir::neura
diff --git a/lib/NeuraDialect/Transforms/InsertCtrlMovPass.cpp b/lib/NeuraDialect/Transforms/InsertCtrlMovPass.cpp
index 8b9e6601..1b741092 100644
--- a/lib/NeuraDialect/Transforms/InsertCtrlMovPass.cpp
+++ b/lib/NeuraDialect/Transforms/InsertCtrlMovPass.cpp
@@ -9,7 +9,7 @@
 
 using namespace mlir;
 
-#define GEN_PASS_DEF_InsertCtrlMov
+#define GEN_PASS_DEF_INSERTCTRLMOV
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
diff --git a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
index 70533680..e9fe627f 100644
--- a/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
+++ b/lib/NeuraDialect/Transforms/InsertDataMovPass.cpp
@@ -9,7 +9,7 @@
 
 using namespace mlir;
 
-#define GEN_PASS_DEF_InsertDataMov
+#define GEN_PASS_DEF_INSERTDATAMOV
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
diff --git a/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp b/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp
index 58e63ffc..e78dfd6c 100644
--- a/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp
+++ b/lib/NeuraDialect/Transforms/LeveragePredicatedValuePass.cpp
@@ -10,7 +10,7 @@
 
 using namespace mlir;
 
-#define GEN_PASS_DEF_LeveragePredicatedValue
+#define GEN_PASS_DEF_LEVERAGEPREDICATEDVALUE
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
diff --git a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
index d6b848d6..eda7f7ff 100644
--- a/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
+++ b/lib/NeuraDialect/Transforms/MapToAcceleratorPass.cpp
@@ -20,7 +20,7 @@
 using namespace mlir;
 using namespace mlir::neura;
 
-#define GEN_PASS_DEF_MapToAccelerator
+#define GEN_PASS_DEF_MAPTOACCELERATOR
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
diff --git a/lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt b/lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt
new file mode 100644
index 00000000..770b2b40
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt
@@ -0,0 +1,19 @@
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+
+add_subdirectory(HWAgnosticOpt)
+add_subdirectory(HWSpecificOpt)
+
+add_library(MLIRNUERAOptimization INTERFACE)
+
+add_dependencies(MLIRNUERAOptimization MLIRNeuraTransformsIncGen)
+
+target_link_libraries(MLIRNUERAOptimization INTERFACE
+  MLIRIR
+  MLIRPass
+  MLIRSupport
+  MLIRTransforms
+  MLIRNeura
+  MLIRNeuraHWSpecificOpt
+  MLIRNeuraHWAgnosticOpt
+  ${dialect_libs}
+)
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/CMakeLists.txt b/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/CMakeLists.txt
new file mode 100644
index 00000000..e41dce26
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/CMakeLists.txt
@@ -0,0 +1,18 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_mlir_conversion_library(MLIRNeuraHWAgnosticOpt
+  FoldConstantPass.cpp
+
+  DEPENDS
+  MLIRNeuraTransformsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRLLVMDialect
+  MLIRIR
+  MLIRPass
+  MLIRTransforms
+  MLIRNeura
+  MLIRSupport
+)
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/FoldConstantPass.cpp b/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/FoldConstantPass.cpp
new file mode 100644
index 00000000..1abe7718
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/FoldConstantPass.cpp
@@ -0,0 +1,252 @@
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraTypes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/Support/LogicalResult.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
+using namespace mlir;
+
+#define GEN_PASS_DEF_FOLDCONSTANT
+#include "NeuraDialect/NeuraPasses.h.inc"
+
+namespace {
+// =========================================
+// FuseConstantAndGrantPattern
+// Valid only after transform-ctrl-to-data-flow pass.
+// =========================================
+struct FuseConstantAndGrantPattern
+    : public OpRewritePattern<neura::ConstantOp> {
+  using OpRewritePattern<neura::ConstantOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(neura::ConstantOp constant_op,
+                                PatternRewriter &rewriter) const override {
+    bool made_change = false;
+
+    // Checks if the constant operation is used by a grant_once or grant_always
+    // operation.
+    for (auto user : constant_op->getUsers()) {
+      llvm::errs() << "Checking use: " << *user << "\n";
+      if (isa<neura::GrantOnceOp>(user) || isa<neura::GrantAlwaysOp>(user)) {
+        if (neura::GrantOnceOp grant_once_op =
+                dyn_cast<neura::GrantOnceOp>(user)) {
+          auto new_grant_once_op = rewriter.create<neura::GrantOnceOp>(
+              grant_once_op.getLoc(), grant_once_op.getResult().getType(),
+              /*value=*/nullptr, constant_op->getAttr("value"));
+          // Replaces the original constant operation with the new one.
+          rewriter.replaceOp(grant_once_op, new_grant_once_op);
+          made_change = true;
+        } else if (neura::GrantAlwaysOp grant_always_op =
+                       dyn_cast<neura::GrantAlwaysOp>(user)) {
+          auto new_grant_always_op = rewriter.create<neura::GrantAlwaysOp>(
+              grant_always_op.getLoc(), grant_always_op.getResult().getType(),
+              /*value=*/nullptr, constant_op->getAttr("value"));
+          // Replaces the original constant operation with the new one.
+          rewriter.replaceOp(grant_always_op, new_grant_always_op);
+          made_change = true;
+        }
+      }
+    }
+
+    if (constant_op->use_empty()) {
+      // If the constant operation has no users, it can be removed.
+      rewriter.eraseOp(constant_op);
+      made_change = true;
+    }
+
+    return success(made_change);
+  }
+};
+
+// =========================================
+// FoldConstantPass
+// Valid before transform-ctrl-to-data-flow pass.
+// =========================================
+bool isOriginConstantOp(Value value) {
+  Operation *def_op = value.getDefiningOp();
+  if (!def_op || !isa<neura::ConstantOp>(def_op)) {
+    return false;
+  }
+
+  // Checks if the result type is the original type or the predicated type.
+  Type result_type = value.getType();
+  if (isa<neura::PredicatedValue>(result_type)) {
+    return false;
+  }
+
+  return true;
+}
+
+Attribute getOriginConstantValue(Value value) {
+  neura::ConstantOp constant_op =
+      dyn_cast<neura::ConstantOp>(value.getDefiningOp());
+  return constant_op->getAttr("value");
+}
+
+void addConstantAttribute(Operation *op, StringRef attr_name,
+                          Attribute const_value) {
+  op->setAttr(attr_name, const_value);
+}
+
+// A template pattern to fuse binary operations with a constant on the
+// right-hand side.
+template <typename OpType>
+struct FuseRhsConstantPattern : public OpRewritePattern<OpType> {
+  using OpRewritePattern<OpType>::OpRewritePattern;
+
+  virtual Operation *
+  createOpWithFusedRhsConstant(OpType op, Attribute rhs_const_value,
+                               PatternRewriter &rewriter) const = 0;
+
+  LogicalResult matchAndRewrite(OpType op,
+                                PatternRewriter &rewriter) const override {
+    Value lhs = op.getLhs();
+    Value rhs = op.getRhs();
+
+    if (isOriginConstantOp(lhs)) {
+      assert(false && "LHS constant folding not implemented yet.");
+      return failure();
+    }
+
+    if (!rhs || !isOriginConstantOp(rhs)) {
+      return failure();
+    }
+
+    auto constant_op = dyn_cast<neura::ConstantOp>(rhs.getDefiningOp());
+
+    Attribute rhs_const_value = getOriginConstantValue(rhs);
+    Operation *fused_op =
+        createOpWithFusedRhsConstant(op, rhs_const_value, rewriter);
+
+    rewriter.replaceOp(op, fused_op->getResults());
+    if (constant_op->use_empty()) {
+      rewriter.eraseOp(constant_op);
+    }
+    return success();
+  }
+};
+
+struct FuseAddRhsConstantPattern : public FuseRhsConstantPattern<neura::AddOp> {
+  using FuseRhsConstantPattern<neura::AddOp>::FuseRhsConstantPattern;
+
+  Operation *
+  createOpWithFusedRhsConstant(neura::AddOp op, Attribute rhs_const_value,
+                               PatternRewriter &rewriter) const override {
+    auto fused_op = rewriter.create<neura::AddOp>(
+        op.getLoc(), op.getResult().getType(), op.getLhs(),
+        /*rhs=*/nullptr);
+    addConstantAttribute(fused_op, "rhs_const_value", rhs_const_value);
+    return fused_op;
+  }
+};
+
+struct FuseSubRhsConstantPattern : public FuseRhsConstantPattern<neura::SubOp> {
+  using FuseRhsConstantPattern<neura::SubOp>::FuseRhsConstantPattern;
+
+  Operation *
+  createOpWithFusedRhsConstant(neura::SubOp op, Attribute rhs_const_value,
+                               PatternRewriter &rewriter) const override {
+    auto fused_op = rewriter.create<neura::SubOp>(
+        op.getLoc(), op.getResult().getType(), op.getLhs(),
+        /*rhs=*/nullptr);
+    addConstantAttribute(fused_op, "rhs_const_value", rhs_const_value);
+    return fused_op;
+  }
+};
+
+struct FuseMulRhsConstantPattern : public FuseRhsConstantPattern<neura::MulOp> {
+  using FuseRhsConstantPattern<neura::MulOp>::FuseRhsConstantPattern;
+
+  Operation *
+  createOpWithFusedRhsConstant(neura::MulOp op, Attribute rhs_const_value,
+                               PatternRewriter &rewriter) const override {
+    auto fused_op = rewriter.create<neura::MulOp>(
+        op.getLoc(), op.getResult().getType(), op.getLhs(),
+        /*rhs=*/nullptr);
+    addConstantAttribute(fused_op, "rhs_const_value", rhs_const_value);
+    return fused_op;
+  }
+};
+
+struct FuseICmpRhsConstantPattern
+    : public FuseRhsConstantPattern<neura::ICmpOp> {
+  using FuseRhsConstantPattern<neura::ICmpOp>::FuseRhsConstantPattern;
+
+  Operation *
+  createOpWithFusedRhsConstant(neura::ICmpOp op, Attribute rhs_const_value,
+                               PatternRewriter &rewriter) const override {
+    auto fused_op = rewriter.create<neura::ICmpOp>(
+        op.getLoc(), op.getResult().getType(), op.getLhs(),
+        /*rhs=*/nullptr, op.getCmpType());
+    addConstantAttribute(fused_op, "rhs_const_value", rhs_const_value);
+    return fused_op;
+  }
+};
+
+struct FuseFAddRhsConstantPattern
+    : public FuseRhsConstantPattern<neura::FAddOp> {
+  using FuseRhsConstantPattern<neura::FAddOp>::FuseRhsConstantPattern;
+
+  Operation *
+  createOpWithFusedRhsConstant(neura::FAddOp op, Attribute rhs_const_value,
+                               PatternRewriter &rewriter) const override {
+    auto fused_op = rewriter.create<neura::FAddOp>(
+        op.getLoc(), op.getResult().getType(), op.getLhs(),
+        /*rhs=*/nullptr);
+    addConstantAttribute(fused_op, "rhs_const_value", rhs_const_value);
+    return fused_op;
+  }
+};
+
+// =========================================
+// FoldConstantPass Implementation
+// =========================================
+struct FoldConstantPass
+    : public PassWrapper<FoldConstantPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FoldConstantPass)
+
+  StringRef getArgument() const override { return "fold-constant"; }
+  StringRef getDescription() const override {
+    return "Fold constant operations.";
+  }
+
+  void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    RewritePatternSet patterns(&getContext());
+
+    patterns.add<FuseAddRhsConstantPattern>(&getContext());
+    patterns.add<FuseSubRhsConstantPattern>(&getContext());
+    patterns.add<FuseMulRhsConstantPattern>(&getContext());
+    patterns.add<FuseICmpRhsConstantPattern>(&getContext());
+
+    patterns.add<FuseConstantAndGrantPattern>(&getContext());
+    FrozenRewritePatternSet frozen(std::move(patterns));
+
+    // Applies to every region inside the module (regardless of func type,
+    // e.g., mlir func or llvm func).
+    module_op.walk([&](Operation *op) {
+      if (!op->getRegions().empty()) {
+        for (Region &region : op->getRegions()) {
+          if (failed(applyPatternsGreedily(region, frozen))) {
+            signalPassFailure();
+          }
+        }
+      }
+    });
+  }
+};
+
+} // namespace
+
+namespace mlir::neura {
+std::unique_ptr<Pass> createFoldConstantPass() {
+  return std::make_unique<FoldConstantPass>();
+}
+} // namespace mlir::neura
diff --git a/lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/CMakeLists.txt b/lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/CMakeLists.txt
new file mode 100644
index 00000000..2a2e52aa
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/CMakeLists.txt
@@ -0,0 +1,18 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_mlir_conversion_library(MLIRNeuraHWSpecificOpt 
+  FuseLoopControlPass.cpp
+
+  DEPENDS
+  MLIRNeuraTransformsIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithDialect
+  MLIRFuncDialect
+  MLIRLLVMDialect
+  MLIRIR
+  MLIRPass
+  MLIRTransforms
+  MLIRNeura
+  MLIRSupport
+)
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/FuseControlFlowPass.cpp b/lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/FuseLoopControlPass.cpp
similarity index 97%
rename from lib/NeuraDialect/Transforms/FuseControlFlowPass.cpp
rename to lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/FuseLoopControlPass.cpp
index d6a2a4b9..43a73bf0 100644
--- a/lib/NeuraDialect/Transforms/FuseControlFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/FuseLoopControlPass.cpp
@@ -18,7 +18,7 @@
 
 using namespace mlir;
 
-#define GEN_PASS_DEF_FUSECONTROLFLOW
+#define GEN_PASS_DEF_FUSELOOPCONTROL
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 namespace {
@@ -565,13 +565,13 @@ struct FuseLoopControlFlowPattern : public OpRewritePattern<func::FuncOp> {
   }
 };
 
-struct FuseControlFlowPass
-    : public PassWrapper<FuseControlFlowPass, OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FuseControlFlowPass)
+struct FuseLoopControlPass
+    : public PassWrapper<FuseLoopControlPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FuseLoopControlPass)
 
-  StringRef getArgument() const override { return "fuse-control-flow"; }
+  StringRef getArgument() const override { return "fuse-loop-control"; }
   StringRef getDescription() const override {
-    return "Fuses control flow operations into optimized neura dialect "
+    return "Fuses loop control operations into optimized neura dialect "
            "operations";
   }
 
@@ -589,7 +589,7 @@ struct FuseControlFlowPass
 } // namespace
 
 namespace mlir::neura {
-std::unique_ptr<Pass> createFuseControlFlowPass() {
-  return std::make_unique<FuseControlFlowPass>();
+std::unique_ptr<Pass> createFuseLoopControlPass() {
+  return std::make_unique<FuseLoopControlPass>();
 }
 } // namespace mlir::neura
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp b/lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp
new file mode 100644
index 00000000..aced6fc4
--- /dev/null
+++ b/lib/NeuraDialect/Transforms/PromoteFuncArgToConstPass.cpp
@@ -0,0 +1,97 @@
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "llvm/ADT/SetVector.h"
+#include <cassert>
+#include <string>
+
+using namespace mlir;
+
+#define GEN_PASS_DEF_PROMOTEFUNCARGTOCONST
+#include "NeuraDialect/NeuraPasses.h.inc"
+
+namespace {
+LogicalResult promoteFunctionArgsToConstants(Region &region) {
+  if (region.empty()) {
+    return success();
+  }
+
+  Block &entry_block = region.front();
+  OpBuilder builder(&entry_block, entry_block.begin());
+
+  // Collects all function arguments.
+  SmallVector<BlockArgument, 4> args(entry_block.getArguments().begin(),
+                                     entry_block.getArguments().end());
+
+  // Creates a constant operation for each function argument.
+  for (auto [idx, arg] : llvm::enumerate(args)) {
+    // For constant operation, the default predicate is true.
+    auto const_op = builder.create<neura::ConstantOp>(
+        arg.getLoc(), arg.getType(),
+        builder.getStringAttr("\%arg" + std::to_string(idx)),
+        builder.getBoolAttr(true));
+    arg.replaceAllUsesWith(const_op.getResult());
+  }
+
+  return success();
+}
+
+struct PromoteFuncArgToConstPass
+    : public PassWrapper<PromoteFuncArgToConstPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PromoteFuncArgToConstPass)
+
+  StringRef getArgument() const override { return "promote-func-arg-to-const"; }
+  StringRef getDescription() const override {
+    return "Promotes function arguments to constants.";
+  }
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mlir::neura::NeuraDialect>();
+    registry.insert<mlir::LLVM::LLVMDialect>();
+    registry.insert<mlir::func::FuncDialect>();
+  }
+
+  void runOnOperation() override {
+    ModuleOp module_op = getOperation();
+    module_op.walk([&](Operation *op) {
+      Region *region = nullptr;
+      if (auto func_op = dyn_cast<func::FuncOp>(op)) {
+        auto accel_attr = func_op->getAttrOfType<StringAttr>("accelerator");
+        if (!accel_attr || accel_attr.getValue() != "neura") {
+          return;
+        }
+        region = &func_op.getBody();
+      } else if (auto llvm_func = dyn_cast<LLVM::LLVMFuncOp>(op)) {
+        auto accel_attr = llvm_func->getAttrOfType<StringAttr>("accelerator");
+        if (!accel_attr || accel_attr.getValue() != "neura") {
+          return;
+        }
+        region = &llvm_func.getBody();
+      } else {
+        return;
+      }
+
+      if (!region || region->empty()) {
+        return;
+      }
+
+      if (failed(promoteFunctionArgsToConstants(*region))) {
+        signalPassFailure();
+        return;
+      }
+    });
+  }
+};
+} // namespace
+
+namespace mlir::neura {
+std::unique_ptr<Pass> createPromoteFuncArgToConstPass() {
+  return std::make_unique<PromoteFuncArgToConstPass>();
+}
+} // namespace mlir::neura
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
index 22d9e1d7..9d65601b 100644
--- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
@@ -19,7 +19,7 @@
 
 using namespace mlir;
 
-#define GEN_PASS_DEF_TransformCtrlToDataFlow
+#define GEN_PASS_DEF_TRANSFORMCTRLTODATAFLOW
 #include "NeuraDialect/NeuraPasses.h.inc"
 
 // Inserts `grant_once` for every predicated value defined in the entry block
diff --git a/test/controflow_fuse/simple_loop/simple_loop.mlir b/test/controflow_fuse/simple_loop/simple_loop.mlir
index 427d0c53..7a686758 100644
--- a/test/controflow_fuse/simple_loop/simple_loop.mlir
+++ b/test/controflow_fuse/simple_loop/simple_loop.mlir
@@ -41,7 +41,7 @@
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
 // RUN: --transform-ctrl-to-data-flow \
-// RUN: --fuse-control-flow \
+// RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: | FileCheck %s -check-prefix=FUSE
 
@@ -55,7 +55,7 @@
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
 // RUN: --transform-ctrl-to-data-flow \
-// RUN: --fuse-control-flow \
+// RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
 // RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" | FileCheck %s -check-prefix=FUSE-MAPPING
diff --git a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
index eff1a5a8..c7a11ee2 100644
--- a/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
+++ b/test/controflow_fuse/simple_loop_reduction/simple_loop_reduction.mlir
@@ -42,7 +42,7 @@
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
 // RUN: --transform-ctrl-to-data-flow \
-// RUN: --fuse-control-flow \
+// RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: | FileCheck %s -check-prefix=FUSE
 
@@ -56,7 +56,7 @@
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
 // RUN: --transform-ctrl-to-data-flow \
-// RUN: --fuse-control-flow \
+// RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
 // RUN: --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized=4,5" | FileCheck %s -check-prefix=FUSE-MAPPING
diff --git a/test/mapping_quality/tiny_loop.mlir b/test/mapping_quality/tiny_loop.mlir
index 7c5ec62d..92c23f48 100644
--- a/test/mapping_quality/tiny_loop.mlir
+++ b/test/mapping_quality/tiny_loop.mlir
@@ -19,7 +19,7 @@
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
 // RUN: --transform-ctrl-to-data-flow \
-// RUN: --fuse-control-flow \
+// RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
 // RUN: --map-to-accelerator="mapping-strategy=heuristic mapping-mode=spatial-only backtrack-config=customized=4,3" \
@@ -35,7 +35,7 @@
 // RUN: --canonicalize-live-in \
 // RUN: --leverage-predicated-value \
 // RUN: --transform-ctrl-to-data-flow \
-// RUN: --fuse-control-flow \
+// RUN: --fuse-loop-control \
 // RUN: --fold-constant \
 // RUN: --insert-data-mov \
 // RUN: --map-to-accelerator="mapping-strategy=heuristic mapping-mode=spatial-temporal backtrack-config=customized=3,4" \
diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir
index 8740fe10..f478793f 100644
--- a/test/neura/ctrl/branch_for.mlir
+++ b/test/neura/ctrl/branch_for.mlir
@@ -7,12 +7,14 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
+// RUN:   --fold-constant \
 // RUN:   --canonicalize-live-in \
 // RUN:   | FileCheck %s -check-prefix=CANONICALIZE
 
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
+// RUN:   --fold-constant \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:   --transform-ctrl-to-data-flow \
@@ -21,6 +23,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
+// RUN:   --fold-constant \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:   --transform-ctrl-to-data-flow \
@@ -30,6 +33,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
+// RUN:   --fold-constant \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:   --transform-ctrl-to-data-flow \
@@ -40,6 +44,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
+// RUN:   --fold-constant \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:   --transform-ctrl-to-data-flow \
@@ -51,6 +56,7 @@
 // RUN: mlir-neura-opt %s \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
+// RUN:   --fold-constant \
 // RUN:   --canonicalize-live-in \
 // RUN:   --leverage-predicated-value \
 // RUN:   --transform-ctrl-to-data-flow \
@@ -96,251 +102,181 @@ func.func @loop_test() -> f32 {
 // CHECK-NEXT:   "neura.return"(%10) : (!neura.data<f32, i1>) -> ()
 // CHECK-NEXT: }
 
-// CANONICALIZE:        func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
-// CANONICALIZE-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 10 : i64}> : () -> i64
-// CANONICALIZE-NEXT:     %1 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> i64
-// CANONICALIZE-NEXT:     %2 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> i64
-// CANONICALIZE-NEXT:     %3 = "neura.constant"() <{predicate = true, value = 3.000000e+00 : f32}> : () -> f32
-// CANONICALIZE-NEXT:     %4 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> f32
-// CANONICALIZE-NEXT:     neura.br %1, %4, %3, %2, %0 : i64, f32, f32, i64, i64 to ^bb1
-// CANONICALIZE-NEXT:   ^bb1(%5: i64, %6: f32, %7: f32, %8: i64, %9: i64):  // 2 preds: ^bb0, ^bb1
-// CANONICALIZE-NEXT:     %10 = "neura.fadd"(%6, %7) : (f32, f32) -> f32
-// CANONICALIZE-NEXT:     %11 = "neura.add"(%5, %8) : (i64, i64) -> i64
-// CANONICALIZE-NEXT:     %12 = "neura.icmp"(%11, %9) <{cmpType = "slt"}> : (i64, i64) -> i1
-// CANONICALIZE-NEXT:     neura.cond_br %12 : i1 then %11, %10, %7, %8, %9 : i64, f32, f32, i64, i64 to ^bb1 else %10 : f32 to ^bb2
-// CANONICALIZE-NEXT:   ^bb2(%13: f32):  // pred: ^bb1
-// CANONICALIZE-NEXT:     "neura.return"(%13) : (f32) -> ()
-// CANONICALIZE-NEXT:   }
+// CANONICALIZE:      func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
+// CANONICALIZE-NEXT:   %0 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> i64
+// CANONICALIZE-NEXT:   %1 = "neura.constant"() <{predicate = true, value = 3.000000e+00 : f32}> : () -> f32
+// CANONICALIZE-NEXT:   %2 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> f32
+// CANONICALIZE-NEXT:   neura.br %0, %2, %1 : i64, f32, f32 to ^bb1
+// CANONICALIZE-NEXT: ^bb1(%3: i64, %4: f32, %5: f32):  // 2 preds: ^bb0, ^bb1
+// CANONICALIZE-NEXT:   %6 = "neura.fadd"(%4, %5) : (f32, f32) -> f32
+// CANONICALIZE-NEXT:   %7 = "neura.add"(%3) {rhs_const_value = 1 : i64} : (i64) -> i64
+// CANONICALIZE-NEXT:   %8 = "neura.icmp"(%7) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (i64) -> i1
+// CANONICALIZE-NEXT:   neura.cond_br %8 : i1 then %7, %6, %5 : i64, f32, f32 to ^bb1 else %6 : f32 to ^bb2
+// CANONICALIZE-NEXT: ^bb2(%9: f32):  // pred: ^bb1
+// CANONICALIZE-NEXT:   "neura.return"(%9) : (f32) -> ()
+// CANONICALIZE-NEXT: }
 
 // CTRL2DATA:        func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
-// CTRL2DATA-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 10 : i64}> : () -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_once"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %2 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %3 = "neura.grant_once"(%2) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %4 = "neura.constant"() <{predicate = true, value = 1 : i64}> : () -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %5 = "neura.grant_once"(%4) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %6 = "neura.constant"() <{predicate = true, value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %7 = "neura.grant_once"(%6) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %8 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %9 = "neura.grant_once"(%8) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %2 = "neura.constant"() <{predicate = true, value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %3 = "neura.grant_once"(%2) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %4 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %5 = "neura.grant_once"(%4) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %6 = neura.reserve : !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %7 = "neura.phi"(%6, %3) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %8 = neura.reserve : !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %9 = "neura.phi"(%8, %5) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
 // CTRL2DATA-NEXT:     %10 = neura.reserve : !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %11 = "neura.phi"(%10, %1) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %12 = neura.reserve : !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %13 = "neura.phi"(%12, %5) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %14 = neura.reserve : !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %15 = "neura.phi"(%14, %7) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %16 = neura.reserve : !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %17 = "neura.phi"(%16, %9) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %18 = neura.reserve : !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %19 = "neura.phi"(%18, %3) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %20 = "neura.fadd"(%17, %15) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %21 = "neura.add"(%19, %13) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %22 = "neura.icmp"(%21, %11) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
-// CTRL2DATA-NEXT:     %23 = neura.grant_predicate %21, %22 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     neura.ctrl_mov %23 -> %18 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %24 = neura.grant_predicate %20, %22 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     neura.ctrl_mov %24 -> %16 : !neura.data<f32, i1> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %25 = neura.grant_predicate %15, %22 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     neura.ctrl_mov %25 -> %14 : !neura.data<f32, i1> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %26 = neura.grant_predicate %13, %22 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     neura.ctrl_mov %26 -> %12 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %27 = neura.grant_predicate %11, %22 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     neura.ctrl_mov %27 -> %10 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %28 = "neura.not"(%22) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// CTRL2DATA-NEXT:     %29 = neura.grant_predicate %20, %28 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     "neura.return"(%29) : (!neura.data<f32, i1>) -> ()
+// CTRL2DATA-NEXT:     %12 = "neura.fadd"(%9, %7) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %13 = "neura.add"(%11) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %14 = "neura.icmp"(%13) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %15 = neura.grant_predicate %13, %14 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     neura.ctrl_mov %15 -> %10 : !neura.data<i64, i1> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %16 = neura.grant_predicate %12, %14 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     neura.ctrl_mov %16 -> %8 : !neura.data<f32, i1> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %17 = neura.grant_predicate %7, %14 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     neura.ctrl_mov %17 -> %6 : !neura.data<f32, i1> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %18 = "neura.not"(%14) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %19 = neura.grant_predicate %12, %18 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%19) : (!neura.data<f32, i1>) -> ()
 // CTRL2DATA-NEXT:   }
 
 // FUSE:      func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
-// FUSE-NEXT:   %0 = "neura.grant_once"() <{constant_value = 10 : i64}> : () -> !neura.data<i64, i1>
-// FUSE-NEXT:   %1 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// FUSE-NEXT:   %2 = "neura.grant_once"() <{constant_value = 1 : i64}> : () -> !neura.data<i64, i1>
-// FUSE-NEXT:   %3 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// FUSE-NEXT:   %4 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// FUSE-NEXT:   %5 = neura.reserve : !neura.data<i64, i1>
-// FUSE-NEXT:   %6 = "neura.phi"(%5, %0) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
+// FUSE-NEXT:   %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
+// FUSE-NEXT:   %1 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// FUSE-NEXT:   %2 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// FUSE-NEXT:   %3 = neura.reserve : !neura.data<f32, i1>
+// FUSE-NEXT:   %4 = "neura.phi"(%3, %1) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// FUSE-NEXT:   %5 = neura.reserve : !neura.data<f32, i1>
+// FUSE-NEXT:   %6 = "neura.phi"(%5, %2) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
 // FUSE-NEXT:   %7 = neura.reserve : !neura.data<i64, i1>
-// FUSE-NEXT:   %8 = "neura.phi"(%7, %2) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// FUSE-NEXT:   %9 = neura.reserve : !neura.data<f32, i1>
-// FUSE-NEXT:   %10 = "neura.phi"(%9, %3) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// FUSE-NEXT:   %11 = neura.reserve : !neura.data<f32, i1>
-// FUSE-NEXT:   %12 = "neura.phi"(%11, %4) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// FUSE-NEXT:   %13 = neura.reserve : !neura.data<i64, i1>
-// FUSE-NEXT:   %14 = "neura.phi"(%13, %1) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// FUSE-NEXT:   %15 = "neura.fadd"(%12, %10) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// FUSE-NEXT:   %16 = "neura.add"(%14, %8) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// FUSE-NEXT:   %17 = "neura.icmp"(%16, %6) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
-// FUSE-NEXT:   %18 = neura.grant_predicate %16, %17 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// FUSE-NEXT:   neura.ctrl_mov %18 -> %13 : !neura.data<i64, i1> !neura.data<i64, i1>
-// FUSE-NEXT:   %19 = neura.grant_predicate %15, %17 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// FUSE-NEXT:   neura.ctrl_mov %19 -> %11 : !neura.data<f32, i1> !neura.data<f32, i1>
-// FUSE-NEXT:   %20 = neura.grant_predicate %10, %17 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// FUSE-NEXT:   neura.ctrl_mov %20 -> %9 : !neura.data<f32, i1> !neura.data<f32, i1>
-// FUSE-NEXT:   %21 = neura.grant_predicate %8, %17 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// FUSE-NEXT:   neura.ctrl_mov %21 -> %7 : !neura.data<i64, i1> !neura.data<i64, i1>
-// FUSE-NEXT:   %22 = neura.grant_predicate %6, %17 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// FUSE-NEXT:   neura.ctrl_mov %22 -> %5 : !neura.data<i64, i1> !neura.data<i64, i1>
-// FUSE-NEXT:   %23 = "neura.not"(%17) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// FUSE-NEXT:   %24 = neura.grant_predicate %15, %23 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// FUSE-NEXT:   "neura.return"(%24) : (!neura.data<f32, i1>) -> ()
+// FUSE-NEXT:   %8 = "neura.phi"(%7, %0) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
+// FUSE-NEXT:   %9 = "neura.fadd"(%6, %4) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// FUSE-NEXT:   %10 = "neura.add"(%8) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// FUSE-NEXT:   %11 = "neura.icmp"(%10) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// FUSE-NEXT:   %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// FUSE-NEXT:   neura.ctrl_mov %12 -> %7 : !neura.data<i64, i1> !neura.data<i64, i1>
+// FUSE-NEXT:   %13 = neura.grant_predicate %9, %11 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// FUSE-NEXT:   neura.ctrl_mov %13 -> %5 : !neura.data<f32, i1> !neura.data<f32, i1>
+// FUSE-NEXT:   %14 = neura.grant_predicate %4, %11 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// FUSE-NEXT:   neura.ctrl_mov %14 -> %3 : !neura.data<f32, i1> !neura.data<f32, i1>
+// FUSE-NEXT:   %15 = "neura.not"(%11) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// FUSE-NEXT:   %16 = neura.grant_predicate %9, %15 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// FUSE-NEXT:   "neura.return"(%16) : (!neura.data<f32, i1>) -> ()
 // FUSE-NEXT: }
 
 // MOV:        func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
-// MOV-NEXT:     %0 = "neura.grant_once"() <{constant_value = 10 : i64}> : () -> !neura.data<i64, i1>
-// MOV-NEXT:     %1 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// MOV-NEXT:     %2 = "neura.grant_once"() <{constant_value = 1 : i64}> : () -> !neura.data<i64, i1>
-// MOV-NEXT:     %3 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// MOV-NEXT:     %4 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// MOV-NEXT:     %5 = neura.reserve : !neura.data<i64, i1>
-// MOV-NEXT:     %6 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %7 = "neura.phi"(%5, %6) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %8 = neura.reserve : !neura.data<i64, i1>
-// MOV-NEXT:     %9 = "neura.data_mov"(%2) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %10 = "neura.phi"(%8, %9) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %11 = neura.reserve : !neura.data<f32, i1>
-// MOV-NEXT:     %12 = "neura.data_mov"(%3) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %13 = "neura.phi"(%11, %12) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %14 = neura.reserve : !neura.data<f32, i1>
-// MOV-NEXT:     %15 = "neura.data_mov"(%4) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %16 = "neura.phi"(%14, %15) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %17 = neura.reserve : !neura.data<i64, i1>
-// MOV-NEXT:     %18 = "neura.data_mov"(%1) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %19 = "neura.phi"(%17, %18) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %20 = "neura.data_mov"(%16) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %21 = "neura.data_mov"(%13) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %22 = "neura.fadd"(%20, %21) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %23 = "neura.data_mov"(%19) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %24 = "neura.data_mov"(%10) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %25 = "neura.add"(%23, %24) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %26 = "neura.data_mov"(%25) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %27 = "neura.data_mov"(%7) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %28 = "neura.icmp"(%26, %27) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %29 = "neura.data_mov"(%25) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %30 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %31 = neura.grant_predicate %29, %30 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MOV-NEXT:     neura.ctrl_mov %31 -> %17 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MOV-NEXT:     %32 = "neura.data_mov"(%22) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %33 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %34 = neura.grant_predicate %32, %33 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MOV-NEXT:     neura.ctrl_mov %34 -> %14 : !neura.data<f32, i1> !neura.data<f32, i1>
-// MOV-NEXT:     %35 = "neura.data_mov"(%13) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %36 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %37 = neura.grant_predicate %35, %36 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MOV-NEXT:     neura.ctrl_mov %37 -> %11 : !neura.data<f32, i1> !neura.data<f32, i1>
-// MOV-NEXT:     %38 = "neura.data_mov"(%10) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %39 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %40 = neura.grant_predicate %38, %39 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MOV-NEXT:     neura.ctrl_mov %40 -> %8 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MOV-NEXT:     %41 = "neura.data_mov"(%7) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %42 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %43 = neura.grant_predicate %41, %42 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MOV-NEXT:     neura.ctrl_mov %43 -> %5 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MOV-NEXT:     %44 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %45 = "neura.not"(%44) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %46 = "neura.data_mov"(%22) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %47 = "neura.data_mov"(%45) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %48 = neura.grant_predicate %46, %47 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MOV-NEXT:     %49 = "neura.data_mov"(%48) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     "neura.return"(%49) : (!neura.data<f32, i1>) -> ()
+// MOV-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
+// MOV-NEXT:     %1 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// MOV-NEXT:     %2 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// MOV-NEXT:     %3 = neura.reserve : !neura.data<f32, i1>
+// MOV-NEXT:     %4 = "neura.data_mov"(%1) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %5 = "neura.phi"(%3, %4) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %6 = neura.reserve : !neura.data<f32, i1>
+// MOV-NEXT:     %7 = "neura.data_mov"(%2) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %8 = "neura.phi"(%6, %7) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %9 = neura.reserve : !neura.data<i64, i1>
+// MOV-NEXT:     %10 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %11 = "neura.phi"(%9, %10) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %12 = "neura.data_mov"(%8) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %13 = "neura.data_mov"(%5) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %14 = "neura.fadd"(%12, %13) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %15 = "neura.data_mov"(%11) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %16 = "neura.add"(%15) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %17 = "neura.data_mov"(%16) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %18 = "neura.icmp"(%17) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %19 = "neura.data_mov"(%16) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %20 = "neura.data_mov"(%18) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %21 = neura.grant_predicate %19, %20 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MOV-NEXT:     neura.ctrl_mov %21 -> %9 : !neura.data<i64, i1> !neura.data<i64, i1>
+// MOV-NEXT:     %22 = "neura.data_mov"(%14) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %23 = "neura.data_mov"(%18) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %24 = neura.grant_predicate %22, %23 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MOV-NEXT:     neura.ctrl_mov %24 -> %6 : !neura.data<f32, i1> !neura.data<f32, i1>
+// MOV-NEXT:     %25 = "neura.data_mov"(%5) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %26 = "neura.data_mov"(%18) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %27 = neura.grant_predicate %25, %26 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MOV-NEXT:     neura.ctrl_mov %27 -> %3 : !neura.data<f32, i1> !neura.data<f32, i1>
+// MOV-NEXT:     %28 = "neura.data_mov"(%18) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %29 = "neura.not"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %30 = "neura.data_mov"(%14) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %31 = "neura.data_mov"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %32 = neura.grant_predicate %30, %31 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MOV-NEXT:     %33 = "neura.data_mov"(%32) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     "neura.return"(%33) : (!neura.data<f32, i1>) -> ()
 // MOV-NEXT:   }
 
-// MAPPING:        func.func @loop_test() -> f32 attributes {accelerator = "neura", mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 2 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
-// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 10 : i64}> {mapping_locs = [{id = 8 : i32, resource = "tile", time_step = 1 : i32, x = 0 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %1 = "neura.grant_once"() <{constant_value = 0 : i64}> {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %2 = "neura.grant_once"() <{constant_value = 1 : i64}> {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 0 : i32, x = 1 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %3 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> {mapping_locs = [{id = 4 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 1 : i32}]} : () -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %4 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> {mapping_locs = [{id = 8 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 2 : i32}]} : () -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %5 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT:     %6 = "neura.data_mov"(%0) {mapping_locs = [{id = 24 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %7 = "neura.phi"(%5, %6) {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %8 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT:     %9 = "neura.data_mov"(%2) {mapping_locs = [{id = 36 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %10 = "neura.phi"(%8, %9) {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %11 = neura.reserve : !neura.data<f32, i1>
-// MAPPING-NEXT:     %12 = "neura.data_mov"(%3) {mapping_locs = [{id = 10 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %13 = "neura.phi"(%11, %12) {mapping_locs = [{id = 5 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %14 = neura.reserve : !neura.data<f32, i1>
-// MAPPING-NEXT:     %15 = "neura.data_mov"(%4) {mapping_locs = [{id = 32 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %16 = "neura.phi"(%14, %15) {mapping_locs = [{id = 8 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 2 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %17 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT:     %18 = "neura.data_mov"(%1) {mapping_locs = [{id = 44 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %19 = "neura.phi"(%17, %18) {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %20 = "neura.data_mov"(%16) {mapping_locs = [{id = 24 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %21 = "neura.data_mov"(%13) {mapping_locs = [{id = 16 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %22 = "neura.fadd"(%20, %21) {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %23 = "neura.data_mov"(%19) {mapping_locs = [{id = 35 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %24 = "neura.data_mov"(%10) {mapping_locs = [{id = 28 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %25 = "neura.add"(%23, %24) {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %26 = "neura.data_mov"(%25) {mapping_locs = [{id = 40 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %27 = "neura.data_mov"(%7) {mapping_locs = [{id = 28 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %28 = "neura.icmp"(%26, %27) <{cmpType = "slt"}> {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %29 = "neura.data_mov"(%25) {mapping_locs = [{id = 32 : i32, resource = "link", time_step = 2 : i32}, {id = 44 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %30 = "neura.data_mov"(%28) {mapping_locs = [{id = 32 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %31 = neura.grant_predicate %29, %30 {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %31 -> %17 {mapping_locs = [{id = 45 : i32, resource = "register", time_step = 4 : i32}, {id = 45 : i32, resource = "register", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT:     %32 = "neura.data_mov"(%22) {mapping_locs = [{id = 30 : i32, resource = "link", time_step = 4 : i32}, {id = 52 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %33 = "neura.data_mov"(%28) {mapping_locs = [{id = 34 : i32, resource = "link", time_step = 3 : i32}, {id = 43 : i32, resource = "link", time_step = 4 : i32}, {id = 53 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %34 = neura.grant_predicate %32, %33 {mapping_locs = [{id = 13 : i32, resource = "tile", time_step = 6 : i32, x = 1 : i32, y = 3 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %34 -> %14 {mapping_locs = [{id = 40 : i32, resource = "link", time_step = 6 : i32}, {id = 39 : i32, resource = "link", time_step = 7 : i32}]} : !neura.data<f32, i1> !neura.data<f32, i1>
-// MAPPING-NEXT:     %35 = "neura.data_mov"(%13) {mapping_locs = [{id = 20 : i32, resource = "register", time_step = 3 : i32}, {id = 20 : i32, resource = "register", time_step = 4 : i32}, {id = 20 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %36 = "neura.data_mov"(%28) {mapping_locs = [{id = 31 : i32, resource = "link", time_step = 3 : i32}, {id = 29 : i32, resource = "link", time_step = 4 : i32}, {id = 21 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %37 = neura.grant_predicate %35, %36 {mapping_locs = [{id = 5 : i32, resource = "tile", time_step = 6 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %37 -> %11 {mapping_locs = [{id = 20 : i32, resource = "register", time_step = 6 : i32}, {id = 20 : i32, resource = "register", time_step = 7 : i32}]} : !neura.data<f32, i1> !neura.data<f32, i1>
-// MAPPING-NEXT:     %38 = "neura.data_mov"(%10) {mapping_locs = [{id = 29 : i32, resource = "link", time_step = 1 : i32}, {id = 14 : i32, resource = "link", time_step = 2 : i32}, {id = 24 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %39 = "neura.data_mov"(%28) {mapping_locs = [{id = 33 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %40 = neura.grant_predicate %38, %39 {mapping_locs = [{id = 6 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %40 -> %8 {mapping_locs = [{id = 17 : i32, resource = "link", time_step = 4 : i32}, {id = 16 : i32, resource = "link", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT:     %41 = "neura.data_mov"(%7) {mapping_locs = [{id = 29 : i32, resource = "link", time_step = 2 : i32}, {id = 14 : i32, resource = "link", time_step = 3 : i32}, {id = 20 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %42 = "neura.data_mov"(%28) {mapping_locs = [{id = 41 : i32, resource = "register", time_step = 3 : i32}, {id = 41 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %43 = neura.grant_predicate %41, %42 {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %43 -> %5 {mapping_locs = [{id = 31 : i32, resource = "link", time_step = 5 : i32}, {id = 36 : i32, resource = "register", time_step = 6 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT:     %44 = "neura.data_mov"(%28) {mapping_locs = [{id = 40 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %45 = "neura.not"(%44) {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %46 = "neura.data_mov"(%22) {mapping_locs = [{id = 28 : i32, resource = "link", time_step = 4 : i32}, {id = 34 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %47 = "neura.data_mov"(%45) {mapping_locs = [{id = 34 : i32, resource = "link", time_step = 4 : i32}, {id = 56 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %48 = neura.grant_predicate %46, %47 {mapping_locs = [{id = 14 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 3 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %49 = "neura.data_mov"(%48) {mapping_locs = [{id = 43 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     "neura.return"(%49) {mapping_locs = [{id = 13 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 3 : i32}]} : (!neura.data<f32, i1>) -> ()
+// MAPPING:       func.func @loop_test() -> f32 attributes {accelerator = "neura", mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
+// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %1 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> {mapping_locs = [{id = 8 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 2 : i32}]} : () -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %2 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> {mapping_locs = [{id = 15 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 3 : i32}]} : () -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %3 = neura.reserve : !neura.data<f32, i1>
+// MAPPING-NEXT:     %4 = "neura.data_mov"(%1) {mapping_locs = [{id = 24 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %5 = "neura.phi"(%3, %4) {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %6 = neura.reserve : !neura.data<f32, i1>
+// MAPPING-NEXT:     %7 = "neura.data_mov"(%2) {mapping_locs = [{id = 60 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %8 = "neura.phi"(%6, %7) {mapping_locs = [{id = 15 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 3 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %9 = neura.reserve : !neura.data<i64, i1>
+// MAPPING-NEXT:     %10 = "neura.data_mov"(%0) {mapping_locs = [{id = 44 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %11 = "neura.phi"(%9, %10) {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %12 = "neura.data_mov"(%8) {mapping_locs = [{id = 46 : i32, resource = "link", time_step = 3 : i32}, {id = 45 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %13 = "neura.data_mov"(%5) {mapping_locs = [{id = 28 : i32, resource = "link", time_step = 3 : i32}, {id = 40 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %14 = "neura.fadd"(%12, %13) {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %15 = "neura.data_mov"(%11) {mapping_locs = [{id = 44 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %16 = "neura.add"(%15) {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}], rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %17 = "neura.data_mov"(%16) {mapping_locs = [{id = 35 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %18 = "neura.icmp"(%17) <{cmpType = "slt"}> {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}], rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %19 = "neura.data_mov"(%16) {mapping_locs = [{id = 44 : i32, resource = "register", time_step = 2 : i32}, {id = 44 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %20 = "neura.data_mov"(%18) {mapping_locs = [{id = 32 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %21 = neura.grant_predicate %19, %20 {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %21 -> %9 {mapping_locs = [{id = 45 : i32, resource = "register", time_step = 4 : i32}, {id = 45 : i32, resource = "register", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %22 = "neura.data_mov"(%14) {mapping_locs = [{id = 40 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %23 = "neura.data_mov"(%18) {mapping_locs = [{id = 41 : i32, resource = "register", time_step = 3 : i32}, {id = 41 : i32, resource = "register", time_step = 4 : i32}, {id = 41 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %24 = neura.grant_predicate %22, %23 {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %24 -> %6 {mapping_locs = [{id = 32 : i32, resource = "link", time_step = 6 : i32}, {id = 37 : i32, resource = "link", time_step = 7 : i32}]} : !neura.data<f32, i1> !neura.data<f32, i1>
+// MAPPING-NEXT:     %25 = "neura.data_mov"(%5) {mapping_locs = [{id = 36 : i32, resource = "register", time_step = 3 : i32}, {id = 36 : i32, resource = "register", time_step = 4 : i32}, {id = 36 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %26 = "neura.data_mov"(%18) {mapping_locs = [{id = 31 : i32, resource = "link", time_step = 3 : i32}, {id = 37 : i32, resource = "register", time_step = 4 : i32}, {id = 37 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %27 = neura.grant_predicate %25, %26 {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 6 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %27 -> %3 {mapping_locs = [{id = 36 : i32, resource = "register", time_step = 6 : i32}, {id = 36 : i32, resource = "register", time_step = 7 : i32}]} : !neura.data<f32, i1> !neura.data<f32, i1>
+// MAPPING-NEXT:     %28 = "neura.data_mov"(%18) {mapping_locs = [{id = 33 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %29 = "neura.not"(%28) {mapping_locs = [{id = 6 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %30 = "neura.data_mov"(%14) {mapping_locs = [{id = 33 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %31 = "neura.data_mov"(%29) {mapping_locs = [{id = 24 : i32, resource = "register", time_step = 4 : i32}, {id = 24 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %32 = neura.grant_predicate %30, %31 {mapping_locs = [{id = 6 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 1 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %33 = "neura.data_mov"(%32) {mapping_locs = [{id = 17 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     "neura.return"(%33) {mapping_locs = [{id = 5 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<f32, i1>) -> ()
 // MAPPING-NEXT:   }
 
-// YAML:        array_config:
+// YAML:      array_config:
 // YAML-NEXT:   columns: 4
 // YAML-NEXT:   rows: 4
 // YAML-NEXT:   cores:
-// YAML-NEXT:   - column: 0
-// YAML-NEXT:     row: 1
-// YAML-NEXT:     core_id: "4"
-// YAML-NEXT:     entries:
-// YAML-NEXT:     - entry_id: "entry0"
-// YAML-NEXT:       instructions:
-// YAML-NEXT:       - timestep: 2
-// YAML-NEXT:         operations:
-// YAML-NEXT:         - opcode: "GRANT_ONCE"
-// YAML-NEXT:           src_operands:
-// YAML-NEXT:           - operand: "#3.000000"
-// YAML-NEXT:             color: "RED"
-// YAML-NEXT:           dst_operands:
-// YAML-NEXT:           - operand: "EAST"
-// YAML-NEXT:             color: "RED"
-
+// YAML-NEXT:     - column: 1
+// YAML-NEXT:       row: 1
+// YAML-NEXT:       core_id: "5"
+// YAML-NEXT:       entries:
+// YAML-NEXT:         - entry_id: "entry0"
+// YAML-NEXT:           instructions:
+// YAML-NEXT:             - timestep: 7
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "RETURN"
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "EAST"
+// YAML-NEXT:                       color: "RED"
 
-// ASM-LABEL: PE(0,1):
-// ASM: {
-// ASM:   GRANT_ONCE, [#3.000000] -> [EAST, RED]
-// ASM: } (t=2)
 
-// ASM-LABEL: PE(1,1):
-// ASM: {
-// ASM:   DATA_MOV, [NORTH, RED] -> [EAST, RED]
-// ASM: } (t=2)
-// ASM: {
-// ASM:   PHI, [$20], [WEST, RED] -> [NORTH, RED], [$20]
-// ASM:   DATA_MOV, [NORTH, RED] -> [EAST, RED]
-// ASM: } (t=3)
-// ASM: {
-// ASM:   DATA_MOV, [SOUTH, RED] -> [$21]
-// ASM:   CTRL_MOV, [EAST, RED] -> [NORTH, RED]
-// ASM: } (t=5)
-// ASM: {
-// ASM:   GRANT_PREDICATE, [$20], [$21] -> [$20]
-// ASM: } (t=6)
\ No newline at end of file
+// ASM:      PE(2,2):
+// ASM-NEXT: {
+// ASM-NEXT:   ICMP, [EAST, RED] -> [EAST, RED], [$41], [WEST, RED], [SOUTH, RED]
+// ASM-NEXT: } (t=3)
+// ASM-NEXT: {
+// ASM-NEXT:   DATA_MOV, [EAST, RED] -> [$40]
+// ASM-NEXT: } (t=4)
+// ASM-NEXT: {
+// ASM-NEXT:   FADD, [NORTH, RED], [$40] -> [$40], [SOUTH, RED]
+// ASM-NEXT: } (t=5)
+// ASM-NEXT: {
+// ASM-NEXT:   GRANT_PREDICATE, [$40], [$41] -> [EAST, RED]
+// ASM-NEXT: } (t=6)
\ No newline at end of file
diff --git a/test/neura/interpreter/basic_operation/add.mlir b/test/neura/interpreter/basic_operation/add.mlir
index 5496aa1c..8d82a87f 100644
--- a/test/neura/interpreter/basic_operation/add.mlir
+++ b/test/neura/interpreter/basic_operation/add.mlir
@@ -46,15 +46,17 @@ func.func @test_add_zero() -> f32 {
 
 // RUN: neura-interpreter %s | FileCheck %s
 
+// Remove Test 5 because we plan to remove the predicate attribute in
+// https://github.com/coredac/dataflow/issues/116
+
 // ===----------------------------------------------------------------------===//
 // Test 5: Add with operation predicate 0
 // ===----------------------------------------------------------------------===//
-func.func @test_add_predicate_zero() -> f32 {
-  %a = arith.constant 10.0 : f32
-  %b = arith.constant 32.0 : f32
-  %pred = arith.constant 0 : i1
-  %pred_f32 = "neura.cast"(%pred) {cast_type = "bool2f"} : (i1) -> f32
-  %res = "neura.add"(%a, %b, %pred_f32) : (f32, f32, f32) -> f32
-  // CHECK: [neura-interpreter]  → Output: 0.000000
-  return %res : f32
-}
\ No newline at end of file
+// func.func @test_add_predicate_zero() -> f32 {
+//   %a = arith.constant 10.0 : f32
+//   %b = arith.constant 32.0 : f32
+//   %pred = arith.constant 0 : i1
+//   %pred_f32 = "neura.cast"(%pred) {cast_type = "bool2f"} : (i1) -> f32
+//   %res = "neura.add"(%a, %b, %pred_f32) : (f32, f32, f32) -> f32
+//   return %res : f32
+// }
\ No newline at end of file
diff --git a/test/neura/interpreter/basic_operation/fadd.mlir b/test/neura/interpreter/basic_operation/fadd.mlir
index d1b1d9ac..dbbb30e1 100644
--- a/test/neura/interpreter/basic_operation/fadd.mlir
+++ b/test/neura/interpreter/basic_operation/fadd.mlir
@@ -44,29 +44,30 @@ func.func @test_fadd_zero() -> f32 {
   return %res : f32
 }
 
+// Remove tests with predicate handling because we plan to remove the predicate attribute in
+// https://github.com/coredac/dataflow/issues/116
+
 // ===----------------------------------------------------------------------===//
 // Test 4: Predicate handling in neura.fadd
 // ===----------------------------------------------------------------------===//
-func.func @test_fadd_invalid_predicate() -> f32 {
-  %a = arith.constant 0.0 : f32
-  %b = arith.constant 25.5 : f32
-  %pred = arith.constant 0 : i1
-  %pred_f32 = "neura.cast"(%pred) {cast_type = "bool2f"} : (i1) -> f32
-  %res = "neura.fadd"(%a, %b, %pred_f32) : (f32, f32, f32) -> f32
-  // CHECK: [neura-interpreter]  → Output: 0.000000
-  return %res : f32
-}
+// func.func @test_fadd_invalid_predicate() -> f32 {
+//   %a = arith.constant 0.0 : f32
+//   %b = arith.constant 25.5 : f32
+//   %pred = arith.constant 0 : i1
+//   %pred_f32 = "neura.cast"(%pred) {cast_type = "bool2f"} : (i1) -> f32
+//   %res = "neura.fadd"(%a, %b, %pred_f32) : (f32, f32, f32) -> f32
+//   return %res : f32
+// }
 
 // ===----------------------------------------------------------------------===//
 // Test 5: Nested predicate handling in neura.fadd
 // ===----------------------------------------------------------------------===//
-func.func @test_nested_fadd_invalid_predicate() -> f32 {
-  %a = arith.constant 0.0 : f32
-  %b = arith.constant 25.5 : f32
-  %pred = arith.constant 0 : i1
-  %pred_f32 = "neura.cast"(%pred) {cast_type = "bool2f"} : (i1) -> f32
-  %tmp = "neura.fadd"(%a, %b, %pred_f32) : (f32, f32, f32) -> f32
-  %res = "neura.fadd"(%tmp, %b, %pred_f32) : (f32, f32, f32) -> f32
-  // CHECK: [neura-interpreter]  → Output: 0.000000
-  return %res : f32
-}
\ No newline at end of file
+// func.func @test_nested_fadd_invalid_predicate() -> f32 {
+//   %a = arith.constant 0.0 : f32
+//   %b = arith.constant 25.5 : f32
+//   %pred = arith.constant 0 : i1
+//   %pred_f32 = "neura.cast"(%pred) {cast_type = "bool2f"} : (i1) -> f32
+//   %tmp = "neura.fadd"(%a, %b, %pred_f32) : (f32, f32, f32) -> f32
+//   %res = "neura.fadd"(%tmp, %b, %pred_f32) : (f32, f32, f32) -> f32
+//   return %res : f32
+// }
\ No newline at end of file
diff --git a/test/neura/interpreter/basic_operation/sub.mlir b/test/neura/interpreter/basic_operation/sub.mlir
index 8a6ce158..0ff4224b 100644
--- a/test/neura/interpreter/basic_operation/sub.mlir
+++ b/test/neura/interpreter/basic_operation/sub.mlir
@@ -20,24 +20,25 @@ func.func @test_sub_negative() -> i32 {
   return %res : i32
 }
 
+// Remove tests with predicate input because we plan to remove the predicate attribute in
+// https://github.com/coredac/dataflow/issues/116
+
 // Test subtraction with predicate=true
-func.func @test_sub_with_predicate_true() -> i32 {
-  %a = arith.constant 300 : i32
-  %b = arith.constant 100 : i32
-  %pred = arith.constant 1 : i32  
-  %res = "neura.sub"(%a, %b, %pred) : (i32, i32, i32) -> i32
+// func.func @test_sub_with_predicate_true() -> i32 {
+//   %a = arith.constant 300 : i32
+//   %b = arith.constant 100 : i32
+//   %pred = arith.constant 1 : i32  
+//   %res = "neura.sub"(%a, %b, %pred) : (i32, i32, i32) -> i32
 
-  // CHECK: [neura-interpreter]  → Output: 200.000000
-  return %res : i32
-}
+//   return %res : i32
+// }
 
-// Test subtraction with predicate=false
-func.func @test_sub_with_predicate_false() -> i32 {
-  %a = arith.constant 500 : i32
-  %b = arith.constant 200 : i32
-  %pred = arith.constant 0 : i32  
-  %res = "neura.sub"(%a, %b, %pred) : (i32, i32, i32) -> i32
 
-  // CHECK: [neura-interpreter]  → Output: 0.000000
-  return %res : i32
-}
\ No newline at end of file
+// Test subtraction with predicate=false
+// func.func @test_sub_with_predicate_false() -> i32 {
+//   %a = arith.constant 500 : i32
+//   %b = arith.constant 200 : i32
+//   %pred = arith.constant 0 : i32  
+//   %res = "neura.sub"(%a, %b, %pred) : (i32, i32, i32) -> i32
+//   return %res : i32
+// }
\ No newline at end of file
diff --git a/tools/neura-interpreter/neura-interpreter.cpp b/tools/neura-interpreter/neura-interpreter.cpp
index 52350c46..4d2803c5 100644
--- a/tools/neura-interpreter/neura-interpreter.cpp
+++ b/tools/neura-interpreter/neura-interpreter.cpp
@@ -1295,17 +1295,18 @@ bool handleFCmpOp(
                  << rhs.value << ", [pred = " << rhs.predicate << "]\n";
   }
 
+  // TODO: Support comparison with a constant value.
   bool pred = true;
-  if (op.getNumOperands() > 2) {
-    auto pred_data = value_to_predicated_data_map[op.getPredicate()];
-    pred = pred_data.predicate && (pred_data.value != 0.0f);
-    if (isVerboseMode()) {
-      llvm::outs() << "[neura-interpreter]  ├─ Execution Context\n";
-      llvm::outs() << "[neura-interpreter]  │  └─ Pred           : value = "
-                   << pred_data.value << ", [pred = " << pred_data.predicate
-                   << "]\n";
-    }
-  }
+  // if (op.getNumOperands() > 2) {
+  //   auto pred_data = value_to_predicated_data_map[op.getPredicate()];
+  //   pred = pred_data.predicate && (pred_data.value != 0.0f);
+  //   if (isVerboseMode()) {
+  //     llvm::outs() << "[neura-interpreter]  ├─ Execution Context\n";
+  //     llvm::outs() << "[neura-interpreter]  │  └─ Pred           : value = "
+  //                  << pred_data.value << ", [pred = " << pred_data.predicate
+  //                  << "]\n";
+  //   }
+  // }
 
   bool fcmp_result = false;
   StringRef cmp_type = op.getCmpType();
@@ -1400,17 +1401,18 @@ bool handleICmpOp(
                  << rhs.value << ", [pred = " << rhs.predicate << "]\n";
   }
 
+  // TODO: Support comparison with a constant value.
   bool pred = true;
-  if (op.getNumOperands() > 2) {
-    auto pred_data = value_to_predicated_data_map[op.getPredicate()];
-    pred = pred_data.predicate && (pred_data.value != 0.0f);
-    if (isVerboseMode()) {
-      llvm::outs() << "[neura-interpreter]  ├─ Execution Context\n";
-      llvm::outs() << "[neura-interpreter]  │  └─ Pred           : value = "
-                   << pred_data.value << ", [pred = " << pred_data.predicate
-                   << "]\n";
-    }
-  }
+  // if (op.getNumOperands() > 2) {
+  //   auto pred_data = value_to_predicated_data_map[op.getPredicate()];
+  //   pred = pred_data.predicate && (pred_data.value != 0.0f);
+  //   if (isVerboseMode()) {
+  //     llvm::outs() << "[neura-interpreter]  ├─ Execution Context\n";
+  //     llvm::outs() << "[neura-interpreter]  │  └─ Pred           : value = "
+  //                  << pred_data.value << ", [pred = " << pred_data.predicate
+  //                  << "]\n";
+  //   }
+  // }
   // Converts stored floating-point values to signed integers (rounded to
   // nearest integer).
   int64_t s_lhs = static_cast<int64_t>(std::round(lhs.value));

From 2dac54e8355e10433ea846c994a719e9be9af14f Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 16 Sep 2025 16:27:20 +0800
Subject: [PATCH 2/3] change the file name & comments

---
 lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt  | 8 ++++----
 .../{HWAgnosticOpt => HwAgnosticOpt}/CMakeLists.txt       | 2 +-
 .../{HWAgnosticOpt => HwAgnosticOpt}/FoldConstantPass.cpp | 2 +-
 .../{HWSpecificOpt => HwSpecificOpt}/CMakeLists.txt       | 2 +-
 .../FuseLoopControlPass.cpp                               | 0
 test/neura/interpreter/basic_operation/add.mlir           | 2 +-
 test/neura/interpreter/basic_operation/fadd.mlir          | 2 +-
 test/neura/interpreter/basic_operation/sub.mlir           | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)
 rename lib/NeuraDialect/Transforms/Optimizations/{HWAgnosticOpt => HwAgnosticOpt}/CMakeLists.txt (83%)
 rename lib/NeuraDialect/Transforms/Optimizations/{HWAgnosticOpt => HwAgnosticOpt}/FoldConstantPass.cpp (99%)
 rename lib/NeuraDialect/Transforms/Optimizations/{HWSpecificOpt => HwSpecificOpt}/CMakeLists.txt (83%)
 rename lib/NeuraDialect/Transforms/Optimizations/{HWSpecificOpt => HwSpecificOpt}/FuseLoopControlPass.cpp (100%)

diff --git a/lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt b/lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt
index 770b2b40..1c75d3c8 100644
--- a/lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/Optimizations/CMakeLists.txt
@@ -1,7 +1,7 @@
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
-add_subdirectory(HWAgnosticOpt)
-add_subdirectory(HWSpecificOpt)
+add_subdirectory(HwAgnosticOpt)
+add_subdirectory(HwSpecificOpt)
 
 add_library(MLIRNUERAOptimization INTERFACE)
 
@@ -13,7 +13,7 @@ target_link_libraries(MLIRNUERAOptimization INTERFACE
   MLIRSupport
   MLIRTransforms
   MLIRNeura
-  MLIRNeuraHWSpecificOpt
-  MLIRNeuraHWAgnosticOpt
+  MLIRNeuraHwSpecificOpt
+  MLIRNeuraHwAgnosticOpt
   ${dialect_libs}
 )
\ No newline at end of file
diff --git a/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/CMakeLists.txt b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/CMakeLists.txt
similarity index 83%
rename from lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/CMakeLists.txt
rename to lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/CMakeLists.txt
index e41dce26..bf1c71af 100644
--- a/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/CMakeLists.txt
@@ -1,6 +1,6 @@
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-add_mlir_conversion_library(MLIRNeuraHWAgnosticOpt
+add_mlir_conversion_library(MLIRNeuraHwAgnosticOpt
   FoldConstantPass.cpp
 
   DEPENDS
diff --git a/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/FoldConstantPass.cpp b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
similarity index 99%
rename from lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/FoldConstantPass.cpp
rename to lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
index 1abe7718..8218c4f6 100644
--- a/lib/NeuraDialect/Transforms/Optimizations/HWAgnosticOpt/FoldConstantPass.cpp
+++ b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
@@ -96,7 +96,7 @@ void addConstantAttribute(Operation *op, StringRef attr_name,
 }
 
 // A template pattern to fuse binary operations with a constant on the
-// right-hand side.
+// right-hand side operand.
 template <typename OpType>
 struct FuseRhsConstantPattern : public OpRewritePattern<OpType> {
   using OpRewritePattern<OpType>::OpRewritePattern;
diff --git a/lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/CMakeLists.txt b/lib/NeuraDialect/Transforms/Optimizations/HwSpecificOpt/CMakeLists.txt
similarity index 83%
rename from lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/CMakeLists.txt
rename to lib/NeuraDialect/Transforms/Optimizations/HwSpecificOpt/CMakeLists.txt
index 2a2e52aa..41a5b7f2 100644
--- a/lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/CMakeLists.txt
+++ b/lib/NeuraDialect/Transforms/Optimizations/HwSpecificOpt/CMakeLists.txt
@@ -1,6 +1,6 @@
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-add_mlir_conversion_library(MLIRNeuraHWSpecificOpt 
+add_mlir_conversion_library(MLIRNeuraHwSpecificOpt 
   FuseLoopControlPass.cpp
 
   DEPENDS
diff --git a/lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/FuseLoopControlPass.cpp b/lib/NeuraDialect/Transforms/Optimizations/HwSpecificOpt/FuseLoopControlPass.cpp
similarity index 100%
rename from lib/NeuraDialect/Transforms/Optimizations/HWSpecificOpt/FuseLoopControlPass.cpp
rename to lib/NeuraDialect/Transforms/Optimizations/HwSpecificOpt/FuseLoopControlPass.cpp
diff --git a/test/neura/interpreter/basic_operation/add.mlir b/test/neura/interpreter/basic_operation/add.mlir
index 8d82a87f..9ecdc8b3 100644
--- a/test/neura/interpreter/basic_operation/add.mlir
+++ b/test/neura/interpreter/basic_operation/add.mlir
@@ -46,7 +46,7 @@ func.func @test_add_zero() -> f32 {
 
 // RUN: neura-interpreter %s | FileCheck %s
 
-// Remove Test 5 because we plan to remove the predicate attribute in
+// TODO: Remove Test 5 because we plan to remove the predicate attribute in
 // https://github.com/coredac/dataflow/issues/116
 
 // ===----------------------------------------------------------------------===//
diff --git a/test/neura/interpreter/basic_operation/fadd.mlir b/test/neura/interpreter/basic_operation/fadd.mlir
index dbbb30e1..9c9631ca 100644
--- a/test/neura/interpreter/basic_operation/fadd.mlir
+++ b/test/neura/interpreter/basic_operation/fadd.mlir
@@ -44,7 +44,7 @@ func.func @test_fadd_zero() -> f32 {
   return %res : f32
 }
 
-// Remove tests with predicate handling because we plan to remove the predicate attribute in
+// TODO: Remove tests with predicate handling because we plan to remove the predicate attribute in
 // https://github.com/coredac/dataflow/issues/116
 
 // ===----------------------------------------------------------------------===//
diff --git a/test/neura/interpreter/basic_operation/sub.mlir b/test/neura/interpreter/basic_operation/sub.mlir
index 0ff4224b..a5047488 100644
--- a/test/neura/interpreter/basic_operation/sub.mlir
+++ b/test/neura/interpreter/basic_operation/sub.mlir
@@ -20,7 +20,7 @@ func.func @test_sub_negative() -> i32 {
   return %res : i32
 }
 
-// Remove tests with predicate input because we plan to remove the predicate attribute in
+// TODO: Remove tests with predicate input because we plan to remove the predicate attribute in
 // https://github.com/coredac/dataflow/issues/116
 
 // Test subtraction with predicate=true

From 38eced9737c7ac607ed28414029891edb1d0d339 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 16 Sep 2025 19:51:48 +0800
Subject: [PATCH 3/3] add fadd fuse pattern

---
 .../HwAgnosticOpt/FoldConstantPass.cpp        |   1 +
 test/neura/ctrl/branch_for.mlir               | 276 ++++++++----------
 2 files changed, 127 insertions(+), 150 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
index 8218c4f6..7283ef31 100644
--- a/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
+++ b/lib/NeuraDialect/Transforms/Optimizations/HwAgnosticOpt/FoldConstantPass.cpp
@@ -225,6 +225,7 @@ struct FoldConstantPass
     patterns.add<FuseSubRhsConstantPattern>(&getContext());
     patterns.add<FuseMulRhsConstantPattern>(&getContext());
     patterns.add<FuseICmpRhsConstantPattern>(&getContext());
+    patterns.add<FuseFAddRhsConstantPattern>(&getContext());
 
     patterns.add<FuseConstantAndGrantPattern>(&getContext());
     FrozenRewritePatternSet frozen(std::move(patterns));
diff --git a/test/neura/ctrl/branch_for.mlir b/test/neura/ctrl/branch_for.mlir
index f478793f..f0e26134 100644
--- a/test/neura/ctrl/branch_for.mlir
+++ b/test/neura/ctrl/branch_for.mlir
@@ -102,181 +102,157 @@ func.func @loop_test() -> f32 {
 // CHECK-NEXT:   "neura.return"(%10) : (!neura.data<f32, i1>) -> ()
 // CHECK-NEXT: }
 
-// CANONICALIZE:      func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
-// CANONICALIZE-NEXT:   %0 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> i64
-// CANONICALIZE-NEXT:   %1 = "neura.constant"() <{predicate = true, value = 3.000000e+00 : f32}> : () -> f32
-// CANONICALIZE-NEXT:   %2 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> f32
-// CANONICALIZE-NEXT:   neura.br %0, %2, %1 : i64, f32, f32 to ^bb1
-// CANONICALIZE-NEXT: ^bb1(%3: i64, %4: f32, %5: f32):  // 2 preds: ^bb0, ^bb1
-// CANONICALIZE-NEXT:   %6 = "neura.fadd"(%4, %5) : (f32, f32) -> f32
-// CANONICALIZE-NEXT:   %7 = "neura.add"(%3) {rhs_const_value = 1 : i64} : (i64) -> i64
-// CANONICALIZE-NEXT:   %8 = "neura.icmp"(%7) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (i64) -> i1
-// CANONICALIZE-NEXT:   neura.cond_br %8 : i1 then %7, %6, %5 : i64, f32, f32 to ^bb1 else %6 : f32 to ^bb2
-// CANONICALIZE-NEXT: ^bb2(%9: f32):  // pred: ^bb1
-// CANONICALIZE-NEXT:   "neura.return"(%9) : (f32) -> ()
-// CANONICALIZE-NEXT: }
+// CANONICALIZE:       func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
+// CANONICALIZE-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> i64
+// CANONICALIZE-NEXT:     %1 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> f32
+// CANONICALIZE-NEXT:     neura.br %0, %1 : i64, f32 to ^bb1
+// CANONICALIZE-NEXT:   ^bb1(%2: i64, %3: f32):  // 2 preds: ^bb0, ^bb1
+// CANONICALIZE-NEXT:     %4 = "neura.fadd"(%3) {rhs_const_value = 3.000000e+00 : f32} : (f32) -> f32
+// CANONICALIZE-NEXT:     %5 = "neura.add"(%2) {rhs_const_value = 1 : i64} : (i64) -> i64
+// CANONICALIZE-NEXT:     %6 = "neura.icmp"(%5) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (i64) -> i1
+// CANONICALIZE-NEXT:     neura.cond_br %6 : i1 then %5, %4 : i64, f32 to ^bb1 else %4 : f32 to ^bb2
+// CANONICALIZE-NEXT:   ^bb2(%7: f32):  // pred: ^bb1
+// CANONICALIZE-NEXT:     "neura.return"(%7) : (f32) -> ()
+// CANONICALIZE-NEXT:   }
 
 // CTRL2DATA:        func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{predicate = true, value = 0 : i64}> : () -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_once"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %2 = "neura.constant"() <{predicate = true, value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %2 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
 // CTRL2DATA-NEXT:     %3 = "neura.grant_once"(%2) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %4 = "neura.constant"() <{predicate = true, value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %5 = "neura.grant_once"(%4) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %6 = neura.reserve : !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %7 = "neura.phi"(%6, %3) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %8 = neura.reserve : !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %9 = "neura.phi"(%8, %5) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %10 = neura.reserve : !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %11 = "neura.phi"(%10, %1) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %12 = "neura.fadd"(%9, %7) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %13 = "neura.add"(%11) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %14 = "neura.icmp"(%13) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// CTRL2DATA-NEXT:     %15 = neura.grant_predicate %13, %14 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     neura.ctrl_mov %15 -> %10 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %16 = neura.grant_predicate %12, %14 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     neura.ctrl_mov %16 -> %8 : !neura.data<f32, i1> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %17 = neura.grant_predicate %7, %14 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     neura.ctrl_mov %17 -> %6 : !neura.data<f32, i1> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     %18 = "neura.not"(%14) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// CTRL2DATA-NEXT:     %19 = neura.grant_predicate %12, %18 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// CTRL2DATA-NEXT:     "neura.return"(%19) : (!neura.data<f32, i1>) -> ()
+// CTRL2DATA-NEXT:     %4 = neura.reserve : !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %5 = "neura.phi"(%4, %3) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %6 = neura.reserve : !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %7 = "neura.phi"(%6, %1) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %8 = "neura.fadd"(%5) {rhs_const_value = 3.000000e+00 : f32} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %9 = "neura.add"(%7) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %10 = "neura.icmp"(%9) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %11 = neura.grant_predicate %9, %10 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     neura.ctrl_mov %11 -> %6 : !neura.data<i64, i1> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %12 = neura.grant_predicate %8, %10 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     neura.ctrl_mov %12 -> %4 : !neura.data<f32, i1> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     %13 = "neura.not"(%10) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %14 = neura.grant_predicate %8, %13 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%14) : (!neura.data<f32, i1>) -> ()
 // CTRL2DATA-NEXT:   }
 
-// FUSE:      func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
-// FUSE-NEXT:   %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// FUSE-NEXT:   %1 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// FUSE-NEXT:   %2 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// FUSE-NEXT:   %3 = neura.reserve : !neura.data<f32, i1>
-// FUSE-NEXT:   %4 = "neura.phi"(%3, %1) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// FUSE-NEXT:   %5 = neura.reserve : !neura.data<f32, i1>
-// FUSE-NEXT:   %6 = "neura.phi"(%5, %2) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// FUSE-NEXT:   %7 = neura.reserve : !neura.data<i64, i1>
-// FUSE-NEXT:   %8 = "neura.phi"(%7, %0) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// FUSE-NEXT:   %9 = "neura.fadd"(%6, %4) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// FUSE-NEXT:   %10 = "neura.add"(%8) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// FUSE-NEXT:   %11 = "neura.icmp"(%10) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// FUSE-NEXT:   %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// FUSE-NEXT:   neura.ctrl_mov %12 -> %7 : !neura.data<i64, i1> !neura.data<i64, i1>
-// FUSE-NEXT:   %13 = neura.grant_predicate %9, %11 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// FUSE-NEXT:   neura.ctrl_mov %13 -> %5 : !neura.data<f32, i1> !neura.data<f32, i1>
-// FUSE-NEXT:   %14 = neura.grant_predicate %4, %11 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// FUSE-NEXT:   neura.ctrl_mov %14 -> %3 : !neura.data<f32, i1> !neura.data<f32, i1>
-// FUSE-NEXT:   %15 = "neura.not"(%11) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// FUSE-NEXT:   %16 = neura.grant_predicate %9, %15 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// FUSE-NEXT:   "neura.return"(%16) : (!neura.data<f32, i1>) -> ()
-// FUSE-NEXT: }
+// FUSE:        func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
+// FUSE-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
+// FUSE-NEXT:     %1 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// FUSE-NEXT:     %2 = neura.reserve : !neura.data<f32, i1>
+// FUSE-NEXT:     %3 = "neura.phi"(%2, %1) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// FUSE-NEXT:     %4 = neura.reserve : !neura.data<i64, i1>
+// FUSE-NEXT:     %5 = "neura.phi"(%4, %0) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
+// FUSE-NEXT:     %6 = "neura.fadd"(%3) {rhs_const_value = 3.000000e+00 : f32} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// FUSE-NEXT:     %7 = "neura.add"(%5) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// FUSE-NEXT:     %8 = "neura.icmp"(%7) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// FUSE-NEXT:     %9 = neura.grant_predicate %7, %8 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// FUSE-NEXT:     neura.ctrl_mov %9 -> %4 : !neura.data<i64, i1> !neura.data<i64, i1>
+// FUSE-NEXT:     %10 = neura.grant_predicate %6, %8 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// FUSE-NEXT:     neura.ctrl_mov %10 -> %2 : !neura.data<f32, i1> !neura.data<f32, i1>
+// FUSE-NEXT:     %11 = "neura.not"(%8) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// FUSE-NEXT:     %12 = neura.grant_predicate %6, %11 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// FUSE-NEXT:     "neura.return"(%12) : (!neura.data<f32, i1>) -> ()
+// FUSE-NEXT:   }
 
 // MOV:        func.func @loop_test() -> f32 attributes {accelerator = "neura"} {
 // MOV-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// MOV-NEXT:     %1 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// MOV-NEXT:     %2 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
-// MOV-NEXT:     %3 = neura.reserve : !neura.data<f32, i1>
-// MOV-NEXT:     %4 = "neura.data_mov"(%1) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %5 = "neura.phi"(%3, %4) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %6 = neura.reserve : !neura.data<f32, i1>
-// MOV-NEXT:     %7 = "neura.data_mov"(%2) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %8 = "neura.phi"(%6, %7) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %9 = neura.reserve : !neura.data<i64, i1>
-// MOV-NEXT:     %10 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %11 = "neura.phi"(%9, %10) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %12 = "neura.data_mov"(%8) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %13 = "neura.data_mov"(%5) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %14 = "neura.fadd"(%12, %13) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %15 = "neura.data_mov"(%11) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %16 = "neura.add"(%15) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %17 = "neura.data_mov"(%16) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %18 = "neura.icmp"(%17) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %19 = "neura.data_mov"(%16) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MOV-NEXT:     %20 = "neura.data_mov"(%18) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %21 = neura.grant_predicate %19, %20 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MOV-NEXT:     neura.ctrl_mov %21 -> %9 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MOV-NEXT:     %22 = "neura.data_mov"(%14) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %23 = "neura.data_mov"(%18) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %1 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> : () -> !neura.data<f32, i1>
+// MOV-NEXT:     %2 = neura.reserve : !neura.data<f32, i1>
+// MOV-NEXT:     %3 = "neura.data_mov"(%1) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %4 = "neura.phi"(%2, %3) : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %5 = neura.reserve : !neura.data<i64, i1>
+// MOV-NEXT:     %6 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %7 = "neura.phi"(%5, %6) : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %8 = "neura.data_mov"(%4) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %9 = "neura.fadd"(%8) {rhs_const_value = 3.000000e+00 : f32} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %10 = "neura.data_mov"(%7) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %11 = "neura.add"(%10) {rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %12 = "neura.data_mov"(%11) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %13 = "neura.icmp"(%12) <{cmpType = "slt"}> {rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %14 = "neura.data_mov"(%11) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MOV-NEXT:     %15 = "neura.data_mov"(%13) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %16 = neura.grant_predicate %14, %15 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MOV-NEXT:     neura.ctrl_mov %16 -> %5 : !neura.data<i64, i1> !neura.data<i64, i1>
+// MOV-NEXT:     %17 = "neura.data_mov"(%9) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %18 = "neura.data_mov"(%13) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %19 = neura.grant_predicate %17, %18 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MOV-NEXT:     neura.ctrl_mov %19 -> %2 : !neura.data<f32, i1> !neura.data<f32, i1>
+// MOV-NEXT:     %20 = "neura.data_mov"(%13) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %21 = "neura.not"(%20) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MOV-NEXT:     %22 = "neura.data_mov"(%9) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     %23 = "neura.data_mov"(%21) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
 // MOV-NEXT:     %24 = neura.grant_predicate %22, %23 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MOV-NEXT:     neura.ctrl_mov %24 -> %6 : !neura.data<f32, i1> !neura.data<f32, i1>
-// MOV-NEXT:     %25 = "neura.data_mov"(%5) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %26 = "neura.data_mov"(%18) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %27 = neura.grant_predicate %25, %26 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MOV-NEXT:     neura.ctrl_mov %27 -> %3 : !neura.data<f32, i1> !neura.data<f32, i1>
-// MOV-NEXT:     %28 = "neura.data_mov"(%18) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %29 = "neura.not"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %30 = "neura.data_mov"(%14) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     %31 = "neura.data_mov"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MOV-NEXT:     %32 = neura.grant_predicate %30, %31 : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MOV-NEXT:     %33 = "neura.data_mov"(%32) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MOV-NEXT:     "neura.return"(%33) : (!neura.data<f32, i1>) -> ()
+// MOV-NEXT:     %25 = "neura.data_mov"(%24) : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MOV-NEXT:     "neura.return"(%25) : (!neura.data<f32, i1>) -> ()
 // MOV-NEXT:   }
 
-// MAPPING:       func.func @loop_test() -> f32 attributes {accelerator = "neura", mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
-// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %1 = "neura.grant_once"() <{constant_value = 3.000000e+00 : f32}> {mapping_locs = [{id = 8 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 2 : i32}]} : () -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %2 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> {mapping_locs = [{id = 15 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 3 : i32}]} : () -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %3 = neura.reserve : !neura.data<f32, i1>
-// MAPPING-NEXT:     %4 = "neura.data_mov"(%1) {mapping_locs = [{id = 24 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %5 = "neura.phi"(%3, %4) {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 2 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %6 = neura.reserve : !neura.data<f32, i1>
-// MAPPING-NEXT:     %7 = "neura.data_mov"(%2) {mapping_locs = [{id = 60 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %8 = "neura.phi"(%6, %7) {mapping_locs = [{id = 15 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 3 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %9 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT:     %10 = "neura.data_mov"(%0) {mapping_locs = [{id = 44 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %11 = "neura.phi"(%9, %10) {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %12 = "neura.data_mov"(%8) {mapping_locs = [{id = 46 : i32, resource = "link", time_step = 3 : i32}, {id = 45 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %13 = "neura.data_mov"(%5) {mapping_locs = [{id = 28 : i32, resource = "link", time_step = 3 : i32}, {id = 40 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %14 = "neura.fadd"(%12, %13) {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %15 = "neura.data_mov"(%11) {mapping_locs = [{id = 44 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %16 = "neura.add"(%15) {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}], rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %17 = "neura.data_mov"(%16) {mapping_locs = [{id = 35 : i32, resource = "link", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %18 = "neura.icmp"(%17) <{cmpType = "slt"}> {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}], rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %19 = "neura.data_mov"(%16) {mapping_locs = [{id = 44 : i32, resource = "register", time_step = 2 : i32}, {id = 44 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %20 = "neura.data_mov"(%18) {mapping_locs = [{id = 32 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %21 = neura.grant_predicate %19, %20 {mapping_locs = [{id = 11 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %21 -> %9 {mapping_locs = [{id = 45 : i32, resource = "register", time_step = 4 : i32}, {id = 45 : i32, resource = "register", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT:     %22 = "neura.data_mov"(%14) {mapping_locs = [{id = 40 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %23 = "neura.data_mov"(%18) {mapping_locs = [{id = 41 : i32, resource = "register", time_step = 3 : i32}, {id = 41 : i32, resource = "register", time_step = 4 : i32}, {id = 41 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %24 = neura.grant_predicate %22, %23 {mapping_locs = [{id = 10 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %24 -> %6 {mapping_locs = [{id = 32 : i32, resource = "link", time_step = 6 : i32}, {id = 37 : i32, resource = "link", time_step = 7 : i32}]} : !neura.data<f32, i1> !neura.data<f32, i1>
-// MAPPING-NEXT:     %25 = "neura.data_mov"(%5) {mapping_locs = [{id = 36 : i32, resource = "register", time_step = 3 : i32}, {id = 36 : i32, resource = "register", time_step = 4 : i32}, {id = 36 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %26 = "neura.data_mov"(%18) {mapping_locs = [{id = 31 : i32, resource = "link", time_step = 3 : i32}, {id = 37 : i32, resource = "register", time_step = 4 : i32}, {id = 37 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %27 = neura.grant_predicate %25, %26 {mapping_locs = [{id = 9 : i32, resource = "tile", time_step = 6 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %27 -> %3 {mapping_locs = [{id = 36 : i32, resource = "register", time_step = 6 : i32}, {id = 36 : i32, resource = "register", time_step = 7 : i32}]} : !neura.data<f32, i1> !neura.data<f32, i1>
-// MAPPING-NEXT:     %28 = "neura.data_mov"(%18) {mapping_locs = [{id = 33 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %29 = "neura.not"(%28) {mapping_locs = [{id = 6 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %30 = "neura.data_mov"(%14) {mapping_locs = [{id = 33 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %31 = "neura.data_mov"(%29) {mapping_locs = [{id = 24 : i32, resource = "register", time_step = 4 : i32}, {id = 24 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %32 = neura.grant_predicate %30, %31 {mapping_locs = [{id = 6 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 1 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
-// MAPPING-NEXT:     %33 = "neura.data_mov"(%32) {mapping_locs = [{id = 17 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
-// MAPPING-NEXT:     "neura.return"(%33) {mapping_locs = [{id = 5 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 1 : i32}]} : (!neura.data<f32, i1>) -> ()
+// MAPPING:        func.func @loop_test() -> f32 attributes {accelerator = "neura", mapping_info = {compiled_ii = 4 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 4 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
+// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {mapping_locs = [{id = 0 : i32, resource = "tile", time_step = 0 : i32, x = 0 : i32, y = 0 : i32}]} : () -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %1 = "neura.grant_once"() <{constant_value = 0.000000e+00 : f32}> {mapping_locs = [{id = 0 : i32, resource = "tile", time_step = 2 : i32, x = 0 : i32, y = 0 : i32}]} : () -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %2 = neura.reserve : !neura.data<f32, i1>
+// MAPPING-NEXT:     %3 = "neura.data_mov"(%1) {mapping_locs = [{id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %4 = "neura.phi"(%2, %3) {mapping_locs = [{id = 0 : i32, resource = "tile", time_step = 3 : i32, x = 0 : i32, y = 0 : i32}]} : (!neura.data<f32, i1>, !neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %5 = neura.reserve : !neura.data<i64, i1>
+// MAPPING-NEXT:     %6 = "neura.data_mov"(%0) {mapping_locs = [{id = 0 : i32, resource = "link", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %7 = "neura.phi"(%5, %6) {mapping_locs = [{id = 1 : i32, resource = "tile", time_step = 1 : i32, x = 1 : i32, y = 0 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %8 = "neura.data_mov"(%4) {mapping_locs = [{id = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %9 = "neura.fadd"(%8) {mapping_locs = [{id = 1 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 0 : i32}], rhs_const_value = 3.000000e+00 : f32} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %10 = "neura.data_mov"(%7) {mapping_locs = [{id = 4 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %11 = "neura.add"(%10) {mapping_locs = [{id = 1 : i32, resource = "tile", time_step = 2 : i32, x = 1 : i32, y = 0 : i32}], rhs_const_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %12 = "neura.data_mov"(%11) {mapping_locs = [{id = 4 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %13 = "neura.icmp"(%12) <{cmpType = "slt"}> {mapping_locs = [{id = 1 : i32, resource = "tile", time_step = 3 : i32, x = 1 : i32, y = 0 : i32}], rhs_const_value = 10 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %14 = "neura.data_mov"(%11) {mapping_locs = [{id = 4 : i32, resource = "link", time_step = 2 : i32}, {id = 20 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %15 = "neura.data_mov"(%13) {mapping_locs = [{id = 4 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %16 = neura.grant_predicate %14, %15 {mapping_locs = [{id = 5 : i32, resource = "tile", time_step = 4 : i32, x = 1 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %16 -> %5 {mapping_locs = [{id = 15 : i32, resource = "link", time_step = 4 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %17 = "neura.data_mov"(%9) {mapping_locs = [{id = 2 : i32, resource = "link", time_step = 4 : i32}, {id = 1 : i32, resource = "link", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %18 = "neura.data_mov"(%13) {mapping_locs = [{id = 2 : i32, resource = "link", time_step = 3 : i32}, {id = 1 : i32, resource = "link", time_step = 4 : i32}, {id = 16 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %19 = neura.grant_predicate %17, %18 {mapping_locs = [{id = 4 : i32, resource = "tile", time_step = 6 : i32, x = 0 : i32, y = 1 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %19 -> %2 {mapping_locs = [{id = 11 : i32, resource = "link", time_step = 6 : i32}]} : !neura.data<f32, i1> !neura.data<f32, i1>
+// MAPPING-NEXT:     %20 = "neura.data_mov"(%13) {mapping_locs = [{id = 3 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %21 = "neura.not"(%20) {mapping_locs = [{id = 2 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 0 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %22 = "neura.data_mov"(%9) {mapping_locs = [{id = 3 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %23 = "neura.data_mov"(%21) {mapping_locs = [{id = 8 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %24 = neura.grant_predicate %22, %23 {mapping_locs = [{id = 2 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 0 : i32}]} : !neura.data<f32, i1>, !neura.data<i1, i1> -> !neura.data<f32, i1>
+// MAPPING-NEXT:     %25 = "neura.data_mov"(%24) {mapping_locs = [{id = 8 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<f32, i1>) -> !neura.data<f32, i1>
+// MAPPING-NEXT:     "neura.return"(%25) {mapping_locs = [{id = 2 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 0 : i32}]} : (!neura.data<f32, i1>) -> ()
 // MAPPING-NEXT:   }
 
 // YAML:      array_config:
 // YAML-NEXT:   columns: 4
 // YAML-NEXT:   rows: 4
 // YAML-NEXT:   cores:
-// YAML-NEXT:     - column: 1
-// YAML-NEXT:       row: 1
-// YAML-NEXT:       core_id: "5"
+// YAML-NEXT:     - column: 0
+// YAML-NEXT:       row: 0
+// YAML-NEXT:       core_id: "0"
 // YAML-NEXT:       entries:
 // YAML-NEXT:         - entry_id: "entry0"
 // YAML-NEXT:           instructions:
-// YAML-NEXT:             - timestep: 7
+// YAML-NEXT:             - timestep: 0
 // YAML-NEXT:               operations:
-// YAML-NEXT:                 - opcode: "RETURN"
+// YAML-NEXT:                 - opcode: "GRANT_ONCE"
 // YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "#0"
+// YAML-NEXT:                       color: "RED"
+// YAML-NEXT:                   dst_operands:
 // YAML-NEXT:                     - operand: "EAST"
 // YAML-NEXT:                       color: "RED"
 
 
-// ASM:      PE(2,2):
-// ASM-NEXT: {
-// ASM-NEXT:   ICMP, [EAST, RED] -> [EAST, RED], [$41], [WEST, RED], [SOUTH, RED]
-// ASM-NEXT: } (t=3)
-// ASM-NEXT: {
-// ASM-NEXT:   DATA_MOV, [EAST, RED] -> [$40]
-// ASM-NEXT: } (t=4)
-// ASM-NEXT: {
-// ASM-NEXT:   FADD, [NORTH, RED], [$40] -> [$40], [SOUTH, RED]
-// ASM-NEXT: } (t=5)
-// ASM-NEXT: {
-// ASM-NEXT:   GRANT_PREDICATE, [$40], [$41] -> [EAST, RED]
-// ASM-NEXT: } (t=6)
\ No newline at end of file
+// ASM:       PE(0,0):
+// ASM-NEXT:  {
+// ASM-NEXT:    GRANT_ONCE, [#0] -> [EAST, RED]
+// ASM-NEXT:  } (t=0)
+// ASM-NEXT:  {
+// ASM-NEXT:    GRANT_ONCE, [#0.000000] -> [$0]
+// ASM-NEXT:  } (t=2)
+// ASM-NEXT:  {
+// ASM-NEXT:    PHI, [NORTH, RED], [$0] -> [EAST, RED]
+// ASM-NEXT:  } (t=3)
+// ASM-NEXT:  {
+// ASM-NEXT:    DATA_MOV, [EAST, RED] -> [NORTH, RED]
+// ASM-NEXT:  } (t=4)
+// ASM-NEXT:  {
+// ASM-NEXT:    DATA_MOV, [EAST, RED] -> [NORTH, RED]
+// ASM-NEXT:  } (t=5)
\ No newline at end of file