From 4ee67386cb4dbc2335f617e7d1dd059d6aae04e2 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Sun, 28 Dec 2025 11:46:25 +0800
Subject: [PATCH 1/2] Fixes ReturnOp exit predicate logic and addresses issue
 #209

---
 .../TransformCtrlToDataFlowPass.cpp           |  59 ++
 .../bert/bert_node1/bert_node1.mlir           |  70 +--
 .../bert/bert_node28/bert_node28.mlir         | 101 ++--
 .../complex_nested/complex_nested.mlir        |  12 +-
 .../non_perfect_nested.mlir                   |  14 +-
 .../perfect_nested/perfect_nested.mlir        | 137 ++++-
 .../simple_loop/simple_loop.mlir              |   7 +-
 test/e2e/bicg/bicg_kernel.mlir                |  66 ++-
 test/e2e/histogram/histogram_kernel.mlir      | 200 +++----
 test/e2e/relu/relu_kernel.mlir                | 201 +++----
 test/neura/for_loop/kernel_test.mlir          | 128 ++++-
 test/neura/for_loop/relu_test.mlir            | 525 ++++++++++++++++--
 test/neura/fusion/test.mlir                   |  56 +-
 .../steer_ctrl/loop_without_return_value.mlir |  56 +-
 14 files changed, 1179 insertions(+), 453 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
index 02d0e8df..55f2db67 100644
--- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
@@ -602,6 +602,62 @@ void transformControlFlowToDataFlow(Region &region, ControlFlowInfo &ctrl_info,
   }
 }
 
+// Injects exit predicate for ReturnOp (only for void returns).
+// Value-returning functions are not modified.
+void injectExitPredicateForReturn(Region &region, ControlFlowInfo &ctrl_info,
+                                   OpBuilder &builder) {
+  Block *entry_block = &region.front();
+  
+  // Find the ReturnOp
+  neura::ReturnOp return_op = nullptr;
+  for (Operation &op : *entry_block) {
+    if (auto rt =dyn_cast<neura::ReturnOp>(op)) {
+      return_op = rt;
+      llvm::errs() << "[ctrl2data] ReturnOp found: " << *rt << "\n";
+      break;
+    }
+  }
+  
+  if (!return_op) {
+    return; // No ReturnOp to process
+  }
+  
+  // Checks if ReturnOp has operands (value return).
+  // Only injects exit predicate for void returns.
+  if (return_op.getNumOperands() > 0) {
+    llvm::errs() << "[ctrl2data] ReturnOp has return value, skipping exit predicate injection.\n";
+    return;  // Skips modification for value-returning functions.
+  }
+  
+  // Computes exit predicate: use a constant true predicate for now.
+  llvm::errs() << "[ctrl2data] Injecting exit predicate for void ReturnOp.\n";
+  
+  builder.setInsertionPoint(return_op);
+  
+  // Creates a constant true predicate.
+  auto i1_type = builder.getI1Type();
+  auto pred_type = neura::PredicatedValue::get(builder.getContext(), i1_type, i1_type);
+  
+  Value true_constant = builder.create<neura::ConstantOp>(
+      return_op.getLoc(),
+      pred_type,
+      builder.getIntegerAttr(i1_type, 1));
+  
+  Value granted_true = builder.create<neura::GrantOnceOp>(
+      return_op.getLoc(),
+      pred_type,
+      true_constant);
+  
+  // Replaces the old ReturnOp with a new one that includes the exit predicate.
+  builder.setInsertionPoint(return_op);
+  auto new_return = builder.create<neura::ReturnOp>(
+      return_op.getLoc(),
+      ValueRange{granted_true});
+  return_op.erase();
+  
+  llvm::errs() << "[ctrl2data] Injected exit predicate for ReturnOp.\n";
+}
+
 // Converts phi operations with reserve operands to phi_start operations.
 void convertPhiToPhiStart(Region &region, OpBuilder &builder) {
   llvm::errs() << "[ctrl2data] Converting phi operations to phi_start...\n";
@@ -697,6 +753,9 @@ struct TransformCtrlToDataFlowPass
       buildControlFlowInfo(*region, ctrlInfo, domInfo);
       transformControlFlowToDataFlow(*region, ctrlInfo, domInfo, builder);
 
+      // Inject exit predicate for void returns
+      injectExitPredicateForReturn(*region, ctrlInfo, builder);
+
       // Converts phi operations to phi_start operations.
       convertPhiToPhiStart(*region, builder);
     });
diff --git a/test/affine2neura/bert/bert_node1/bert_node1.mlir b/test/affine2neura/bert/bert_node1/bert_node1.mlir
index dedb0c14..c3ebacee 100644
--- a/test/affine2neura/bert/bert_node1/bert_node1.mlir
+++ b/test/affine2neura/bert/bert_node1/bert_node1.mlir
@@ -31,38 +31,41 @@ module attributes {} {
   }
 }
 
-// CHECK: func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura"} {
-// CHECK-NEXT: %0 = "neura.constant"() <{value = 1 : index}> : () -> index
-// CHECK-NEXT: %1 = "neura.constant"() <{value = 128 : index}> : () -> index
-// CHECK-NEXT: %2 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT: %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %3 : i64 to ^bb1
-// CHECK-NEXT: ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb5
-// CHECK-NEXT: %5 = "neura.cast"(%4) <{cast_type = "int_to_index"}> : (i64) -> index
-// CHECK-NEXT: %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: neura.cond_br %6 : i1 then to ^bb2 else to ^bb6
-// CHECK-NEXT: ^bb2:  // pred: ^bb1
-// CHECK-NEXT: %7 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %7 : i64 to ^bb3
-// CHECK-NEXT: ^bb3(%8: i64):  // 2 preds: ^bb2, ^bb4
-// CHECK-NEXT: %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (i64) -> index
-// CHECK-NEXT: %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: neura.cond_br %10 : i1 then to ^bb4 else to ^bb5
-// CHECK-NEXT: ^bb4:  // pred: ^bb3
-// CHECK-NEXT: %11 = neura.load_indexed %arg0[%2, %2, %2, %2, %2, %9 : index, index, index, index, index, index] memref<?x1x1x1x1x128xi8> : i8
-// CHECK-NEXT: neura.store_indexed %11 to %arg1[%2, %2, %5, %2, %2, %9 : index, index, index, index, index, index] memref<?x1x128x1x1x128xi8> : i8
-// CHECK-NEXT: %12 = "neura.add"(%9, %0) : (index, index) -> index
-// CHECK-NEXT: %13 = "neura.cast"(%12) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %13 : i64 to ^bb3
-// CHECK-NEXT: ^bb5:  // pred: ^bb3
-// CHECK-NEXT: %14 = "neura.add"(%5, %0) : (index, index) -> index
-// CHECK-NEXT: %15 = "neura.cast"(%14) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %15 : i64 to ^bb1
-// CHECK-NEXT: ^bb6:  // pred: ^bb1
-// CHECK-NEXT: "neura.return"() : () -> ()
+// CHECK:      module {
+// CHECK-NEXT:   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura"} {
+// CHECK-NEXT:     %0 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT:     %1 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT:     %2 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT:     %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %3 : i64 to ^bb1
+// CHECK-NEXT:   ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb5
+// CHECK-NEXT:     %5 = "neura.cast"(%4) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT:     %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:     neura.cond_br %6 : i1 then to ^bb2 else to ^bb6
+// CHECK-NEXT:   ^bb2:  // pred: ^bb1
+// CHECK-NEXT:     %7 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %7 : i64 to ^bb3
+// CHECK-NEXT:   ^bb3(%8: i64):  // 2 preds: ^bb2, ^bb4
+// CHECK-NEXT:     %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT:     %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:     neura.cond_br %10 : i1 then to ^bb4 else to ^bb5
+// CHECK-NEXT:   ^bb4:  // pred: ^bb3
+// CHECK-NEXT:     %11 = neura.load_indexed %arg0[%2, %2, %2, %2, %2, %9 : index, index, index, index, index, index] memref<?x1x1x1x1x128xi8> : i8
+// CHECK-NEXT:     neura.store_indexed %11 to %arg1[%2, %2, %5, %2, %2, %9 : index, index, index, index, index, index] memref<?x1x128x1x1x128xi8> : i8
+// CHECK-NEXT:     %12 = "neura.add"(%9, %0) : (index, index) -> index
+// CHECK-NEXT:     %13 = "neura.cast"(%12) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %13 : i64 to ^bb3
+// CHECK-NEXT:   ^bb5:  // pred: ^bb3
+// CHECK-NEXT:     %14 = "neura.add"(%5, %0) : (index, index) -> index
+// CHECK-NEXT:     %15 = "neura.cast"(%14) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %15 : i64 to ^bb1
+// CHECK-NEXT:   ^bb6:  // pred: ^bb1
+// CHECK-NEXT:     "neura.return"() : () -> ()
+// CHECK-NEXT:   }
 // CHECK-NEXT: }
 
-// CTRL2DATA:        func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
+// CTRL2DATA:      module {
+// CTRL2DATA-NEXT:   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<memref<?x1x1x1x1x128xi8>, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_once"(%0) : (!neura.data<memref<?x1x1x1x1x128xi8>, i1>) -> !neura.data<memref<?x1x1x1x1x128xi8>, i1>
 // CTRL2DATA-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<memref<?x1x128x1x1x128xi8>, i1>
@@ -138,5 +141,8 @@ module attributes {} {
 // CTRL2DATA-NEXT:     neura.ctrl_mov %47 -> %33 : !neura.data<memref<?x1x128x1x1x128xi8>, i1> !neura.data<memref<?x1x128x1x1x128xi8>, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %48 -> %31 : !neura.data<i64, i1> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %49 -> %29 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     "neura.return"() : () -> ()
-// CTRL2DATA-NEXT:   }
\ No newline at end of file
+// CTRL2DATA-NEXT:     %61 = "neura.constant"() <{value = true}> : () -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %62 = "neura.grant_once"(%61) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%62) : (!neura.data<i1, i1>) -> ()
+// CTRL2DATA-NEXT:   }
+// CTRL2DATA-NEXT: }
\ No newline at end of file
diff --git a/test/affine2neura/bert/bert_node28/bert_node28.mlir b/test/affine2neura/bert/bert_node28/bert_node28.mlir
index bff2def5..c30cfb0a 100644
--- a/test/affine2neura/bert/bert_node28/bert_node28.mlir
+++ b/test/affine2neura/bert/bert_node28/bert_node28.mlir
@@ -36,53 +36,57 @@ module attributes {} {
     return
   }
 }
-// CHECK: func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
-// CHECK-NEXT: %0 = "neura.constant"() <{value = 768 : index}> : () -> index
-// CHECK-NEXT: %1 = "neura.constant"() <{value = 1 : index}> : () -> index
-// CHECK-NEXT: %2 = "neura.constant"() <{value = 128 : index}> : () -> index
-// CHECK-NEXT: %3 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT: %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %4 : i64 to ^bb1
-// CHECK-NEXT: ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb8
-// CHECK-NEXT: %6 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
-// CHECK-NEXT: %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: neura.cond_br %7 : i1 then to ^bb2 else to ^bb9
-// CHECK-NEXT: ^bb2:  // pred: ^bb1
-// CHECK-NEXT: %8 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %8 : i64 to ^bb3
-// CHECK-NEXT: ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb7
-// CHECK-NEXT: %10 = "neura.cast"(%9) <{cast_type = "int_to_index"}> : (i64) -> index
-// CHECK-NEXT: %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: neura.cond_br %11 : i1 then to ^bb4 else to ^bb8
-// CHECK-NEXT: ^bb4:  // pred: ^bb3
-// CHECK-NEXT: %12 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %12 : i64 to ^bb5
-// CHECK-NEXT: ^bb5(%13: i64):  // 2 preds: ^bb4, ^bb6
-// CHECK-NEXT: %14 = "neura.cast"(%13) <{cast_type = "int_to_index"}> : (i64) -> index
-// CHECK-NEXT: %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
-// CHECK-NEXT: neura.cond_br %15 : i1 then to ^bb6 else to ^bb7
-// CHECK-NEXT: ^bb6:  // pred: ^bb5
-// CHECK-NEXT: %16 = neura.load_indexed %arg0[%3, %6, %14 : index, index, index] memref<?x128x768xf32> : f32
-// CHECK-NEXT: %17 = neura.load_indexed %arg1[%3, %14, %10 : index, index, index] memref<?x768x768xf32> : f32
-// CHECK-NEXT: %18 = neura.load_indexed %arg2[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
-// CHECK-NEXT: %19 = "neura.fmul"(%16, %17) : (f32, f32) -> f32
-// CHECK-NEXT: %20 = "neura.fadd"(%18, %19) : (f32, f32) -> f32
-// CHECK-NEXT: neura.store_indexed %20 to %arg2[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
-// CHECK-NEXT: %21 = "neura.add"(%14, %1) : (index, index) -> index
-// CHECK-NEXT: %22 = "neura.cast"(%21) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %22 : i64 to ^bb5
-// CHECK-NEXT: ^bb7:  // pred: ^bb5
-// CHECK-NEXT: %23 = "neura.add"(%10, %1) : (index, index) -> index
-// CHECK-NEXT: %24 = "neura.cast"(%23) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %24 : i64 to ^bb3
-// CHECK-NEXT: ^bb8:  // pred: ^bb3
-// CHECK-NEXT: %25 = "neura.add"(%6, %1) : (index, index) -> index
-// CHECK-NEXT: %26 = "neura.cast"(%25) <{cast_type = "index_to_int"}> : (index) -> i64
-// CHECK-NEXT: neura.br %26 : i64 to ^bb1
-// CHECK-NEXT: ^bb9:  // pred: ^bb1
-// CHECK-NEXT: "neura.return"() : () -> ()
+// CHECK:      module {
+// CHECK-NEXT:   func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura"} {
+// CHECK-NEXT:     %0 = "neura.constant"() <{value = 768 : index}> : () -> index
+// CHECK-NEXT:     %1 = "neura.constant"() <{value = 1 : index}> : () -> index
+// CHECK-NEXT:     %2 = "neura.constant"() <{value = 128 : index}> : () -> index
+// CHECK-NEXT:     %3 = "neura.constant"() <{value = 0 : index}> : () -> index
+// CHECK-NEXT:     %4 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %4 : i64 to ^bb1
+// CHECK-NEXT:   ^bb1(%5: i64):  // 2 preds: ^bb0, ^bb8
+// CHECK-NEXT:     %6 = "neura.cast"(%5) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT:     %7 = "neura.icmp"(%6, %2) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:     neura.cond_br %7 : i1 then to ^bb2 else to ^bb9
+// CHECK-NEXT:   ^bb2:  // pred: ^bb1
+// CHECK-NEXT:     %8 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %8 : i64 to ^bb3
+// CHECK-NEXT:   ^bb3(%9: i64):  // 2 preds: ^bb2, ^bb7
+// CHECK-NEXT:     %10 = "neura.cast"(%9) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT:     %11 = "neura.icmp"(%10, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:     neura.cond_br %11 : i1 then to ^bb4 else to ^bb8
+// CHECK-NEXT:   ^bb4:  // pred: ^bb3
+// CHECK-NEXT:     %12 = "neura.cast"(%3) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %12 : i64 to ^bb5
+// CHECK-NEXT:   ^bb5(%13: i64):  // 2 preds: ^bb4, ^bb6
+// CHECK-NEXT:     %14 = "neura.cast"(%13) <{cast_type = "int_to_index"}> : (i64) -> index
+// CHECK-NEXT:     %15 = "neura.icmp"(%14, %0) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:     neura.cond_br %15 : i1 then to ^bb6 else to ^bb7
+// CHECK-NEXT:   ^bb6:  // pred: ^bb5
+// CHECK-NEXT:     %16 = neura.load_indexed %arg0[%3, %6, %14 : index, index, index] memref<?x128x768xf32> : f32
+// CHECK-NEXT:     %17 = neura.load_indexed %arg1[%3, %14, %10 : index, index, index] memref<?x768x768xf32> : f32
+// CHECK-NEXT:     %18 = neura.load_indexed %arg2[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
+// CHECK-NEXT:     %19 = "neura.fmul"(%16, %17) : (f32, f32) -> f32
+// CHECK-NEXT:     %20 = "neura.fadd"(%18, %19) : (f32, f32) -> f32
+// CHECK-NEXT:     neura.store_indexed %20 to %arg2[%3, %6, %10 : index, index, index] memref<?x128x768xf32> : f32
+// CHECK-NEXT:     %21 = "neura.add"(%14, %1) : (index, index) -> index
+// CHECK-NEXT:     %22 = "neura.cast"(%21) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %22 : i64 to ^bb5
+// CHECK-NEXT:   ^bb7:  // pred: ^bb5
+// CHECK-NEXT:     %23 = "neura.add"(%10, %1) : (index, index) -> index
+// CHECK-NEXT:     %24 = "neura.cast"(%23) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %24 : i64 to ^bb3
+// CHECK-NEXT:   ^bb8:  // pred: ^bb3
+// CHECK-NEXT:     %25 = "neura.add"(%6, %1) : (index, index) -> index
+// CHECK-NEXT:     %26 = "neura.cast"(%25) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     neura.br %26 : i64 to ^bb1
+// CHECK-NEXT:   ^bb9:  // pred: ^bb1
+// CHECK-NEXT:     "neura.return"() : () -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
 
-// CTRL2DATA:      func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
+// CTRL2DATA:      module {
+// CTRL2DATA-NEXT:   func.func @_Z11bert_node28PA128_A768_KfPA768_S0_PA128_A768_f(%arg0: memref<?x128x768xf32>, %arg1: memref<?x768x768xf32>, %arg2: memref<?x128x768xf32>) attributes {accelerator = "neura", dataflow_mode = "predicate"} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<memref<?x128x768xf32>, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_once"(%0) : (!neura.data<memref<?x128x768xf32>, i1>) -> !neura.data<memref<?x128x768xf32>, i1>
 // CTRL2DATA-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<memref<?x768x768xf32>, i1>
@@ -236,5 +240,8 @@ module attributes {} {
 // CTRL2DATA-NEXT:     neura.ctrl_mov %104 -> %81 : !neura.data<memref<?x128x768xf32>, i1> !neura.data<memref<?x128x768xf32>, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %105 -> %79 : !neura.data<i64, i1> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %107 -> %77 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     "neura.return"() : () -> ()
+// CTRL2DATA-NEXT:     %125 = "neura.constant"() <{value = true}> : () -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %126 = "neura.grant_once"(%125) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%126) : (!neura.data<i1, i1>) -> ()
 // CTRL2DATA-NEXT:   }
+// CTRL2DATA-NEXT: }
diff --git a/test/controflow_fuse/complex_nested/complex_nested.mlir b/test/controflow_fuse/complex_nested/complex_nested.mlir
index e8ecf0ae..0af4b086 100644
--- a/test/controflow_fuse/complex_nested/complex_nested.mlir
+++ b/test/controflow_fuse/complex_nested/complex_nested.mlir
@@ -66,7 +66,8 @@ module attributes {} {
   }
 }
 
-// CHECK:   func.func @_Z14complex_nestedPA32_A32_iPS_(%arg0: memref<?x32x32xi32>, %arg1: memref<?x32xi32>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+// CHECK:      module {
+// CHECK-NEXT:   func.func @_Z14complex_nestedPA32_A32_iPS_(%arg0: memref<?x32x32xi32>, %arg1: memref<?x32xi32>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
 // CHECK-NEXT:     %0 = "neura.constant"() <{value = 1 : index}> : () -> index
 // CHECK-NEXT:     %1 = "neura.constant"() <{value = 32 : index}> : () -> index
 // CHECK-NEXT:     %2 = "neura.constant"() <{value = 128 : i32}> : () -> i32
@@ -175,8 +176,10 @@ module attributes {} {
 // CHECK-NEXT:   ^bb23:  // pred: ^bb1
 // CHECK-NEXT:     "neura.return"() : () -> ()
 // CHECK-NEXT:   }
+// CHECK-NEXT: }
 
-// CTRL2DATA:        func.func @_Z14complex_nestedPA32_A32_iPS_(%arg0: memref<?x32x32xi32>, %arg1: memref<?x32xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate", llvm.linkage = #llvm.linkage<external>} {
+// CTRL2DATA:      module {
+// CTRL2DATA-NEXT:   func.func @_Z14complex_nestedPA32_A32_iPS_(%arg0: memref<?x32x32xi32>, %arg1: memref<?x32xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate", llvm.linkage = #llvm.linkage<external>} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<memref<?x32x32xi32>, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_once"(%0) : (!neura.data<memref<?x32x32xi32>, i1>) -> !neura.data<memref<?x32x32xi32>, i1>
 // CTRL2DATA-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<memref<?x32xi32>, i1>
@@ -668,5 +671,8 @@ module attributes {} {
 // CTRL2DATA-NEXT:     neura.ctrl_mov %384 -> %354 : !neura.data<i32, i1> !neura.data<i32, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %385 -> %352 : !neura.data<i32, i1> !neura.data<i32, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %386 -> %350 : !neura.data<i32, i1> !neura.data<i32, i1>
-// CTRL2DATA-NEXT:     "neura.return"() : () -> ()
+// CTRL2DATA-NEXT:     %404 = "neura.constant"() <{value = true}> : () -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %405 = "neura.grant_once"(%404) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%405) : (!neura.data<i1, i1>) -> ()
 // CTRL2DATA-NEXT:   }
+// CTRL2DATA-NEXT: }
diff --git a/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir b/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir
index 760ae0ce..5bb764d6 100644
--- a/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir
+++ b/test/controflow_fuse/non_perfect_nested/non_perfect_nested.mlir
@@ -66,7 +66,8 @@ module attributes {} {
   }
 }
 
-// CHECK:   func.func @_Z29non_perfect_extra_computationPA128_iS0_(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi32>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+// CHECK:      module {
+// CHECK-NEXT:   func.func @_Z29non_perfect_extra_computationPA128_iS0_(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi32>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
 // CHECK-NEXT:     %0 = "neura.constant"() <{value = 4 : index}> : () -> index
 // CHECK-NEXT:     %1 = "neura.constant"() <{value = 3 : index}> : () -> index
 // CHECK-NEXT:     %2 = "neura.constant"() <{value = 2 : index}> : () -> index
@@ -137,9 +138,11 @@ module attributes {} {
 // CHECK-NEXT:   ^bb10:  // pred: ^bb1
 // CHECK-NEXT:     "neura.return"() : () -> ()
 // CHECK-NEXT:   }
+// CHECK-NEXT: }
 
 
-// CTRL2DATA:        func.func @_Z29non_perfect_extra_computationPA128_iS0_(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate", llvm.linkage = #llvm.linkage<external>} {
+// CTRL2DATA:      module {
+// CTRL2DATA-NEXT:   func.func @_Z29non_perfect_extra_computationPA128_iS0_(%arg0: memref<?x128xi32>, %arg1: memref<?x128xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate", llvm.linkage = #llvm.linkage<external>} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<memref<?x128xi32>, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_once"(%0) : (!neura.data<memref<?x128xi32>, i1>) -> !neura.data<memref<?x128xi32>, i1>
 // CTRL2DATA-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<memref<?x128xi32>, i1>
@@ -429,5 +432,8 @@ module attributes {} {
 // CTRL2DATA-NEXT:     neura.ctrl_mov %146 -> %89 : !neura.data<i32, i1> !neura.data<i32, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %147 -> %87 : !neura.data<i32, i1> !neura.data<i32, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %148 -> %85 : !neura.data<i32, i1> !neura.data<i32, i1>
-// CTRL2DATA-NEXT:     "neura.return"() : () -> ()
-// CTRL2DATA-NEXT:   }
\ No newline at end of file
+// CTRL2DATA-NEXT:     %246 = "neura.constant"() <{value = true}> : () -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %247 = "neura.grant_once"(%246) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%247) : (!neura.data<i1, i1>) -> ()
+// CTRL2DATA-NEXT:   }
+// CTRL2DATA-NEXT: }
\ No newline at end of file
diff --git a/test/controflow_fuse/perfect_nested/perfect_nested.mlir b/test/controflow_fuse/perfect_nested/perfect_nested.mlir
index 22c6f982..520f2f82 100644
--- a/test/controflow_fuse/perfect_nested/perfect_nested.mlir
+++ b/test/controflow_fuse/perfect_nested/perfect_nested.mlir
@@ -60,48 +60,51 @@ module attributes {} {
 }
 
 
-// CHECK:   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
-// CHECK-NEXT:     %0 = "neura.constant"() <{value = 1 : index}> : () -> index
-// CHECK-NEXT:     %1 = "neura.constant"() <{value = 128 : index}> : () -> index
-// CHECK-NEXT:     %2 = "neura.constant"() <{value = 0 : index}> : () -> index
-// CHECK-NEXT:     %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK:      module {
+// CHECK-NEXT:   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+// CHECK-NEXT:     %0 = "neura.constant"() <{value = 1 : index}> {{.*}}: () -> index
+// CHECK-NEXT:     %1 = "neura.constant"() <{value = 128 : index}> {{.*}}: () -> index
+// CHECK-NEXT:     %2 = "neura.constant"() <{value = 0 : index}> {{.*}}: () -> index
+// CHECK-NEXT:     %3 = "neura.cast"(%2) <{cast_type = "index_to_int"}> {{.*}}: (index) -> i64
 // CHECK-NEXT:     neura.br %3 : i64 to ^bb1
 // CHECK-NEXT:   ^bb1(%4: i64):  // 2 preds: ^bb0, ^bb5
-// CHECK-NEXT:     %5 = "neura.cast"(%4) <{cast_type = "int_to_index"}> : (i64) -> index
-// CHECK-NEXT:     %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:     %5 = "neura.cast"(%4) <{cast_type = "int_to_index"}> {{.*}}: (i64) -> index
+// CHECK-NEXT:     %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> {{.*}}: (index, index) -> i1
 // CHECK-NEXT:     neura.cond_br %6 : i1 then to ^bb2 else to ^bb6
 // CHECK-NEXT:   ^bb2:  // pred: ^bb1
-// CHECK-NEXT:     %7 = "neura.cast"(%2) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     %7 = "neura.cast"(%2) <{cast_type = "index_to_int"}> {{.*}}: (index) -> i64
 // CHECK-NEXT:     neura.br %7 : i64 to ^bb3
 // CHECK-NEXT:   ^bb3(%8: i64):  // 2 preds: ^bb2, ^bb4
-// CHECK-NEXT:     %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> : (i64) -> index
-// CHECK-NEXT:     %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> : (index, index) -> i1
+// CHECK-NEXT:     %9 = "neura.cast"(%8) <{cast_type = "int_to_index"}> {{.*}}: (i64) -> index
+// CHECK-NEXT:     %10 = "neura.icmp"(%9, %1) <{cmpType = "slt"}> {{.*}}: (index, index) -> i1
 // CHECK-NEXT:     neura.cond_br %10 : i1 then to ^bb4 else to ^bb5
 // CHECK-NEXT:   ^bb4:  // pred: ^bb3
 // CHECK-NEXT:     %11 = neura.load_indexed %arg0[%2, %2, %2, %2, %2, %9 : index, index, index, index, index, index] memref<?x1x1x1x1x128xi8> : i8
 // CHECK-NEXT:     neura.store_indexed %11 to %arg1[%2, %2, %5, %2, %2, %9 : index, index, index, index, index, index] memref<?x1x128x1x1x128xi8> : i8
 // CHECK-NEXT:     %12 = "neura.add"(%9, %0) : (index, index) -> index
-// CHECK-NEXT:     %13 = "neura.cast"(%12) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     %13 = "neura.cast"(%12) <{cast_type = "index_to_int"}> {{.*}}: (index) -> i64
 // CHECK-NEXT:     neura.br %13 : i64 to ^bb3
 // CHECK-NEXT:   ^bb5:  // pred: ^bb3
 // CHECK-NEXT:     %14 = "neura.add"(%5, %0) : (index, index) -> index
-// CHECK-NEXT:     %15 = "neura.cast"(%14) <{cast_type = "index_to_int"}> : (index) -> i64
+// CHECK-NEXT:     %15 = "neura.cast"(%14) <{cast_type = "index_to_int"}> {{.*}}: (index) -> i64
 // CHECK-NEXT:     neura.br %15 : i64 to ^bb1
 // CHECK-NEXT:   ^bb6:  // pred: ^bb1
 // CHECK-NEXT:     "neura.return"() : () -> ()
 // CHECK-NEXT:   }
-// CAST:     func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
-// CAST-NEXT:     %0 = "neura.constant"() <{value = 1 : i64}> : () -> i64
-// CAST-NEXT:     %1 = "neura.constant"() <{value = 128 : i64}> : () -> i64
-// CAST-NEXT:     %2 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// CHECK-NEXT: }
+// CAST:      module {
+// CAST-NEXT:   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+// CAST-NEXT:     %0 = "neura.constant"() <{value = 1 : i64}> {{.*}}: () -> i64
+// CAST-NEXT:     %1 = "neura.constant"() <{value = 128 : i64}> {{.*}}: () -> i64
+// CAST-NEXT:     %2 = "neura.constant"() <{value = 0 : i64}> {{.*}}: () -> i64
 // CAST-NEXT:     neura.br %2 : i64 to ^bb1
 // CAST-NEXT:   ^bb1(%3: i64):  // 2 preds: ^bb0, ^bb5
-// CAST-NEXT:     %4 = "neura.icmp"(%3, %1) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CAST-NEXT:     %4 = "neura.icmp"(%3, %1) <{cmpType = "slt"}> {{.*}}: (i64, i64) -> i1
 // CAST-NEXT:     neura.cond_br %4 : i1 then to ^bb2 else to ^bb6
 // CAST-NEXT:   ^bb2:  // pred: ^bb1
 // CAST-NEXT:     neura.br %2 : i64 to ^bb3
 // CAST-NEXT:   ^bb3(%5: i64):  // 2 preds: ^bb2, ^bb4
-// CAST-NEXT:     %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CAST-NEXT:     %6 = "neura.icmp"(%5, %1) <{cmpType = "slt"}> {{.*}}: (i64, i64) -> i1
 // CAST-NEXT:     neura.cond_br %6 : i1 then to ^bb4 else to ^bb5
 // CAST-NEXT:   ^bb4:  // pred: ^bb3
 // CAST-NEXT:     %7 = neura.load_indexed %arg0[%2, %2, %2, %2, %2, %5 : i64, i64, i64, i64, i64, i64] memref<?x1x1x1x1x128xi8> : i8
@@ -114,18 +117,20 @@ module attributes {} {
 // CAST-NEXT:   ^bb6:  // pred: ^bb1
 // CAST-NEXT:     "neura.return"() : () -> ()
 // CAST-NEXT:   }
+// CAST-NEXT: }
 
 
-// CTRL2DATA:        func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", dataflow_mode = "predicate", llvm.linkage = #llvm.linkage<external>} {
-// CTRL2DATA-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<memref<?x1x1x1x1x128xi8>, i1>
+// CTRL2DATA:      module {
+// CTRL2DATA-NEXT:   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", dataflow_mode = "predicate", llvm.linkage = #llvm.linkage<external>} {
+// CTRL2DATA-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> {{.*}}: () -> !neura.data<memref<?x1x1x1x1x128xi8>, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_once"(%0) : (!neura.data<memref<?x1x1x1x1x128xi8>, i1>) -> !neura.data<memref<?x1x1x1x1x128xi8>, i1>
-// CTRL2DATA-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<memref<?x1x128x1x1x128xi8>, i1>
+// CTRL2DATA-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> {{.*}}: () -> !neura.data<memref<?x1x128x1x1x128xi8>, i1>
 // CTRL2DATA-NEXT:     %3 = "neura.grant_once"(%2) : (!neura.data<memref<?x1x128x1x1x128xi8>, i1>) -> !neura.data<memref<?x1x128x1x1x128xi8>, i1>
-// CTRL2DATA-NEXT:     %4 = "neura.constant"() <{value = 1 : i64}> : () -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %4 = "neura.constant"() <{value = 1 : i64}> {{.*}}: () -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %5 = "neura.grant_once"(%4) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %6 = "neura.constant"() <{value = 128 : i64}> : () -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %6 = "neura.constant"() <{value = 128 : i64}> {{.*}}: () -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %7 = "neura.grant_once"(%6) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %8 = "neura.constant"() <{value = 0 : i64}> : () -> !neura.data<i64, i1>
+// CTRL2DATA-NEXT:     %8 = "neura.constant"() <{value = 0 : i64}> {{.*}}: () -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %9 = "neura.grant_once"(%8) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %10 = neura.reserve : !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %11 = neura.phi_start %5, %10 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
@@ -139,7 +144,7 @@ module attributes {} {
 // CTRL2DATA-NEXT:     %19 = neura.phi_start %7, %18 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %20 = neura.reserve : !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %21 = neura.phi_start %9, %20 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %22 = "neura.icmp"(%21, %19) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %22 = "neura.icmp"(%21, %19) <{cmpType = "slt"}> {{.*}}: (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
 // CTRL2DATA-NEXT:     %23 = neura.grant_predicate %17, %22 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %24 = neura.grant_predicate %19, %22 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %25 = neura.grant_predicate %15, %22 : !neura.data<memref<?x1x1x1x1x128xi8>, i1>, !neura.data<i1, i1> -> !neura.data<memref<?x1x1x1x1x128xi8>, i1>
@@ -160,7 +165,7 @@ module attributes {} {
 // CTRL2DATA-NEXT:     %40 = neura.phi_start %24, %39 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %41 = neura.reserve : !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %42 = neura.phi_start %23, %41 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     %43 = "neura.icmp"(%42, %40) <{cmpType = "slt"}> : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %43 = "neura.icmp"(%42, %40) <{cmpType = "slt"}> {{.*}}: (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<i1, i1>
 // CTRL2DATA-NEXT:     %44 = neura.grant_predicate %38, %43 : !neura.data<memref<?x1x1x1x1x128xi8>, i1>, !neura.data<i1, i1> -> !neura.data<memref<?x1x1x1x1x128xi8>, i1>
 // CTRL2DATA-NEXT:     %45 = neura.grant_predicate %36, %43 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %46 = neura.grant_predicate %42, %43 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
@@ -192,7 +197,83 @@ module attributes {} {
 // CTRL2DATA-NEXT:     neura.ctrl_mov %47 -> %33 : !neura.data<memref<?x1x128x1x1x128xi8>, i1> !neura.data<memref<?x1x128x1x1x128xi8>, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %48 -> %31 : !neura.data<i64, i1> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %49 -> %29 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     "neura.return"() : () -> ()
+// CTRL2DATA-NEXT:     %61 = "neura.constant"() <{value = true}> {{.*}}: () -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %62 = "neura.grant_once"(%61) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%62) : (!neura.data<i1, i1>) -> ()
 // CTRL2DATA-NEXT:   }
+// CTRL2DATA-NEXT: }
 
-// MAPPING:      func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {accelerator = "neura", dataflow_mode = "predicate", llvm.linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 10 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 8 : i32, res_mii = 2 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
\ No newline at end of file
+// MAPPING:      module {
+// MAPPING-NEXT:   func.func @_Z10bert_node1PA1_A1_A1_A1_A128_bPA1_A128_S1_(%arg0: memref<?x1x1x1x1x128xi8>, %arg1: memref<?x1x128x1x1x128xi8>) attributes {{.*}} {
+// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {{.*}}: () -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %1 = neura.reserve {{.*}}: !neura.data<i64, i1>
+// MAPPING-NEXT:     %2 = "neura.data_mov"(%0) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %3 = neura.phi_start %2, %1 {{.*}}: !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %4 = neura.reserve {{.*}}: !neura.data<i64, i1>
+// MAPPING-NEXT:     %5 = "neura.data_mov"(%0) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %6 = neura.phi_start %5, %4 {{.*}}: !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %7 = "neura.data_mov"(%6) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %8 = "neura.icmp"(%7) <{cmpType = "slt"}> {{.*}}rhs_value = 128 : i64{{.*}}: (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %9 = "neura.data_mov"(%3) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %10 = "neura.data_mov"(%8) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %11 = neura.grant_predicate %9, %10 {{.*}}: !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %12 = "neura.data_mov"(%6) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %13 = "neura.data_mov"(%8) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %14 = neura.grant_predicate %12, %13 {{.*}}: !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %15 = neura.reserve {{.*}}: !neura.data<i64, i1>
+// MAPPING-NEXT:     %16 = "neura.data_mov"(%14) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %17 = neura.phi_start %16, %15 {{.*}}: !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %18 = neura.reserve {{.*}}: !neura.data<i64, i1>
+// MAPPING-NEXT:     %19 = "neura.data_mov"(%11) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %20 = neura.phi_start %19, %18 {{.*}}: !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %21 = neura.reserve {{.*}}: !neura.data<i64, i1>
+// MAPPING-NEXT:     %22 = "neura.data_mov"(%11) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %23 = neura.phi_start %22, %21 {{.*}}: !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %24 = "neura.data_mov"(%23) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %25 = "neura.icmp"(%24) <{cmpType = "slt"}> {{.*}}rhs_value = 128 : i64{{.*}}: (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %26 = "neura.data_mov"(%20) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %27 = "neura.data_mov"(%25) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %28 = neura.grant_predicate %26, %27 {{.*}}: !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %29 = "neura.data_mov"(%23) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %30 = "neura.data_mov"(%25) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %31 = neura.grant_predicate %29, %30 {{.*}}: !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %32 = "neura.data_mov"(%17) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %33 = "neura.data_mov"(%25) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %34 = neura.grant_predicate %32, %33 {{.*}}: !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %35 = "neura.data_mov"(%25) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %36 = "neura.not"(%35) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %37 = "neura.data_mov"(%17) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %38 = "neura.data_mov"(%36) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %39 = neura.grant_predicate %37, %38 {{.*}}: !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %40 = "neura.data_mov"(%20) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %41 = "neura.data_mov"(%36) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %42 = neura.grant_predicate %40, %41 {{.*}}: !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %43 = "neura.data_mov"(%39) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %44 = "neura.add"(%43) {{.*}}rhs_value = 1 : i64{{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %44 -> %4 {{.*}}: !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %42 -> %1 {{.*}}: !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %45 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %46 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %47 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %48 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %49 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %50 = "neura.data_mov"(%31) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %51 = neura.load_indexed [%45, %46, %47, %48, %49, %50 : !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>]  {{.*}}lhs_value = "%arg0"{{.*}}: !neura.data<i8, i1>
+// MAPPING-NEXT:     %52 = "neura.data_mov"(%51) {{.*}}: (!neura.data<i8, i1>) -> !neura.data<i8, i1>
+// MAPPING-NEXT:     %53 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %54 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %55 = "neura.data_mov"(%34) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %56 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %57 = "neura.data_mov"(%28) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %58 = "neura.data_mov"(%31) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.store_indexed %52 to [%53, %54, %55, %56, %57, %58 : !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>] {{.*}}rhs_value = "%arg1"{{.*}}: !neura.data<i8, i1>
+// MAPPING-NEXT:     %59 = "neura.data_mov"(%31) {{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %60 = "neura.add"(%59) {{.*}}rhs_value = 1 : i64{{.*}}: (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %60 -> %21 {{.*}}: !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %28 -> %18 {{.*}}: !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %34 -> %15 {{.*}}: !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %61 = "neura.grant_once"() <{constant_value = true}> {{.*}}: () -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %62 = "neura.data_mov"(%61) {{.*}}: (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     "neura.return"(%62) {{.*}}: (!neura.data<i1, i1>) -> ()
+// MAPPING-NEXT:   }
+// MAPPING-NEXT: }
\ No newline at end of file
diff --git a/test/controflow_fuse/simple_loop/simple_loop.mlir b/test/controflow_fuse/simple_loop/simple_loop.mlir
index 7d191587..9ec56e8b 100644
--- a/test/controflow_fuse/simple_loop/simple_loop.mlir
+++ b/test/controflow_fuse/simple_loop/simple_loop.mlir
@@ -182,7 +182,9 @@ module attributes {} {
 // CTRL2DATA-NEXT:     neura.ctrl_mov %32 -> %18 : !neura.data<i32, i1> !neura.data<i32, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %33 -> %16 : !neura.data<memref<?xi32>, i1> !neura.data<memref<?xi32>, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %34 -> %14 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     "neura.return"() : () -> ()
+// CTRL2DATA-NEXT:     %40 = "neura.constant"() <{value = true}> : () -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %41 = "neura.grant_once"(%40) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%41) : (!neura.data<i1, i1>) -> ()
 // CTRL2DATA-NEXT:   }
 
 
@@ -193,7 +195,8 @@ module attributes {} {
 // FUSE-NEXT:     %2 = "neura.mul"(%1) {rhs_value = 2 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
 // FUSE-NEXT:     %3 = "neura.add"(%2) {rhs_value = 1 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
 // FUSE-NEXT:     neura.store_indexed %3 to [%nextindex : !neura.data<i64, i1>]  {rhs_value = "%arg1"} : !neura.data<i32, i1>
-// FUSE-NEXT:     "neura.return"() : () -> ()
+// FUSE-NEXT:     %4 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
+// FUSE-NEXT:     "neura.return"(%4) : (!neura.data<i1, i1>) -> ()
 // FUSE-NEXT:   }
 
 // FUSE-MAPPING:  func.func @_Z11simple_loopPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "predicate", llvm.linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 1 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 1 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}} {
\ No newline at end of file
diff --git a/test/e2e/bicg/bicg_kernel.mlir b/test/e2e/bicg/bicg_kernel.mlir
index fe2b0f9b..cb3556c4 100644
--- a/test/e2e/bicg/bicg_kernel.mlir
+++ b/test/e2e/bicg/bicg_kernel.mlir
@@ -189,7 +189,7 @@
 // YAML:             - index_per_ii: 1
 // YAML:               operations:
 // YAML:                 - opcode: "ICMP_SGT"
-// YAML:                   id: 24
+// YAML:                   id: 25
 // YAML:                   time_step: 1
 // YAML:                   invalid_iterations: 0
 // YAML:                   src_operands:
@@ -213,7 +213,59 @@
 // YAML:                     - operand: "$1"
 // YAML:                       color: "RED"
 // YAML:                     - operand: "$3"
-
+// YAML:                       color: "RED"
+// YAML:             - index_per_ii: 3
+// YAML:               operations:
+// YAML:                 - opcode: "GRANT_ONCE"
+// YAML:                   id: 28
+// YAML:                   time_step: 3
+// YAML:                   invalid_iterations: 0
+// YAML:                   src_operands:
+// YAML:                     - operand: "$0"
+// YAML:                       color: "RED"
+// YAML:                   dst_operands:
+// YAML:                     - operand: "EAST"
+// YAML:                       color: "RED"
+// YAML:                     - operand: "NORTH"
+// YAML:                       color: "RED"
+// YAML:                     - operand: "$0"
+// YAML:                       color: "RED"
+// YAML:                     - operand: "$2"
+// YAML:                       color: "RED"
+// YAML:             - index_per_ii: 4
+// YAML:               operations:
+// YAML:                 - opcode: "GRANT_PREDICATE"
+// YAML:                   id: 37
+// YAML:                   time_step: 4
+// YAML:                   invalid_iterations: 0
+// YAML:                   src_operands:
+// YAML:                     - operand: "$1"
+// YAML:                       color: "RED"
+// YAML:                     - operand: "$0"
+// YAML:                       color: "RED"
+// YAML:                   dst_operands:
+// YAML:                     - operand: "$0"
+// YAML:                       color: "RED"
+// YAML:                     - operand: "$1"
+// YAML:                       color: "RED"
+// YAML:             - index_per_ii: 5
+// YAML:               operations:
+// YAML:                 - opcode: "ICMP_SGT"
+// YAML:                   id: 54
+// YAML:                   time_step: 5
+// YAML:                   invalid_iterations: 0
+// YAML:                   src_operands:
+// YAML:                     - operand: "$0"
+// YAML:                       color: "RED"
+// YAML:                     - operand: "#0"
+// YAML:                       color: "RED"
+// YAML:                   dst_operands:
+// YAML:                     - operand: "$0"
+// YAML:                       color: "RED"
+// YAML:                     - operand: "EAST"
+// YAML:                       color: "RED"
+// YAML:                     - operand: "NORTH"
+// YAML:                       color: "RED"
 
 // ASM:      # Compiled II: 11
 // ASM:      PE(0,0):
@@ -236,9 +288,6 @@
 // ASM-NEXT:   ICMP_SGT, [$0], [#0] -> [$0], [EAST, RED], [NORTH, RED] (t=5, inv_iters=0)
 // ASM-NEXT: } (idx_per_ii=5)
 // ASM-NEXT: {
-// ASM-NEXT:   GRANT_PREDICATE, [NORTH, RED], [EAST, RED] -> [NORTH, RED] (t=17, inv_iters=1)
-// ASM-NEXT: } (idx_per_ii=6)
-// ASM-NEXT: {
 // ASM-NEXT:   GRANT_PREDICATE, [$1], [$0] -> [EAST, RED] (t=7, inv_iters=0)
 // ASM-NEXT: } (idx_per_ii=7)
 // ASM-NEXT: {
@@ -269,11 +318,11 @@
 // ASM-NEXT: {
 // ASM-NEXT:   DATA_MOV, [WEST, RED] -> [NORTH, RED] (t=4, inv_iters=0)
 // ASM-NEXT:   GRANT_PREDICATE, [$4], [$2] -> [$1] (t=15, inv_iters=1)
+// ASM-NEXT:   DATA_MOV, [EAST, RED] -> [$0] (t=15, inv_iters=1)
 // ASM-NEXT: } (idx_per_ii=4)
 // ASM-NEXT: {
 // ASM-NEXT:   DATA_MOV, [WEST, RED] -> [EAST, RED] (t=5, inv_iters=0)
-// ASM-NEXT:   GRANT_PREDICATE, [$5], [NORTH, RED] -> [$2] (t=16, inv_iters=1)
-// ASM-NEXT:   DATA_MOV, [EAST, RED] -> [WEST, RED] (t=16, inv_iters=1)
+// ASM-NEXT:   GRANT_PREDICATE, [$5], [$0] -> [$2] (t=16, inv_iters=1)
 // ASM-NEXT: } (idx_per_ii=5)
 // ASM-NEXT: {
 // ASM-NEXT:   DATA_MOV, [WEST, RED] -> [NORTH, RED] (t=6, inv_iters=0)
@@ -291,7 +340,6 @@
 // ASM-NEXT:   DATA_MOV, [WEST, RED] -> [$1] (t=10, inv_iters=0)
 // ASM-NEXT: } (idx_per_ii=10)
 
-
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
 // RUN:   --lower-llvm-to-neura \
@@ -306,4 +354,4 @@
 // RUN: dot -Tjson bicg_kernel.dot -o bicg_kernel.json
 // RUN: FileCheck %s --input-file=bicg_kernel.dot -check-prefix=DOT
 
-// DOT: digraph G {
+// DOT: digraph G {
\ No newline at end of file
diff --git a/test/e2e/histogram/histogram_kernel.mlir b/test/e2e/histogram/histogram_kernel.mlir
index a381e8e0..35975374 100644
--- a/test/e2e/histogram/histogram_kernel.mlir
+++ b/test/e2e/histogram/histogram_kernel.mlir
@@ -23,129 +23,88 @@
 // RUN: FileCheck %s --input-file=tmp-generated-instructions.asm --check-prefix=ASM
 
 
-// MAPPING: func.func
-// MAPPING-SAME: compiled_ii = 5
-// MAPPING-SAME: mapping_mode = "spatial-temporal"
-// MAPPING-SAME: mapping_strategy = "heuristic"
-// MAPPING-SAME: rec_mii = 5
-// MAPPING-SAME: res_mii = 2
-// MAPPING-SAME: x_tiles = 4
-// MAPPING-SAME: y_tiles = 4
-// MAPPING-NEXT:    %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {dfg_id = 0 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %1 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i64, i1>
-// MAPPING-NEXT:    %2 = "neura.data_mov"(%0) {dfg_id = 3 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %3 = neura.phi_start %2, %1 {dfg_id = 4 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %4 = "neura.data_mov"(%3) {dfg_id = 6 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 8 : i32, lhs_value = "%arg0", mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:    %6 = "neura.data_mov"(%5) {dfg_id = 11 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:    %7 = "neura.load"(%6) {dfg_id = 13 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %8 = "neura.data_mov"(%7) {dfg_id = 15 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %9 = "neura.mul"(%8) {dfg_id = 17 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 5 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %10 = "neura.data_mov"(%9) {dfg_id = 19 : i32, mapping_locs = [{id = 37 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %11 = "neura.add"(%10) {dfg_id = 21 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 3 : i32, y = 3 : i32}], rhs_value = -5 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %12 = "neura.data_mov"(%11) {dfg_id = 23 : i32, mapping_locs = [{id = 480 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %13 = "neura.div"(%12) {dfg_id = 24 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 3 : i32, y = 3 : i32}], rhs_value = 18 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %14 = "neura.data_mov"(%13) {dfg_id = 25 : i32, mapping_locs = [{id = 480 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %15 = neura.sext %14 {dfg_id = 26 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 3 : i32}]} : !neura.data<i32, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %16 = "neura.data_mov"(%15) {dfg_id = 27 : i32, mapping_locs = [{id = 480 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 28 : i32, lhs_value = "%arg1", mapping_locs = [{id = 15 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 3 : i32, y = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:    %18 = "neura.data_mov"(%17) {dfg_id = 30 : i32, mapping_locs = [{id = 480 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:    %19 = "neura.load"(%18) {dfg_id = 31 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 3 : i32, y = 3 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %20 = "neura.data_mov"(%19) {dfg_id = 32 : i32, mapping_locs = [{id = 46 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %21 = "neura.add"(%20) {dfg_id = 33 : i32, mapping_locs = [{id = 14 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 3 : i32}], rhs_value = 1 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %22 = "neura.data_mov"(%21) {dfg_id = 34 : i32, mapping_locs = [{id = 448 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:    %23 = "neura.data_mov"(%17) {dfg_id = 29 : i32, mapping_locs = [{id = 46 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}, {id = 449 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}, {id = 449 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:    "neura.store"(%22, %23) {dfg_id = 35 : i32, mapping_locs = [{id = 14 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 11 : i32, x = 2 : i32, y = 3 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
-// MAPPING-NEXT:    %24 = "neura.data_mov"(%3) {dfg_id = 5 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %25 = "neura.add"(%24) {dfg_id = 7 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %26 = "neura.data_mov"(%25) {dfg_id = 10 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %27 = "neura.icmp"(%26) <{cmpType = "eq"}> {dfg_id = 12 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 20 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:    %28 = "neura.data_mov"(%27) {dfg_id = 14 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:    %29 = "neura.not"(%28) {dfg_id = 16 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:    %30 = "neura.data_mov"(%25) {dfg_id = 9 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 320 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 320 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:    %31 = "neura.data_mov"(%29) {dfg_id = 18 : i32, mapping_locs = [{id = 321 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:    %32 = neura.grant_predicate %30, %31 {dfg_id = 20 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:    neura.ctrl_mov %32 -> %1 {dfg_id = 22 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT:    "neura.return"() {dfg_id = 2 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 11 : i32, x = 2 : i32, y = 2 : i32}]} : () -> ()
+// MAPPING:      module attributes {{.*}}
+// MAPPING-NEXT:   func.func @_Z6kernelPiS_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 5 : i32, res_mii = 2 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
+// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {dfg_id = 0 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %1 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i64, i1>
+// MAPPING-NEXT:     %2 = "neura.data_mov"(%0) {dfg_id = 3 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %3 = neura.phi_start %2, %1 {dfg_id = 5 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %4 = "neura.data_mov"(%3) {dfg_id = 8 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 10 : i32, lhs_value = "%arg0", mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %6 = "neura.data_mov"(%5) {dfg_id = 13 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %7 = "neura.load"(%6) {dfg_id = 15 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %8 = "neura.data_mov"(%7) {dfg_id = 17 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %9 = "neura.mul"(%8) {dfg_id = 19 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 5 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %10 = "neura.data_mov"(%9) {dfg_id = 21 : i32, mapping_locs = [{id = 37 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %11 = "neura.add"(%10) {dfg_id = 23 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 3 : i32, y = 3 : i32}], rhs_value = -5 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %12 = "neura.data_mov"(%11) {dfg_id = 25 : i32, mapping_locs = [{id = 480 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %13 = "neura.div"(%12) {dfg_id = 26 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 3 : i32, y = 3 : i32}], rhs_value = 18 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %14 = "neura.data_mov"(%13) {dfg_id = 27 : i32, mapping_locs = [{id = 480 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %15 = neura.sext %14 {dfg_id = 28 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 3 : i32}]} : !neura.data<i32, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %16 = "neura.data_mov"(%15) {dfg_id = 29 : i32, mapping_locs = [{id = 480 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 30 : i32, lhs_value = "%arg1", mapping_locs = [{id = 15 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 3 : i32, y = 3 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %18 = "neura.data_mov"(%17) {dfg_id = 32 : i32, mapping_locs = [{id = 480 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %19 = "neura.load"(%18) {dfg_id = 33 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 3 : i32, y = 3 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %20 = "neura.data_mov"(%19) {dfg_id = 34 : i32, mapping_locs = [{id = 46 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %21 = "neura.add"(%20) {dfg_id = 35 : i32, mapping_locs = [{id = 14 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 3 : i32}], rhs_value = 1 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %22 = "neura.data_mov"(%21) {dfg_id = 36 : i32, mapping_locs = [{id = 448 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %23 = "neura.data_mov"(%17) {dfg_id = 31 : i32, mapping_locs = [{id = 46 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}, {id = 449 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}, {id = 449 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     "neura.store"(%22, %23) {dfg_id = 37 : i32, mapping_locs = [{id = 14 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 11 : i32, x = 2 : i32, y = 3 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT:     %24 = "neura.data_mov"(%3) {dfg_id = 7 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %25 = "neura.add"(%24) {dfg_id = 9 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %26 = "neura.data_mov"(%25) {dfg_id = 12 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %27 = "neura.icmp"(%26) <{cmpType = "eq"}> {dfg_id = 14 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 20 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %28 = "neura.data_mov"(%27) {dfg_id = 16 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %29 = "neura.not"(%28) {dfg_id = 18 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %30 = "neura.data_mov"(%25) {dfg_id = 11 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 320 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 320 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %31 = "neura.data_mov"(%29) {dfg_id = 20 : i32, mapping_locs = [{id = 321 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %32 = neura.grant_predicate %30, %31 {dfg_id = 22 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %32 -> %1 {dfg_id = 24 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %33 = "neura.grant_once"() <{constant_value = true}> {dfg_id = 2 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 0 : i32}]} : () -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %34 = "neura.data_mov"(%33) {dfg_id = 4 : i32, mapping_locs = [{id = 64 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     "neura.return"(%34) {dfg_id = 6 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 11 : i32, x = 2 : i32, y = 0 : i32}]} : (!neura.data<i1, i1>) -> ()
+// MAPPING-NEXT:   }
+// MAPPING-NEXT: }
 
-// YAML:     array_config:
-// YAML-NEXT:  columns: 4
-// YAML-NEXT:  rows: 4
-// YAML-NEXT:  compiled_ii: 5
-// YAML-NEXT:  cores:
-// YAML-NEXT:    - column: 2
-// YAML-NEXT:      row: 2
-// YAML-NEXT:      core_id: "10"
-// YAML-NEXT:      entries:
-// YAML-NEXT:        - entry_id: "entry0"
-// YAML-NEXT:          instructions:
-// YAML-NEXT:            - index_per_ii: 0
-// YAML-NEXT:              operations:
-// YAML-NEXT:                - opcode: "GRANT_PREDICATE"
-// YAML-NEXT:                  id: 20
-// YAML-NEXT:                  time_step: 5
-// YAML-NEXT:                  invalid_iterations: 1
-// YAML-NEXT:                  src_operands:
-// YAML-NEXT:                    - operand: "$0"
-// YAML-NEXT:                      color: "RED"
-// YAML-NEXT:                    - operand: "$1"
-// YAML-NEXT:                      color: "RED"
-// YAML-NEXT:                  dst_operands:
-// YAML-NEXT:                    - operand: "EAST"
-// YAML-NEXT:                      color: "RED"
-// YAML-NEXT:            - index_per_ii: 1
-// YAML-NEXT:              operations:
-// YAML-NEXT:                - opcode: "RETURN"
-// YAML-NEXT:                  id: 2
-// YAML-NEXT:                  time_step: 11
-// YAML-NEXT:                  invalid_iterations: 2
-// YAML-NEXT:            - index_per_ii: 2
-// YAML-NEXT:              operations:
-// YAML-NEXT:                - opcode: "GEP"
-// YAML-NEXT:                  id: 8
-// YAML-NEXT:                  time_step: 2
-// YAML-NEXT:                  invalid_iterations: 0
-// YAML-NEXT:                  src_operands:
-// YAML-NEXT:                    - operand: "EAST"
-// YAML-NEXT:                      color: "RED"
-// YAML-NEXT:                  dst_operands:
-// YAML-NEXT:                    - operand: "$0"
-// YAML-NEXT:                      color: "RED"
+// YAML:      array_config:
+// YAML-NEXT:   columns: 4
+// YAML-NEXT:   rows: 4
+// YAML-NEXT:   compiled_ii: 5
+// YAML-NEXT:   cores:
+// YAML-NEXT:     - column: 2
+// YAML-NEXT:       row: 0
+// YAML-NEXT:       core_id: "2"
+// YAML-NEXT:       entries:
+// YAML-NEXT:         - entry_id: "entry0"
+// YAML-NEXT:           instructions:
+// YAML-NEXT:             - index_per_ii: 0
+// YAML-NEXT:               operations:
+// YAML-NEXT:                 - opcode: "GRANT_ONCE"
+// YAML-NEXT:                   id: 2
+// YAML-NEXT:                   time_step: 10
+// YAML-NEXT:                   invalid_iterations: 2
+// YAML-NEXT:                   src_operands:
+// YAML-NEXT:                     - operand: "#-1"
+// YAML-NEXT:                       color: "RED"
 
 // ASM:      # Compiled II: 5
-// ASM:      PE(2,2):
+// ASM:      PE(2,0):
 // ASM-NEXT: {
-// ASM-NEXT:   GRANT_PREDICATE, [$0], [$1] -> [EAST, RED] (t=5, inv_iters=1)
+// ASM-NEXT:   GRANT_ONCE, [#-1] -> [$0] (t=10, inv_iters=2)
 // ASM-NEXT: } (idx_per_ii=0)
 // ASM-NEXT: {
-// ASM-NEXT:   RETURN (t=11, inv_iters=2)
+// ASM-NEXT:   RETURN, [$0] (t=11, inv_iters=2)
 // ASM-NEXT: } (idx_per_ii=1)
+// ASM:      PE(2,2):
+// ASM-NEXT: {
+// ASM-NEXT:   GRANT_PREDICATE, [$0], [$1] -> [EAST, RED] (t=5, inv_iters=1)
+// ASM-NEXT: } (idx_per_ii=0)
 // ASM-NEXT: {
 // ASM-NEXT:   GEP, [EAST, RED] -> [$0] (t=2, inv_iters=0)
 // ASM-NEXT: } (idx_per_ii=2)
 // ASM-NEXT: {
 // ASM-NEXT:   LOAD, [$0] -> [EAST, RED] (t=3, inv_iters=0)
 // ASM-NEXT:   DATA_MOV, [EAST, RED] -> [$0] (t=3, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=3)
-// ASM-NEXT: {
-// ASM-NEXT:   NOT, [EAST, RED] -> [$1] (t=4, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=4)
-// ASM:      PE(3,2):
-// ASM-NEXT: {
-// ASM-NEXT:   GRANT_ONCE, [#0] -> [$0] (t=0, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=0)
-// ASM-NEXT: {
-// ASM-NEXT:   PHI_START, [$0], [WEST, RED] -> [WEST, RED], [$0] (t=1, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=1)
-// ASM-NEXT: {
-// ASM-NEXT:   ADD, [$0], [#1] -> [$0], [WEST, RED] (t=2, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=2)
-// ASM-NEXT: {
-// ASM-NEXT:   ICMP_EQ, [$0], [#20] -> [WEST, RED] (t=3, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=3)
-// ASM-NEXT: {
-// ASM-NEXT:   MUL, [WEST, RED], [#5] -> [NORTH, RED] (t=4, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=4)
 
 // RUN: mlir-neura-opt %t-kernel.mlir \
 // RUN:   --assign-accelerator \
@@ -161,4 +120,23 @@
 // RUN: dot -Tjson histogram_kernel.dot -o histogram_kernel.json
 // RUN: FileCheck %s --input-file=histogram_kernel.dot -check-prefix=DOT
 
-// DOT: digraph G {
+// DOT:      digraph G {
+// DOT-NEXT:   compound = true;
+// DOT-NEXT:   subgraph cluster_1 {
+// DOT-NEXT:     v2 [label = " ", shape = plain];
+// DOT-NEXT: label = "builtin.module {{.*}}";
+// DOT-NEXT:     subgraph cluster_3 {
+// DOT-NEXT:       v4 [label = " ", shape = plain];
+// DOT-NEXT:       label = "";
+// DOT-NEXT:       subgraph cluster_5 {
+// DOT-NEXT:         v6 [label = " ", shape = plain];
+// DOT-NEXT:         label = "func.func : ()\n\nCConv: #llvm.cconv<ccc>\naccelerator: \"neura\"\narg_attrs: [{llvm.nocapture, ll...\ndataflow_mode: \"predicate\"\nfunction_type: (!llvm.ptr, !llvm.pt...\nlinkage: #llvm.linkage<extern...\nmemory_effects: #llvm.memory_effects...\nno_unwind: unit\npassthrough: [\"mustprogress\", \"no...\nsym_name: \"_Z6kernelPiS_\"\ntarget_cpu: \"x86-64\"\ntarget_features: #llvm.target_feature...\ntune_cpu: \"generic\"\nunnamed_addr: 1 : i64\nvisibility_: 0 : i64";
+// DOT-NEXT:         subgraph cluster_7 {
+// DOT-NEXT:           v8 [label = " ", shape = plain];
+// DOT-NEXT:           label = "";
+// DOT-NEXT:           v9 [label = "arg0", shape = ellipse];
+// DOT-NEXT:           v10 [label = "arg1", shape = ellipse];
+// DOT-NEXT:           v11 [fillcolor = "0.000000 1.0 1.0", label = "neura.grant_once : (!neura.data<i64, i1>)\n\nconstant_value: 0 : i64", shape = ellipse, style = filled];
+// DOT-NEXT:           v12 [fillcolor = "0.058824 1.0 1.0", label = "neura.reserve : (!neura.data<i64, i1>)\n", shape = ellipse, style = filled];
+// DOT-NEXT:           v13 [fillcolor = "0.117647 1.0 1.0", label = "neura.phi_start : (!neura.data<i64, i1>)\n", shape = ellipse, style = filled];
+// DOT-NEXT:           v14 [fillcolor = "0.176471 1.0 1.0", label = "neura.gep : (!neura.data<!llvm.pt...)\n\nlhs_value: \"%arg0\"\noperandSegmentSizes: array<i32: 0, 1>", shape = ellipse, style = filled];
diff --git a/test/e2e/relu/relu_kernel.mlir b/test/e2e/relu/relu_kernel.mlir
index 45d2248b..f8e391f3 100644
--- a/test/e2e/relu/relu_kernel.mlir
+++ b/test/e2e/relu/relu_kernel.mlir
@@ -30,56 +30,60 @@
 //
 // Check the mapped MLIR contains proper structure and neura operations.
 // RUN: FileCheck %s --input-file=%t-mapping.mlir -check-prefix=MAPPING
-// MAPPING:      func.func @kernel(%arg0: i32 {llvm.noundef}, %arg1: i32 {llvm.noundef}, %arg2: i32 {llvm.noundef}, %arg3: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.writeonly}, %arg4: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg5: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readnone}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 5 : i32, res_mii = 2 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
-// MAPPING-NEXT:   %0 = "neura.grant_once"() <{constant_value = 0 : i32}> {dfg_id = 0 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %1 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i32, i1>
-// MAPPING-NEXT:   %2 = "neura.data_mov"(%0) {dfg_id = 4 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 31 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 288 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}, {id = 288 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 288 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}, {id = 288 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %3 = neura.phi_start %2, %1 {dfg_id = 6 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %4 = neura.reserve {dfg_id = 2 : i32} : !neura.data<i32, i1>
-// MAPPING-NEXT:   %5 = "neura.data_mov"(%0) {dfg_id = 5 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %6 = neura.phi_start %5, %4 {dfg_id = 7 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %7 = "neura.data_mov"(%6) {dfg_id = 11 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %8 = "neura.cast"(%7) <{cast_type = "trunc"}> {dfg_id = 13 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i16, i1>
-// MAPPING-NEXT:   %9 = "neura.data_mov"(%8) {dfg_id = 17 : i32, mapping_locs = [{id = 321 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}, {id = 321 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
-// MAPPING-NEXT:   %10 = "neura.div"(%9) {dfg_id = 20 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}], rhs_value = 70 : i16} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
-// MAPPING-NEXT:   %11 = "neura.data_mov"(%8) {dfg_id = 16 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
-// MAPPING-NEXT:   %12 = "neura.rem"(%11) {dfg_id = 19 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}], rhs_value = 70 : i16} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
-// MAPPING-NEXT:   %13 = "neura.data_mov"(%10) {dfg_id = 23 : i32, mapping_locs = [{id = 321 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}, {id = 321 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
-// MAPPING-NEXT:   %14 = neura.zext %13 {dfg_id = 26 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i16, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %15 = "neura.data_mov"(%12) {dfg_id = 22 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 320 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
-// MAPPING-NEXT:   %16 = neura.zext %15 {dfg_id = 25 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i16, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %17 = "neura.data_mov"(%14) {dfg_id = 32 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %18 = "neura.data_mov"(%16) {dfg_id = 30 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}, {id = 192 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %19 = "neura.gep"(%17, %18) <{operandSegmentSizes = array<i32: 0, 2>}> {dfg_id = 36 : i32, lhs_value = "%arg4", mapping_locs = [{id = 6 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:   %20 = "neura.data_mov"(%19) {dfg_id = 40 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:   %21 = "neura.load"(%20) {dfg_id = 41 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %22 = "neura.data_mov"(%21) {dfg_id = 43 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %23 = "neura.icmp"(%22) <{cmpType = "sge"}> {dfg_id = 44 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 2 : i32, y = 1 : i32}], rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %24 = "neura.data_mov"(%23) {dfg_id = 45 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %25 = "neura.data_mov"(%21) {dfg_id = 42 : i32, mapping_locs = [{id = 193 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}, {id = 193 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %26 = "neura.data_mov"(%3) {dfg_id = 9 : i32, mapping_locs = [{id = 28 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}, {id = 33 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}, {id = 194 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %27 = "neura.sel"(%24, %25, %26) {dfg_id = 46 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>, !neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %28 = "neura.data_mov"(%14) {dfg_id = 31 : i32, mapping_locs = [{id = 34 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}, {id = 448 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %29 = "neura.data_mov"(%16) {dfg_id = 29 : i32, mapping_locs = [{id = 34 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}, {id = 449 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}, {id = 449 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %30 = "neura.gep"(%28, %29) <{operandSegmentSizes = array<i32: 0, 2>}> {dfg_id = 35 : i32, lhs_value = "%arg3", mapping_locs = [{id = 14 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 2 : i32, y = 3 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:   %31 = "neura.data_mov"(%27) {dfg_id = 47 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %32 = "neura.data_mov"(%30) {dfg_id = 39 : i32, mapping_locs = [{id = 45 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}, {id = 33 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 9 : i32}, {id = 193 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:   "neura.store"(%31, %32) {dfg_id = 48 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 11 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
-// MAPPING-NEXT:   %33 = "neura.data_mov"(%6) {dfg_id = 10 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %34 = "neura.add"(%33) {dfg_id = 12 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 1 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %35 = "neura.data_mov"(%34) {dfg_id = 15 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %36 = "neura.icmp"(%35) <{cmpType = "eq"}> {dfg_id = 18 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 4200 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %37 = "neura.data_mov"(%36) {dfg_id = 21 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %38 = "neura.not"(%37) {dfg_id = 24 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %39 = "neura.data_mov"(%34) {dfg_id = 14 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 224 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 224 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %40 = "neura.data_mov"(%38) {dfg_id = 28 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %41 = neura.grant_predicate %39, %40 {dfg_id = 34 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 3 : i32, y = 1 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT:   neura.ctrl_mov %41 -> %4 {dfg_id = 38 : i32, mapping_locs = [{id = 23 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
-// MAPPING-NEXT:   %42 = "neura.data_mov"(%3) {dfg_id = 8 : i32, mapping_locs = [{id = 289 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}, {id = 289 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %43 = "neura.data_mov"(%38) {dfg_id = 27 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 31 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}, {id = 290 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 6 : i32}, {id = 290 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 7 : i32}, {id = 290 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %44 = neura.grant_predicate %42, %43 {dfg_id = 33 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT:   neura.ctrl_mov %44 -> %1 {dfg_id = 37 : i32, mapping_locs = [{id = 289 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}, {id = 289 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 10 : i32}, {id = 289 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 11 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
-// MAPPING-NEXT:   "neura.return"() {dfg_id = 3 : i32, mapping_locs = [{id = 2 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 2 : i32, y = 0 : i32}]} : () -> ()
+// MAPPING:      module attributes {{.*}}
+// MAPPING-NEXT:   func.func @kernel(%arg0: i32 {llvm.noundef}, %arg1: i32 {llvm.noundef}, %arg2: i32 {llvm.noundef}, %arg3: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.writeonly}, %arg4: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg5: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readnone}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 5 : i32, res_mii = 2 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
+// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i32}> {dfg_id = 0 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %1 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i32, i1>
+// MAPPING-NEXT:     %2 = "neura.data_mov"(%0) {dfg_id = 4 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 0 : i32}, {id = 31 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 288 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}, {id = 288 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 288 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}, {id = 288 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 288 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %3 = neura.phi_start %2, %1 {dfg_id = 7 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %4 = neura.reserve {dfg_id = 2 : i32} : !neura.data<i32, i1>
+// MAPPING-NEXT:     %5 = "neura.data_mov"(%0) {dfg_id = 5 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %6 = neura.phi_start %5, %4 {dfg_id = 8 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i32, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %7 = "neura.data_mov"(%6) {dfg_id = 13 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %8 = "neura.cast"(%7) <{cast_type = "trunc"}> {dfg_id = 15 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i16, i1>
+// MAPPING-NEXT:     %9 = "neura.data_mov"(%8) {dfg_id = 19 : i32, mapping_locs = [{id = 321 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 2 : i32}, {id = 321 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
+// MAPPING-NEXT:     %10 = "neura.div"(%9) {dfg_id = 22 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}], rhs_value = 70 : i16} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
+// MAPPING-NEXT:     %11 = "neura.data_mov"(%8) {dfg_id = 18 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
+// MAPPING-NEXT:     %12 = "neura.rem"(%11) {dfg_id = 21 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}], rhs_value = 70 : i16} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
+// MAPPING-NEXT:     %13 = "neura.data_mov"(%10) {dfg_id = 25 : i32, mapping_locs = [{id = 321 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}, {id = 321 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
+// MAPPING-NEXT:     %14 = neura.zext %13 {dfg_id = 28 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i16, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %15 = "neura.data_mov"(%12) {dfg_id = 24 : i32, mapping_locs = [{id = 320 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 320 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i16, i1>) -> !neura.data<i16, i1>
+// MAPPING-NEXT:     %16 = neura.zext %15 {dfg_id = 27 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i16, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %17 = "neura.data_mov"(%14) {dfg_id = 34 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %18 = "neura.data_mov"(%16) {dfg_id = 32 : i32, mapping_locs = [{id = 33 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}, {id = 192 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %19 = "neura.gep"(%17, %18) <{operandSegmentSizes = array<i32: 0, 2>}> {dfg_id = 38 : i32, lhs_value = "%arg4", mapping_locs = [{id = 6 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %20 = "neura.data_mov"(%19) {dfg_id = 42 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %21 = "neura.load"(%20) {dfg_id = 43 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %22 = "neura.data_mov"(%21) {dfg_id = 45 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %23 = "neura.icmp"(%22) <{cmpType = "sge"}> {dfg_id = 46 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 2 : i32, y = 1 : i32}], rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %24 = "neura.data_mov"(%23) {dfg_id = 47 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %25 = "neura.data_mov"(%21) {dfg_id = 44 : i32, mapping_locs = [{id = 193 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}, {id = 193 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %26 = "neura.data_mov"(%3) {dfg_id = 11 : i32, mapping_locs = [{id = 28 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}, {id = 33 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}, {id = 194 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %27 = "neura.sel"(%24, %25, %26) {dfg_id = 48 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i1, i1>, !neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %28 = "neura.data_mov"(%14) {dfg_id = 33 : i32, mapping_locs = [{id = 34 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}, {id = 448 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %29 = "neura.data_mov"(%16) {dfg_id = 31 : i32, mapping_locs = [{id = 34 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}, {id = 449 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}, {id = 449 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %30 = "neura.gep"(%28, %29) <{operandSegmentSizes = array<i32: 0, 2>}> {dfg_id = 37 : i32, lhs_value = "%arg3", mapping_locs = [{id = 14 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 2 : i32, y = 3 : i32}]} : (!neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %31 = "neura.data_mov"(%27) {dfg_id = 49 : i32, mapping_locs = [{id = 192 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %32 = "neura.data_mov"(%30) {dfg_id = 41 : i32, mapping_locs = [{id = 45 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}, {id = 33 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 9 : i32}, {id = 193 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 10 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     "neura.store"(%31, %32) {dfg_id = 50 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 11 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT:     %33 = "neura.data_mov"(%6) {dfg_id = 12 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %34 = "neura.add"(%33) {dfg_id = 14 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 1 : i32} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %35 = "neura.data_mov"(%34) {dfg_id = 17 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %36 = "neura.icmp"(%35) <{cmpType = "eq"}> {dfg_id = 20 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 4200 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %37 = "neura.data_mov"(%36) {dfg_id = 23 : i32, mapping_locs = [{id = 352 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %38 = "neura.not"(%37) {dfg_id = 26 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %39 = "neura.data_mov"(%34) {dfg_id = 16 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 224 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 224 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %40 = "neura.data_mov"(%38) {dfg_id = 30 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %41 = neura.grant_predicate %39, %40 {dfg_id = 36 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 3 : i32, y = 1 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %41 -> %4 {dfg_id = 40 : i32, mapping_locs = [{id = 23 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
+// MAPPING-NEXT:     %42 = "neura.data_mov"(%3) {dfg_id = 10 : i32, mapping_locs = [{id = 289 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}, {id = 289 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %43 = "neura.data_mov"(%38) {dfg_id = 29 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 31 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}, {id = 290 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 6 : i32}, {id = 290 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 7 : i32}, {id = 290 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 2 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %44 = neura.grant_predicate %42, %43 {dfg_id = 35 : i32, mapping_locs = [{id = 9 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 1 : i32, y = 2 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %44 -> %1 {dfg_id = 39 : i32, mapping_locs = [{id = 289 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 9 : i32}, {id = 289 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 10 : i32}, {id = 289 : i32, index_per_ii = 1 : i32, invalid_iterations = 2 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 11 : i32}]} : !neura.data<i32, i1> !neura.data<i32, i1>
+// MAPPING-NEXT:     %45 = "neura.grant_once"() <{constant_value = true}> {dfg_id = 3 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 0 : i32, y = 3 : i32}]} : () -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %46 = "neura.data_mov"(%45) {dfg_id = 6 : i32, mapping_locs = [{id = 384 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     "neura.return"(%46) {dfg_id = 9 : i32, mapping_locs = [{id = 12 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 0 : i32, y = 3 : i32}]} : (!neura.data<i1, i1>) -> ()
+// MAPPING-NEXT:   }
 // MAPPING-NEXT: }
 
 
@@ -88,80 +92,31 @@
 // YAML-NEXT:   rows: 4
 // YAML-NEXT:   compiled_ii: 5
 // YAML-NEXT:   cores:
-// YAML-NEXT:     - column: 2
-// YAML-NEXT:       row: 0
-// YAML-NEXT:       core_id: "2"
+// YAML-NEXT:     - column: {{.*}}
+// YAML-NEXT:       row: {{.*}}
+// YAML-NEXT:       core_id: "{{.*}}"
 // YAML-NEXT:       entries:
 // YAML-NEXT:         - entry_id: "entry0"
-// YAML-NEXT:           instructions:
-// YAML-NEXT:             - index_per_ii: 4
-// YAML-NEXT:               operations:
-// YAML-NEXT:                 - opcode: "RETURN"
-// YAML-NEXT:                   id: 3
-// YAML-NEXT:                   time_step: 9
-// YAML-NEXT:                   invalid_iterations: 1
-// YAML-NEXT:     - column: 2
-// YAML-NEXT:       row: 1
-// YAML-NEXT:       core_id: "6"
-// YAML-NEXT:       entries:
-// YAML-NEXT:         - entry_id: "entry0"
-// YAML-NEXT:           instructions:
-// YAML-NEXT:             - index_per_ii: 0
-// YAML-NEXT:               operations:
-// YAML-NEXT:                 - opcode: "SEL"
-// YAML-NEXT:                   id: 46
-// YAML-NEXT:                   time_step: 10
-// YAML-NEXT:                   invalid_iterations: 2
-// YAML-NEXT:                   src_operands:
-// YAML-NEXT:                     - operand: "$0"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                     - operand: "$1"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                     - operand: "$2"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                   dst_operands:
-// YAML-NEXT:                     - operand: "$0"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                 - opcode: "DATA_MOV"
-// YAML-NEXT:                   id: 39
-// YAML-NEXT:                   time_step: 10
-// YAML-NEXT:                   invalid_iterations: 2
-// YAML-NEXT:                   src_operands:
-// YAML-NEXT:                     - operand: "NORTH"
-// YAML-NEXT:                       color: "RED"
-// YAML-NEXT:                   dst_operands:
-// YAML-NEXT:                     - operand: "$1"
-// YAML-NEXT:                       color: "RED"
 
 
-// ASM: # Compiled II: 5
-// ASM: PE(2,0):
-// ASM-NEXT: {
-// ASM-NEXT:   RETURN (t=9, inv_iters=1)
-// ASM-NEXT: } (idx_per_ii=4)
-// ASM:      PE(2,1):
-// ASM-NEXT: {
-// ASM-NEXT:   SEL, [$0], [$1], [$2] -> [$0] (t=10, inv_iters=2)
-// ASM-NEXT:   DATA_MOV, [NORTH, RED] -> [$1] (t=10, inv_iters=2)
-// ASM-NEXT: } (idx_per_ii=0)
-// ASM-NEXT: {
-// ASM-NEXT:   DATA_MOV, [NORTH, RED] -> [$0] (t=6, inv_iters=1)
-// ASM-NEXT:   STORE, [$0], [$1] (t=11, inv_iters=2)
-// ASM-NEXT: } (idx_per_ii=1)
-// ASM-NEXT: {
-// ASM-NEXT:   GEP, [NORTH, RED], [$0] -> [$0] (t=7, inv_iters=1)
-// ASM-NEXT: } (idx_per_ii=2)
-// ASM-NEXT: {
-// ASM-NEXT:   LOAD, [$0] -> [$0], [$1] (t=8, inv_iters=1)
-// ASM-NEXT: } (idx_per_ii=3)
-// ASM-NEXT: {
-// ASM-NEXT:   ICMP_SGE, [$0], [#0] -> [$0] (t=9, inv_iters=1)
-// ASM-NEXT:   DATA_MOV, [NORTH, RED] -> [$2] (t=9, inv_iters=1)
-// ASM-NEXT: } (idx_per_ii=4)
-// ASM:      PE(3,1):
-// ASM-NEXT: {
-// ASM-NEXT:   GRANT_PREDICATE, [$0], [NORTH, RED] -> [NORTH, RED] (t=5, inv_iters=1)
-// ASM-NEXT: } (idx_per_ii=0)
-// ASM-NEXT: {
-// ASM-NEXT:   DATA_MOV, [NORTH, RED] -> [$0] (t=3, inv_iters=0)
-// ASM-NEXT: } (idx_per_ii=3)
\ No newline at end of file
+// ASM:      # Compiled II: {{.*}}
+// ASM:      PE({{.*}}):
+// ASM:      {
+// ASM:        SEL, [$0], [$1], [$2] -> [$0] (t={{.*}}, inv_iters={{.*}})
+// ASM:        DATA_MOV, [{{.*}}] -> [$1] (t={{.*}}, inv_iters={{.*}})
+// ASM:      } (idx_per_ii={{.*}})
+// ASM:      {
+// ASM:        DATA_MOV, [{{.*}}] -> [$0] (t={{.*}}, inv_iters={{.*}})
+// ASM:        STORE, [$0], [$1] (t={{.*}}, inv_iters={{.*}})
+// ASM:      } (idx_per_ii={{.*}})
+// ASM:      PE({{.*}}):
+// ASM:      {
+// ASM:        GRANT_PREDICATE, [{{.*}}] (t={{.*}}, inv_iters={{.*}})
+// ASM:      } (idx_per_ii={{.*}})
+// ASM:      PE({{.*}}):
+// ASM:      {
+// ASM:        GRANT_ONCE, [{{.*}}] -> [$0] (t={{.*}}, inv_iters={{.*}})
+// ASM:      } (idx_per_ii={{.*}})
+// ASM:      {
+// ASM:        RETURN, [$0] (t={{.*}}, inv_iters={{.*}})
+// ASM:      } (idx_per_ii={{.*}})
diff --git a/test/neura/for_loop/kernel_test.mlir b/test/neura/for_loop/kernel_test.mlir
index 51a15b07..004c3bf7 100644
--- a/test/neura/for_loop/kernel_test.mlir
+++ b/test/neura/for_loop/kernel_test.mlir
@@ -35,8 +35,39 @@
 // RUN:   --insert-data-mov \
 // RUN:  | FileCheck %s --check-prefix=CHECK-MOV
 
-// CHECK:       func.func @_Z6kernelPfS_S_
-// CHECK:       accelerator = "neura"
+// CHECK:      module attributes {{.*}}
+// CHECK-NEXT:   llvm.mlir.global external local_unnamed_addr @input(dense<1.000000e+00> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-NEXT:   llvm.mlir.global external local_unnamed_addr @output(dense<0.000000e+00> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-NEXT:   llvm.mlir.global external local_unnamed_addr @coefficients(dense<[2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00]> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-NEXT:   llvm.mlir.global private unnamed_addr constant @".str"("output: %f\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local}
+// CHECK-NEXT:   llvm.func local_unnamed_addr @main() -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+// CHECK-NEXT:     %0 = llvm.mlir.addressof @".str" : !llvm.ptr
+// CHECK-NEXT:     %1 = llvm.mlir.addressof @coefficients : !llvm.ptr
+// CHECK-NEXT:     %2 = llvm.mlir.addressof @input : !llvm.ptr
+// CHECK-NEXT:     %3 = llvm.mlir.addressof @output : !llvm.ptr
+// CHECK-NEXT:     %4 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// CHECK-NEXT:     %5 = "neura.constant"() <{value = 1 : i64}> : () -> i64
+// CHECK-NEXT:     %6 = "neura.constant"() <{value = 32 : i64}> : () -> i64
+// CHECK-NEXT:     %7 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT:     %8 = "neura.load"(%3) : (!llvm.ptr) -> f32
+// CHECK-NEXT:     neura.br %4, %8 : i64, f32 to ^bb1
+// CHECK-NEXT:   ^bb1(%9: i64, %10: f32):  // 2 preds: ^bb0, ^bb1
+// CHECK-NEXT:     %11 = "neura.gep"(%2, %9) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// CHECK-NEXT:     %12 = "neura.load"(%11) : (!llvm.ptr) -> f32
+// CHECK-NEXT:     %13 = "neura.gep"(%1, %9) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// CHECK-NEXT:     %14 = "neura.load"(%13) : (!llvm.ptr) -> f32
+// CHECK-NEXT:     %15 = "neura.fmul"(%12, %14) : (f32, f32) -> f32
+// CHECK-NEXT:     %16 = "neura.fadd"(%10, %15) : (f32, f32) -> f32
+// CHECK-NEXT:     %17 = "neura.add"(%9, %5) : (i64, i64) -> i64
+// CHECK-NEXT:     %18 = "neura.icmp"(%17, %6) <{cmpType = "eq"}> : (i64, i64) -> i1
+// CHECK-NEXT:     neura.cond_br %18 : i1 then to ^bb2 else %17, %16 : i64, f32 to ^bb1
+// CHECK-NEXT:   ^bb2:  // pred: ^bb1
+// CHECK-NEXT:     "neura.store"(%16, %3) : (f32, !llvm.ptr) -> ()
+// CHECK-NEXT:     %19 = llvm.fpext %16 : f32 to f64
+// CHECK-NEXT:     %20 = llvm.call tail @printf(%0, %19) vararg(!llvm.func<i32 (ptr, ...)>) {no_unwind} : (!llvm.ptr, f64) -> i32
+// CHECK-NEXT:     "neura.return"(%7) : (i32) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
 // CHECK-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<!llvm.ptr, i1>
 // CHECK-NEXT:     %1 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
 // CHECK-NEXT:     %2 = "neura.constant"() <{value = "%arg2"}> : () -> !neura.data<!llvm.ptr, i1>
@@ -59,10 +90,38 @@
 // CHECK-NEXT:   ^bb2:  // pred: ^bb1
 // CHECK-NEXT:     "neura.return"() : () -> ()
 // CHECK-NEXT:   }
+// CHECK-NEXT:   llvm.func local_unnamed_addr @printf(!llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, ...) -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["nofree", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"}
+// CHECK-NEXT: }
 
 // Verifies the neura ops are generated. And fusion happens.
-// CHECK-FUSED:        func.func @_Z6kernelPfS_S_
-// CHECK-FUSED-SAME:   accelerator = "neura"
+// CHECK-FUSED:      module attributes {{.*}}
+// CHECK-FUSED-NEXT:   llvm.mlir.global external local_unnamed_addr @input(dense<1.000000e+00> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-FUSED-NEXT:   llvm.mlir.global external local_unnamed_addr @output(dense<0.000000e+00> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-FUSED-NEXT:   llvm.mlir.global external local_unnamed_addr @coefficients(dense<[2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00]> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-FUSED-NEXT:   llvm.mlir.global private unnamed_addr constant @".str"("output: %f\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local}
+// CHECK-FUSED-NEXT:   llvm.func local_unnamed_addr @main() -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+// CHECK-FUSED-NEXT:     %0 = llvm.mlir.addressof @".str" : !llvm.ptr
+// CHECK-FUSED-NEXT:     %1 = llvm.mlir.addressof @coefficients : !llvm.ptr
+// CHECK-FUSED-NEXT:     %2 = llvm.mlir.addressof @input : !llvm.ptr
+// CHECK-FUSED-NEXT:     %3 = llvm.mlir.addressof @output : !llvm.ptr
+// CHECK-FUSED-NEXT:     %4 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// CHECK-FUSED-NEXT:     %5 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-FUSED-NEXT:     %6 = "neura.load"(%3) : (!llvm.ptr) -> f32
+// CHECK-FUSED-NEXT:     neura.br %4, %6 : i64, f32 to ^bb1
+// CHECK-FUSED-NEXT:   ^bb1(%7: i64, %8: f32):  // 2 preds: ^bb0, ^bb1
+// CHECK-FUSED-NEXT:     %9 = neura.load_indexed %2[%7 : i64] !llvm.ptr : f32
+// CHECK-FUSED-NEXT:     %10 = neura.load_indexed %1[%7 : i64] !llvm.ptr : f32
+// CHECK-FUSED-NEXT:     %11 = "neura.fmul_fadd"(%9, %10, %8) : (f32, f32, f32) -> f32
+// CHECK-FUSED-NEXT:     %12 = "neura.add"(%7) {rhs_value = 1 : i64} : (i64) -> i64
+// CHECK-FUSED-NEXT:     %13 = "neura.icmp"(%12) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (i64) -> i1
+// CHECK-FUSED-NEXT:     neura.cond_br %13 : i1 then to ^bb2 else %12, %11 : i64, f32 to ^bb1
+// CHECK-FUSED-NEXT:   ^bb2:  // pred: ^bb1
+// CHECK-FUSED-NEXT:     "neura.store"(%11, %3) : (f32, !llvm.ptr) -> ()
+// CHECK-FUSED-NEXT:     %14 = llvm.fpext %11 : f32 to f64
+// CHECK-FUSED-NEXT:     %15 = llvm.call tail @printf(%0, %14) vararg(!llvm.func<i32 (ptr, ...)>) {no_unwind} : (!llvm.ptr, f64) -> i32
+// CHECK-FUSED-NEXT:     "neura.return"(%5) : (i32) -> ()
+// CHECK-FUSED-NEXT:   }
+// CHECK-FUSED-NEXT:   func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
 // CHECK-FUSED-NEXT:     %0 = "neura.grant_once"() <{constant_value = "%arg0"}> : () -> !neura.data<!llvm.ptr, i1>
 // CHECK-FUSED-NEXT:     %1 = "neura.grant_once"() <{constant_value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
 // CHECK-FUSED-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
@@ -107,11 +166,58 @@
 // CHECK-FUSED-NEXT:     neura.ctrl_mov %34 -> %11 : !neura.data<i64, i1> !neura.data<i64, i1>
 // CHECK-FUSED-NEXT:     %35 = neura.grant_predicate %10, %28 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
 // CHECK-FUSED-NEXT:     neura.ctrl_mov %35 -> %9 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CHECK-FUSED-NEXT:     "neura.return"() : () -> ()
+// CHECK-FUSED-NEXT:     %36 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
+// CHECK-FUSED-NEXT:     "neura.return"(%36) : (!neura.data<i1, i1>) -> ()
 // CHECK-FUSED-NEXT:   }
+// CHECK-FUSED-NEXT:   llvm.func local_unnamed_addr @printf(!llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, ...) -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["nofree", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"}
+// CHECK-FUSED-NEXT: }
 
-// CHECK-MOV:        func.func @_Z6kernelPfS_S_
-// CHECK-MOV-SAME:   accelerator = "neura"
+// CHECK-MOV:      module attributes {{.*}}
+// CHECK-MOV-NEXT:   llvm.mlir.global external local_unnamed_addr @input(dense<1.000000e+00> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-MOV-NEXT:   llvm.mlir.global external local_unnamed_addr @output(dense<0.000000e+00> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-MOV-NEXT:   llvm.mlir.global external local_unnamed_addr @coefficients(dense<[2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00, 2.500000e-01, 1.500000e+00, 3.750000e+00, -2.250000e+00, 5.000000e-01, 7.500000e-01, -3.000000e+00, 1.250000e+00]> : tensor<32xf32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x f32>
+// CHECK-MOV-NEXT:   llvm.mlir.global private unnamed_addr constant @".str"("output: %f\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local}
+// CHECK-MOV-NEXT:   llvm.func local_unnamed_addr @main() -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+// CHECK-MOV-NEXT:     %0 = llvm.mlir.addressof @".str" : !llvm.ptr
+// CHECK-MOV-NEXT:     %1 = llvm.mlir.addressof @coefficients : !llvm.ptr
+// CHECK-MOV-NEXT:     %2 = llvm.mlir.addressof @input : !llvm.ptr
+// CHECK-MOV-NEXT:     %3 = llvm.mlir.addressof @output : !llvm.ptr
+// CHECK-MOV-NEXT:     %4 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// CHECK-MOV-NEXT:     %5 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-MOV-NEXT:     %6 = "neura.data_mov"(%3) : (!llvm.ptr) -> !llvm.ptr
+// CHECK-MOV-NEXT:     %7 = "neura.load"(%6) : (!llvm.ptr) -> f32
+// CHECK-MOV-NEXT:     %8 = "neura.data_mov"(%4) : (i64) -> i64
+// CHECK-MOV-NEXT:     %9 = "neura.data_mov"(%7) : (f32) -> f32
+// CHECK-MOV-NEXT:     neura.br %8, %9 : i64, f32 to ^bb1
+// CHECK-MOV-NEXT:   ^bb1(%10: i64, %11: f32):  // 2 preds: ^bb0, ^bb1
+// CHECK-MOV-NEXT:     %12 = "neura.data_mov"(%2) : (!llvm.ptr) -> !llvm.ptr
+// CHECK-MOV-NEXT:     %13 = "neura.data_mov"(%10) : (i64) -> i64
+// CHECK-MOV-NEXT:     %14 = neura.load_indexed %12[%13 : i64] !llvm.ptr : f32
+// CHECK-MOV-NEXT:     %15 = "neura.data_mov"(%1) : (!llvm.ptr) -> !llvm.ptr
+// CHECK-MOV-NEXT:     %16 = "neura.data_mov"(%10) : (i64) -> i64
+// CHECK-MOV-NEXT:     %17 = neura.load_indexed %15[%16 : i64] !llvm.ptr : f32
+// CHECK-MOV-NEXT:     %18 = "neura.data_mov"(%14) : (f32) -> f32
+// CHECK-MOV-NEXT:     %19 = "neura.data_mov"(%17) : (f32) -> f32
+// CHECK-MOV-NEXT:     %20 = "neura.data_mov"(%11) : (f32) -> f32
+// CHECK-MOV-NEXT:     %21 = "neura.fmul_fadd"(%18, %19, %20) : (f32, f32, f32) -> f32
+// CHECK-MOV-NEXT:     %22 = "neura.data_mov"(%10) : (i64) -> i64
+// CHECK-MOV-NEXT:     %23 = "neura.add"(%22) {rhs_value = 1 : i64} : (i64) -> i64
+// CHECK-MOV-NEXT:     %24 = "neura.data_mov"(%23) : (i64) -> i64
+// CHECK-MOV-NEXT:     %25 = "neura.icmp"(%24) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (i64) -> i1
+// CHECK-MOV-NEXT:     %26 = "neura.data_mov"(%25) : (i1) -> i1
+// CHECK-MOV-NEXT:     %27 = "neura.data_mov"(%23) : (i64) -> i64
+// CHECK-MOV-NEXT:     %28 = "neura.data_mov"(%21) : (f32) -> f32
+// CHECK-MOV-NEXT:     neura.cond_br %26 : i1 then to ^bb2 else %27, %28 : i64, f32 to ^bb1
+// CHECK-MOV-NEXT:   ^bb2:  // pred: ^bb1
+// CHECK-MOV-NEXT:     %29 = "neura.data_mov"(%21) : (f32) -> f32
+// CHECK-MOV-NEXT:     %30 = "neura.data_mov"(%3) : (!llvm.ptr) -> !llvm.ptr
+// CHECK-MOV-NEXT:     "neura.store"(%29, %30) : (f32, !llvm.ptr) -> ()
+// CHECK-MOV-NEXT:     %31 = llvm.fpext %21 : f32 to f64
+// CHECK-MOV-NEXT:     %32 = llvm.call tail @printf(%0, %31) vararg(!llvm.func<i32 (ptr, ...)>) {no_unwind} : (!llvm.ptr, f64) -> i32
+// CHECK-MOV-NEXT:     %33 = "neura.data_mov"(%5) : (i32) -> i32
+// CHECK-MOV-NEXT:     "neura.return"(%33) : (i32) -> ()
+// CHECK-MOV-NEXT:   }
+// CHECK-MOV-NEXT:   func.func @_Z6kernelPfS_S_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}, %arg2: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
 // CHECK-MOV-NEXT:     %0 = "neura.grant_once"() <{constant_value = "%arg0"}> : () -> !neura.data<!llvm.ptr, i1>
 // CHECK-MOV-NEXT:     %1 = "neura.grant_once"() <{constant_value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
 // CHECK-MOV-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
@@ -193,5 +299,9 @@
 // CHECK-MOV-NEXT:     %71 = "neura.data_mov"(%51) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
 // CHECK-MOV-NEXT:     %72 = neura.grant_predicate %70, %71 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
 // CHECK-MOV-NEXT:     neura.ctrl_mov %72 -> %11 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CHECK-MOV-NEXT:     "neura.return"() : () -> ()
-// CHECK-MOV-NEXT:   }
\ No newline at end of file
+// CHECK-MOV-NEXT:     %73 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
+// CHECK-MOV-NEXT:     %74 = "neura.data_mov"(%73) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CHECK-MOV-NEXT:     "neura.return"(%74) : (!neura.data<i1, i1>) -> ()
+// CHECK-MOV-NEXT:   }
+// CHECK-MOV-NEXT:   llvm.func local_unnamed_addr @printf(!llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, ...) -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["nofree", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"}
+// CHECK-MOV-NEXT: }
\ No newline at end of file
diff --git a/test/neura/for_loop/relu_test.mlir b/test/neura/for_loop/relu_test.mlir
index 951f89c4..2c00cb6a 100644
--- a/test/neura/for_loop/relu_test.mlir
+++ b/test/neura/for_loop/relu_test.mlir
@@ -31,38 +31,123 @@
 // RUN:   --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized" \
 // RUN:  | FileCheck %s --check-prefix=MAPPING
 
-// CHECK:      func.func @_Z6kernelPiS_
-// CHECK-SAME: accelerator = "neura"
-// CHECK-NEXT:   %0 = "neura.constant"() <{value = "%arg0"}> : () -> !llvm.ptr
-// CHECK-NEXT:   %1 = "neura.constant"() <{value = "%arg1"}> : () -> !llvm.ptr
-// CHECK-NEXT:   %2 = "neura.constant"() <{value = 0 : i64}> : () -> i64
-// CHECK-NEXT:   %3 = "neura.constant"() <{value = 0 : i32}> : () -> i32
-// CHECK-NEXT:   %4 = "neura.constant"() <{value = 1 : i64}> : () -> i64
-// CHECK-NEXT:   %5 = "neura.constant"() <{value = 32 : i64}> : () -> i64
-// CHECK-NEXT:   neura.br %2, %0, %3, %1, %4, %5 : i64, !llvm.ptr, i32, !llvm.ptr, i64, i64 to ^bb2
-// CHECK-NEXT: ^bb1:  // pred: ^bb4
-// CHECK-NEXT:   "neura.return"() : () -> ()
-// CHECK-NEXT: ^bb2(%6: i64, %7: !llvm.ptr, %8: i32, %9: !llvm.ptr, %10: i64, %11: i64):  // 2 preds: ^bb0, ^bb4
-// CHECK-NEXT:   %12 = "neura.gep"(%7, %6) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
-// CHECK-NEXT:   %13 = "neura.load"(%12) : (!llvm.ptr) -> i32
-// CHECK-NEXT:   %14 = "neura.icmp"(%13, %8) <{cmpType = "sgt"}> : (i32, i32) -> i1
-// CHECK-NEXT:   neura.cond_br %14 : i1 then %9, %6, %13, %10, %11, %7, %8 : !llvm.ptr, i64, i32, i64, i64, !llvm.ptr, i32 to ^bb3 else %10, %11, %7, %8, %9 : i64, i64, !llvm.ptr, i32, !llvm.ptr to ^bb4
-// CHECK-NEXT: ^bb3(%15: !llvm.ptr, %16: i64, %17: i32, %18: i64, %19: i64, %20: !llvm.ptr, %21: i32):  // pred: ^bb2
-// CHECK-NEXT:   %22 = "neura.gep"(%15, %16) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
-// CHECK-NEXT:   %23 = "neura.load"(%22) : (!llvm.ptr) -> i32
-// CHECK-NEXT:   %24 = "neura.add"(%23, %17) : (i32, i32) -> i32
-// CHECK-NEXT:   "neura.store"(%24, %22) : (i32, !llvm.ptr) -> ()
-// CHECK-NEXT:   neura.br %18, %19, %20, %21, %15 : i64, i64, !llvm.ptr, i32, !llvm.ptr to ^bb4
-// CHECK-NEXT: ^bb4(%25: i64, %26: i64, %27: !llvm.ptr, %28: i32, %29: !llvm.ptr):  // 2 preds: ^bb2, ^bb3
-// CHECK-NEXT:   %30 = "neura.add"(%6, %25) : (i64, i64) -> i64
-// CHECK-NEXT:   %31 = "neura.icmp"(%30, %26) <{cmpType = "eq"}> : (i64, i64) -> i1
-// CHECK-NEXT:   neura.cond_br %31 : i1 then to ^bb1 else %30, %27, %28, %29, %25, %26 : i64, !llvm.ptr, i32, !llvm.ptr, i64, i64 to ^bb2
+// CHECK:      module attributes {{.*}}
+// CHECK-NEXT:   llvm.mlir.global external local_unnamed_addr @input(dense<[1, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24, -25, 26, -27, 28, -29, 30, -31]> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
+// CHECK-NEXT:   llvm.mlir.global external local_unnamed_addr @output(dense<0> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
+// CHECK-NEXT:   llvm.mlir.global private unnamed_addr constant @".str"("output[%d] = %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local}
+// CHECK-NEXT:   llvm.func local_unnamed_addr @main() -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+// CHECK-NEXT:     %0 = llvm.mlir.addressof @".str" : !llvm.ptr
+// CHECK-NEXT:     %1 = llvm.mlir.addressof @input : !llvm.ptr
+// CHECK-NEXT:     %2 = llvm.mlir.addressof @output : !llvm.ptr
+// CHECK-NEXT:     %3 = "neura.constant"() <{value = 0 : i8}> : () -> i8
+// CHECK-NEXT:     %4 = "neura.constant"() <{value = 128 : i64}> : () -> i64
+// CHECK-NEXT:     %5 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// CHECK-NEXT:     %6 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT:     %7 = "neura.constant"() <{value = 1 : i64}> : () -> i64
+// CHECK-NEXT:     %8 = "neura.constant"() <{value = 32 : i64}> : () -> i64
+// CHECK-NEXT:     "neura.memset"(%2, %3, %4) <{is_volatile = false}> : (!llvm.ptr, i8, i64) -> ()
+// CHECK-NEXT:     neura.br %5 : i64 to ^bb1
+// CHECK-NEXT:   ^bb1(%9: i64):  // 2 preds: ^bb0, ^bb3
+// CHECK-NEXT:     %10 = "neura.gep"(%1, %9) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// CHECK-NEXT:     %11 = "neura.load"(%10) : (!llvm.ptr) -> i32
+// CHECK-NEXT:     %12 = "neura.icmp"(%11, %6) <{cmpType = "sgt"}> : (i32, i32) -> i1
+// CHECK-NEXT:     neura.cond_br %12 : i1 then to ^bb2 else to ^bb3
+// CHECK-NEXT:   ^bb2:  // pred: ^bb1
+// CHECK-NEXT:     %13 = "neura.gep"(%2, %9) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// CHECK-NEXT:     %14 = "neura.load"(%13) : (!llvm.ptr) -> i32
+// CHECK-NEXT:     %15 = "neura.add"(%14, %11) : (i32, i32) -> i32
+// CHECK-NEXT:     "neura.store"(%15, %13) : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:     neura.br to ^bb3
+// CHECK-NEXT:   ^bb3:  // 2 preds: ^bb1, ^bb2
+// CHECK-NEXT:     %16 = "neura.add"(%9, %7) : (i64, i64) -> i64
+// CHECK-NEXT:     %17 = "neura.icmp"(%16, %8) <{cmpType = "eq"}> : (i64, i64) -> i1
+// CHECK-NEXT:     neura.cond_br %17 : i1 then %5 : i64 to ^bb5 else %16 : i64 to ^bb1
+// CHECK-NEXT:   ^bb4:  // pred: ^bb5
+// CHECK-NEXT:     "neura.return"(%6) : (i32) -> ()
+// CHECK-NEXT:   ^bb5(%18: i64):  // 2 preds: ^bb3, ^bb5
+// CHECK-NEXT:     %19 = "neura.constant"() <{value = 0 : i32}> : () -> index
+// CHECK-NEXT:     %20 = "neura.gep"(%2, %19, %18) <{operandSegmentSizes = array<i32: 1, 2>}> : (!llvm.ptr, index, i64) -> !llvm.ptr
+// CHECK-NEXT:     %21 = "neura.load"(%20) : (!llvm.ptr) -> i32
+// CHECK-NEXT:     %22 = "neura.cast"(%18) <{cast_type = "trunc"}> : (i64) -> i32
+// CHECK-NEXT:     %23 = llvm.call tail @printf(%0, %22, %21) vararg(!llvm.func<i32 (ptr, ...)>) {no_unwind} : (!llvm.ptr, i32, i32) -> i32
+// CHECK-NEXT:     %24 = "neura.add"(%18, %7) : (i64, i64) -> i64
+// CHECK-NEXT:     %25 = "neura.icmp"(%24, %8) <{cmpType = "eq"}> : (i64, i64) -> i1
+// CHECK-NEXT:     neura.cond_br %25 : i1 then to ^bb4 else %24 : i64 to ^bb5
+// CHECK-NEXT:   }
+// CHECK-NEXT:   func.func @_Z6kernelPiS_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
+// CHECK-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> : () -> !llvm.ptr
+// CHECK-NEXT:     %1 = "neura.constant"() <{value = "%arg1"}> : () -> !llvm.ptr
+// CHECK-NEXT:     %2 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// CHECK-NEXT:     %3 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CHECK-NEXT:     %4 = "neura.constant"() <{value = 1 : i64}> : () -> i64
+// CHECK-NEXT:     %5 = "neura.constant"() <{value = 32 : i64}> : () -> i64
+// CHECK-NEXT:     neura.br %2, %0, %3, %1, %4, %5 : i64, !llvm.ptr, i32, !llvm.ptr, i64, i64 to ^bb2
+// CHECK-NEXT:   ^bb1:  // pred: ^bb4
+// CHECK-NEXT:     "neura.return"() : () -> ()
+// CHECK-NEXT:   ^bb2(%6: i64, %7: !llvm.ptr, %8: i32, %9: !llvm.ptr, %10: i64, %11: i64):  // 2 preds: ^bb0, ^bb4
+// CHECK-NEXT:     %12 = "neura.gep"(%7, %6) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// CHECK-NEXT:     %13 = "neura.load"(%12) : (!llvm.ptr) -> i32
+// CHECK-NEXT:     %14 = "neura.icmp"(%13, %8) <{cmpType = "sgt"}> : (i32, i32) -> i1
+// CHECK-NEXT:     neura.cond_br %14 : i1 then %9, %6, %13, %10, %11, %7, %8 : !llvm.ptr, i64, i32, i64, i64, !llvm.ptr, i32 to ^bb3 else %10, %11, %7, %8, %9 : i64, i64, !llvm.ptr, i32, !llvm.ptr to ^bb4
+// CHECK-NEXT:   ^bb3(%15: !llvm.ptr, %16: i64, %17: i32, %18: i64, %19: i64, %20: !llvm.ptr, %21: i32):  // pred: ^bb2
+// CHECK-NEXT:     %22 = "neura.gep"(%15, %16) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// CHECK-NEXT:     %23 = "neura.load"(%22) : (!llvm.ptr) -> i32
+// CHECK-NEXT:     %24 = "neura.add"(%23, %17) : (i32, i32) -> i32
+// CHECK-NEXT:     "neura.store"(%24, %22) : (i32, !llvm.ptr) -> ()
+// CHECK-NEXT:     neura.br %18, %19, %20, %21, %15 : i64, i64, !llvm.ptr, i32, !llvm.ptr to ^bb4
+// CHECK-NEXT:   ^bb4(%25: i64, %26: i64, %27: !llvm.ptr, %28: i32, %29: !llvm.ptr):  // 2 preds: ^bb2, ^bb3
+// CHECK-NEXT:     %30 = "neura.add"(%6, %25) : (i64, i64) -> i64
+// CHECK-NEXT:     %31 = "neura.icmp"(%30, %26) <{cmpType = "eq"}> : (i64, i64) -> i1
+// CHECK-NEXT:     neura.cond_br %31 : i1 then to ^bb1 else %30, %27, %28, %29, %25, %26 : i64, !llvm.ptr, i32, !llvm.ptr, i64, i64 to ^bb2
+// CHECK-NEXT:   }
+// CHECK-NEXT:   llvm.func local_unnamed_addr @printf(!llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, ...) -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["nofree", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"}
 // CHECK-NEXT: }
 
 
-// CTRL2DATA:      func.func @_Z6kernelPiS_
-// CTRL2DATA-SAME: accelerator = "neura"
-// CTRL2DATA-SAME: dataflow_mode = "predicate"
+// CTRL2DATA:      module attributes {{.*}}
+// CTRL2DATA-NEXT:   llvm.mlir.global external local_unnamed_addr @input(dense<[1, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24, -25, 26, -27, 28, -29, 30, -31]> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
+// CTRL2DATA-NEXT:   llvm.mlir.global external local_unnamed_addr @output(dense<0> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
+// CTRL2DATA-NEXT:   llvm.mlir.global private unnamed_addr constant @".str"("output[%d] = %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local}
+// CTRL2DATA-NEXT:   llvm.func local_unnamed_addr @main() -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+// CTRL2DATA-NEXT:     %0 = llvm.mlir.addressof @".str" : !llvm.ptr
+// CTRL2DATA-NEXT:     %1 = llvm.mlir.addressof @input : !llvm.ptr
+// CTRL2DATA-NEXT:     %2 = llvm.mlir.addressof @output : !llvm.ptr
+// CTRL2DATA-NEXT:     %3 = "neura.constant"() <{value = 0 : i8}> : () -> i8
+// CTRL2DATA-NEXT:     %4 = "neura.constant"() <{value = 128 : i64}> : () -> i64
+// CTRL2DATA-NEXT:     %5 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// CTRL2DATA-NEXT:     %6 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// CTRL2DATA-NEXT:     %7 = "neura.constant"() <{value = 1 : i64}> : () -> i64
+// CTRL2DATA-NEXT:     %8 = "neura.constant"() <{value = 32 : i64}> : () -> i64
+// CTRL2DATA-NEXT:     "neura.memset"(%2, %3, %4) <{is_volatile = false}> : (!llvm.ptr, i8, i64) -> ()
+// CTRL2DATA-NEXT:     neura.br %5 : i64 to ^bb1
+// CTRL2DATA-NEXT:   ^bb1(%9: i64):  // 2 preds: ^bb0, ^bb3
+// CTRL2DATA-NEXT:     %10 = "neura.gep"(%1, %9) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// CTRL2DATA-NEXT:     %11 = "neura.load"(%10) : (!llvm.ptr) -> i32
+// CTRL2DATA-NEXT:     %12 = "neura.icmp"(%11, %6) <{cmpType = "sgt"}> : (i32, i32) -> i1
+// CTRL2DATA-NEXT:     neura.cond_br %12 : i1 then to ^bb2 else to ^bb3
+// CTRL2DATA-NEXT:   ^bb2:  // pred: ^bb1
+// CTRL2DATA-NEXT:     %13 = "neura.gep"(%2, %9) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// CTRL2DATA-NEXT:     %14 = "neura.load"(%13) : (!llvm.ptr) -> i32
+// CTRL2DATA-NEXT:     %15 = "neura.add"(%14, %11) : (i32, i32) -> i32
+// CTRL2DATA-NEXT:     "neura.store"(%15, %13) : (i32, !llvm.ptr) -> ()
+// CTRL2DATA-NEXT:     neura.br to ^bb3
+// CTRL2DATA-NEXT:   ^bb3:  // 2 preds: ^bb1, ^bb2
+// CTRL2DATA-NEXT:     %16 = "neura.add"(%9, %7) : (i64, i64) -> i64
+// CTRL2DATA-NEXT:     %17 = "neura.icmp"(%16, %8) <{cmpType = "eq"}> : (i64, i64) -> i1
+// CTRL2DATA-NEXT:     neura.cond_br %17 : i1 then %5 : i64 to ^bb5 else %16 : i64 to ^bb1
+// CTRL2DATA-NEXT:   ^bb4:  // pred: ^bb5
+// CTRL2DATA-NEXT:     "neura.return"(%6) : (i32) -> ()
+// CTRL2DATA-NEXT:   ^bb5(%18: i64):  // 2 preds: ^bb3, ^bb5
+// CTRL2DATA-NEXT:     %19 = "neura.constant"() <{value = 0 : i32}> : () -> index
+// CTRL2DATA-NEXT:     %20 = "neura.gep"(%2, %19, %18) <{operandSegmentSizes = array<i32: 1, 2>}> : (!llvm.ptr, index, i64) -> !llvm.ptr
+// CTRL2DATA-NEXT:     %21 = "neura.load"(%20) : (!llvm.ptr) -> i32
+// CTRL2DATA-NEXT:     %22 = "neura.cast"(%18) <{cast_type = "trunc"}> : (i64) -> i32
+// CTRL2DATA-NEXT:     %23 = llvm.call tail @printf(%0, %22, %21) vararg(!llvm.func<i32 (ptr, ...)>) {no_unwind} : (!llvm.ptr, i32, i32) -> i32
+// CTRL2DATA-NEXT:     %24 = "neura.add"(%18, %7) : (i64, i64) -> i64
+// CTRL2DATA-NEXT:     %25 = "neura.icmp"(%24, %8) <{cmpType = "eq"}> : (i64, i64) -> i1
+// CTRL2DATA-NEXT:     neura.cond_br %25 : i1 then to ^bb4 else %24 : i64 to ^bb5
+// CTRL2DATA-NEXT:   }
+// CTRL2DATA-NEXT:   func.func @_Z6kernelPiS_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
 // CTRL2DATA-NEXT:     %0 = "neura.constant"() <{value = "%arg0"}> : () -> !neura.data<!llvm.ptr, i1>
 // CTRL2DATA-NEXT:     %1 = "neura.grant_once"(%0) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
 // CTRL2DATA-NEXT:     %2 = "neura.constant"() <{value = "%arg1"}> : () -> !neura.data<!llvm.ptr, i1>
@@ -127,8 +212,386 @@
 // CTRL2DATA-NEXT:     neura.ctrl_mov %55 -> %14 : !neura.data<i64, i1> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     %56 = neura.grant_predicate %46, %50 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
 // CTRL2DATA-NEXT:     neura.ctrl_mov %56 -> %12 : !neura.data<i64, i1> !neura.data<i64, i1>
-// CTRL2DATA-NEXT:     "neura.return"() : () -> ()
+// CTRL2DATA-NEXT:     %57 = "neura.constant"() <{value = true}> : () -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     %58 = "neura.grant_once"(%57) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// CTRL2DATA-NEXT:     "neura.return"(%58) : (!neura.data<i1, i1>) -> ()
 // CTRL2DATA-NEXT:   }
+// CTRL2DATA-NEXT:   llvm.func local_unnamed_addr @printf(!llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, ...) -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["nofree", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"}
+// CTRL2DATA-NEXT: }
 
 
-// MAPPING: func.func @_Z6kernelPiS_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 5 : i32, res_mii = 1 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
\ No newline at end of file
+// MAPPING:      [DEBUG] Recurrence cycle (length 3):
+// MAPPING-NEXT:   %1 = neura.reserve : !neura.data<i64, i1>
+// MAPPING-NEXT:   %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %31 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:   neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT: [DEBUG] Recurrence cycle (length 5):
+// MAPPING-NEXT:   %1 = neura.reserve : !neura.data<i64, i1>
+// MAPPING-NEXT:   %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:   neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Longest recurrence cycle (length 5):
+// MAPPING-NEXT: %1 = neura.reserve : !neura.data<i64, i1>
+// MAPPING-NEXT: %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %1 = neura.reserve : !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %2 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %35 = "neura.data_mov"(%34) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: "neura.return"(%35) : (!neura.data<i1, i1>) -> ()
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %10 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %4 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %31 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %6 = "neura.data_mov"(%5) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %13 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %8 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %14 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %11 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %21 = "neura.data_mov"(%15) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %16 = "neura.data_mov"(%12) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %24 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %18 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %20 = "neura.data_mov"(%19) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %23 = "neura.data_mov"(%22) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 0: 3 ops
+// MAPPING-NEXT:   %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %1 = neura.reserve : !neura.data<i64, i1>
+// MAPPING-NEXT:   %2 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 1: 3 ops
+// MAPPING-NEXT:   %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %4 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 2: 5 ops
+// MAPPING-NEXT:   %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:   %31 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %6 = "neura.data_mov"(%5) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 3: 4 ops
+// MAPPING-NEXT:   %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:   %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %8 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 4: 5 ops
+// MAPPING-NEXT:   %10 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %11 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 5: 4 ops
+// MAPPING-NEXT:   %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:   %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:   neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:   %16 = "neura.data_mov"(%12) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 6: 4 ops
+// MAPPING-NEXT:   %13 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:   %14 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:   %18 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 7: 4 ops
+// MAPPING-NEXT:   %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT:   %21 = "neura.data_mov"(%15) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:   %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:   %20 = "neura.data_mov"(%19) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 8: 5 ops
+// MAPPING-NEXT:   %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %35 = "neura.data_mov"(%34) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:   %24 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:   %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:   %23 = "neura.data_mov"(%22) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 9: 2 ops
+// MAPPING-NEXT:   "neura.return"(%35) : (!neura.data<i1, i1>) -> ()
+// MAPPING-NEXT:   "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1> (ALAP level: 0)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %1 = neura.reserve : !neura.data<i64, i1> (ALAP level: 0)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %2 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 0)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1> (ALAP level: 1)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 1)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %4 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 1)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 2)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 2)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %31 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 2)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 2)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %6 = "neura.data_mov"(%5) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 2)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1> (ALAP level: 3)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1> (ALAP level: 3)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 3)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %8 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 3)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %10 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 4)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 4)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1> (ALAP level: 4)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 4)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %11 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 4)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1> (ALAP level: 5)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1> (ALAP level: 5)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1> (ALAP level: 5)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %16 = "neura.data_mov"(%12) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 5)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %13 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 6)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %14 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 6)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 6)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %18 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 6)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1> (ALAP level: 7)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %21 = "neura.data_mov"(%15) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 7)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1> (ALAP level: 7)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %20 = "neura.data_mov"(%19) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 7)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1> (ALAP level: 8)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %35 = "neura.data_mov"(%34) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 8)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %24 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 8)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 8)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %23 = "neura.data_mov"(%22) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 8)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: "neura.return"(%35) : (!neura.data<i1, i1>) -> () (ALAP level: 9)
+// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> () (ALAP level: 9)
+// MAPPING-NEXT: ---------------------------------------------------------
+// MAPPING-NEXT: [HeuristicMapping] Starting mapping with 39 operations.
+// MAPPING-NEXT: Configuration: MAX Backtrack Depth = 3, MAX Candidate Locations = 5
+// MAPPING-NEXT: [HeuristicMapping] Filtered 22 non-materialized operations, 17 operations require physical mapping.
+// MAPPING-NEXT: [HeuristicMapping] Materialized operations list:
+// MAPPING-NEXT: 0 %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1> (level: 0)
+// MAPPING-NEXT: 1 %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1> (level: 1)
+// MAPPING-NEXT: 2 %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (level: 2)
+// MAPPING-NEXT: 3 %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1> (level: 2)
+// MAPPING-NEXT: 4 %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1> (level: 3)
+// MAPPING-NEXT: 5 %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1> (level: 3)
+// MAPPING-NEXT: 6 %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (level: 4)
+// MAPPING-NEXT: 7 %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1> (level: 4)
+// MAPPING-NEXT: 8 %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1> (level: 5)
+// MAPPING-NEXT: 9 %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1> (level: 5)
+// MAPPING-NEXT: 10 %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1> (level: 6)
+// MAPPING-NEXT: 11 %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1> (level: 7)
+// MAPPING-NEXT: 12 %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1> (level: 7)
+// MAPPING-NEXT: 13 %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1> (level: 8)
+// MAPPING-NEXT: 14 %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1> (level: 8)
+// MAPPING-NEXT: 15 "neura.return"(%35) : (!neura.data<i1, i1>) -> () (level: 9)
+// MAPPING-NEXT: 16 "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> () (level: 9)
+// MAPPING-NEXT: [HeuristicMapping] Found 80 candidate locations for operation: %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=0
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 54 candidate locations for operation: %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=1
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=0 to Tile#11 @t=1
+// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #704
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 53 candidate locations for operation: %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=2
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=1 to Tile#11 @t=2
+// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #704
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 52 candidate locations for operation: %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#10 @t=2
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=1 to Tile#10 @t=2
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 51 candidate locations for operation: %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=3
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=2 to Tile#11 @t=3
+// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #704
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 58 candidate locations for operation: %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#10 @t=3
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=2 to Tile#10 @t=3
+// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #640
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 49 candidate locations for operation: %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#10 @t=4
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=3 to Tile#10 @t=4
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 56 candidate locations for operation: %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=4
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=3 to Tile#11 @t=4
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 1 candidate locations for operation: %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/1 at tile#10 @t=5
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=2 to Tile#10 @t=5
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=4 to Tile#10 @t=5
+// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #641
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=5 to Tile#11 @t=6
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 46 candidate locations for operation: %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=5
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=1 to Tile#7 @t=5
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=4 to Tile#7 @t=5
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 45 candidate locations for operation: %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=6
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=5 to Tile#7 @t=6
+// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #448
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 64 candidate locations for operation: %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#15 @t=7
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=3 to Tile#15 @t=7
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=4 to Tile#15 @t=7
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 45 candidate locations for operation: %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=7
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=6 to Tile#7 @t=7
+// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #448
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 67 candidate locations for operation: %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=8
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 28 candidate locations for operation: %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=9
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=7 to Tile#7 @t=9
+// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #449
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#15 @t=7 to Tile#7 @t=9
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT: [HeuristicMapping] Found 40 candidate locations for operation: "neura.return"(%35) : (!neura.data<i1, i1>) -> ()
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#3 @t=9
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=8 to Tile#3 @t=9
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation "neura.return"(%35) : (!neura.data<i1, i1>) -> ()
+// MAPPING-NEXT: [HeuristicMapping] Found 40 candidate locations for operation: "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#6 @t=10
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=9 to Tile#6 @t=10
+// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=6 to Tile#6 @t=10
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT: [HeuristicMapping] Successfully mapped all 17 operations.
+// MAPPING-NEXT: module attributes {{.*}}
+// MAPPING-NEXT:   llvm.mlir.global external local_unnamed_addr @input(dense<[1, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24, -25, 26, -27, 28, -29, 30, -31]> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
+// MAPPING-NEXT:   llvm.mlir.global external local_unnamed_addr @output(dense<0> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
+// MAPPING-NEXT:   llvm.mlir.global private unnamed_addr constant @".str"("output[%d] = %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local}
+// MAPPING-NEXT:   llvm.func local_unnamed_addr @main() -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
+// MAPPING-NEXT:     %0 = llvm.mlir.addressof @".str" : !llvm.ptr
+// MAPPING-NEXT:     %1 = llvm.mlir.addressof @input : !llvm.ptr
+// MAPPING-NEXT:     %2 = llvm.mlir.addressof @output : !llvm.ptr
+// MAPPING-NEXT:     %3 = "neura.constant"() <{value = 0 : i8}> : () -> i8
+// MAPPING-NEXT:     %4 = "neura.constant"() <{value = 128 : i64}> : () -> i64
+// MAPPING-NEXT:     %5 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// MAPPING-NEXT:     %6 = "neura.constant"() <{value = 0 : i32}> : () -> i32
+// MAPPING-NEXT:     %7 = "neura.data_mov"(%2) : (!llvm.ptr) -> !llvm.ptr
+// MAPPING-NEXT:     %8 = "neura.data_mov"(%3) : (i8) -> i8
+// MAPPING-NEXT:     %9 = "neura.data_mov"(%4) : (i64) -> i64
+// MAPPING-NEXT:     "neura.memset"(%7, %8, %9) <{is_volatile = false}> : (!llvm.ptr, i8, i64) -> ()
+// MAPPING-NEXT:     %10 = "neura.data_mov"(%5) : (i64) -> i64
+// MAPPING-NEXT:     neura.br %10 : i64 to ^bb1
+// MAPPING-NEXT:   ^bb1(%11: i64):  // 2 preds: ^bb0, ^bb3
+// MAPPING-NEXT:     %12 = "neura.data_mov"(%1) : (!llvm.ptr) -> !llvm.ptr
+// MAPPING-NEXT:     %13 = "neura.data_mov"(%11) : (i64) -> i64
+// MAPPING-NEXT:     %14 = "neura.gep"(%12, %13) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// MAPPING-NEXT:     %15 = "neura.data_mov"(%14) : (!llvm.ptr) -> !llvm.ptr
+// MAPPING-NEXT:     %16 = "neura.load"(%15) : (!llvm.ptr) -> i32
+// MAPPING-NEXT:     %17 = "neura.data_mov"(%16) : (i32) -> i32
+// MAPPING-NEXT:     %18 = "neura.icmp"(%17) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (i32) -> i1
+// MAPPING-NEXT:     %19 = "neura.data_mov"(%18) : (i1) -> i1
+// MAPPING-NEXT:     neura.cond_br %19 : i1 then to ^bb2 else to ^bb3
+// MAPPING-NEXT:   ^bb2:  // pred: ^bb1
+// MAPPING-NEXT:     %20 = "neura.data_mov"(%2) : (!llvm.ptr) -> !llvm.ptr
+// MAPPING-NEXT:     %21 = "neura.data_mov"(%11) : (i64) -> i64
+// MAPPING-NEXT:     %22 = "neura.gep"(%20, %21) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
+// MAPPING-NEXT:     %23 = "neura.data_mov"(%22) : (!llvm.ptr) -> !llvm.ptr
+// MAPPING-NEXT:     %24 = "neura.load"(%23) : (!llvm.ptr) -> i32
+// MAPPING-NEXT:     %25 = "neura.data_mov"(%24) : (i32) -> i32
+// MAPPING-NEXT:     %26 = "neura.data_mov"(%16) : (i32) -> i32
+// MAPPING-NEXT:     %27 = "neura.add"(%25, %26) : (i32, i32) -> i32
+// MAPPING-NEXT:     %28 = "neura.data_mov"(%27) : (i32) -> i32
+// MAPPING-NEXT:     %29 = "neura.data_mov"(%22) : (!llvm.ptr) -> !llvm.ptr
+// MAPPING-NEXT:     "neura.store"(%28, %29) : (i32, !llvm.ptr) -> ()
+// MAPPING-NEXT:     neura.br to ^bb3
+// MAPPING-NEXT:   ^bb3:  // 2 preds: ^bb1, ^bb2
+// MAPPING-NEXT:     %30 = "neura.data_mov"(%11) : (i64) -> i64
+// MAPPING-NEXT:     %31 = "neura.add"(%30) {rhs_value = 1 : i64} : (i64) -> i64
+// MAPPING-NEXT:     %32 = "neura.data_mov"(%31) : (i64) -> i64
+// MAPPING-NEXT:     %33 = "neura.icmp"(%32) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (i64) -> i1
+// MAPPING-NEXT:     %34 = "neura.data_mov"(%33) : (i1) -> i1
+// MAPPING-NEXT:     %35 = "neura.data_mov"(%5) : (i64) -> i64
+// MAPPING-NEXT:     %36 = "neura.data_mov"(%31) : (i64) -> i64
+// MAPPING-NEXT:     neura.cond_br %34 : i1 then %35 : i64 to ^bb5 else %36 : i64 to ^bb1
+// MAPPING-NEXT:   ^bb4:  // pred: ^bb5
+// MAPPING-NEXT:     %37 = "neura.data_mov"(%6) : (i32) -> i32
+// MAPPING-NEXT:     "neura.return"(%37) : (i32) -> ()
+// MAPPING-NEXT:   ^bb5(%38: i64):  // 2 preds: ^bb3, ^bb5
+// MAPPING-NEXT:     %39 = "neura.data_mov"(%2) : (!llvm.ptr) -> !llvm.ptr
+// MAPPING-NEXT:     %40 = "neura.data_mov"(%38) : (i64) -> i64
+// MAPPING-NEXT:     %41 = "neura.gep"(%39, %40) <{operandSegmentSizes = array<i32: 1, 1>}> {operand_1_value = 0 : i32} : (!llvm.ptr, i64) -> !llvm.ptr
+// MAPPING-NEXT:     %42 = "neura.data_mov"(%41) : (!llvm.ptr) -> !llvm.ptr
+// MAPPING-NEXT:     %43 = "neura.load"(%42) : (!llvm.ptr) -> i32
+// MAPPING-NEXT:     %44 = "neura.data_mov"(%38) : (i64) -> i64
+// MAPPING-NEXT:     %45 = "neura.cast"(%44) <{cast_type = "trunc"}> : (i64) -> i32
+// MAPPING-NEXT:     %46 = llvm.call tail @printf(%0, %45, %43) vararg(!llvm.func<i32 (ptr, ...)>) {no_unwind} : (!llvm.ptr, i32, i32) -> i32
+// MAPPING-NEXT:     %47 = "neura.data_mov"(%38) : (i64) -> i64
+// MAPPING-NEXT:     %48 = "neura.add"(%47) {rhs_value = 1 : i64} : (i64) -> i64
+// MAPPING-NEXT:     %49 = "neura.data_mov"(%48) : (i64) -> i64
+// MAPPING-NEXT:     %50 = "neura.icmp"(%49) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (i64) -> i1
+// MAPPING-NEXT:     %51 = "neura.data_mov"(%50) : (i1) -> i1
+// MAPPING-NEXT:     %52 = "neura.data_mov"(%48) : (i64) -> i64
+// MAPPING-NEXT:     neura.cond_br %51 : i1 then to ^bb4 else %52 : i64 to ^bb5
+// MAPPING-NEXT:   }
+// MAPPING-NEXT:   func.func @_Z6kernelPiS_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 5 : i32, res_mii = 2 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
+// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {dfg_id = 0 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %1 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i64, i1>
+// MAPPING-NEXT:     %2 = "neura.data_mov"(%0) {dfg_id = 3 : i32, mapping_locs = [{id = 704 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %3 = neura.phi_start %2, %1 {dfg_id = 5 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %4 = "neura.data_mov"(%3) {dfg_id = 9 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 11 : i32, lhs_value = "%arg0", mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %6 = "neura.data_mov"(%5) {dfg_id = 14 : i32, mapping_locs = [{id = 640 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %7 = "neura.load"(%6) {dfg_id = 16 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %8 = "neura.data_mov"(%7) {dfg_id = 19 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {dfg_id = 21 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %10 = "neura.data_mov"(%3) {dfg_id = 8 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 448 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}, {id = 448 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 448 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %11 = "neura.data_mov"(%9) {dfg_id = 24 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %12 = neura.grant_predicate %10, %11 {dfg_id = 27 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 3 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %13 = "neura.data_mov"(%7) {dfg_id = 18 : i32, mapping_locs = [{id = 34 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}, {id = 44 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 960 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 960 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %14 = "neura.data_mov"(%9) {dfg_id = 23 : i32, mapping_locs = [{id = 37 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 961 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}, {id = 961 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %15 = neura.grant_predicate %13, %14 {dfg_id = 26 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 3 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %16 = "neura.data_mov"(%12) {dfg_id = 30 : i32, mapping_locs = [{id = 448 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 31 : i32, lhs_value = "%arg1", mapping_locs = [{id = 7 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 3 : i32, y = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %18 = "neura.data_mov"(%17) {dfg_id = 33 : i32, mapping_locs = [{id = 448 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     %19 = "neura.load"(%18) {dfg_id = 34 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 1 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %20 = "neura.data_mov"(%19) {dfg_id = 35 : i32, mapping_locs = [{id = 449 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}, {id = 449 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %21 = "neura.data_mov"(%15) {dfg_id = 29 : i32, mapping_locs = [{id = 47 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}, {id = 36 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %22 = "neura.add"(%20, %21) {dfg_id = 36 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 3 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %23 = "neura.data_mov"(%22) {dfg_id = 37 : i32, mapping_locs = [{id = 21 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
+// MAPPING-NEXT:     %24 = "neura.data_mov"(%17) {dfg_id = 32 : i32, mapping_locs = [{id = 21 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}, {id = 384 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}, {id = 384 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}, {id = 384 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
+// MAPPING-NEXT:     "neura.store"(%23, %24) {dfg_id = 38 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
+// MAPPING-NEXT:     %25 = "neura.data_mov"(%3) {dfg_id = 7 : i32, mapping_locs = [{id = 704 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %26 = "neura.add"(%25) {dfg_id = 10 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %27 = "neura.data_mov"(%26) {dfg_id = 13 : i32, mapping_locs = [{id = 704 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {dfg_id = 15 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %29 = "neura.data_mov"(%28) {dfg_id = 17 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %30 = "neura.not"(%29) {dfg_id = 20 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %31 = "neura.data_mov"(%26) {dfg_id = 12 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 640 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 640 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
+// MAPPING-NEXT:     %32 = "neura.data_mov"(%30) {dfg_id = 22 : i32, mapping_locs = [{id = 641 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %33 = neura.grant_predicate %31, %32 {dfg_id = 25 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
+// MAPPING-NEXT:     neura.ctrl_mov %33 -> %1 {dfg_id = 28 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
+// MAPPING-NEXT:     %34 = "neura.grant_once"() <{constant_value = true}> {dfg_id = 2 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 3 : i32, y = 1 : i32}]} : () -> !neura.data<i1, i1>
+// MAPPING-NEXT:     %35 = "neura.data_mov"(%34) {dfg_id = 4 : i32, mapping_locs = [{id = 22 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
+// MAPPING-NEXT:     "neura.return"(%35) {dfg_id = 6 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 3 : i32, y = 0 : i32}]} : (!neura.data<i1, i1>) -> ()
+// MAPPING-NEXT:   }
+// MAPPING-NEXT:   llvm.func local_unnamed_addr @printf(!llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, ...) -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["nofree", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"}
+// MAPPING-NEXT: }
\ No newline at end of file
diff --git a/test/neura/fusion/test.mlir b/test/neura/fusion/test.mlir
index f3b1dc32..68abb040 100644
--- a/test/neura/fusion/test.mlir
+++ b/test/neura/fusion/test.mlir
@@ -46,27 +46,27 @@
 // RUN:           --iter-merge-pattern="min-support=3 max-iter=4" %t-kernel.mlir \
 // RUN: | FileCheck %s --check-prefix=CHECK-ITER-MERGE-PATTERN
 
-// CHECK-ITER-MERGE-PATTERN:       %11:2 = "neura.fused_op"(%10) <{frequency = 4 : i64, pattern_id = 9 : i64, pattern_name = "grant_once->phi_start"}> ({
+// CHECK-ITER-MERGE-PATTERN:      %11:2 = "neura.fused_op"(%10) <{frequency = 4 : i64, pattern_id = 9 : i64, pattern_name = "grant_once->phi_start"}> ({
 // CHECK-ITER-MERGE-PATTERN-NEXT:    ^bb0(%arg5: !neura.data<i64, i1>):
-// CHECK-ITER-MERGE-PATTERN-NEXT:      %61 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:      %62 = neura.phi_start %61, %arg5 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:      neura.yield %61, %62 : !neura.data<i64, i1>, !neura.data<i64, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %62 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %63 = neura.phi_start %62, %arg5 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      neura.yield %62, %63 : !neura.data<i64, i1>, !neura.data<i64, i1>
 // CHECK-ITER-MERGE-PATTERN-NEXT:    }) : (!neura.data<i64, i1>) -> (!neura.data<i64, i1>, !neura.data<i64, i1>)
 // CHECK-ITER-MERGE-PATTERN:       %15:3 = "neura.fused_op"(%11#0, %14, %4, %13) <{frequency = 3 : i64, pattern_id = 6 : i64, pattern_name = "phi_start->fused_op:phi_start->fused_op:gep->load"}> ({
-// CHECK-ITER-MERGE-PATTERN-NEXT:   ^bb0(%arg5: !neura.data<i64, i1>, %arg6: !neura.data<i64, i1>, %arg7: !neura.data<!llvm.ptr, i1>, %arg8: !neura.data<!llvm.ptr, i1>):
-// CHECK-ITER-MERGE-PATTERN-NEXT:     %61 = neura.phi_start %arg5, %arg6 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:     %62 = neura.phi_start %arg7, %arg8 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1> -> !neura.data<!llvm.ptr, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:     %63 = "neura.gep"(%62, %61) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:     %64 = "neura.load"(%63) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:     neura.yield %61, %62, %64 : !neura.data<i64, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
-// CHECK-ITER-MERGE-PATTERN-NEXT:   }) : (!neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> (!neura.data<i64, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
- // CHECK-ITER-MERGE-PATTERN:      %16:3 = "neura.fused_op"(%2, %12, %15#0) <{frequency = 8 : i64, pattern_id = 10 : i64, pattern_name = "phi_start->fused_op:gep->load"}> ({
- // CHECK-ITER-MERGE-PATTERN-NEXT:   ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<!llvm.ptr, i1>, %arg7: !neura.data<i64, i1>):
- // CHECK-ITER-MERGE-PATTERN-NEXT:     %61 = neura.phi_start %arg5, %arg6 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1> -> !neura.data<!llvm.ptr, i1>
- // CHECK-ITER-MERGE-PATTERN-NEXT:     %62 = "neura.gep"(%61, %arg7) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
- // CHECK-ITER-MERGE-PATTERN-NEXT:     %63 = "neura.load"(%62) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
- // CHECK-ITER-MERGE-PATTERN-NEXT:     neura.yield %61, %62, %63 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
- // CHECK-ITER-MERGE-PATTERN-NEXT:   }) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
+// CHECK-ITER-MERGE-PATTERN-NEXT:    ^bb0(%arg5: !neura.data<i64, i1>, %arg6: !neura.data<i64, i1>, %arg7: !neura.data<!llvm.ptr, i1>, %arg8: !neura.data<!llvm.ptr, i1>):
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %62 = neura.phi_start %arg5, %arg6 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %63 = neura.phi_start %arg7, %arg8 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1> -> !neura.data<!llvm.ptr, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %64 = "neura.gep"(%63, %62) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %65 = "neura.load"(%64) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      neura.yield %62, %63, %65 : !neura.data<i64, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:    }) : (!neura.data<i64, i1>, !neura.data<i64, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>) -> (!neura.data<i64, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
+// CHECK-ITER-MERGE-PATTERN:      %16:3 = "neura.fused_op"(%2, %12, %15#0) <{frequency = 8 : i64, pattern_id = 10 : i64, pattern_name = "phi_start->fused_op:gep->load"}> ({
+// CHECK-ITER-MERGE-PATTERN-NEXT:    ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<!llvm.ptr, i1>, %arg7: !neura.data<i64, i1>):
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %62 = neura.phi_start %arg5, %arg6 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1> -> !neura.data<!llvm.ptr, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %63 = "neura.gep"(%62, %arg7) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      %64 = "neura.load"(%63) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:      neura.yield %62, %63, %64 : !neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
+// CHECK-ITER-MERGE-PATTERN-NEXT:    }) : (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> (!neura.data<!llvm.ptr, i1>, !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
 
 // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \
 // RUN:           --assign-accelerator \
@@ -80,17 +80,17 @@
 // RUN:           --fold-constant \
 // RUN:           --init-pattern %t-kernel.mlir | FileCheck %s --check-prefix=CHECK-INIT-PATTERN
 
-// CHECK-INIT-PATTERN:         %21:2 = "neura.fused_op"(%16, %20) <{frequency = 6 : i64, pattern_id = 2 : i64, pattern_name = "gep->load"}> ({
+// CHECK-INIT-PATTERN:    %43:2 = "neura.fused_op"(%38, %30, %42) <{frequency = 6 : i64, pattern_id = 2 : i64, pattern_name = "gep->load"}> ({
+// CHECK-INIT-PATTERN-NEXT:    ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<i64, i1>, %arg7: !neura.data<i64, i1>):
+// CHECK-INIT-PATTERN-NEXT:      %75 = "neura.gep"(%arg5, %arg6, %arg7) <{operandSegmentSizes = array<i32: 1, 2>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// CHECK-INIT-PATTERN-NEXT:      %76 = "neura.load"(%75) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// CHECK-INIT-PATTERN-NEXT:      neura.yield %75, %76 : !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
+// CHECK-INIT-PATTERN-NEXT:    }) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>, !neura.data<i64, i1>) -> (!neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
+// CHECK-INIT-PATTERN-NEXT:    %44 = "neura.fused_op"(%36, %42) <{frequency = 6 : i64, pattern_id = 2 : i64, pattern_name = "gep->load"}> ({
 // CHECK-INIT-PATTERN-NEXT:    ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<i64, i1>):
-// CHECK-INIT-PATTERN-NEXT:      %74 = "neura.gep"(%arg5, %arg6) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// CHECK-INIT-PATTERN-NEXT:      %75 = "neura.load"(%74) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// CHECK-INIT-PATTERN-NEXT:      neura.yield %74, %75 : !neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>
-// CHECK-INIT-PATTERN-NEXT:    }) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> (!neura.data<!llvm.ptr, i1>, !neura.data<i32, i1>)
-// CHECK-INIT-PATTERN-NEXT:    %22 = "neura.fused_op"(%18, %20) <{frequency = 6 : i64, pattern_id = 2 : i64, pattern_name = "gep->load"}> ({
-// CHECK-INIT-PATTERN-NEXT:    ^bb0(%arg5: !neura.data<!llvm.ptr, i1>, %arg6: !neura.data<i64, i1>):
-// CHECK-INIT-PATTERN-NEXT:      %74 = "neura.gep"(%arg5, %arg6) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// CHECK-INIT-PATTERN-NEXT:      %75 = "neura.load"(%74) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// CHECK-INIT-PATTERN-NEXT:      neura.yield %75 : !neura.data<i32, i1>
+// CHECK-INIT-PATTERN-NEXT:      %75 = "neura.gep"(%arg5, %arg6) <{operandSegmentSizes = array<i32: 1, 1>}> : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
+// CHECK-INIT-PATTERN-NEXT:      %76 = "neura.load"(%75) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
+// CHECK-INIT-PATTERN-NEXT:      neura.yield %76 : !neura.data<i32, i1>
 // CHECK-INIT-PATTERN-NEXT:    }) : (!neura.data<!llvm.ptr, i1>, !neura.data<i64, i1>) -> !neura.data<i32, i1>
 
 // RUN: mlir-neura-opt --architecture-spec=%S/../../arch_spec/architecture.yaml --verify-each=true --mlir-print-ir-after-failure \
diff --git a/test/neura/steer_ctrl/loop_without_return_value.mlir b/test/neura/steer_ctrl/loop_without_return_value.mlir
index b20ceb53..1a2caf02 100644
--- a/test/neura/steer_ctrl/loop_without_return_value.mlir
+++ b/test/neura/steer_ctrl/loop_without_return_value.mlir
@@ -28,29 +28,33 @@ module attributes {} {
   }
 }
 
-// CHECK:      func.func @_Z11simple_loopPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "steering", llvm.linkage = #llvm.linkage<external>} {
-// CHECK-NEXT:   %0 = neura.reserve : i64
-// CHECK-NEXT:   %1 = neura.reserve : i1
-// CHECK-NEXT:   %2 = "neura.constant"() <{value = "%arg0"}> : () -> memref<?xi32>
-// CHECK-NEXT:   %3 = "neura.constant"() <{value = "%arg1"}> : () -> memref<?xi32>
-// CHECK-NEXT:   %4 = "neura.constant"() <{value = 1 : i64}> : () -> i64
-// CHECK-NEXT:   %5 = "neura.constant"() <{value = 128 : i64}> : () -> i64
-// CHECK-NEXT:   %6 = "neura.constant"() <{value = 1 : i32}> : () -> i32
-// CHECK-NEXT:   %7 = "neura.constant"() <{value = 2 : i32}> : () -> i32
-// CHECK-NEXT:   %8 = "neura.constant"() <{value = 0 : i64}> : () -> i64
-// CHECK-NEXT:   %9 = neura.invariant %4, %1 : i64, i1 -> i64
-// CHECK-NEXT:   %10 = neura.invariant %3, %1 : memref<?xi32>, i1 -> memref<?xi32>
-// CHECK-NEXT:   %11 = neura.invariant %6, %1 : i32, i1 -> i32
-// CHECK-NEXT:   %12 = neura.invariant %7, %1 : i32, i1 -> i32
-// CHECK-NEXT:   %13 = neura.invariant %2, %1 : memref<?xi32>, i1 -> memref<?xi32>
-// CHECK-NEXT:   %14 = neura.invariant %5, %1 : i64, i1 -> i64
-// CHECK-NEXT:   %15 = neura.carry %8, %1, %0 : i64, i1, i64 -> i64
-// CHECK-NEXT:   %16 = "neura.icmp"(%15, %14) <{cmpType = "slt"}> : (i64, i64) -> i1
-// CHECK-NEXT:   neura.ctrl_mov %16 -> %1 : i1 i1
-// CHECK-NEXT:   %17 = neura.load_indexed %13[%15 : i64] memref<?xi32> : i32
-// CHECK-NEXT:   %18 = "neura.mul"(%17, %12) : (i32, i32) -> i32
-// CHECK-NEXT:   %19 = "neura.add"(%18, %11) : (i32, i32) -> i32
-// CHECK-NEXT:   neura.store_indexed %19 to %10[%15 : i64] memref<?xi32> : i32
-// CHECK-NEXT:   %20 = "neura.add"(%15, %9) : (i64, i64) -> i64
-// CHECK-NEXT:   neura.ctrl_mov %20 -> %0 : i64 i64
-// CHECK-NEXT:   "neura.return"() : () -> ()
+// CHECK:      module {
+// CHECK-NEXT:   func.func @_Z11simple_loopPiS_(%arg0: memref<?xi32>, %arg1: memref<?xi32>) attributes {accelerator = "neura", dataflow_mode = "steering", llvm.linkage = #llvm.linkage<external>} {
+// CHECK-NEXT:     %0 = neura.reserve : i64
+// CHECK-NEXT:     %1 = neura.reserve : i1
+// CHECK-NEXT:     %2 = "neura.constant"() <{value = "%arg0"}> : () -> memref<?xi32>
+// CHECK-NEXT:     %3 = "neura.constant"() <{value = "%arg1"}> : () -> memref<?xi32>
+// CHECK-NEXT:     %4 = "neura.constant"() <{value = 1 : i64}> : () -> i64
+// CHECK-NEXT:     %5 = "neura.constant"() <{value = 128 : i64}> : () -> i64
+// CHECK-NEXT:     %6 = "neura.constant"() <{value = 1 : i32}> : () -> i32
+// CHECK-NEXT:     %7 = "neura.constant"() <{value = 2 : i32}> : () -> i32
+// CHECK-NEXT:     %8 = "neura.constant"() <{value = 0 : i64}> : () -> i64
+// CHECK-NEXT:     %9 = neura.invariant %4, %1 : i64, i1 -> i64
+// CHECK-NEXT:     %10 = neura.invariant %3, %1 : memref<?xi32>, i1 -> memref<?xi32>
+// CHECK-NEXT:     %11 = neura.invariant %6, %1 : i32, i1 -> i32
+// CHECK-NEXT:     %12 = neura.invariant %7, %1 : i32, i1 -> i32
+// CHECK-NEXT:     %13 = neura.invariant %2, %1 : memref<?xi32>, i1 -> memref<?xi32>
+// CHECK-NEXT:     %14 = neura.invariant %5, %1 : i64, i1 -> i64
+// CHECK-NEXT:     %15 = neura.carry %8, %1, %0 : i64, i1, i64 -> i64
+// CHECK-NEXT:     %16 = "neura.icmp"(%15, %14) <{cmpType = "slt"}> : (i64, i64) -> i1
+// CHECK-NEXT:     neura.ctrl_mov %16 -> %1 : i1 i1
+// CHECK-NEXT:     %17 = neura.load_indexed %13[%15 : i64] memref<?xi32> : i32
+// CHECK-NEXT:     %18 = "neura.mul"(%17, %12) : (i32, i32) -> i32
+// CHECK-NEXT:     %19 = "neura.add"(%18, %11) : (i32, i32) -> i32
+// CHECK-NEXT:     neura.store_indexed %19 to %10[%15 : i64] memref<?xi32> : i32
+// CHECK-NEXT:     %20 = "neura.add"(%15, %9) : (i64, i64) -> i64
+// CHECK-NEXT:     neura.ctrl_mov %20 -> %0 : i64 i64
+// CHECK-NEXT:     %21 = "neura.constant"() <{value = true}> : () -> i1
+// CHECK-NEXT:     "neura.return"(%21) : (i1) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT: }

From fb09c7cf099a8586afcf5e9231f7fcb357d36f05 Mon Sep 17 00:00:00 2001
From: Shiran Guo <sguoau@connect.ust.hk>
Date: Sun, 28 Dec 2025 12:05:46 +0800
Subject: [PATCH 2/2] Fix ReturnOp exit predicate logic and clean up Neura test
 outputs (addresses issue #209)

---
 .../TransformCtrlToDataFlowPass.cpp           |   6 +-
 test/neura/for_loop/relu_test.mlir            | 376 +-----------------
 2 files changed, 4 insertions(+), 378 deletions(-)

diff --git a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
index 55f2db67..2289e981 100644
--- a/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
+++ b/lib/NeuraDialect/Transforms/TransformCtrlToDataFlowPass.cpp
@@ -608,10 +608,10 @@ void injectExitPredicateForReturn(Region &region, ControlFlowInfo &ctrl_info,
                                    OpBuilder &builder) {
   Block *entry_block = &region.front();
   
-  // Find the ReturnOp
+  // Finds the ReturnOp.
   neura::ReturnOp return_op = nullptr;
   for (Operation &op : *entry_block) {
-    if (auto rt =dyn_cast<neura::ReturnOp>(op)) {
+    if (auto rt = dyn_cast<neura::ReturnOp>(op)) {
       return_op = rt;
       llvm::errs() << "[ctrl2data] ReturnOp found: " << *rt << "\n";
       break;
@@ -753,7 +753,7 @@ struct TransformCtrlToDataFlowPass
       buildControlFlowInfo(*region, ctrlInfo, domInfo);
       transformControlFlowToDataFlow(*region, ctrlInfo, domInfo, builder);
 
-      // Inject exit predicate for void returns
+      // Injects exit predicate for void returns.
       injectExitPredicateForReturn(*region, ctrlInfo, builder);
 
       // Converts phi operations to phi_start operations.
diff --git a/test/neura/for_loop/relu_test.mlir b/test/neura/for_loop/relu_test.mlir
index 2c00cb6a..3fc3c520 100644
--- a/test/neura/for_loop/relu_test.mlir
+++ b/test/neura/for_loop/relu_test.mlir
@@ -220,378 +220,4 @@
 // CTRL2DATA-NEXT: }
 
 
-// MAPPING:      [DEBUG] Recurrence cycle (length 3):
-// MAPPING-NEXT:   %1 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT:   %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %31 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT: [DEBUG] Recurrence cycle (length 5):
-// MAPPING-NEXT:   %1 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT:   %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Longest recurrence cycle (length 5):
-// MAPPING-NEXT: %1 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT: %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %1 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %2 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %35 = "neura.data_mov"(%34) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: "neura.return"(%35) : (!neura.data<i1, i1>) -> ()
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %10 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %4 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %31 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %6 = "neura.data_mov"(%5) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %13 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %8 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %14 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %11 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %21 = "neura.data_mov"(%15) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %16 = "neura.data_mov"(%12) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %24 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %18 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %20 = "neura.data_mov"(%19) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: %23 = "neura.data_mov"(%22) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] Topologically sorted op: "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 0: 3 ops
-// MAPPING-NEXT:   %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %1 = neura.reserve : !neura.data<i64, i1>
-// MAPPING-NEXT:   %2 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 1: 3 ops
-// MAPPING-NEXT:   %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %4 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 2: 5 ops
-// MAPPING-NEXT:   %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:   %31 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %6 = "neura.data_mov"(%5) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 3: 4 ops
-// MAPPING-NEXT:   %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %8 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 4: 5 ops
-// MAPPING-NEXT:   %10 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %11 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 5: 4 ops
-// MAPPING-NEXT:   %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:   neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT:   %16 = "neura.data_mov"(%12) : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 6: 4 ops
-// MAPPING-NEXT:   %13 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %14 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:   %18 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 7: 4 ops
-// MAPPING-NEXT:   %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %21 = "neura.data_mov"(%15) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %20 = "neura.data_mov"(%19) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 8: 5 ops
-// MAPPING-NEXT:   %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %35 = "neura.data_mov"(%34) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:   %24 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:   %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:   %23 = "neura.data_mov"(%22) : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP Bucket Level 9: 2 ops
-// MAPPING-NEXT:   "neura.return"(%35) : (!neura.data<i1, i1>) -> ()
-// MAPPING-NEXT:   "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1> (ALAP level: 0)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %1 = neura.reserve : !neura.data<i64, i1> (ALAP level: 0)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %2 = "neura.data_mov"(%0) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 0)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1> (ALAP level: 1)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %25 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 1)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %4 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 1)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 2)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 2)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %31 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 2)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %27 = "neura.data_mov"(%26) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 2)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %6 = "neura.data_mov"(%5) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 2)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1> (ALAP level: 3)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1> (ALAP level: 3)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %29 = "neura.data_mov"(%28) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 3)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %8 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 3)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %10 = "neura.data_mov"(%3) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 4)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 4)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1> (ALAP level: 4)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %32 = "neura.data_mov"(%30) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 4)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %11 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 4)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1> (ALAP level: 5)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1> (ALAP level: 5)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: neura.ctrl_mov %33 -> %1 : !neura.data<i64, i1> !neura.data<i64, i1> (ALAP level: 5)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %16 = "neura.data_mov"(%12) : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (ALAP level: 5)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %13 = "neura.data_mov"(%7) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 6)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %14 = "neura.data_mov"(%9) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 6)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 6)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %18 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 6)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1> (ALAP level: 7)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %21 = "neura.data_mov"(%15) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 7)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1> (ALAP level: 7)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %20 = "neura.data_mov"(%19) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 7)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1> (ALAP level: 8)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %35 = "neura.data_mov"(%34) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (ALAP level: 8)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %24 = "neura.data_mov"(%17) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1> (ALAP level: 8)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 8)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: %23 = "neura.data_mov"(%22) : (!neura.data<i32, i1>) -> !neura.data<i32, i1> (ALAP level: 8)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: "neura.return"(%35) : (!neura.data<i1, i1>) -> () (ALAP level: 9)
-// MAPPING-NEXT: [MapToAcceleratorPass] ALAP sorted op: "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> () (ALAP level: 9)
-// MAPPING-NEXT: ---------------------------------------------------------
-// MAPPING-NEXT: [HeuristicMapping] Starting mapping with 39 operations.
-// MAPPING-NEXT: Configuration: MAX Backtrack Depth = 3, MAX Candidate Locations = 5
-// MAPPING-NEXT: [HeuristicMapping] Filtered 22 non-materialized operations, 17 operations require physical mapping.
-// MAPPING-NEXT: [HeuristicMapping] Materialized operations list:
-// MAPPING-NEXT: 0 %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1> (level: 0)
-// MAPPING-NEXT: 1 %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1> (level: 1)
-// MAPPING-NEXT: 2 %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1> (level: 2)
-// MAPPING-NEXT: 3 %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1> (level: 2)
-// MAPPING-NEXT: 4 %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1> (level: 3)
-// MAPPING-NEXT: 5 %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1> (level: 3)
-// MAPPING-NEXT: 6 %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1> (level: 4)
-// MAPPING-NEXT: 7 %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1> (level: 4)
-// MAPPING-NEXT: 8 %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1> (level: 5)
-// MAPPING-NEXT: 9 %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1> (level: 5)
-// MAPPING-NEXT: 10 %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1> (level: 6)
-// MAPPING-NEXT: 11 %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1> (level: 7)
-// MAPPING-NEXT: 12 %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1> (level: 7)
-// MAPPING-NEXT: 13 %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1> (level: 8)
-// MAPPING-NEXT: 14 %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1> (level: 8)
-// MAPPING-NEXT: 15 "neura.return"(%35) : (!neura.data<i1, i1>) -> () (level: 9)
-// MAPPING-NEXT: 16 "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> () (level: 9)
-// MAPPING-NEXT: [HeuristicMapping] Found 80 candidate locations for operation: %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=0
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %0 = "neura.grant_once"() <{constant_value = 0 : i64}> : () -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 54 candidate locations for operation: %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=1
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=0 to Tile#11 @t=1
-// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #704
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %3 = neura.phi_start %2, %1 : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 53 candidate locations for operation: %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=2
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=1 to Tile#11 @t=2
-// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #704
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %26 = "neura.add"(%25) {rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 52 candidate locations for operation: %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#10 @t=2
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=1 to Tile#10 @t=2
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg0"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 51 candidate locations for operation: %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=3
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=2 to Tile#11 @t=3
-// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #704
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 58 candidate locations for operation: %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#10 @t=3
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=2 to Tile#10 @t=3
-// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #640
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %7 = "neura.load"(%6) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 49 candidate locations for operation: %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#10 @t=4
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=3 to Tile#10 @t=4
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %30 = "neura.not"(%29) : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 56 candidate locations for operation: %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#11 @t=4
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=3 to Tile#11 @t=4
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 1 candidate locations for operation: %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/1 at tile#10 @t=5
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=2 to Tile#10 @t=5
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=4 to Tile#10 @t=5
-// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #641
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=5 to Tile#11 @t=6
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %33 = neura.grant_predicate %31, %32 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 46 candidate locations for operation: %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=5
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=1 to Tile#7 @t=5
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=4 to Tile#7 @t=5
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %12 = neura.grant_predicate %10, %11 : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 45 candidate locations for operation: %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=6
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=5 to Tile#7 @t=6
-// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #448
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {lhs_value = "%arg1"} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 64 candidate locations for operation: %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#15 @t=7
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#10 @t=3 to Tile#15 @t=7
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#11 @t=4 to Tile#15 @t=7
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %15 = neura.grant_predicate %13, %14 : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 45 candidate locations for operation: %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=7
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=6 to Tile#7 @t=7
-// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #448
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %19 = "neura.load"(%18) : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 67 candidate locations for operation: %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=8
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %34 = "neura.grant_once"() <{constant_value = true}> : () -> !neura.data<i1, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 28 candidate locations for operation: %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#7 @t=9
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=7 to Tile#7 @t=9
-// MAPPING-NEXT: [tryRouteDataMove] Successfully routed on same tile using Register #449
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#15 @t=7 to Tile#7 @t=9
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation %22 = "neura.add"(%20, %21) : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT: [HeuristicMapping] Found 40 candidate locations for operation: "neura.return"(%35) : (!neura.data<i1, i1>) -> ()
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#3 @t=9
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=8 to Tile#3 @t=9
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation "neura.return"(%35) : (!neura.data<i1, i1>) -> ()
-// MAPPING-NEXT: [HeuristicMapping] Found 40 candidate locations for operation: "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
-// MAPPING-NEXT: [HeuristicMapping] Trying candidate 1/5 at tile#6 @t=10
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=9 to Tile#6 @t=10
-// MAPPING-NEXT: [tryRouteDataMove] Routing from Tile#7 @t=6 to Tile#6 @t=10
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped operation "neura.store"(%23, %24) : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
-// MAPPING-NEXT: [HeuristicMapping] Successfully mapped all 17 operations.
-// MAPPING-NEXT: module attributes {{.*}}
-// MAPPING-NEXT:   llvm.mlir.global external local_unnamed_addr @input(dense<[1, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17, 18, -19, 20, -21, 22, -23, 24, -25, 26, -27, 28, -29, 30, -31]> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
-// MAPPING-NEXT:   llvm.mlir.global external local_unnamed_addr @output(dense<0> : tensor<32xi32>) {addr_space = 0 : i32, alignment = 16 : i64, dso_local} : !llvm.array<32 x i32>
-// MAPPING-NEXT:   llvm.mlir.global private unnamed_addr constant @".str"("output[%d] = %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local}
-// MAPPING-NEXT:   llvm.func local_unnamed_addr @main() -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
-// MAPPING-NEXT:     %0 = llvm.mlir.addressof @".str" : !llvm.ptr
-// MAPPING-NEXT:     %1 = llvm.mlir.addressof @input : !llvm.ptr
-// MAPPING-NEXT:     %2 = llvm.mlir.addressof @output : !llvm.ptr
-// MAPPING-NEXT:     %3 = "neura.constant"() <{value = 0 : i8}> : () -> i8
-// MAPPING-NEXT:     %4 = "neura.constant"() <{value = 128 : i64}> : () -> i64
-// MAPPING-NEXT:     %5 = "neura.constant"() <{value = 0 : i64}> : () -> i64
-// MAPPING-NEXT:     %6 = "neura.constant"() <{value = 0 : i32}> : () -> i32
-// MAPPING-NEXT:     %7 = "neura.data_mov"(%2) : (!llvm.ptr) -> !llvm.ptr
-// MAPPING-NEXT:     %8 = "neura.data_mov"(%3) : (i8) -> i8
-// MAPPING-NEXT:     %9 = "neura.data_mov"(%4) : (i64) -> i64
-// MAPPING-NEXT:     "neura.memset"(%7, %8, %9) <{is_volatile = false}> : (!llvm.ptr, i8, i64) -> ()
-// MAPPING-NEXT:     %10 = "neura.data_mov"(%5) : (i64) -> i64
-// MAPPING-NEXT:     neura.br %10 : i64 to ^bb1
-// MAPPING-NEXT:   ^bb1(%11: i64):  // 2 preds: ^bb0, ^bb3
-// MAPPING-NEXT:     %12 = "neura.data_mov"(%1) : (!llvm.ptr) -> !llvm.ptr
-// MAPPING-NEXT:     %13 = "neura.data_mov"(%11) : (i64) -> i64
-// MAPPING-NEXT:     %14 = "neura.gep"(%12, %13) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
-// MAPPING-NEXT:     %15 = "neura.data_mov"(%14) : (!llvm.ptr) -> !llvm.ptr
-// MAPPING-NEXT:     %16 = "neura.load"(%15) : (!llvm.ptr) -> i32
-// MAPPING-NEXT:     %17 = "neura.data_mov"(%16) : (i32) -> i32
-// MAPPING-NEXT:     %18 = "neura.icmp"(%17) <{cmpType = "sgt"}> {rhs_value = 0 : i32} : (i32) -> i1
-// MAPPING-NEXT:     %19 = "neura.data_mov"(%18) : (i1) -> i1
-// MAPPING-NEXT:     neura.cond_br %19 : i1 then to ^bb2 else to ^bb3
-// MAPPING-NEXT:   ^bb2:  // pred: ^bb1
-// MAPPING-NEXT:     %20 = "neura.data_mov"(%2) : (!llvm.ptr) -> !llvm.ptr
-// MAPPING-NEXT:     %21 = "neura.data_mov"(%11) : (i64) -> i64
-// MAPPING-NEXT:     %22 = "neura.gep"(%20, %21) <{operandSegmentSizes = array<i32: 1, 1>}> : (!llvm.ptr, i64) -> !llvm.ptr
-// MAPPING-NEXT:     %23 = "neura.data_mov"(%22) : (!llvm.ptr) -> !llvm.ptr
-// MAPPING-NEXT:     %24 = "neura.load"(%23) : (!llvm.ptr) -> i32
-// MAPPING-NEXT:     %25 = "neura.data_mov"(%24) : (i32) -> i32
-// MAPPING-NEXT:     %26 = "neura.data_mov"(%16) : (i32) -> i32
-// MAPPING-NEXT:     %27 = "neura.add"(%25, %26) : (i32, i32) -> i32
-// MAPPING-NEXT:     %28 = "neura.data_mov"(%27) : (i32) -> i32
-// MAPPING-NEXT:     %29 = "neura.data_mov"(%22) : (!llvm.ptr) -> !llvm.ptr
-// MAPPING-NEXT:     "neura.store"(%28, %29) : (i32, !llvm.ptr) -> ()
-// MAPPING-NEXT:     neura.br to ^bb3
-// MAPPING-NEXT:   ^bb3:  // 2 preds: ^bb1, ^bb2
-// MAPPING-NEXT:     %30 = "neura.data_mov"(%11) : (i64) -> i64
-// MAPPING-NEXT:     %31 = "neura.add"(%30) {rhs_value = 1 : i64} : (i64) -> i64
-// MAPPING-NEXT:     %32 = "neura.data_mov"(%31) : (i64) -> i64
-// MAPPING-NEXT:     %33 = "neura.icmp"(%32) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (i64) -> i1
-// MAPPING-NEXT:     %34 = "neura.data_mov"(%33) : (i1) -> i1
-// MAPPING-NEXT:     %35 = "neura.data_mov"(%5) : (i64) -> i64
-// MAPPING-NEXT:     %36 = "neura.data_mov"(%31) : (i64) -> i64
-// MAPPING-NEXT:     neura.cond_br %34 : i1 then %35 : i64 to ^bb5 else %36 : i64 to ^bb1
-// MAPPING-NEXT:   ^bb4:  // pred: ^bb5
-// MAPPING-NEXT:     %37 = "neura.data_mov"(%6) : (i32) -> i32
-// MAPPING-NEXT:     "neura.return"(%37) : (i32) -> ()
-// MAPPING-NEXT:   ^bb5(%38: i64):  // 2 preds: ^bb3, ^bb5
-// MAPPING-NEXT:     %39 = "neura.data_mov"(%2) : (!llvm.ptr) -> !llvm.ptr
-// MAPPING-NEXT:     %40 = "neura.data_mov"(%38) : (i64) -> i64
-// MAPPING-NEXT:     %41 = "neura.gep"(%39, %40) <{operandSegmentSizes = array<i32: 1, 1>}> {operand_1_value = 0 : i32} : (!llvm.ptr, i64) -> !llvm.ptr
-// MAPPING-NEXT:     %42 = "neura.data_mov"(%41) : (!llvm.ptr) -> !llvm.ptr
-// MAPPING-NEXT:     %43 = "neura.load"(%42) : (!llvm.ptr) -> i32
-// MAPPING-NEXT:     %44 = "neura.data_mov"(%38) : (i64) -> i64
-// MAPPING-NEXT:     %45 = "neura.cast"(%44) <{cast_type = "trunc"}> : (i64) -> i32
-// MAPPING-NEXT:     %46 = llvm.call tail @printf(%0, %45, %43) vararg(!llvm.func<i32 (ptr, ...)>) {no_unwind} : (!llvm.ptr, i32, i32) -> i32
-// MAPPING-NEXT:     %47 = "neura.data_mov"(%38) : (i64) -> i64
-// MAPPING-NEXT:     %48 = "neura.add"(%47) {rhs_value = 1 : i64} : (i64) -> i64
-// MAPPING-NEXT:     %49 = "neura.data_mov"(%48) : (i64) -> i64
-// MAPPING-NEXT:     %50 = "neura.icmp"(%49) <{cmpType = "eq"}> {rhs_value = 32 : i64} : (i64) -> i1
-// MAPPING-NEXT:     %51 = "neura.data_mov"(%50) : (i1) -> i1
-// MAPPING-NEXT:     %52 = "neura.data_mov"(%48) : (i64) -> i64
-// MAPPING-NEXT:     neura.cond_br %51 : i1 then to ^bb4 else %52 : i64 to ^bb5
-// MAPPING-NEXT:   }
-// MAPPING-NEXT:   func.func @_Z6kernelPiS_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 5 : i32, res_mii = 2 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
-// MAPPING-NEXT:     %0 = "neura.grant_once"() <{constant_value = 0 : i64}> {dfg_id = 0 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 0 : i32, x = 3 : i32, y = 2 : i32}]} : () -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %1 = neura.reserve {dfg_id = 1 : i32} : !neura.data<i64, i1>
-// MAPPING-NEXT:     %2 = "neura.data_mov"(%0) {dfg_id = 3 : i32, mapping_locs = [{id = 704 : i32, index_per_ii = 0 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 0 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %3 = neura.phi_start %2, %1 {dfg_id = 5 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 1 : i32, x = 3 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i64, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %4 = "neura.data_mov"(%3) {dfg_id = 9 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %5 = "neura.gep"(%4) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 11 : i32, lhs_value = "%arg0", mapping_locs = [{id = 10 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:     %6 = "neura.data_mov"(%5) {dfg_id = 14 : i32, mapping_locs = [{id = 640 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:     %7 = "neura.load"(%6) {dfg_id = 16 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %8 = "neura.data_mov"(%7) {dfg_id = 19 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %9 = "neura.icmp"(%8) <{cmpType = "sgt"}> {dfg_id = 21 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 0 : i32} : (!neura.data<i32, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %10 = "neura.data_mov"(%3) {dfg_id = 8 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 1 : i32}, {id = 448 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}, {id = 448 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 448 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %11 = "neura.data_mov"(%9) {dfg_id = 24 : i32, mapping_locs = [{id = 36 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %12 = neura.grant_predicate %10, %11 {dfg_id = 27 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 3 : i32, y = 1 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %13 = "neura.data_mov"(%7) {dfg_id = 18 : i32, mapping_locs = [{id = 34 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}, {id = 44 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 960 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}, {id = 960 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %14 = "neura.data_mov"(%9) {dfg_id = 23 : i32, mapping_locs = [{id = 37 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 4 : i32}, {id = 961 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 5 : i32}, {id = 961 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %15 = neura.grant_predicate %13, %14 {dfg_id = 26 : i32, mapping_locs = [{id = 15 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 3 : i32}]} : !neura.data<i32, i1>, !neura.data<i1, i1> -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %16 = "neura.data_mov"(%12) {dfg_id = 30 : i32, mapping_locs = [{id = 448 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 5 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %17 = "neura.gep"(%16) <{operandSegmentSizes = array<i32: 0, 1>}> {dfg_id = 31 : i32, lhs_value = "%arg1", mapping_locs = [{id = 7 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 6 : i32, x = 3 : i32, y = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:     %18 = "neura.data_mov"(%17) {dfg_id = 33 : i32, mapping_locs = [{id = 448 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 6 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:     %19 = "neura.load"(%18) {dfg_id = 34 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 7 : i32, x = 3 : i32, y = 1 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %20 = "neura.data_mov"(%19) {dfg_id = 35 : i32, mapping_locs = [{id = 449 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 7 : i32}, {id = 449 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %21 = "neura.data_mov"(%15) {dfg_id = 29 : i32, mapping_locs = [{id = 47 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 7 : i32}, {id = 36 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %22 = "neura.add"(%20, %21) {dfg_id = 36 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 3 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %23 = "neura.data_mov"(%22) {dfg_id = 37 : i32, mapping_locs = [{id = 21 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 9 : i32}]} : (!neura.data<i32, i1>) -> !neura.data<i32, i1>
-// MAPPING-NEXT:     %24 = "neura.data_mov"(%17) {dfg_id = 32 : i32, mapping_locs = [{id = 21 : i32, index_per_ii = 1 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 6 : i32}, {id = 384 : i32, index_per_ii = 2 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 7 : i32}, {id = 384 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 8 : i32}, {id = 384 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 9 : i32}]} : (!neura.data<!llvm.ptr, i1>) -> !neura.data<!llvm.ptr, i1>
-// MAPPING-NEXT:     "neura.store"(%23, %24) {dfg_id = 38 : i32, mapping_locs = [{id = 6 : i32, index_per_ii = 0 : i32, invalid_iterations = 2 : i32, resource = "tile", time_step = 10 : i32, x = 2 : i32, y = 1 : i32}]} : (!neura.data<i32, i1>, !neura.data<!llvm.ptr, i1>) -> ()
-// MAPPING-NEXT:     %25 = "neura.data_mov"(%3) {dfg_id = 7 : i32, mapping_locs = [{id = 704 : i32, index_per_ii = 1 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 1 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %26 = "neura.add"(%25) {dfg_id = 10 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 2 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 1 : i64} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %27 = "neura.data_mov"(%26) {dfg_id = 13 : i32, mapping_locs = [{id = 704 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 2 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %28 = "neura.icmp"(%27) <{cmpType = "eq"}> {dfg_id = 15 : i32, mapping_locs = [{id = 11 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 3 : i32, x = 3 : i32, y = 2 : i32}], rhs_value = 32 : i64} : (!neura.data<i64, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %29 = "neura.data_mov"(%28) {dfg_id = 17 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 3 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %30 = "neura.not"(%29) {dfg_id = 20 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, resource = "tile", time_step = 4 : i32, x = 2 : i32, y = 2 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %31 = "neura.data_mov"(%26) {dfg_id = 12 : i32, mapping_locs = [{id = 35 : i32, index_per_ii = 2 : i32, invalid_iterations = 0 : i32, resource = "link", time_step = 2 : i32}, {id = 640 : i32, index_per_ii = 3 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 3 : i32}, {id = 640 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 0 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i64, i1>) -> !neura.data<i64, i1>
-// MAPPING-NEXT:     %32 = "neura.data_mov"(%30) {dfg_id = 22 : i32, mapping_locs = [{id = 641 : i32, index_per_ii = 4 : i32, invalid_iterations = 0 : i32, per_tile_register_id = 1 : i32, resource = "register", time_step = 4 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %33 = neura.grant_predicate %31, %32 {dfg_id = 25 : i32, mapping_locs = [{id = 10 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 5 : i32, x = 2 : i32, y = 2 : i32}]} : !neura.data<i64, i1>, !neura.data<i1, i1> -> !neura.data<i64, i1>
-// MAPPING-NEXT:     neura.ctrl_mov %33 -> %1 {dfg_id = 28 : i32, mapping_locs = [{id = 32 : i32, index_per_ii = 0 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 5 : i32}]} : !neura.data<i64, i1> !neura.data<i64, i1>
-// MAPPING-NEXT:     %34 = "neura.grant_once"() <{constant_value = true}> {dfg_id = 2 : i32, mapping_locs = [{id = 7 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 8 : i32, x = 3 : i32, y = 1 : i32}]} : () -> !neura.data<i1, i1>
-// MAPPING-NEXT:     %35 = "neura.data_mov"(%34) {dfg_id = 4 : i32, mapping_locs = [{id = 22 : i32, index_per_ii = 3 : i32, invalid_iterations = 1 : i32, resource = "link", time_step = 8 : i32}]} : (!neura.data<i1, i1>) -> !neura.data<i1, i1>
-// MAPPING-NEXT:     "neura.return"(%35) {dfg_id = 6 : i32, mapping_locs = [{id = 3 : i32, index_per_ii = 4 : i32, invalid_iterations = 1 : i32, resource = "tile", time_step = 9 : i32, x = 3 : i32, y = 0 : i32}]} : (!neura.data<i1, i1>) -> ()
-// MAPPING-NEXT:   }
-// MAPPING-NEXT:   llvm.func local_unnamed_addr @printf(!llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, ...) -> (i32 {llvm.noundef}) attributes {no_unwind, passthrough = ["nofree", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"}
-// MAPPING-NEXT: }
\ No newline at end of file
+// MAPPING: func.func @_Z6kernelPiS_(%arg0: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg1: !llvm.ptr {llvm.nocapture, llvm.noundef}) -> !llvm.void attributes {CConv = #llvm.cconv<ccc>, accelerator = "neura", dataflow_mode = "predicate", linkage = #llvm.linkage<external>, mapping_info = {compiled_ii = 5 : i32, mapping_mode = "spatial-temporal", mapping_strategy = "heuristic", rec_mii = 5 : i32, res_mii = 2 : i32, x_tiles = 4 : i32, y_tiles = 4 : i32}, memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic", unnamed_addr = 1 : i64, visibility_ = 0 : i64} {
\ No newline at end of file