From fe46cdbf016f83c6612d2615d36815ac6d81dd85 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Thu, 12 Jun 2025 21:17:06 +0800
Subject: [PATCH 01/13] add integrated operations for memory access and loop
 control

---
 CMakeLists.txt                           |  8 +--
 include/NeuraDialect/NeuraOps.td         | 75 ++++++++++++++++++++++++
 test/affine2neura/gpt2-node27/compile.sh |  3 +
 test/affine2neura/gpt2-node27/node27.cpp | 14 +++++
 test/affine2neura/gpt2-node30/compile.sh |  3 +
 test/affine2neura/gpt2-node30/node30.cpp | 15 +++++
 6 files changed, 114 insertions(+), 4 deletions(-)
 create mode 100755 test/affine2neura/gpt2-node27/compile.sh
 create mode 100644 test/affine2neura/gpt2-node27/node27.cpp
 create mode 100755 test/affine2neura/gpt2-node30/compile.sh
 create mode 100644 test/affine2neura/gpt2-node30/node30.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9a83b5b7..adaf6c9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED YES)
 
 add_compile_options(-g)
 
-# set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
-# set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
-# set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
-# set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
+set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
+set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
+set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
+set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
 message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 01e54159..0b191516 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -101,6 +101,34 @@ def Neura_StoreOp : Op<NeuraDialect, "store"> {
   // let assemblyFormat = "$value `,` $addr `,` $predicate attr-dict";
 }
 
+// Defines a load operation with integrated address calculation.
+def Neura_LoadIndexedOp: Op<NeuraDialect, "load_indexed", [AttrSizedOperandSegments]>{
+  let summary = "Load with integrated address calculation for multi-dimensional arrays";
+  let description = [{
+    Calculates the address using the base address and indices.
+    Load the value at the calculated address.
+    Example:
+      %value = neura.load_indexed %base [%arg1, %arg2] : f32
+  }];
+  let arguments = (ins Arg<AnyMemRef, "the load operation">:$base, Variadic<Index>:$indices, Optional<AnyType>:$predicate);
+  let results = (outs AnyType:$result);
+  let assemblyFormat = "type($base) $base `[` $indices `]` ($predicate^ `:` type($predicate))? attr-dict `:` type($result)";
+}
+
+//Defines a store operation with integrated address calculation.
+def Neura_StoreIndexedOp: Op<NeuraDialect, "store_indexed", [AttrSizedOperandSegments]> {
+  let summary = "Store with integrated address calculation for multi-dimensional arrays";
+  let description = [{
+    Calculates the address using the base address and indices.
+    Store the value at the calculated address.
+    Example:
+      neura.store_indexed %value, %base [%arg1, %arg2] : f32
+  }];
+  let arguments = (ins AnyType:$value, Arg<AnyMemRef, "the store operation">:$base, Variadic<Index>:$indices, Optional<AnyType>:$predicate);
+  let results = (outs);
+  let assemblyFormat = "$value `to` type($base) $base `[` $indices `]` ($predicate^ `:` type($predicate))? attr-dict `:` type($value)";
+}
+
 // Defines a pointer computation operation.
 def Neura_GEP : Op<NeuraDialect, "gep"> {
   let summary = "Pointer computation using offset indices";
@@ -253,3 +281,50 @@ def Neura_ReserveOp : Op<NeuraDialect, "reserve"> {
   let results = (outs AnyType:$result);
   let assemblyFormat = "attr-dict `:` type($result)";
 }
+
+// ----------------------------------------------------
+// Defines loop related operations.
+
+// Loop iteration operation for index increament and compare
+def Neura_LoopIterOp : Op<NeuraDialect, "loop_iter", [AttrSizedOperandSegments]> {
+  let summary = "CGRA-optimized loop iteration operation";
+  let description = [{
+    Takes the current loop index, a step value, and an upper bound as the inputs.
+    Outputs the next loop index and a boolean condition indicating whether the loop should continue.
+    
+    Example:
+      %next_index, %continue = neura.loop_control current_index: 0, step: 1, bound: 10 : i32 i1}];
+
+  let arguments = (ins Index: $current_index, 
+                   Index:$step, 
+                   Index:$bound,
+                   Optional<AnyType>:$loop_type, // 0: <, 1: <=, 2: >, 3: >=
+                   Optional<AnyType>:$predicate);
+  let results = (outs Index:$next_index, I1:$continue_condition);
+  let assemblyFormat = "`current_index` `:` $current_index `,` `step` `:` $step `,` `bound` `:` $bound `:` type($bound) ($loop_type^ `:` type($loop_type))? ($predicate^ `:` type($predicate))? attr-dict `:` type($next_index) type($continue_condition)";
+}
+
+// Loop control operation that integrates loop iteration and control flow.
+def Neura_LoopControlOp: Op<NeuraDialect, "loop_control", [Terminator]>{
+  let summary = "Intergrated loop control operation for simple loops";
+  let description = [{
+    This operation is an integrated loop control operation that combines the loop iteration and control flow.
+    It has three main actions:
+    1. Calculates the next iteration's index: `next_index = current_index + step`
+    2. Checks if the loop should continue based on the current index and bound.
+    3. If the loop should continue, it branches to the loop body, and yields related values.
+    4. Otherwise, it exits the loop.
+  }];
+  let arguments = (ins Index:$current_index, // Current loop index
+                   Index:$step,
+                   Index:$bound, 
+                   DefaultValuedAttr<StrAttr, "\"lt\"">:$loop_type, // Loop type: "lt", "le", "gt", "ge", "eq", "ne"
+                   Variadic<AnyType>:$passthrough_args // Additional arguments to pass through to the successors
+                   );
+  let results = (outs);
+  let successors = (successor
+                    AnySuccessor:$body, // loop body successors
+                    AnySuccessor:$exit // exit successors
+                    );
+  let assemblyFormat = "`current_index` `:` $current_index `,` `step` `:` $step `,` `bound` `:` $bound `,` `loop_type` `:` $loop_type (`passthrough` `(` $passthrough_args^ `:` type($passthrough_args) `)`)? `then` $body `else` $exit attr-dict";
+}
\ No newline at end of file
diff --git a/test/affine2neura/gpt2-node27/compile.sh b/test/affine2neura/gpt2-node27/compile.sh
new file mode 100755
index 00000000..e1c6c965
--- /dev/null
+++ b/test/affine2neura/gpt2-node27/compile.sh
@@ -0,0 +1,3 @@
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/cgeist ./node27_unroll.cpp -S --raise-scf-to-affine -o ./node27.mlir
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node27.mlir --affine-loop-unroll="unroll-factor=2" -o ./node27_unroll.mlir
+# /home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node27_unroll.mlir --affine-loop-tile="tile-size=2" -o ./node27_tile.mlir
\ No newline at end of file
diff --git a/test/affine2neura/gpt2-node27/node27.cpp b/test/affine2neura/gpt2-node27/node27.cpp
new file mode 100644
index 00000000..3bcf72c2
--- /dev/null
+++ b/test/affine2neura/gpt2-node27/node27.cpp
@@ -0,0 +1,14 @@
+float input[1][16][4][16];
+float output[1][4][16][16];
+
+int main() {
+  for (int arg2 = 0; arg2 < 1; arg2++) {
+    for (int arg3 = 0; arg3 < 16; arg3++) {
+      for (int arg4 = 0; arg4 < 4; arg4 += 1) {
+        for (int arg5 = 0; arg5 < 16; arg5 += 1) {
+          output[arg2][arg3][arg4][arg5] = input[arg2][arg4][arg3][arg5];
+        }
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/test/affine2neura/gpt2-node30/compile.sh b/test/affine2neura/gpt2-node30/compile.sh
new file mode 100755
index 00000000..28b23b23
--- /dev/null
+++ b/test/affine2neura/gpt2-node30/compile.sh
@@ -0,0 +1,3 @@
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/cgeist ./node30.cpp -S --raise-scf-to-affine -o ./node30.mlir
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node30.mlir --affine-loop-unroll="unroll-factor=2" -o ./node30_unroll.mlir
+# /home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node27_unroll.mlir --affine-loop-tile="tile-size=2" -o ./node27_tile.mlir
\ No newline at end of file
diff --git a/test/affine2neura/gpt2-node30/node30.cpp b/test/affine2neura/gpt2-node30/node30.cpp
new file mode 100644
index 00000000..01177f33
--- /dev/null
+++ b/test/affine2neura/gpt2-node30/node30.cpp
@@ -0,0 +1,15 @@
+float A[1][4][16][64];
+// float B=20.0;
+float C[1][4][16][64];
+
+int main() {
+  for (int arg2 = 0; arg2 < 1; arg2++) {
+    for (int arg3 = 0; arg3 < 4; arg3++) {
+      for (int arg4 = 0; arg4 < 16; arg4++) {
+        for (int arg5 = 0; arg5 < 64; arg5++) {
+          C[arg2][arg3][arg4][arg5] = A[arg2][arg3][arg4][arg5] * 10;
+        }
+      }
+    }
+  }
+}
\ No newline at end of file

From 93955fa0d463fc7964c2a99421a673b513bb25ce Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 13 Jun 2025 16:45:19 +0800
Subject: [PATCH 02/13] convert affine load/store/for into neura dialect

---
 include/Conversion/ConversionPasses.h         |   1 +
 include/Conversion/ConversionPasses.td        |   6 +
 include/NeuraDialect/NeuraOps.td              |   2 +-
 .../AffineToNeura/AffineToNeuraPass.cpp       | 317 ++++++++++++++++++
 lib/Conversion/AffineToNeura/CMakeLists.txt   |  15 +
 .../ArithToNeura/ArithToNeuraPass.cpp         |   2 +-
 lib/Conversion/CMakeLists.txt                 |   1 +
 .../LlvmToNeura/LlvmToNeuraPass.cpp           |   2 +-
 test/affine2neura/gpt2-node11/compile.sh      |   3 +
 test/affine2neura/gpt2-node11/node11.cpp      |  12 +
 test/affine2neura/simpleloop/compile.sh       |   3 +
 test/affine2neura/simpleloop/simple.cpp       |  12 +
 tools/mlir-neura-opt/mlir-neura-opt.cpp       |   3 +
 13 files changed, 376 insertions(+), 3 deletions(-)
 create mode 100644 lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
 create mode 100644 lib/Conversion/AffineToNeura/CMakeLists.txt
 create mode 100755 test/affine2neura/gpt2-node11/compile.sh
 create mode 100644 test/affine2neura/gpt2-node11/node11.cpp
 create mode 100755 test/affine2neura/simpleloop/compile.sh
 create mode 100644 test/affine2neura/simpleloop/simple.cpp

diff --git a/include/Conversion/ConversionPasses.h b/include/Conversion/ConversionPasses.h
index 2477bb3d..36e5db18 100644
--- a/include/Conversion/ConversionPasses.h
+++ b/include/Conversion/ConversionPasses.h
@@ -19,6 +19,7 @@ namespace mlir {
 // Conversion passes.
 std::unique_ptr<mlir::Pass> createLowerArithToNeuraPass();
 std::unique_ptr<mlir::Pass> createLowerLlvmToNeuraPass();
+std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
 
 #define GEN_PASS_REGISTRATION
 #include "Conversion/ConversionPasses.h.inc"
diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index 7fca77bb..77ee2ef1 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -20,4 +20,10 @@ def LowerLlvmToNeura : Pass<"lower-llvm-to-neura", "ModuleOp">{
   let constructor = "mlir::createLowerLlvmToNeuraPass()";
 }
 
+def LowerAffineToNeura : Pass<"lower-affine-to-neura", "FuncOp">{
+  let summary = "Lower affine to Neura dialect";
+  let description = [{Lower affine dialect operations to Neura dialect operations.}];
+  let constructor = "mlir::createLowerAffineToNeuraPass()";
+}
+
 #endif // CONVERSION_PASSES_TD
\ No newline at end of file
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 0b191516..bfba1d4f 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -11,7 +11,7 @@ def Neura_ConstantOp : Op<NeuraDialect, "constant"> {
     OptionalAttr<BoolAttr>:$predicate  // Add optional predicate attribute
   );
   let results = (outs AnyType:$result);
-  // let assemblyFormat = "attr-dict `:` type($result)";
+  let assemblyFormat = "attr-dict `:` type($result)";
 }
 
 // Defines an addition operation.
diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
new file mode 100644
index 00000000..e080da9e
--- /dev/null
+++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
@@ -0,0 +1,317 @@
+#include "Conversion/ConversionPasses.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/raw_ostream.h"
+#include <memory>
+
+using namespace mlir;
+using namespace mlir::neura;
+using namespace mlir::func;
+
+#define GEN_PASS_DEF_LOWERAFFINETONEURA
+#include "Conversion/ConversionPasses.h.inc"
+
+namespace {
+struct AffineLoadLowering : public OpRewritePattern<affine::AffineLoadOp> {
+  using OpRewritePattern<affine::AffineLoadOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(affine::AffineLoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = loadOp.getLoc();
+    auto memref = loadOp.getMemref();
+    AffineMap map = loadOp.getAffineMap();
+    ValueRange mapOperands = loadOp.getMapOperands();
+    // Get the indices for the load operation
+    SmallVector<Value, 4> newIndices;
+    newIndices.reserve(map.getNumResults());
+    llvm::errs() << "Lowering affine load operation: " << loadOp << "\n";
+    llvm::errs() << "Number of results in affine map: " << map.getNumResults()
+                 << "\n";
+    for (auto expr : map.getResults()) {
+      llvm::errs() << "Map expr: " << expr << "\n";
+    }
+
+    for (AffineExpr expr : map.getResults()) {
+      if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+        auto indexType = rewriter.getIndexType();
+        auto valueAttr =
+            rewriter.getIntegerAttr(indexType, constExpr.getValue());
+        newIndices.push_back(rewriter.create<neura::ConstantOp>(
+            loc, indexType, valueAttr, nullptr));
+      } else if (auto dimExpr = expr.dyn_cast<AffineDimExpr>()) {
+        if (dimExpr.getPosition() >= map.getNumDims() ||
+            dimExpr.getPosition() >=
+                mapOperands
+                    .size()) { // Check against mapOperands size for safety
+          return loadOp.emitError(
+              "affine map dimension out of bounds for map operands");
+        }
+        newIndices.push_back(mapOperands[dimExpr.getPosition()]);
+      } else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
+        unsigned symbolOperandIndex = map.getNumDims() + symExpr.getPosition();
+        if (symbolOperandIndex >= mapOperands.size()) {
+          return loadOp.emitError(
+              "affine map symbol out of bounds for map operands");
+        }
+        newIndices.push_back(mapOperands[symbolOperandIndex]);
+      } else {
+        // For more complex affine expressions (e.g., d0 + c1),
+        // materialize the result using affine.apply.
+        // neura.load_indexed expects individual index values.
+        // This is a temporary workaround for complex expressions.
+        llvm::errs() << "Complex affine expression: " << expr << "\n";
+        AffineMap singleResultMap = AffineMap::get(
+            map.getNumDims(), map.getNumSymbols(), expr, rewriter.getContext());
+        Value complexIndex = rewriter.create<affine::AffineApplyOp>(
+            loc, singleResultMap, mapOperands);
+        newIndices.push_back(complexIndex);
+      }
+    }
+
+    auto memRefType = memref.getType().dyn_cast<MemRefType>();
+    if (!memRefType) {
+      return loadOp.emitError("base of load is not a MemRefType");
+    }
+    if (newIndices.size() != static_cast<size_t>(memRefType.getRank())) {
+      return loadOp.emitError("number of indices from affine map (")
+             << newIndices.size() << ") does not match memref rank ("
+             << memRefType.getRank() << ")";
+    }
+
+    // Create the neura.load_indexed operation
+    auto newLoadOp = rewriter.create<neura::LoadIndexedOp>(
+        loc, loadOp.getType(), memref, ValueRange{newIndices}, nullptr);
+
+    rewriter.replaceOp(loadOp, newLoadOp.getResult());
+    return success();
+  }
+};
+
+struct AffineStoreLowering : public OpRewritePattern<affine::AffineStoreOp> {
+  using OpRewritePattern<affine::AffineStoreOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(affine::AffineStoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = storeOp.getLoc();
+    auto memref = storeOp.getMemref();
+    auto value = storeOp.getValueToStore();
+    AffineMap map = storeOp.getAffineMap();
+    ValueRange mapOperands = storeOp.getMapOperands();
+
+    SmallVector<Value, 4> newIndices;
+    newIndices.reserve(map.getNumResults());
+
+    for (AffineExpr expr : map.getResults()) {
+      if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+        auto indexType = rewriter.getIndexType();
+        auto valueAttr =
+            rewriter.getIntegerAttr(indexType, constExpr.getValue());
+        newIndices.push_back(rewriter.create<neura::ConstantOp>(
+            loc, indexType, valueAttr, nullptr));
+      } else if (auto dimExpr = expr.dyn_cast<AffineDimExpr>()) {
+        if (dimExpr.getPosition() >= map.getNumDims() ||
+            dimExpr.getPosition() >= mapOperands.size()) {
+          return storeOp.emitError(
+              "affine map dimension out of bounds for map operands");
+        }
+        newIndices.push_back(mapOperands[dimExpr.getPosition()]);
+      } else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
+        unsigned symbolOperandIndex = map.getNumDims() + symExpr.getPosition();
+        if (symbolOperandIndex >= mapOperands.size()) {
+          return storeOp.emitError(
+              "affine map symbol out of bounds for map operands");
+        }
+        newIndices.push_back(mapOperands[symbolOperandIndex]);
+      } else {
+        // For more complex affine expressions, materialize the result using
+        // affine.apply. This is a temporary workaround for complex expressions.
+        AffineMap singleResultMap = AffineMap::get(
+            map.getNumDims(), map.getNumSymbols(), expr, rewriter.getContext());
+        Value complexIndex = rewriter.create<affine::AffineApplyOp>(
+            loc, singleResultMap, mapOperands);
+        newIndices.push_back(complexIndex);
+      }
+    }
+
+    auto memRefType = memref.getType().dyn_cast<MemRefType>();
+    if (!memRefType) {
+      return storeOp.emitError("base of store is not a MemRefType");
+    }
+    if (newIndices.size() != static_cast<size_t>(memRefType.getRank())) {
+      return storeOp.emitError("number of indices from affine map (")
+             << newIndices.size() << ") does not match memref rank ("
+             << memRefType.getRank() << ")";
+    }
+
+    rewriter.create<neura::StoreIndexedOp>(loc, value, memref,
+                                           ValueRange{newIndices}, nullptr);
+    rewriter.eraseOp(storeOp);
+    return success();
+  }
+};
+
+struct AffineForLowering : public OpRewritePattern<affine::AffineForOp> {
+  using OpRewritePattern<affine::AffineForOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(affine::AffineForOp forOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = forOp.getLoc();
+    auto indexType = rewriter.getIndexType();
+
+    // 1. Extract loop parameters (lower bound, upper bound, step)
+    Value lowerBoundVal;
+    if (forOp.hasConstantLowerBound()) {
+      int lowerBoundConstant = forOp.getConstantLowerBound();
+      auto lowerBoundAttr =
+          rewriter.getIntegerAttr(indexType, lowerBoundConstant);
+      lowerBoundVal = rewriter.create<neura::ConstantOp>(
+          loc, indexType, lowerBoundAttr, nullptr);
+    } else {
+      // If the lower bound is not constant, we need to use affine.apply
+      // This is a temporary workaround for non-constant lower bounds.
+      llvm::errs() << "Using affine.apply for unconstant lower bound\n";
+      affine::AffineBound lowerBound = forOp.getLowerBound();
+      AffineMap lowerBoundMap = lowerBound.getMap();
+      ValueRange lowerBoundOperands = forOp.getLowerBoundOperands();
+      lowerBoundVal = rewriter.create<affine::AffineApplyOp>(
+          loc, lowerBoundMap, lowerBoundOperands);
+    }
+
+    Value upperBoundVal;
+    if (forOp.hasConstantUpperBound()) {
+      int upperBoundConstant = forOp.getConstantUpperBound();
+      auto upperBoundAttr =
+          rewriter.getIntegerAttr(indexType, upperBoundConstant);
+      upperBoundVal = rewriter.create<neura::ConstantOp>(
+          loc, indexType, upperBoundAttr, nullptr);
+    } else {
+      // For non-constant upper bounds, we also use affine.apply
+      llvm::errs() << "Using affine.apply for unconstant upper bound\n";
+      affine::AffineBound upperBound = forOp.getUpperBound();
+      AffineMap upperBoundMap = upperBound.getMap();
+      ValueRange upperBoundOperands = forOp.getUpperBoundOperands();
+      upperBoundVal = rewriter.create<affine::AffineApplyOp>(
+          loc, upperBoundMap, upperBoundOperands);
+    }
+
+    auto stepAttr = rewriter.getIntegerAttr(indexType, forOp.getStep());
+    Value stepVal =
+        rewriter.create<neura::ConstantOp>(loc, indexType, stepAttr, nullptr);
+    llvm::errs() << "lower bound: " << lowerBoundVal
+                 << ", upper bound: " << upperBoundVal << ", step: " << stepVal
+                 << "\n";
+
+    // 2. Block structure
+    Block *originBlock = rewriter.getInsertionBlock();
+    auto originPoint = rewriter.getInsertionPoint();
+    Region *parentRegion = originBlock->getParent();
+
+    Block *headerBlock = rewriter.createBlock(
+        parentRegion, std::next(Region::iterator(originBlock)), {indexType},
+        {loc});
+    Block *bodyBlock = rewriter.createBlock(
+        parentRegion, std::next(Region::iterator(headerBlock)), {indexType},
+        {loc});
+    Block *exitBlock = rewriter.createBlock(
+        parentRegion, std::next(Region::iterator(bodyBlock)));
+    Block *continueBlock = rewriter.splitBlock(originBlock, originPoint);
+
+    // 3. origin -> header
+    rewriter.setInsertionPointToEnd(originBlock);
+    rewriter.create<neura::Br>(loc, ValueRange{lowerBoundVal}, headerBlock);
+
+    // 4. header: loop_control
+    rewriter.setInsertionPointToEnd(headerBlock);
+    rewriter.create<neura::LoopControlOp>(
+        loc,
+        headerBlock->getArgument(0), // current index
+        stepVal, upperBoundVal, rewriter.getStringAttr("lt"),
+        ValueRange{}, // passthrough
+        bodyBlock, exitBlock);
+
+    // 5. body: clone forOp body, mapping index
+    rewriter.setInsertionPointToStart(bodyBlock);
+    Value currentIndex = bodyBlock->getArgument(0);
+    if (!forOp.getRegion().empty()) {
+      Block &sourceBlock = forOp.getRegion().front();
+      IRMapping mapping;
+      mapping.map(sourceBlock.getArgument(0), currentIndex);
+      for (auto &op : llvm::make_range(sourceBlock.begin(),
+                                       std::prev(sourceBlock.end()))) {
+        Operation *clonedOp = rewriter.clone(op, mapping);
+        for (unsigned i = 0; i < op.getNumResults(); ++i)
+          mapping.map(op.getResult(i), clonedOp->getResult(i));
+      }
+    }
+
+    // 6. body 结尾跳 header，传当前 index
+    rewriter.setInsertionPointToEnd(bodyBlock);
+    rewriter.create<neura::Br>(loc, ValueRange{currentIndex}, headerBlock);
+
+    // 7. exit 跳 continue
+    rewriter.setInsertionPointToEnd(exitBlock);
+    rewriter.create<neura::Br>(loc, ValueRange{}, continueBlock);
+
+    // 8. 移除原 affine.for
+    rewriter.eraseOp(forOp);
+
+    return success();
+  }
+};
+
+struct LowerAffineToNeuraPass
+    : public PassWrapper<LowerAffineToNeuraPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerAffineToNeuraPass)
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<neura::NeuraDialect, arith::ArithDialect,
+                    memref::MemRefDialect, affine::AffineDialect>();
+  }
+
+  StringRef getArgument() const override { return "lower-affine-to-neura"; }
+  StringRef getDescription() const override {
+    return "Lower affine operations to Neura dialect operations";
+  }
+
+  void runOnOperation() override {
+    FuncOp funcOp = getOperation();
+    MLIRContext *context = funcOp.getContext();
+
+    // ConversionTarget target(*context);
+    // target.addIllegalOp<affine::AffineLoadOp, affine::AffineStoreOp, affine::AffineForOp>();
+    // target.addLegalDialect<neura::NeuraDialect, arith::ArithDialect,
+    //                        memref::MemRefDialect, affine::AffineDialect,
+    //                        func::FuncDialect>();
+
+    RewritePatternSet patterns(context);
+    patterns.add<AffineLoadLowering, AffineStoreLowering, AffineForLowering>(
+        context);
+
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                      std::move(patterns)))) {
+      funcOp.emitError("Failed to lower affine operations to Neura dialect");
+      signalPassFailure();
+    }
+  }
+};
+} // namespace
+
+std::unique_ptr<mlir::Pass> mlir::createLowerAffineToNeuraPass() {
+  return std::make_unique<LowerAffineToNeuraPass>();
+}
\ No newline at end of file
diff --git a/lib/Conversion/AffineToNeura/CMakeLists.txt b/lib/Conversion/AffineToNeura/CMakeLists.txt
new file mode 100644
index 00000000..fc71ff70
--- /dev/null
+++ b/lib/Conversion/AffineToNeura/CMakeLists.txt
@@ -0,0 +1,15 @@
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+add_mlir_conversion_library(MLIRNeuraAffineToNeuraPass
+  AffineToNeuraPass.cpp
+
+  DEPENDS
+  MLIRConversionIncGen
+
+  LINK_LIBS PUBLIC
+    MLIRIR
+    MLIRPass
+    MLIRSupport
+    MLIRTransforms
+    # MLIRNeura
+)
\ No newline at end of file
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index ab952519..af926302 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -24,7 +24,7 @@ using namespace mlir::func;
 using namespace mlir::neura;
 
 #define GEN_PASS_DEF_LOWERARITHTONEURA
-#include "NeuraDialect/NeuraPasses.h.inc"
+#include "Conversion/ConversionPasses.h.inc"
 
 namespace{
 
diff --git a/lib/Conversion/CMakeLists.txt b/lib/Conversion/CMakeLists.txt
index 1dbce29f..2a33d1e2 100644
--- a/lib/Conversion/CMakeLists.txt
+++ b/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 
 add_subdirectory(ArithToNeura)
 add_subdirectory(LlvmToNeura)
+add_subdirectory(AffineToNeura)
 
 # add_mlir_library(
 #     MLIRNeuraConversion
diff --git a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
index c9c2fe23..39d72b39 100644
--- a/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
+++ b/lib/Conversion/LlvmToNeura/LlvmToNeuraPass.cpp
@@ -25,7 +25,7 @@ using namespace mlir;
 using namespace mlir::neura;
 
 #define GEN_PASS_DEF_LOWERLLVMTONEURA
-#include "NeuraDialect/NeuraPasses.h.inc"
+#include "Conversion/ConversionPasses.h.inc"
 
 
 namespace {
diff --git a/test/affine2neura/gpt2-node11/compile.sh b/test/affine2neura/gpt2-node11/compile.sh
new file mode 100755
index 00000000..4d3eeaee
--- /dev/null
+++ b/test/affine2neura/gpt2-node11/compile.sh
@@ -0,0 +1,3 @@
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/cgeist ./node11.cpp -S --raise-scf-to-affine -o ./node11.mlir
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node11.mlir --affine-loop-unroll="unroll-factor=2" -o ./node11_unroll.mlir
+# /home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node27_unroll.mlir --affine-loop-tile="tile-size=2" -o ./node27_tile.mlir
\ No newline at end of file
diff --git a/test/affine2neura/gpt2-node11/node11.cpp b/test/affine2neura/gpt2-node11/node11.cpp
new file mode 100644
index 00000000..45e4262c
--- /dev/null
+++ b/test/affine2neura/gpt2-node11/node11.cpp
@@ -0,0 +1,12 @@
+float input[1][16][64];
+float output[1][16];
+
+int main() {
+  for (int arg2 = 0; arg2 < 1; arg2++) {
+    for (int arg3 = 0; arg3 < 16; arg3++) {
+      for (int arg4 = 0; arg4 < 64; arg4+=1) {
+        output[arg2][arg3] += input[arg2][arg3][arg4];
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/test/affine2neura/simpleloop/compile.sh b/test/affine2neura/simpleloop/compile.sh
new file mode 100755
index 00000000..f19caf0e
--- /dev/null
+++ b/test/affine2neura/simpleloop/compile.sh
@@ -0,0 +1,3 @@
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/cgeist ./simple.cpp -S --raise-scf-to-affine -o ./simple.mlir
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./simple.mlir --affine-loop-unroll="unroll-factor=2" -o ./simple_unroll.mlir
+# /home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node27_unroll.mlir --affine-loop-tile="tile-size=2" -o ./node27_tile.mlir
\ No newline at end of file
diff --git a/test/affine2neura/simpleloop/simple.cpp b/test/affine2neura/simpleloop/simple.cpp
new file mode 100644
index 00000000..6078f497
--- /dev/null
+++ b/test/affine2neura/simpleloop/simple.cpp
@@ -0,0 +1,12 @@
+float A[100];
+float C[100];
+
+int main() {
+  const int size = 100;
+  for (int i = 0; i < size; ++i) {
+    float loaded_value = A[i];      // Instruction 1: Load value from A
+    float multiplied_value = loaded_value * 10.0f; // Instruction 2: Multiply the value
+    C[i] = multiplied_value;      // Instruction 3: Store result into C
+  }
+  return 0;
+}
diff --git a/tools/mlir-neura-opt/mlir-neura-opt.cpp b/tools/mlir-neura-opt/mlir-neura-opt.cpp
index d21664fb..5453a29d 100644
--- a/tools/mlir-neura-opt/mlir-neura-opt.cpp
+++ b/tools/mlir-neura-opt/mlir-neura-opt.cpp
@@ -3,6 +3,7 @@
 #include "mlir/Dialect/DLTI/DLTI.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/InitAllDialects.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/InitAllPasses.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Support/LogicalResult.h"
@@ -20,6 +21,8 @@ int main(int argc, char **argv) {
   registry.insert<mlir::arith::ArithDialect>();
   registry.insert<mlir::DLTIDialect>();
   registry.insert<mlir::LLVM::LLVMDialect>();
+  registry.insert<mlir::affine::AffineDialect>();
+  registry.insert<mlir::memref::MemRefDialect>();
 
   mlir::neura::registerPasses();
   mlir::registerPasses();

From 7e165475eee7982a204439828de473d3947c23e1 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 13 Jun 2025 18:55:43 +0800
Subject: [PATCH 03/13] lower affine.apply to neura.add

---
 include/NeuraDialect/NeuraOps.td              |  5 +-
 .../AffineToNeura/AffineToNeuraPass.cpp       | 54 +++++++++++++++----
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index bfba1d4f..1f76a981 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -1,6 +1,7 @@
 // NeuraOps.td - Custom operation definitions.
 
 include "NeuraDialect/NeuraDialect.td"
+include "mlir/IR/CommonTypeConstraints.td"
 
 // ----------------------------------------------------
 // Defines basic scalar operations.
@@ -18,8 +19,8 @@ def Neura_ConstantOp : Op<NeuraDialect, "constant"> {
 def Neura_AddOp : Op<NeuraDialect, "add"> {
   let summary = "Integer addition operation";
   let opName = "add";
-  let arguments = (ins AnyInteger:$lhs, AnyInteger:$rhs, Optional<AnyType>:$predicate);
-  let results = (outs AnyInteger:$result);
+  let arguments = (ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs, Optional<AnyType>:$predicate);
+  let results = (outs SignlessIntegerLike:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
   let traits = [SameOperandsAndResultElementType];
 }
diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
index e080da9e..51cbfe12 100644
--- a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
+++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
@@ -3,6 +3,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IRMapping.h"
@@ -275,6 +276,47 @@ struct AffineForLowering : public OpRewritePattern<affine::AffineForOp> {
   }
 };
 
+struct AffineApplyLowering : public OpRewritePattern<affine::AffineApplyOp> {
+  using OpRewritePattern<affine::AffineApplyOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(affine::AffineApplyOp applyOp,
+                                PatternRewriter &rewriter) const override {
+    AffineMap map = applyOp.getAffineMap();
+    ValueRange operands = applyOp.getMapOperands();
+    auto loc = applyOp.getLoc();
+
+    if (map.getNumResults() != 1) {
+      return applyOp.emitError("AffineApplyOp must have a single result");
+    }
+
+    AffineExpr expr = map.getResult(0);
+    // d0 + cst
+    if (auto binExpr = expr.dyn_cast<AffineBinaryOpExpr>()) {
+      if (binExpr.getKind() == AffineExprKind::Add) {
+        if (auto dim = binExpr.getLHS().dyn_cast<AffineDimExpr>()) {
+          if (auto cst = binExpr.getRHS().dyn_cast<AffineConstantExpr>()) {
+            auto cstVal = rewriter.create<neura::ConstantOp>(
+                loc, rewriter.getIndexType(),
+                rewriter.getIntegerAttr(rewriter.getIndexType(),
+                                        cst.getValue()),
+                nullptr);
+            auto addOp = rewriter.create<neura::AddOp>(
+                loc, cstVal.getType(), operands[dim.getPosition()], cstVal,
+                nullptr);
+            rewriter.replaceOp(applyOp, addOp.getResult());
+            return success();
+          }
+        }
+      }
+    }
+
+    // You can add more cases here for different affine expressions
+    // For now, we will just emit an error for unsupported expressions.
+    return applyOp.emitError("Unsupported affine expression in AffineApplyOp: ")
+           << expr
+           << ". Only simple affine expressions like d0 + cst are supported.";
+  }
+};
+
 struct LowerAffineToNeuraPass
     : public PassWrapper<LowerAffineToNeuraPass, OperationPass<func::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerAffineToNeuraPass)
@@ -293,18 +335,12 @@ struct LowerAffineToNeuraPass
     FuncOp funcOp = getOperation();
     MLIRContext *context = funcOp.getContext();
 
-    // ConversionTarget target(*context);
-    // target.addIllegalOp<affine::AffineLoadOp, affine::AffineStoreOp, affine::AffineForOp>();
-    // target.addLegalDialect<neura::NeuraDialect, arith::ArithDialect,
-    //                        memref::MemRefDialect, affine::AffineDialect,
-    //                        func::FuncDialect>();
-
     RewritePatternSet patterns(context);
-    patterns.add<AffineLoadLowering, AffineStoreLowering, AffineForLowering>(
-        context);
+    patterns.add<AffineLoadLowering, AffineStoreLowering, AffineForLowering,
+                 AffineApplyLowering>(context);
 
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                      std::move(patterns)))) {
+                                            std::move(patterns)))) {
       funcOp.emitError("Failed to lower affine operations to Neura dialect");
       signalPassFailure();
     }

From ae6134aeaa507977279754a7e65b88786355fc1d Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 13 Jun 2025 19:10:43 +0800
Subject: [PATCH 04/13] fix bugs

---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adaf6c9b..9a83b5b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED YES)
 
 add_compile_options(-g)
 
-set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
-set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
-set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
-set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
+# set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
+# set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
+# set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
+# set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
 message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 

From 8c3c6e5943710a6470317b18cd80ae6bcdb51669 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 13 Jun 2025 21:37:34 +0800
Subject: [PATCH 05/13] fix dyn_cast bugs

---
 CMakeLists.txt                                |  8 ++--
 include/CMakeLists.txt                        |  3 +-
 include/Compiler/CMakeLists.txt               |  3 ++
 include/Compiler/CompilerPasses.h             | 29 +++++++++++++++
 include/Compiler/CompilerPasses.td            | 17 +++++++++
 include/NeuraDialect/NeuraPasses.h            |  4 ++
 lib/CMakeLists.txt                            |  3 +-
 .../AffineToNeura/AffineToNeuraPass.cpp       | 37 ++++++++++++-------
 tools/neura-compiler/neura-compiler.cpp       |  0
 9 files changed, 84 insertions(+), 20 deletions(-)
 create mode 100644 include/Compiler/CMakeLists.txt
 create mode 100644 include/Compiler/CompilerPasses.h
 create mode 100644 include/Compiler/CompilerPasses.td
 create mode 100644 tools/neura-compiler/neura-compiler.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9a83b5b7..adaf6c9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED YES)
 
 add_compile_options(-g)
 
-# set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
-# set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
-# set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
-# set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
+set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
+set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
+set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
+set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
 message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 599a4181..7ed6674c 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(NeuraDialect)
-add_subdirectory(Conversion)
\ No newline at end of file
+add_subdirectory(Conversion)
+# add_subdirectory(Compiler)
\ No newline at end of file
diff --git a/include/Compiler/CMakeLists.txt b/include/Compiler/CMakeLists.txt
new file mode 100644
index 00000000..61aad97c
--- /dev/null
+++ b/include/Compiler/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(LLVM_TARGET_DEFINITIONS CompilerPasses.td)
+mlir_tablegen(CompilerPasses.h.inc --gen-pass-decls)
+add_public_tablegen_target(MLIRCompilerPassesIncGen)
\ No newline at end of file
diff --git a/include/Compiler/CompilerPasses.h b/include/Compiler/CompilerPasses.h
new file mode 100644
index 00000000..36e5db18
--- /dev/null
+++ b/include/Compiler/CompilerPasses.h
@@ -0,0 +1,29 @@
+// ConversionPasses.h - Header file for conversion passes
+
+#ifndef CONVERSION_PASSES_H
+#define CONVERSION_PASSES_H
+
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
+#include <memory>
+
+namespace mlir {
+
+// Passes defined in GraphPasses.td.
+#define GEN_PASS_DECL
+#include "Conversion/ConversionPasses.h.inc"
+
+// Conversion passes.
+std::unique_ptr<mlir::Pass> createLowerArithToNeuraPass();
+std::unique_ptr<mlir::Pass> createLowerLlvmToNeuraPass();
+std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
+
+#define GEN_PASS_REGISTRATION
+#include "Conversion/ConversionPasses.h.inc"
+
+} // namespace mlir
+
+#endif // CONVERSION_PASSES_H
\ No newline at end of file
diff --git a/include/Compiler/CompilerPasses.td b/include/Compiler/CompilerPasses.td
new file mode 100644
index 00000000..ae1ad574
--- /dev/null
+++ b/include/Compiler/CompilerPasses.td
@@ -0,0 +1,17 @@
+// CompilerPasses.td - Passes for neura compiler
+
+#ifndef COMPILER_PASSES_TD
+#define COMPILER_PASSES_TD
+
+include "mlir/Pass/PassBase.td"
+
+//=========================================================//
+// Passes for the CGRA Mapping
+//=========================================================//
+def GenerateDFG: Pass<"generate-dfg", "ModuleOp">{
+  let summary = "Generates a Data Flow Graph (DFG) for the Neura dialect";
+  let description = [{This pass generates a DFG from the Neura dialect operations.}];
+  let constructor = "neura::createGenerateDFGPass()";
+}
+
+#endif // COMPILER_PASSES_TD
\ No newline at end of file
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 9cdeef7f..baa5c6f8 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -16,6 +16,7 @@ namespace neura {
 // Passes defined in GraphPasses.td
 #define GEN_PASS_DECL
 #include "NeuraDialect/NeuraPasses.h.inc"
+// Passes used for neura optimization and transformation
 std::unique_ptr<mlir::Pass> createInsertDataMovPass();
 std::unique_ptr<mlir::Pass> createInsertCtrlMovPass();
 std::unique_ptr<mlir::Pass> createFusePatternsPass();
@@ -23,6 +24,9 @@ std::unique_ptr<mlir::Pass> createAssignAcceleratorPass();
 std::unique_ptr<mlir::Pass> createTransformCtrlToDataFlowPass();
 std::unique_ptr<mlir::Pass> createLeveragePredicatedValuePass();
 
+// Passes used for neura compiler
+std::unique_ptr<mlir::Pass> createGenerateDFGPass();
+
 #define GEN_PASS_REGISTRATION
 #include "NeuraDialect/NeuraPasses.h.inc"
 
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 599a4181..7ed6674c 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(NeuraDialect)
-add_subdirectory(Conversion)
\ No newline at end of file
+add_subdirectory(Conversion)
+# add_subdirectory(Compiler)
\ No newline at end of file
diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
index 51cbfe12..cd5085f7 100644
--- a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
+++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
@@ -50,13 +50,15 @@ struct AffineLoadLowering : public OpRewritePattern<affine::AffineLoadOp> {
     }
 
     for (AffineExpr expr : map.getResults()) {
-      if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+      if (expr.isa<AffineConstantExpr>()) {
+        auto constExpr = expr.cast<AffineConstantExpr>();
         auto indexType = rewriter.getIndexType();
         auto valueAttr =
             rewriter.getIntegerAttr(indexType, constExpr.getValue());
         newIndices.push_back(rewriter.create<neura::ConstantOp>(
             loc, indexType, valueAttr, nullptr));
-      } else if (auto dimExpr = expr.dyn_cast<AffineDimExpr>()) {
+      } else if (expr.isa<AffineDimExpr>()) {
+        auto dimExpr = expr.cast<AffineDimExpr>();
         if (dimExpr.getPosition() >= map.getNumDims() ||
             dimExpr.getPosition() >=
                 mapOperands
@@ -65,7 +67,8 @@ struct AffineLoadLowering : public OpRewritePattern<affine::AffineLoadOp> {
               "affine map dimension out of bounds for map operands");
         }
         newIndices.push_back(mapOperands[dimExpr.getPosition()]);
-      } else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
+      } else if (expr.isa<AffineSymbolExpr>()) {
+        auto symExpr = expr.cast<AffineSymbolExpr>();
         unsigned symbolOperandIndex = map.getNumDims() + symExpr.getPosition();
         if (symbolOperandIndex >= mapOperands.size()) {
           return loadOp.emitError(
@@ -86,7 +89,7 @@ struct AffineLoadLowering : public OpRewritePattern<affine::AffineLoadOp> {
       }
     }
 
-    auto memRefType = memref.getType().dyn_cast<MemRefType>();
+    auto memRefType = memref.getType().cast<MemRefType>();
     if (!memRefType) {
       return loadOp.emitError("base of load is not a MemRefType");
     }
@@ -119,20 +122,23 @@ struct AffineStoreLowering : public OpRewritePattern<affine::AffineStoreOp> {
     newIndices.reserve(map.getNumResults());
 
     for (AffineExpr expr : map.getResults()) {
-      if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+      if (expr.isa<AffineConstantExpr>()) {
+        auto constExpr = expr.cast<AffineConstantExpr>();
         auto indexType = rewriter.getIndexType();
         auto valueAttr =
             rewriter.getIntegerAttr(indexType, constExpr.getValue());
         newIndices.push_back(rewriter.create<neura::ConstantOp>(
             loc, indexType, valueAttr, nullptr));
-      } else if (auto dimExpr = expr.dyn_cast<AffineDimExpr>()) {
+      } else if (expr.isa<AffineDimExpr>()) {
+        auto dimExpr = expr.cast<AffineDimExpr>();
         if (dimExpr.getPosition() >= map.getNumDims() ||
             dimExpr.getPosition() >= mapOperands.size()) {
           return storeOp.emitError(
               "affine map dimension out of bounds for map operands");
         }
         newIndices.push_back(mapOperands[dimExpr.getPosition()]);
-      } else if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
+      } else if (expr.isa<AffineSymbolExpr>()) {
+        auto symExpr = expr.cast<AffineSymbolExpr>();
         unsigned symbolOperandIndex = map.getNumDims() + symExpr.getPosition();
         if (symbolOperandIndex >= mapOperands.size()) {
           return storeOp.emitError(
@@ -150,7 +156,7 @@ struct AffineStoreLowering : public OpRewritePattern<affine::AffineStoreOp> {
       }
     }
 
-    auto memRefType = memref.getType().dyn_cast<MemRefType>();
+    auto memRefType = memref.getType().cast<MemRefType>();
     if (!memRefType) {
       return storeOp.emitError("base of store is not a MemRefType");
     }
@@ -290,10 +296,13 @@ struct AffineApplyLowering : public OpRewritePattern<affine::AffineApplyOp> {
 
     AffineExpr expr = map.getResult(0);
     // d0 + cst
-    if (auto binExpr = expr.dyn_cast<AffineBinaryOpExpr>()) {
+    if (expr.isa<AffineBinaryOpExpr>()) {
+      auto binExpr = expr.cast<AffineBinaryOpExpr>();
       if (binExpr.getKind() == AffineExprKind::Add) {
-        if (auto dim = binExpr.getLHS().dyn_cast<AffineDimExpr>()) {
-          if (auto cst = binExpr.getRHS().dyn_cast<AffineConstantExpr>()) {
+        if (binExpr.getLHS().isa<AffineDimExpr>()) {
+          auto dim = binExpr.getLHS().cast<AffineDimExpr>();
+          if (binExpr.getRHS().isa<AffineConstantExpr>()) {
+            auto cst = binExpr.getRHS().cast<AffineConstantExpr>();
             auto cstVal = rewriter.create<neura::ConstantOp>(
                 loc, rewriter.getIndexType(),
                 rewriter.getIntegerAttr(rewriter.getIndexType(),
@@ -311,9 +320,9 @@ struct AffineApplyLowering : public OpRewritePattern<affine::AffineApplyOp> {
 
     // You can add more cases here for different affine expressions
     // For now, we will just emit an error for unsupported expressions.
-    return applyOp.emitError("Unsupported affine expression in AffineApplyOp: ")
-           << expr
-           << ". Only simple affine expressions like d0 + cst are supported.";
+    return applyOp.emitError(
+               "Unsupported complex affine expression in AffineApplyOp.\n")
+           << "Only simple affine expressions like d0 + cst are supported.\n";
   }
 };
 
diff --git a/tools/neura-compiler/neura-compiler.cpp b/tools/neura-compiler/neura-compiler.cpp
new file mode 100644
index 00000000..e69de29b

From 0c2406436d571d43ae1e1be8cbaa53d505159d5c Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Fri, 13 Jun 2025 23:30:49 +0800
Subject: [PATCH 06/13] set correct MLIR path in Cmakelists

---
 CMakeLists.txt                     | 8 ++++----
 include/NeuraDialect/NeuraPasses.h | 2 ++
 lib/NeuraDialect/CMakeLists.txt    | 1 +
 lib/NeuraDialect/NeuraPasses.cpp   | 3 +++
 4 files changed, 10 insertions(+), 4 deletions(-)
 create mode 100644 lib/NeuraDialect/NeuraPasses.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adaf6c9b..9a83b5b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED YES)
 
 add_compile_options(-g)
 
-set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
-set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
-set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
-set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
+# set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
+# set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
+# set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
+# set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
 message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index baa5c6f8..27f039ae 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -13,6 +13,8 @@
 namespace mlir {
 namespace neura {
 
+
+    
 // Passes defined in GraphPasses.td
 #define GEN_PASS_DECL
 #include "NeuraDialect/NeuraPasses.h.inc"
diff --git a/lib/NeuraDialect/CMakeLists.txt b/lib/NeuraDialect/CMakeLists.txt
index 50532491..34f19ac4 100644
--- a/lib/NeuraDialect/CMakeLists.txt
+++ b/lib/NeuraDialect/CMakeLists.txt
@@ -18,6 +18,7 @@ add_public_tablegen_target(MLIRNeuraDialectIncGen)
 add_mlir_dialect_library(MLIRNeura
         Neura.cpp
         NeuraTypes.cpp
+        NeuraPasses.cpp
 
         ADDITIONAL_HEADER_DIRS
         ${PROJECT_SOURCE_DIR}/include/NeuraDialect
diff --git a/lib/NeuraDialect/NeuraPasses.cpp b/lib/NeuraDialect/NeuraPasses.cpp
new file mode 100644
index 00000000..72b3a6a6
--- /dev/null
+++ b/lib/NeuraDialect/NeuraPasses.cpp
@@ -0,0 +1,3 @@
+#include "mlir/Pass/PassManager.h"
+#include "mlir/Transforms/Passes.h"
+

From 2f34550f7a2cec9ba67938a8db2701e30ab0d7c4 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 14 Jun 2025 10:34:25 +0800
Subject: [PATCH 07/13] change the target of arith2neura and affine2neura

---
 include/Conversion/ConversionPasses.td             | 4 ++--
 lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp | 8 ++++----
 lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp   | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/Conversion/ConversionPasses.td b/include/Conversion/ConversionPasses.td
index 77ee2ef1..cd8eb5ab 100644
--- a/include/Conversion/ConversionPasses.td
+++ b/include/Conversion/ConversionPasses.td
@@ -8,7 +8,7 @@ include "mlir/Pass/PassBase.td"
 //=========================================================//
 // Conversion passes
 //=========================================================//
-def LowerArithToNeura : Pass<"lower-arith-to-neura", "FuncOp">{
+def LowerArithToNeura : Pass<"lower-arith-to-neura", "ModuleOp">{
   let summary = "Lower arith to Neura dialect";
   let description = [{Lower arith dialect operations to Neura dialect operations.}];
   let constructor = "mlir::createLowerArithToNeuraPass()";
@@ -20,7 +20,7 @@ def LowerLlvmToNeura : Pass<"lower-llvm-to-neura", "ModuleOp">{
   let constructor = "mlir::createLowerLlvmToNeuraPass()";
 }
 
-def LowerAffineToNeura : Pass<"lower-affine-to-neura", "FuncOp">{
+def LowerAffineToNeura : Pass<"lower-affine-to-neura", "ModuleOp">{
   let summary = "Lower affine to Neura dialect";
   let description = [{Lower affine dialect operations to Neura dialect operations.}];
   let constructor = "mlir::createLowerAffineToNeuraPass()";
diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
index cd5085f7..de7e8d19 100644
--- a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
+++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
@@ -327,7 +327,7 @@ struct AffineApplyLowering : public OpRewritePattern<affine::AffineApplyOp> {
 };
 
 struct LowerAffineToNeuraPass
-    : public PassWrapper<LowerAffineToNeuraPass, OperationPass<func::FuncOp>> {
+    : public PassWrapper<LowerAffineToNeuraPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerAffineToNeuraPass)
 
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -341,8 +341,8 @@ struct LowerAffineToNeuraPass
   }
 
   void runOnOperation() override {
-    FuncOp funcOp = getOperation();
-    MLIRContext *context = funcOp.getContext();
+    ModuleOp moduleOp = getOperation();
+    MLIRContext *context = moduleOp.getContext();
 
     RewritePatternSet patterns(context);
     patterns.add<AffineLoadLowering, AffineStoreLowering, AffineForLowering,
@@ -350,7 +350,7 @@ struct LowerAffineToNeuraPass
 
     if (failed(applyPatternsAndFoldGreedily(getOperation(),
                                             std::move(patterns)))) {
-      funcOp.emitError("Failed to lower affine operations to Neura dialect");
+      moduleOp.emitError("Failed to lower affine operations to Neura dialect");
       signalPassFailure();
     }
   }
diff --git a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
index af926302..ee844d9d 100644
--- a/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
+++ b/lib/Conversion/ArithToNeura/ArithToNeuraPass.cpp
@@ -44,7 +44,7 @@ struct ArithFAddToNeuraFAdd : public OpRewritePattern<mlir::arith::AddFOp> {
 };
 
 struct LowerArithToNeuraPass
-    : public PassWrapper<LowerArithToNeuraPass, OperationPass<func::FuncOp>> {
+    : public PassWrapper<LowerArithToNeuraPass, OperationPass<ModuleOp>> {
 
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerArithToNeuraPass)
 

From 5bc48842fe13e40a0fda5f78e0ee488e313f92c2 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 14 Jun 2025 11:02:48 +0800
Subject: [PATCH 08/13] change the operand type in neura.add

---
 include/NeuraDialect/NeuraOps.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 1f76a981..2c1b3235 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -19,8 +19,8 @@ def Neura_ConstantOp : Op<NeuraDialect, "constant"> {
 def Neura_AddOp : Op<NeuraDialect, "add"> {
   let summary = "Integer addition operation";
   let opName = "add";
-  let arguments = (ins SignlessIntegerLike:$lhs, SignlessIntegerLike:$rhs, Optional<AnyType>:$predicate);
-  let results = (outs SignlessIntegerLike:$result);
+  let arguments = (ins AnyType:$lhs, AnyType:$rhs, Optional<AnyType>:$predicate);
+  let results = (outs AnyType:$result);
   // let assemblyFormat = "$lhs `,` $rhs `,` $predicate attr-dict `:` type($result)";
   let traits = [SameOperandsAndResultElementType];
 }

From c0a861c7f8babf26fc0867ad0612bcafff242ca4 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 14 Jun 2025 12:56:55 +0800
Subject: [PATCH 09/13] change the assembly format in neura.loop_control

---
 CMakeLists.txt                                |  8 ++---
 include/CMakeLists.txt                        |  3 +-
 include/Compiler/CMakeLists.txt               |  3 --
 include/Compiler/CompilerPasses.h             | 29 -----------------
 include/Compiler/CompilerPasses.td            | 17 ----------
 include/NeuraDialect/CMakeLists.txt           | 12 +++----
 include/NeuraDialect/NeuraOps.td              |  4 +--
 include/NeuraDialect/NeuraPasses.h            |  4 +--
 include/NeuraDialect/NeuraPasses.td           | 21 ++++++++----
 .../AffineToNeura/AffineToNeuraPass.cpp       |  6 +++-
 lib/NeuraDialect/CMakeLists.txt               | 32 ++++++++++---------
 lib/NeuraDialect/NeuraPasses.cpp              | 24 ++++++++++++++
 tools/CMakeLists.txt                          |  3 +-
 tools/neura-compiler/CMakeLists.txt           | 18 +++++++++++
 tools/neura-compiler/neura-compiler.cpp       | 32 +++++++++++++++++++
 15 files changed, 128 insertions(+), 88 deletions(-)
 delete mode 100644 include/Compiler/CMakeLists.txt
 delete mode 100644 include/Compiler/CompilerPasses.h
 delete mode 100644 include/Compiler/CompilerPasses.td
 create mode 100644 tools/neura-compiler/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9a83b5b7..adaf6c9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED YES)
 
 add_compile_options(-g)
 
-# set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
-# set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
-# set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
-# set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
+set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
+set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
+set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
+set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
 message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index 7ed6674c..599a4181 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_subdirectory(NeuraDialect)
-add_subdirectory(Conversion)
-# add_subdirectory(Compiler)
\ No newline at end of file
+add_subdirectory(Conversion)
\ No newline at end of file
diff --git a/include/Compiler/CMakeLists.txt b/include/Compiler/CMakeLists.txt
deleted file mode 100644
index 61aad97c..00000000
--- a/include/Compiler/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS CompilerPasses.td)
-mlir_tablegen(CompilerPasses.h.inc --gen-pass-decls)
-add_public_tablegen_target(MLIRCompilerPassesIncGen)
\ No newline at end of file
diff --git a/include/Compiler/CompilerPasses.h b/include/Compiler/CompilerPasses.h
deleted file mode 100644
index 36e5db18..00000000
--- a/include/Compiler/CompilerPasses.h
+++ /dev/null
@@ -1,29 +0,0 @@
-// ConversionPasses.h - Header file for conversion passes
-
-#ifndef CONVERSION_PASSES_H
-#define CONVERSION_PASSES_H
-
-#include "NeuraDialect/NeuraDialect.h"
-#include "NeuraDialect/NeuraOps.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Pass/PassRegistry.h"
-#include <memory>
-
-namespace mlir {
-
-// Passes defined in GraphPasses.td.
-#define GEN_PASS_DECL
-#include "Conversion/ConversionPasses.h.inc"
-
-// Conversion passes.
-std::unique_ptr<mlir::Pass> createLowerArithToNeuraPass();
-std::unique_ptr<mlir::Pass> createLowerLlvmToNeuraPass();
-std::unique_ptr<mlir::Pass> createLowerAffineToNeuraPass();
-
-#define GEN_PASS_REGISTRATION
-#include "Conversion/ConversionPasses.h.inc"
-
-} // namespace mlir
-
-#endif // CONVERSION_PASSES_H
\ No newline at end of file
diff --git a/include/Compiler/CompilerPasses.td b/include/Compiler/CompilerPasses.td
deleted file mode 100644
index ae1ad574..00000000
--- a/include/Compiler/CompilerPasses.td
+++ /dev/null
@@ -1,17 +0,0 @@
-// CompilerPasses.td - Passes for neura compiler
-
-#ifndef COMPILER_PASSES_TD
-#define COMPILER_PASSES_TD
-
-include "mlir/Pass/PassBase.td"
-
-//=========================================================//
-// Passes for the CGRA Mapping
-//=========================================================//
-def GenerateDFG: Pass<"generate-dfg", "ModuleOp">{
-  let summary = "Generates a Data Flow Graph (DFG) for the Neura dialect";
-  let description = [{This pass generates a DFG from the Neura dialect operations.}];
-  let constructor = "neura::createGenerateDFGPass()";
-}
-
-#endif // COMPILER_PASSES_TD
\ No newline at end of file
diff --git a/include/NeuraDialect/CMakeLists.txt b/include/NeuraDialect/CMakeLists.txt
index 1c9b30b5..96d06740 100644
--- a/include/NeuraDialect/CMakeLists.txt
+++ b/include/NeuraDialect/CMakeLists.txt
@@ -1,10 +1,10 @@
 # Set TableGen include paths
-set(MLIR_TABLEGEN_INCLUDES 
-    ${PROJECT_SOURCE_DIR}/include
-    ${PROJECT_SOURCE_DIR}/include/NeuraDialect
-    ${CMAKE_CURRENT_BINARY_DIR}/include/NeuraDialect
-    ${MLIR_MAIN_INCLUDE_DIR}
-    ${MLIR_INCLUDE_DIR})
+# set(MLIR_TABLEGEN_INCLUDES 
+#     ${PROJECT_SOURCE_DIR}/include
+#     ${PROJECT_SOURCE_DIR}/include/NeuraDialect
+#     ${CMAKE_CURRENT_BINARY_DIR}/include/NeuraDialect
+#     ${MLIR_MAIN_INCLUDE_DIR}
+#     ${MLIR_INCLUDE_DIR})
 
 add_mlir_dialect(Neura neura)
 
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 2c1b3235..43ded351 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -320,12 +320,12 @@ def Neura_LoopControlOp: Op<NeuraDialect, "loop_control", [Terminator]>{
                    Index:$step,
                    Index:$bound, 
                    DefaultValuedAttr<StrAttr, "\"lt\"">:$loop_type, // Loop type: "lt", "le", "gt", "ge", "eq", "ne"
-                   Variadic<AnyType>:$passthrough_args // Additional arguments to pass through to the successors
+                   Variadic<AnyType>:$body_args // Additional arguments to pass through to the successors
                    );
   let results = (outs);
   let successors = (successor
                     AnySuccessor:$body, // loop body successors
                     AnySuccessor:$exit // exit successors
                     );
-  let assemblyFormat = "`current_index` `:` $current_index `,` `step` `:` $step `,` `bound` `:` $bound `,` `loop_type` `:` $loop_type (`passthrough` `(` $passthrough_args^ `:` type($passthrough_args) `)`)? `then` $body `else` $exit attr-dict";
+  let assemblyFormat = "`current_index` `:` $current_index `,` `step` `:` $step `,` `bound` `:` $bound `,` `loop_type` `:` $loop_type `then` $body(`(`$body_args^ `:` type($body_args)`)`)? `else` $exit attr-dict";
 }
\ No newline at end of file
diff --git a/include/NeuraDialect/NeuraPasses.h b/include/NeuraDialect/NeuraPasses.h
index 27f039ae..ff168337 100644
--- a/include/NeuraDialect/NeuraPasses.h
+++ b/include/NeuraDialect/NeuraPasses.h
@@ -13,7 +13,7 @@
 namespace mlir {
 namespace neura {
 
-
+void registerNeuraLegalizePassPipeline();
     
 // Passes defined in GraphPasses.td
 #define GEN_PASS_DECL
@@ -27,7 +27,7 @@ std::unique_ptr<mlir::Pass> createTransformCtrlToDataFlowPass();
 std::unique_ptr<mlir::Pass> createLeveragePredicatedValuePass();
 
 // Passes used for neura compiler
-std::unique_ptr<mlir::Pass> createGenerateDFGPass();
+// std::unique_ptr<mlir::Pass> createGenerateDFGPass();
 
 #define GEN_PASS_REGISTRATION
 #include "NeuraDialect/NeuraPasses.h.inc"
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index f4ea76a7..6615c2e4 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -36,18 +36,27 @@ def InsertCtrlMov : Pass<"insert-ctrl-mov", "ModuleOp"> {
 
 def TransformCtrlToDataFlow : Pass<"transform-ctrl-to-data-flow", "ModuleOp"> {
   let summary = "Inserts ctrl move operations in the Neura dialect";
-  let description =
-      [{Transform ctrl to predicate-based data flow.}];
+  let description = [{Transform ctrl to predicate - based data flow.}];
   let constructor = "neura::createTransformCtrlToDataFlowPass()";
 }
 
 def LeveragePredicatedValue : Pass<"leverage-predicated-value", "ModuleOp"> {
   let summary = "Convert values to predicated values in Neura dialect";
-  let description = [{
-    This pass converts regular values to predicated values in Neura dialect operations.
-    Each value is wrapped in a predicated value type with a default true predicate.
-  }];
+  let description = [{This pass converts regular values to predicated values in
+                          Neura dialect operations
+                              .Each value is wrapped in a predicated value type
+                                  with a default true predicate.}];
   let constructor = "neura::createLeveragePredicatedValuePass()";
 }
 
+//=========================================================//
+// Passes for the CGRA Mapping
+//=========================================================//
+// def GenerateDFG : Pass<"generate-dfg", "ModuleOp"> {
+//   let summary = "Generates a Data Flow Graph (DFG) for the Neura dialect";
+//   let description =
+//       [{This pass generates a DFG from the Neura dialect operations.}];
+//   let constructor = "neura::createGenerateDFGPass()";
+// }
+
 #endif // NEURA_PASSES_TD
\ No newline at end of file
diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
index de7e8d19..e5402667 100644
--- a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
+++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
@@ -245,11 +245,15 @@ struct AffineForLowering : public OpRewritePattern<affine::AffineForOp> {
 
     // 4. header: loop_control
     rewriter.setInsertionPointToEnd(headerBlock);
+    SmallVector<Value, 4> bodyArgs;
+    bodyArgs.push_back(headerBlock->getArgument(0)); // current index
+    // You can add more arguments if needed
+
     rewriter.create<neura::LoopControlOp>(
         loc,
         headerBlock->getArgument(0), // current index
         stepVal, upperBoundVal, rewriter.getStringAttr("lt"),
-        ValueRange{}, // passthrough
+        bodyArgs, // passthrough
         bodyBlock, exitBlock);
 
     // 5. body: clone forOp body, mapping index
diff --git a/lib/NeuraDialect/CMakeLists.txt b/lib/NeuraDialect/CMakeLists.txt
index 34f19ac4..d9a626b6 100644
--- a/lib/NeuraDialect/CMakeLists.txt
+++ b/lib/NeuraDialect/CMakeLists.txt
@@ -1,18 +1,18 @@
-# Set include paths for TableGen
-set(MLIR_TABLEGEN_INCLUDES
-    "-I${PROJECT_SOURCE_DIR}/include"
-    "-I${PROJECT_SOURCE_DIR}/include/NeuraDialect"
-    "-I${CMAKE_CURRENT_BINARY_DIR}/include/NeuraDialect")
+# # Set include paths for TableGen
+# set(MLIR_TABLEGEN_INCLUDES
+#     "-I${PROJECT_SOURCE_DIR}/include"
+#     "-I${PROJECT_SOURCE_DIR}/include/NeuraDialect"
+#     "-I${CMAKE_CURRENT_BINARY_DIR}/include/NeuraDialect")
 
-# Generate TableGen files
-set(LLVM_TARGET_DEFINITIONS ${PROJECT_SOURCE_DIR}/include/NeuraDialect/Neura.td)
-mlir_tablegen(Neura.h.inc -gen-op-decls ${MLIR_TABLEGEN_INCLUDES})
-mlir_tablegen(Neura.cpp.inc -gen-op-defs ${MLIR_TABLEGEN_INCLUDES})
-mlir_tablegen(NeuraDialect.h.inc -gen-dialect-decls ${MLIR_TABLEGEN_INCLUDES})
-mlir_tablegen(NeuraDialect.cpp.inc -gen-dialect-defs ${MLIR_TABLEGEN_INCLUDES})
-mlir_tablegen(NeuraTypes.h.inc -gen-typedef-decls ${MLIR_TABLEGEN_INCLUDES})
-mlir_tablegen(NeuraTypes.cpp.inc -gen-typedef-defs ${MLIR_TABLEGEN_INCLUDES})
-add_public_tablegen_target(MLIRNeuraDialectIncGen)
+# # Generate TableGen files
+# set(LLVM_TARGET_DEFINITIONS ${PROJECT_SOURCE_DIR}/include/NeuraDialect/Neura.td)
+# mlir_tablegen(Neura.h.inc -gen-op-decls ${MLIR_TABLEGEN_INCLUDES})
+# mlir_tablegen(Neura.cpp.inc -gen-op-defs ${MLIR_TABLEGEN_INCLUDES})
+# mlir_tablegen(NeuraDialect.h.inc -gen-dialect-decls ${MLIR_TABLEGEN_INCLUDES})
+# mlir_tablegen(NeuraDialect.cpp.inc -gen-dialect-defs ${MLIR_TABLEGEN_INCLUDES})
+# mlir_tablegen(NeuraTypes.h.inc -gen-typedef-decls ${MLIR_TABLEGEN_INCLUDES})
+# mlir_tablegen(NeuraTypes.cpp.inc -gen-typedef-defs ${MLIR_TABLEGEN_INCLUDES})
+# add_public_tablegen_target(MLIRNeuraDialectIncGen)
 
 # Add the dialect library
 add_mlir_dialect_library(MLIRNeura
@@ -24,7 +24,9 @@ add_mlir_dialect_library(MLIRNeura
         ${PROJECT_SOURCE_DIR}/include/NeuraDialect
 
         DEPENDS
-        MLIRNeuraDialectIncGen
+        MLIRNeuraIncGen
+        MLIRNeuraTransformsIncGen
+        MLIRConversionIncGen
         
         LINK_LIBS PUBLIC
         MLIRIR
diff --git a/lib/NeuraDialect/NeuraPasses.cpp b/lib/NeuraDialect/NeuraPasses.cpp
index 72b3a6a6..11b92b13 100644
--- a/lib/NeuraDialect/NeuraPasses.cpp
+++ b/lib/NeuraDialect/NeuraPasses.cpp
@@ -1,3 +1,27 @@
 #include "mlir/Pass/PassManager.h"
+#include "mlir/Pass/PassRegistry.h"
 #include "mlir/Transforms/Passes.h"
 
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraOps.h"
+#include "NeuraDialect/NeuraPasses.h"
+#include "NeuraDialect/NeuraTypes.h"
+#include "Conversion/ConversionPasses.h"
+
+// This pass pipeline can convert all the other dialects into the Neura dialect
+void mlir::neura::registerNeuraLegalizePassPipeline() {
+  PassPipelineRegistration<>("neura-legalize",
+                             "Legalize operations to Neura dialect",
+                             [](OpPassManager &pm) {
+                                // Convert all the other dialects into the Neura dialect
+                                pm.addPass(mlir::createLowerAffineToNeuraPass());
+                                pm.addPass(mlir::createLowerArithToNeuraPass());
+                                pm.addPass(mlir::createLowerLlvmToNeuraPass());
+
+                                // Insert data and control movement operations
+                                // pm.addPass(mlir::neura::createLeveragePredicatedValuePass());
+                                // pm.addPass(mlir::neura::createInsertDataMovPass());
+                                // pm.addPass(mlir::neura::createInsertCtrlMovPass());
+                                // pm.addPass(mlir::neura::createTransformCtrlToDataFlowPass());
+                             });
+}
\ No newline at end of file
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 2f980553..8390f87c 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(mlir-neura-opt)
-add_subdirectory(neura-interpreter)
\ No newline at end of file
+add_subdirectory(neura-interpreter)
+add_subdirectory(neura-compiler)
\ No newline at end of file
diff --git a/tools/neura-compiler/CMakeLists.txt b/tools/neura-compiler/CMakeLists.txt
new file mode 100644
index 00000000..69e78747
--- /dev/null
+++ b/tools/neura-compiler/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_executable(neura-compiler neura-compiler.cpp)
+get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
+get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
+set(LIBS
+        ${dialect_libs}
+        ${conversion_libs}
+        MLIRNeuraTransforms
+        MLIRConversion
+        MLIRNeura
+        MLIRTransforms
+        MLIROptLib
+        MLIRPass
+        MLIRIR
+        MLIRParser
+        MLIRSupport
+        )
+
+target_link_libraries(neura-compiler PRIVATE ${LIBS})
\ No newline at end of file
diff --git a/tools/neura-compiler/neura-compiler.cpp b/tools/neura-compiler/neura-compiler.cpp
index e69de29b..8180709e 100644
--- a/tools/neura-compiler/neura-compiler.cpp
+++ b/tools/neura-compiler/neura-compiler.cpp
@@ -0,0 +1,32 @@
+// neura-compiler.cpp
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/DLTI/DLTI.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/InitAllDialects.h"
+#include "mlir/InitAllPasses.h"
+#include "mlir/Support/FileUtilities.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Tools/mlir-opt/MlirOptMain.h"
+
+#include "Conversion/ConversionPasses.h"
+#include "NeuraDialect/NeuraDialect.h"
+#include "NeuraDialect/NeuraPasses.h"
+
+int main(int argc, char **argv) {
+  // Registers MLIR dialects.
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::neura::NeuraDialect>();
+  registry.insert<mlir::func::FuncDialect>();
+  registry.insert<mlir::arith::ArithDialect>();
+  registry.insert<mlir::DLTIDialect>();
+  registry.insert<mlir::LLVM::LLVMDialect>();
+  registry.insert<mlir::affine::AffineDialect>();
+  registry.insert<mlir::memref::MemRefDialect>();
+
+  mlir::neura::registerNeuraLegalizePassPipeline();
+
+  // Runs the MLIR optimizer.
+  return mlir::asMainReturnCode(
+      mlir::MlirOptMain(argc, argv, "Neura Dialect Optimizer", registry));
+}
\ No newline at end of file

From 083aea8e0ad9fab3be31f3dd855e11a26a6f7bdf Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Sat, 14 Jun 2025 12:57:17 +0800
Subject: [PATCH 10/13] change the assembly format in neura.loop_control

---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index adaf6c9b..9a83b5b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,10 +10,10 @@ set(CMAKE_CXX_STANDARD_REQUIRED YES)
 
 add_compile_options(-g)
 
-set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
-set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
-set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
-set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
+# set(MLIR_DIR /home/lucas/llvm-project/build/lib/cmake/mlir)
+# set(LLVM_DIR /home/lucas/llvm-project/build/lib/cmake/llvm)
+# set(MLIR_SRC_DIR /home/lucas/llvm-project/mlir)
+# set(MLIR_BINARY_DIR /home/lucas/llvm-project/build)
 message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
 message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 

From e917f0a6093d1ad2af86f72470ce4f4cdab7fcf8 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 17 Jun 2025 11:59:53 +0800
Subject: [PATCH 11/13] support lowring affine to neura & add some testes

---
 .github/workflows/main.yml                    |   2 +-
 include/NeuraDialect/NeuraOps.td              |   2 +-
 include/NeuraDialect/NeuraPasses.td           |  10 -
 .../AffineToNeura/AffineToNeuraPass.cpp       | 534 +++++++++---------
 test/.lit_test_times.txt                      |   4 +
 test/affine2neura/deep-nested/deep_nested.cpp |  31 +
 .../affine2neura/deep-nested/deep_nested.mlir |  47 ++
 .../deep-nested/deep_nested_neura.mlir        | 125 ++++
 test/affine2neura/gpt2-node11/node11.cpp      |   6 +-
 test/affine2neura/gpt2-node11/node11.mlir     |  31 +
 .../gpt2-node11/node11_neura.mlir             |  40 ++
 test/affine2neura/gpt2-node27/compile.sh      |   2 +-
 test/affine2neura/gpt2-node27/node27.cpp      |   2 +-
 test/affine2neura/gpt2-node27/node27.mlir     |  30 +
 .../gpt2-node27/node27_neura.mlir             |  48 ++
 .../gpt2-node27/node27_unroll.mlir            |  23 +
 test/affine2neura/gpt2-node30/node30.cpp      |   2 +-
 test/affine2neura/gpt2-node30/node30.mlir     |  33 ++
 .../gpt2-node30/node30_neura.mlir             |  50 ++
 .../gpt2-node30/node30_unroll.mlir            |  26 +
 test/affine2neura/simpleloop/compile.sh       |   3 -
 test/affine2neura/simpleloop/simple.cpp       |  12 -
 22 files changed, 774 insertions(+), 289 deletions(-)
 create mode 100644 test/.lit_test_times.txt
 create mode 100644 test/affine2neura/deep-nested/deep_nested.cpp
 create mode 100644 test/affine2neura/deep-nested/deep_nested.mlir
 create mode 100644 test/affine2neura/deep-nested/deep_nested_neura.mlir
 create mode 100644 test/affine2neura/gpt2-node11/node11.mlir
 create mode 100644 test/affine2neura/gpt2-node11/node11_neura.mlir
 create mode 100644 test/affine2neura/gpt2-node27/node27.mlir
 create mode 100644 test/affine2neura/gpt2-node27/node27_neura.mlir
 create mode 100644 test/affine2neura/gpt2-node27/node27_unroll.mlir
 create mode 100644 test/affine2neura/gpt2-node30/node30.mlir
 create mode 100644 test/affine2neura/gpt2-node30/node30_neura.mlir
 create mode 100644 test/affine2neura/gpt2-node30/node30_unroll.mlir
 delete mode 100755 test/affine2neura/simpleloop/compile.sh
 delete mode 100644 test/affine2neura/simpleloop/simple.cpp

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 181c399d..98b116d3 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -40,7 +40,7 @@ jobs:
       run: |
         git clone https://github.com/llvm/llvm-project.git
         cd llvm-project
-        git checkout cd70802
+        git checkout 6146a88
         mkdir build && cd build
         cmake -G Ninja ../llvm \
           -DLLVM_ENABLE_PROJECTS="mlir" \
diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index 43ded351..686f8dd1 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -1,7 +1,6 @@
 // NeuraOps.td - Custom operation definitions.
 
 include "NeuraDialect/NeuraDialect.td"
-include "mlir/IR/CommonTypeConstraints.td"
 
 // ----------------------------------------------------
 // Defines basic scalar operations.
@@ -287,6 +286,7 @@ def Neura_ReserveOp : Op<NeuraDialect, "reserve"> {
 // Defines loop related operations.
 
 // Loop iteration operation for index increament and compare
+// TODO: Add support for more complex loop structures using LoopInterOp
 def Neura_LoopIterOp : Op<NeuraDialect, "loop_iter", [AttrSizedOperandSegments]> {
   let summary = "CGRA-optimized loop iteration operation";
   let description = [{
diff --git a/include/NeuraDialect/NeuraPasses.td b/include/NeuraDialect/NeuraPasses.td
index 6615c2e4..b488924e 100644
--- a/include/NeuraDialect/NeuraPasses.td
+++ b/include/NeuraDialect/NeuraPasses.td
@@ -49,14 +49,4 @@ def LeveragePredicatedValue : Pass<"leverage-predicated-value", "ModuleOp"> {
   let constructor = "neura::createLeveragePredicatedValuePass()";
 }
 
-//=========================================================//
-// Passes for the CGRA Mapping
-//=========================================================//
-// def GenerateDFG : Pass<"generate-dfg", "ModuleOp"> {
-//   let summary = "Generates a Data Flow Graph (DFG) for the Neura dialect";
-//   let description =
-//       [{This pass generates a DFG from the Neura dialect operations.}];
-//   let constructor = "neura::createGenerateDFGPass()";
-// }
-
 #endif // NEURA_PASSES_TD
\ No newline at end of file
diff --git a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
index e5402667..9cf65348 100644
--- a/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
+++ b/lib/Conversion/AffineToNeura/AffineToNeuraPass.cpp
@@ -1,3 +1,4 @@
+#include "Common/AcceleratorAttrs.h"
 #include "Conversion/ConversionPasses.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
@@ -5,12 +6,15 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Region.h"
 #include "mlir/IR/ValueRange.h"
+#include "mlir/IR/Visitors.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
@@ -20,6 +24,7 @@
 #include "NeuraDialect/NeuraDialect.h"
 #include "NeuraDialect/NeuraOps.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
 
@@ -31,291 +36,155 @@ using namespace mlir::func;
 #include "Conversion/ConversionPasses.h.inc"
 
 namespace {
+LogicalResult convertAffineMapToIndices(AffineMap map, ValueRange map_operands,
+                                        Location loc, PatternRewriter &rewriter,
+                                        SmallVector<Value> &new_indices) {
+  new_indices.clear();
+  new_indices.reserve(map.getNumResults());
+  for (AffineExpr expr : map.getResults()) {
+    if (AffineConstantExpr const_expr = dyn_cast<AffineConstantExpr>(expr)) {
+      IndexType index_type = rewriter.getIndexType();
+      IntegerAttr value_attr =
+          rewriter.getIntegerAttr(index_type, const_expr.getValue());
+      new_indices.push_back(rewriter.create<neura::ConstantOp>(
+          loc, index_type, value_attr, nullptr)); // nullptr is for predicated bit
+    } else if (AffineDimExpr dim_expr = dyn_cast<AffineDimExpr>(expr)) {
+      if (dim_expr.getPosition() >= map.getNumDims() ||
+          dim_expr.getPosition() >=
+              map_operands
+                  .size()) { // Check against mapOperands size for safety
+        return failure();
+      }
+      new_indices.push_back(map_operands[dim_expr.getPosition()]);
+    } else if (AffineSymbolExpr sym_expr = dyn_cast<AffineSymbolExpr>(expr)) {
+      unsigned symbol_operand_index = map.getNumDims() + sym_expr.getPosition();
+      if (symbol_operand_index >= map_operands.size()) {
+        return failure();
+      }
+      new_indices.push_back(map_operands[symbol_operand_index]);
+    } else {
+      // For more complex affine expressions (e.g., d0 + c1),
+      // materialize the result using affine.apply.
+      // This is a temporary workaround for complex expressions.
+      // TODO: Handle more complex expressions.
+      llvm::errs() << "[affine2neura] Complex affine expression: " << expr
+                   << "\n";
+      AffineMap single_result_map = AffineMap::get(
+          map.getNumDims(), map.getNumSymbols(), expr, rewriter.getContext());
+      Value complexIndex = rewriter.create<affine::AffineApplyOp>(
+          loc, single_result_map, map_operands);
+      new_indices.push_back(complexIndex);
+    }
+  }
+  return success();
+}
+
 struct AffineLoadLowering : public OpRewritePattern<affine::AffineLoadOp> {
   using OpRewritePattern<affine::AffineLoadOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(affine::AffineLoadOp loadOp,
+  LogicalResult matchAndRewrite(affine::AffineLoadOp load_op,
                                 PatternRewriter &rewriter) const override {
-    auto loc = loadOp.getLoc();
-    auto memref = loadOp.getMemref();
-    AffineMap map = loadOp.getAffineMap();
-    ValueRange mapOperands = loadOp.getMapOperands();
-    // Get the indices for the load operation
-    SmallVector<Value, 4> newIndices;
-    newIndices.reserve(map.getNumResults());
-    llvm::errs() << "Lowering affine load operation: " << loadOp << "\n";
-    llvm::errs() << "Number of results in affine map: " << map.getNumResults()
-                 << "\n";
-    for (auto expr : map.getResults()) {
-      llvm::errs() << "Map expr: " << expr << "\n";
+    Location loc = load_op.getLoc();
+    auto memref = load_op.getMemref();
+    AffineMap map = load_op.getAffineMap();
+    ValueRange map_operands = load_op.getMapOperands();
+    // Gets the indices for the load operation
+    SmallVector<Value> new_indices;
+    if (failed(convertAffineMapToIndices(map, map_operands, loc, rewriter,
+                                         new_indices))) {
+      return load_op.emitError(
+          "[affine2neura] Failed to convert affine map to indices");
     }
 
-    for (AffineExpr expr : map.getResults()) {
-      if (expr.isa<AffineConstantExpr>()) {
-        auto constExpr = expr.cast<AffineConstantExpr>();
-        auto indexType = rewriter.getIndexType();
-        auto valueAttr =
-            rewriter.getIntegerAttr(indexType, constExpr.getValue());
-        newIndices.push_back(rewriter.create<neura::ConstantOp>(
-            loc, indexType, valueAttr, nullptr));
-      } else if (expr.isa<AffineDimExpr>()) {
-        auto dimExpr = expr.cast<AffineDimExpr>();
-        if (dimExpr.getPosition() >= map.getNumDims() ||
-            dimExpr.getPosition() >=
-                mapOperands
-                    .size()) { // Check against mapOperands size for safety
-          return loadOp.emitError(
-              "affine map dimension out of bounds for map operands");
-        }
-        newIndices.push_back(mapOperands[dimExpr.getPosition()]);
-      } else if (expr.isa<AffineSymbolExpr>()) {
-        auto symExpr = expr.cast<AffineSymbolExpr>();
-        unsigned symbolOperandIndex = map.getNumDims() + symExpr.getPosition();
-        if (symbolOperandIndex >= mapOperands.size()) {
-          return loadOp.emitError(
-              "affine map symbol out of bounds for map operands");
-        }
-        newIndices.push_back(mapOperands[symbolOperandIndex]);
-      } else {
-        // For more complex affine expressions (e.g., d0 + c1),
-        // materialize the result using affine.apply.
-        // neura.load_indexed expects individual index values.
-        // This is a temporary workaround for complex expressions.
-        llvm::errs() << "Complex affine expression: " << expr << "\n";
-        AffineMap singleResultMap = AffineMap::get(
-            map.getNumDims(), map.getNumSymbols(), expr, rewriter.getContext());
-        Value complexIndex = rewriter.create<affine::AffineApplyOp>(
-            loc, singleResultMap, mapOperands);
-        newIndices.push_back(complexIndex);
-      }
-    }
-
-    auto memRefType = memref.getType().cast<MemRefType>();
-    if (!memRefType) {
-      return loadOp.emitError("base of load is not a MemRefType");
+    MemRefType memref_type = dyn_cast<MemRefType>(memref.getType());
+    if (!memref_type) {
+      return load_op.emitError(
+          "[affine2neura] Base of load is not a MemRefType");
     }
-    if (newIndices.size() != static_cast<size_t>(memRefType.getRank())) {
-      return loadOp.emitError("number of indices from affine map (")
-             << newIndices.size() << ") does not match memref rank ("
-             << memRefType.getRank() << ")";
+    if (new_indices.size() != static_cast<size_t>(memref_type.getRank())) {
+      return load_op.emitError(
+                 "[affine2neura] Number of indices from affine map (")
+             << new_indices.size() << ") does not match memref rank ("
+             << memref_type.getRank() << ")";
     }
 
     // Create the neura.load_indexed operation
-    auto newLoadOp = rewriter.create<neura::LoadIndexedOp>(
-        loc, loadOp.getType(), memref, ValueRange{newIndices}, nullptr);
+   LoadIndexedOp new_load_op = rewriter.create<neura::LoadIndexedOp>(
+        loc, load_op.getType(), memref, ValueRange{new_indices}, nullptr); // nullptr is for predicated bit
 
-    rewriter.replaceOp(loadOp, newLoadOp.getResult());
+    rewriter.replaceOp(load_op, new_load_op.getResult());
     return success();
   }
 };
 
 struct AffineStoreLowering : public OpRewritePattern<affine::AffineStoreOp> {
   using OpRewritePattern<affine::AffineStoreOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(affine::AffineStoreOp storeOp,
+  LogicalResult matchAndRewrite(affine::AffineStoreOp store_op,
                                 PatternRewriter &rewriter) const override {
-    auto loc = storeOp.getLoc();
-    auto memref = storeOp.getMemref();
-    auto value = storeOp.getValueToStore();
-    AffineMap map = storeOp.getAffineMap();
-    ValueRange mapOperands = storeOp.getMapOperands();
-
-    SmallVector<Value, 4> newIndices;
-    newIndices.reserve(map.getNumResults());
-
-    for (AffineExpr expr : map.getResults()) {
-      if (expr.isa<AffineConstantExpr>()) {
-        auto constExpr = expr.cast<AffineConstantExpr>();
-        auto indexType = rewriter.getIndexType();
-        auto valueAttr =
-            rewriter.getIntegerAttr(indexType, constExpr.getValue());
-        newIndices.push_back(rewriter.create<neura::ConstantOp>(
-            loc, indexType, valueAttr, nullptr));
-      } else if (expr.isa<AffineDimExpr>()) {
-        auto dimExpr = expr.cast<AffineDimExpr>();
-        if (dimExpr.getPosition() >= map.getNumDims() ||
-            dimExpr.getPosition() >= mapOperands.size()) {
-          return storeOp.emitError(
-              "affine map dimension out of bounds for map operands");
-        }
-        newIndices.push_back(mapOperands[dimExpr.getPosition()]);
-      } else if (expr.isa<AffineSymbolExpr>()) {
-        auto symExpr = expr.cast<AffineSymbolExpr>();
-        unsigned symbolOperandIndex = map.getNumDims() + symExpr.getPosition();
-        if (symbolOperandIndex >= mapOperands.size()) {
-          return storeOp.emitError(
-              "affine map symbol out of bounds for map operands");
-        }
-        newIndices.push_back(mapOperands[symbolOperandIndex]);
-      } else {
-        // For more complex affine expressions, materialize the result using
-        // affine.apply. This is a temporary workaround for complex expressions.
-        AffineMap singleResultMap = AffineMap::get(
-            map.getNumDims(), map.getNumSymbols(), expr, rewriter.getContext());
-        Value complexIndex = rewriter.create<affine::AffineApplyOp>(
-            loc, singleResultMap, mapOperands);
-        newIndices.push_back(complexIndex);
-      }
+    Location loc = store_op.getLoc();
+    auto memref = store_op.getMemref();
+    Value value = store_op.getValueToStore();
+    AffineMap map = store_op.getAffineMap();
+    ValueRange mapOperands = store_op.getMapOperands();
+
+    SmallVector<Value> newIndices;
+    if (failed(convertAffineMapToIndices(map, mapOperands, loc, rewriter,
+                                         newIndices))) {
+      return store_op.emitError(
+          "[affine2neura] Failed to convert affine map to indices");
     }
 
-    auto memRefType = memref.getType().cast<MemRefType>();
+    MemRefType memRefType = dyn_cast<MemRefType>(memref.getType());
     if (!memRefType) {
-      return storeOp.emitError("base of store is not a MemRefType");
+      return store_op.emitError(
+          "[affine2neura] Base of store is not a MemRefType");
     }
     if (newIndices.size() != static_cast<size_t>(memRefType.getRank())) {
-      return storeOp.emitError("number of indices from affine map (")
+      return store_op.emitError(
+                 "[affine2neura] Number of indices from affine map (")
              << newIndices.size() << ") does not match memref rank ("
              << memRefType.getRank() << ")";
     }
 
     rewriter.create<neura::StoreIndexedOp>(loc, value, memref,
-                                           ValueRange{newIndices}, nullptr);
-    rewriter.eraseOp(storeOp);
-    return success();
-  }
-};
-
-struct AffineForLowering : public OpRewritePattern<affine::AffineForOp> {
-  using OpRewritePattern<affine::AffineForOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(affine::AffineForOp forOp,
-                                PatternRewriter &rewriter) const override {
-    auto loc = forOp.getLoc();
-    auto indexType = rewriter.getIndexType();
-
-    // 1. Extract loop parameters (lower bound, upper bound, step)
-    Value lowerBoundVal;
-    if (forOp.hasConstantLowerBound()) {
-      int lowerBoundConstant = forOp.getConstantLowerBound();
-      auto lowerBoundAttr =
-          rewriter.getIntegerAttr(indexType, lowerBoundConstant);
-      lowerBoundVal = rewriter.create<neura::ConstantOp>(
-          loc, indexType, lowerBoundAttr, nullptr);
-    } else {
-      // If the lower bound is not constant, we need to use affine.apply
-      // This is a temporary workaround for non-constant lower bounds.
-      llvm::errs() << "Using affine.apply for unconstant lower bound\n";
-      affine::AffineBound lowerBound = forOp.getLowerBound();
-      AffineMap lowerBoundMap = lowerBound.getMap();
-      ValueRange lowerBoundOperands = forOp.getLowerBoundOperands();
-      lowerBoundVal = rewriter.create<affine::AffineApplyOp>(
-          loc, lowerBoundMap, lowerBoundOperands);
-    }
-
-    Value upperBoundVal;
-    if (forOp.hasConstantUpperBound()) {
-      int upperBoundConstant = forOp.getConstantUpperBound();
-      auto upperBoundAttr =
-          rewriter.getIntegerAttr(indexType, upperBoundConstant);
-      upperBoundVal = rewriter.create<neura::ConstantOp>(
-          loc, indexType, upperBoundAttr, nullptr);
-    } else {
-      // For non-constant upper bounds, we also use affine.apply
-      llvm::errs() << "Using affine.apply for unconstant upper bound\n";
-      affine::AffineBound upperBound = forOp.getUpperBound();
-      AffineMap upperBoundMap = upperBound.getMap();
-      ValueRange upperBoundOperands = forOp.getUpperBoundOperands();
-      upperBoundVal = rewriter.create<affine::AffineApplyOp>(
-          loc, upperBoundMap, upperBoundOperands);
-    }
-
-    auto stepAttr = rewriter.getIntegerAttr(indexType, forOp.getStep());
-    Value stepVal =
-        rewriter.create<neura::ConstantOp>(loc, indexType, stepAttr, nullptr);
-    llvm::errs() << "lower bound: " << lowerBoundVal
-                 << ", upper bound: " << upperBoundVal << ", step: " << stepVal
-                 << "\n";
-
-    // 2. Block structure
-    Block *originBlock = rewriter.getInsertionBlock();
-    auto originPoint = rewriter.getInsertionPoint();
-    Region *parentRegion = originBlock->getParent();
-
-    Block *headerBlock = rewriter.createBlock(
-        parentRegion, std::next(Region::iterator(originBlock)), {indexType},
-        {loc});
-    Block *bodyBlock = rewriter.createBlock(
-        parentRegion, std::next(Region::iterator(headerBlock)), {indexType},
-        {loc});
-    Block *exitBlock = rewriter.createBlock(
-        parentRegion, std::next(Region::iterator(bodyBlock)));
-    Block *continueBlock = rewriter.splitBlock(originBlock, originPoint);
-
-    // 3. origin -> header
-    rewriter.setInsertionPointToEnd(originBlock);
-    rewriter.create<neura::Br>(loc, ValueRange{lowerBoundVal}, headerBlock);
-
-    // 4. header: loop_control
-    rewriter.setInsertionPointToEnd(headerBlock);
-    SmallVector<Value, 4> bodyArgs;
-    bodyArgs.push_back(headerBlock->getArgument(0)); // current index
-    // You can add more arguments if needed
-
-    rewriter.create<neura::LoopControlOp>(
-        loc,
-        headerBlock->getArgument(0), // current index
-        stepVal, upperBoundVal, rewriter.getStringAttr("lt"),
-        bodyArgs, // passthrough
-        bodyBlock, exitBlock);
-
-    // 5. body: clone forOp body, mapping index
-    rewriter.setInsertionPointToStart(bodyBlock);
-    Value currentIndex = bodyBlock->getArgument(0);
-    if (!forOp.getRegion().empty()) {
-      Block &sourceBlock = forOp.getRegion().front();
-      IRMapping mapping;
-      mapping.map(sourceBlock.getArgument(0), currentIndex);
-      for (auto &op : llvm::make_range(sourceBlock.begin(),
-                                       std::prev(sourceBlock.end()))) {
-        Operation *clonedOp = rewriter.clone(op, mapping);
-        for (unsigned i = 0; i < op.getNumResults(); ++i)
-          mapping.map(op.getResult(i), clonedOp->getResult(i));
-      }
-    }
-
-    // 6. body 结尾跳 header，传当前 index
-    rewriter.setInsertionPointToEnd(bodyBlock);
-    rewriter.create<neura::Br>(loc, ValueRange{currentIndex}, headerBlock);
-
-    // 7. exit 跳 continue
-    rewriter.setInsertionPointToEnd(exitBlock);
-    rewriter.create<neura::Br>(loc, ValueRange{}, continueBlock);
-
-    // 8. 移除原 affine.for
-    rewriter.eraseOp(forOp);
-
+                                           ValueRange{newIndices}, nullptr); // nullptr is for predicated bit
+    rewriter.eraseOp(store_op);
     return success();
   }
 };
 
 struct AffineApplyLowering : public OpRewritePattern<affine::AffineApplyOp> {
   using OpRewritePattern<affine::AffineApplyOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(affine::AffineApplyOp applyOp,
+  LogicalResult matchAndRewrite(affine::AffineApplyOp apply_op,
                                 PatternRewriter &rewriter) const override {
-    AffineMap map = applyOp.getAffineMap();
-    ValueRange operands = applyOp.getMapOperands();
-    auto loc = applyOp.getLoc();
+    AffineMap map = apply_op.getAffineMap();
+    ValueRange operands = apply_op.getMapOperands();
+    Location loc = apply_op.getLoc();
 
     if (map.getNumResults() != 1) {
-      return applyOp.emitError("AffineApplyOp must have a single result");
+      return apply_op.emitError(
+          "[affine2neura] AffineApplyOp must have a single result");
     }
 
     AffineExpr expr = map.getResult(0);
-    // d0 + cst
-    if (expr.isa<AffineBinaryOpExpr>()) {
-      auto binExpr = expr.cast<AffineBinaryOpExpr>();
-      if (binExpr.getKind() == AffineExprKind::Add) {
-        if (binExpr.getLHS().isa<AffineDimExpr>()) {
-          auto dim = binExpr.getLHS().cast<AffineDimExpr>();
-          if (binExpr.getRHS().isa<AffineConstantExpr>()) {
-            auto cst = binExpr.getRHS().cast<AffineConstantExpr>();
-            auto cstVal = rewriter.create<neura::ConstantOp>(
+    // Handle simple affine expressions like d0 + cst
+    // TODO: Handle more complex expressions
+    if (isa<AffineBinaryOpExpr>(expr)) {
+      AffineBinaryOpExpr bin_expr = dyn_cast<AffineBinaryOpExpr>(expr);
+      if (bin_expr.getKind() == AffineExprKind::Add) {
+        if (isa<AffineDimExpr>(bin_expr.getLHS())) {
+          AffineDimExpr dim = dyn_cast<AffineDimExpr>(bin_expr.getLHS());
+          if (isa<AffineConstantExpr>(bin_expr.getRHS())) {
+            AffineConstantExpr cst =
+                dyn_cast<AffineConstantExpr>(bin_expr.getRHS());
+            neura::ConstantOp cstVal = rewriter.create<neura::ConstantOp>(
                 loc, rewriter.getIndexType(),
                 rewriter.getIntegerAttr(rewriter.getIndexType(),
                                         cst.getValue()),
-                nullptr);
-            auto addOp = rewriter.create<neura::AddOp>(
+                nullptr); // nullptr is for predicated bit
+            neura::AddOp addOp = rewriter.create<neura::AddOp>(
                 loc, cstVal.getType(), operands[dim.getPosition()], cstVal,
-                nullptr);
-            rewriter.replaceOp(applyOp, addOp.getResult());
+                nullptr); // nullptr is for predicated bit
+            rewriter.replaceOp(apply_op, addOp.getResult());
             return success();
           }
         }
@@ -324,12 +193,143 @@ struct AffineApplyLowering : public OpRewritePattern<affine::AffineApplyOp> {
 
     // You can add more cases here for different affine expressions
     // For now, we will just emit an error for unsupported expressions.
-    return applyOp.emitError(
-               "Unsupported complex affine expression in AffineApplyOp.\n")
+    return apply_op.emitError("[affine2neura] Unsupported complex affine "
+                              "expression in AffineApplyOp.\n")
            << "Only simple affine expressions like d0 + cst are supported.\n";
   }
 };
 
+LogicalResult lowerAffineFor(affine::AffineForOp for_op, OpBuilder &builder,
+                             IRMapping &value_mapping) {
+  llvm::errs() << "[affine2neura] Lowering AffineForOp: " << for_op << "\n";
+  Location loc = for_op.getLoc();
+  IndexType index_type = builder.getIndexType();
+
+  // 1 Extract1 loop parameters (lower bound, upper bound, step)
+  Value lower_bound_val;
+  if (for_op.hasConstantLowerBound()) {
+    int64_t lower_bound_constant = for_op.getConstantLowerBound();
+    lower_bound_val = builder.create<neura::ConstantOp>(
+        loc, index_type, builder.getIndexAttr(lower_bound_constant), nullptr); // nullptr is for predicated bit
+  } else {
+    // If the lower bound is not constant, we need to use affine.apply
+    affine::AffineBound lower_bound = for_op.getLowerBound();
+    AffineMap lower_bound_map = lower_bound.getMap();
+    ValueRange lower_bound_operands = for_op.getLowerBoundOperands();
+    lower_bound_val = builder.create<affine::AffineApplyOp>(
+        loc, lower_bound_map, lower_bound_operands);
+  }
+
+  Value upper_bound_val;
+  if (for_op.hasConstantUpperBound()) {
+    int64_t upper_bound_constant = for_op.getConstantUpperBound();
+    upper_bound_val = builder.create<neura::ConstantOp>(
+        loc, index_type, builder.getIndexAttr(upper_bound_constant), nullptr); // nullptr is for predicated bit
+  } else {
+    // For non-constant upper bounds, we also use affine.apply
+    affine::AffineBound upper_bound = for_op.getUpperBound();
+    AffineMap upper_bound_map = upper_bound.getMap();
+    ValueRange upper_bound_operands = for_op.getUpperBoundOperands();
+    upper_bound_val = builder.create<affine::AffineApplyOp>(
+        loc, upper_bound_map, upper_bound_operands);
+  }
+
+  Value step_val = builder.create<neura::ConstantOp>(
+      loc, index_type, builder.getIndexAttr(for_op.getStepAsInt()), nullptr); // nullptr is for predicated bit
+
+  // 2 Creates the block structure
+  Block *origin_block = builder.getInsertionBlock();
+  auto origin_point = builder.getInsertionPoint();
+  Region *parent_region = origin_block->getParent();
+
+  // 2.1 Creates the header block
+  Block *header_block = builder.createBlock(
+      parent_region, std::next(Region::iterator(origin_block)), {index_type},
+      {loc});
+  // 2.2 Creates the body block
+  Block *body_block = builder.createBlock(
+      parent_region, std::next(Region::iterator(header_block)), {index_type},
+      {loc});
+  // 2.3 Creates the exit block
+  Block *exit_block = builder.createBlock(
+      parent_region, std::next(Region::iterator(body_block)));
+  // 2.4 Creates the continue block
+  Block *continue_block = origin_block->splitBlock(origin_point);
+
+  // 3 Connects the blocks
+  // 3.1 Connects origin_block -> header_block
+  builder.setInsertionPointToEnd(origin_block);
+  builder.create<neura::Br>(loc, ValueRange{lower_bound_val}, header_block);
+
+  // 3.2 Connects header_block -> body_block
+  builder.setInsertionPointToEnd(header_block);
+  SmallVector<Value> body_args;
+  body_args.push_back(header_block->getArgument(0)); // current index
+  builder.create<neura::LoopControlOp>(
+      loc, header_block->getArgument(0), step_val, upper_bound_val,
+      builder.getStringAttr("lt"), body_args, body_block, exit_block);
+
+  // 3.3 Clones the body of the original affine.for operation
+  // Assumes the body of the affine.for operation is a single block
+  // So we need to guarantee the sequence of handling the nested affine.for
+  // operations is correct. (From outermost to innermost)
+  builder.setInsertionPointToStart(body_block);
+  Value current_index = body_block->getArgument(0);
+  if (!for_op.getRegion().empty()) {
+    Block &source_block = for_op.getRegion().front();
+    IRMapping mapping;
+    mapping.map(source_block.getArgument(0), current_index);
+    for (Operation &op : llvm::make_range(source_block.begin(),
+                                          std::prev(source_block.end()))) {
+      Operation *cloned_op = builder.clone(op, mapping);
+      for (unsigned i = 0; i < op.getNumResults(); ++i)
+        mapping.map(op.getResult(i), cloned_op->getResult(i));
+    }
+  }
+
+  // 3.4 Connects body_block -> header_block
+  builder.setInsertionPointToEnd(body_block);
+  builder.create<neura::Br>(loc, ValueRange{current_index}, header_block);
+
+  // 3.5 Connects exit_block -> continue_block
+  builder.setInsertionPointToEnd(exit_block);
+  builder.create<neura::Br>(loc, ValueRange{}, continue_block);
+
+  builder.setInsertionPointToStart(continue_block);
+
+  for_op.erase();
+
+  return success();
+}
+
+affine::AffineForOp findOuterMostAffineFor(func::FuncOp &func_op) {
+  // Find the outermost affine.for operation
+  affine::AffineForOp top_for_op = nullptr;
+  func_op.walk([&](affine::AffineForOp for_op) {
+    // Checks if this for_op has any AffineForOp parent
+    Operation *parent_op = for_op->getParentOp();
+    bool has_affine_for_parent = false;
+
+    while (parent_op) {
+      if (isa<affine::AffineForOp>(parent_op)) {
+        has_affine_for_parent = true;
+        break;
+      }
+      parent_op = parent_op->getParentOp();
+    }
+
+    // If it has no AffineForOp parent, it's a Ftop-level loop
+    if (!has_affine_for_parent) {
+      top_for_op = for_op;            // Store the found operation
+      return WalkResult::interrupt(); // Stop walking
+    }
+
+    return WalkResult::advance(); // Continue walking
+  });
+
+  return top_for_op; // Return the found operation
+}
+
 struct LowerAffineToNeuraPass
     : public PassWrapper<LowerAffineToNeuraPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LowerAffineToNeuraPass)
@@ -345,18 +345,40 @@ struct LowerAffineToNeuraPass
   }
 
   void runOnOperation() override {
-    ModuleOp moduleOp = getOperation();
-    MLIRContext *context = moduleOp.getContext();
-
-    RewritePatternSet patterns(context);
-    patterns.add<AffineLoadLowering, AffineStoreLowering, AffineForLowering,
-                 AffineApplyLowering>(context);
-
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      moduleOp.emitError("Failed to lower affine operations to Neura dialect");
-      signalPassFailure();
-    }
+    ModuleOp module_op = getOperation();
+    MLIRContext *context = module_op.getContext();
+    IRMapping mapping;
+    module_op.walk(
+        [&](func::FuncOp func_op) {
+          if (func_op->hasAttr(mlir::accel::kAcceleratorAttr)) {
+            auto target = func_op->getAttrOfType<StringAttr>(
+                mlir::accel::kAcceleratorAttr);
+            if (target && target.getValue() == mlir::accel::kNeuraTarget) {
+              while (affine::AffineForOp outer_for_op =
+                         findOuterMostAffineFor(func_op)) {
+                llvm::errs()
+                    << "[affine2neura] Find outermost affine.for operation: "
+                    << outer_for_op << "\n";
+                OpBuilder builder(outer_for_op);
+                if (failed(lowerAffineFor(outer_for_op, builder, mapping))) {
+                  outer_for_op.emitError("[affine2neura] Failed to lower "
+                                         "outermost affine.for operation");
+                  signalPassFailure();
+                }
+              }
+
+              RewritePatternSet patterns(context);
+              patterns.add<AffineLoadLowering, AffineStoreLowering>(context);
+
+              if (failed(applyPatternsGreedily(func_op.getOperation(),
+                                               std::move(patterns)))) {
+                func_op.emitError("[affine2neura] Failed to lower affine "
+                                    "operations to Neura dialect");
+                signalPassFailure();
+              }
+            }
+          }
+        });
   }
 };
 } // namespace
diff --git a/test/.lit_test_times.txt b/test/.lit_test_times.txt
new file mode 100644
index 00000000..961067b1
--- /dev/null
+++ b/test/.lit_test_times.txt
@@ -0,0 +1,4 @@
+7.853746e-03 affine2neura/gpt2-node27/node27.mlir
+1.136017e-02 affine2neura/deep-nested/deep_nested.mlir
+7.997274e-03 affine2neura/gpt2-node11/node11.mlir
+7.548809e-03 affine2neura/gpt2-node30/node30.mlir
diff --git a/test/affine2neura/deep-nested/deep_nested.cpp b/test/affine2neura/deep-nested/deep_nested.cpp
new file mode 100644
index 00000000..405e6c5b
--- /dev/null
+++ b/test/affine2neura/deep-nested/deep_nested.cpp
@@ -0,0 +1,31 @@
+int input_data[3][3][3];
+int output_data[3][3][3];
+float weights[3];
+
+int deep_nested() {
+  // 10 nested loops
+  for (int i0 = 0; i0 < 3; i0++) {
+    for (int i1 = 0; i1 < 3; i1++) {
+      for (int i2 = 0; i2 < 3; i2++) {
+        for (int i3 = 0; i3 < 3; i3++) {
+          for (int i4 = 0; i4 < 3; i4++) {
+            for (int i5 = 0; i5 < 3; i5++) {
+              for (int i6 = 0; i6 < 3; i6++) {
+                for (int i7 = 0; i7 < 3; i7++) {
+                  for (int i8 = 0; i8 < 3; i8++) {
+                    for (int i9 = 0; i9 < 3; i9++) {
+                      // Assuming some operation on input_data
+                      output_data[i0][i1][i2] +=
+                          input_data[i0][i1][i2];
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
diff --git a/test/affine2neura/deep-nested/deep_nested.mlir b/test/affine2neura/deep-nested/deep_nested.mlir
new file mode 100644
index 00000000..cb5c5db9
--- /dev/null
+++ b/test/affine2neura/deep-nested/deep_nested.mlir
@@ -0,0 +1,47 @@
+// Check that the affine loop nest is correctly transformed to neura.loop_control
+// RUN: mlir-neura-opt %s --assign-accelerator --lower-affine-to-neura | FileCheck %s
+module attributes {} {
+  memref.global @input_data : memref<3x3x3xi32> = uninitialized
+  memref.global @output_data : memref<3x3x3xi32> = uninitialized
+  func.func @_Z11deep_nestedv() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @output_data : memref<3x3x3xi32>
+    %1 = memref.get_global @input_data : memref<3x3x3xi32>
+    affine.for %arg0 = 0 to 3 {
+      affine.for %arg1 = 0 to 3 {
+        affine.for %arg2 = 0 to 3 {
+          affine.for %arg3 = 0 to 3 {
+            affine.for %arg4 = 0 to 3 {
+              affine.for %arg5 = 0 to 3 {
+                affine.for %arg6 = 0 to 3 {
+                  affine.for %arg7 = 0 to 3 {
+                    %2 = affine.load %1[%arg0, %arg1, %arg2] : memref<3x3x3xi32>
+                    affine.for %arg8 = 0 to 3 {
+                      affine.for %arg9 = 0 to 3 {
+                        %3 = affine.load %0[%arg0, %arg1, %arg2] : memref<3x3x3xi32>
+                        %4 = arith.addi %3, %2 : i32
+                        affine.store %4, %0[%arg0, %arg1, %arg2] : memref<3x3x3xi32>
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
+
+// Verify function signature is preserved
+// CHECK-LABEL: func.func @_Z11deep_nestedv() -> i32
+
+// Verify all affine operations are eliminated
+// CHECK-NOT: affine.for
+// CHECK-NOT: affine.load
+// CHECK-NOT: affine.store
+// CHECK-NOT: affine.apply
+
+// CHECK-COUNT-10: neura.loop_control
diff --git a/test/affine2neura/deep-nested/deep_nested_neura.mlir b/test/affine2neura/deep-nested/deep_nested_neura.mlir
new file mode 100644
index 00000000..368b6dc5
--- /dev/null
+++ b/test/affine2neura/deep-nested/deep_nested_neura.mlir
@@ -0,0 +1,125 @@
+module {
+  memref.global @input_data : memref<3x3x3xi32> = uninitialized
+  memref.global @output_data : memref<3x3x3xi32> = uninitialized
+  func.func @_Z11deep_nestedv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @output_data : memref<3x3x3xi32>
+    %1 = memref.get_global @input_data : memref<3x3x3xi32>
+    %2 = neura.constant {value = 0 : index} : index
+    %3 = neura.constant {value = 3 : index} : index
+    %4 = neura.constant {value = 1 : index} : index
+    neura.br %2 : index to ^bb2
+  ^bb1:  // pred: ^bb40
+    return %c0_i32 : i32
+  ^bb2(%5: index):  // 2 preds: ^bb0, ^bb4
+    neura.loop_control current_index : %5, step : %4, bound : %3, loop_type : "lt" then ^bb3(%5 : index) else ^bb40
+  ^bb3(%6: index):  // pred: ^bb2
+    %7 = neura.constant {value = 0 : index} : index
+    %8 = neura.constant {value = 3 : index} : index
+    %9 = neura.constant {value = 1 : index} : index
+    neura.br %7 : index to ^bb5
+  ^bb4:  // pred: ^bb39
+    neura.br %6 : index to ^bb2
+  ^bb5(%10: index):  // 2 preds: ^bb3, ^bb7
+    neura.loop_control current_index : %10, step : %9, bound : %8, loop_type : "lt" then ^bb6(%10 : index) else ^bb39
+  ^bb6(%11: index):  // pred: ^bb5
+    %12 = neura.constant {value = 0 : index} : index
+    %13 = neura.constant {value = 3 : index} : index
+    %14 = neura.constant {value = 1 : index} : index
+    neura.br %12 : index to ^bb8
+  ^bb7:  // pred: ^bb38
+    neura.br %11 : index to ^bb5
+  ^bb8(%15: index):  // 2 preds: ^bb6, ^bb10
+    neura.loop_control current_index : %15, step : %14, bound : %13, loop_type : "lt" then ^bb9(%15 : index) else ^bb38
+  ^bb9(%16: index):  // pred: ^bb8
+    %17 = neura.constant {value = 0 : index} : index
+    %18 = neura.constant {value = 3 : index} : index
+    %19 = neura.constant {value = 1 : index} : index
+    neura.br %17 : index to ^bb11
+  ^bb10:  // pred: ^bb37
+    neura.br %16 : index to ^bb8
+  ^bb11(%20: index):  // 2 preds: ^bb9, ^bb13
+    neura.loop_control current_index : %20, step : %19, bound : %18, loop_type : "lt" then ^bb12(%20 : index) else ^bb37
+  ^bb12(%21: index):  // pred: ^bb11
+    %22 = neura.constant {value = 0 : index} : index
+    %23 = neura.constant {value = 3 : index} : index
+    %24 = neura.constant {value = 1 : index} : index
+    neura.br %22 : index to ^bb14
+  ^bb13:  // pred: ^bb36
+    neura.br %21 : index to ^bb11
+  ^bb14(%25: index):  // 2 preds: ^bb12, ^bb16
+    neura.loop_control current_index : %25, step : %24, bound : %23, loop_type : "lt" then ^bb15(%25 : index) else ^bb36
+  ^bb15(%26: index):  // pred: ^bb14
+    %27 = neura.constant {value = 0 : index} : index
+    %28 = neura.constant {value = 3 : index} : index
+    %29 = neura.constant {value = 1 : index} : index
+    neura.br %27 : index to ^bb17
+  ^bb16:  // pred: ^bb35
+    neura.br %26 : index to ^bb14
+  ^bb17(%30: index):  // 2 preds: ^bb15, ^bb19
+    neura.loop_control current_index : %30, step : %29, bound : %28, loop_type : "lt" then ^bb18(%30 : index) else ^bb35
+  ^bb18(%31: index):  // pred: ^bb17
+    %32 = neura.constant {value = 0 : index} : index
+    %33 = neura.constant {value = 3 : index} : index
+    %34 = neura.constant {value = 1 : index} : index
+    neura.br %32 : index to ^bb20
+  ^bb19:  // pred: ^bb34
+    neura.br %31 : index to ^bb17
+  ^bb20(%35: index):  // 2 preds: ^bb18, ^bb22
+    neura.loop_control current_index : %35, step : %34, bound : %33, loop_type : "lt" then ^bb21(%35 : index) else ^bb34
+  ^bb21(%36: index):  // pred: ^bb20
+    %37 = neura.constant {value = 0 : index} : index
+    %38 = neura.constant {value = 3 : index} : index
+    %39 = neura.constant {value = 1 : index} : index
+    neura.br %37 : index to ^bb23
+  ^bb22:  // pred: ^bb33
+    neura.br %36 : index to ^bb20
+  ^bb23(%40: index):  // 2 preds: ^bb21, ^bb25
+    neura.loop_control current_index : %40, step : %39, bound : %38, loop_type : "lt" then ^bb24(%40 : index) else ^bb33
+  ^bb24(%41: index):  // pred: ^bb23
+    %42 = neura.load_indexed memref<3x3x3xi32> %1[%6, %11, %16] : i32
+    %43 = neura.constant {value = 0 : index} : index
+    %44 = neura.constant {value = 3 : index} : index
+    %45 = neura.constant {value = 1 : index} : index
+    neura.br %43 : index to ^bb26
+  ^bb25:  // pred: ^bb32
+    neura.br %41 : index to ^bb23
+  ^bb26(%46: index):  // 2 preds: ^bb24, ^bb28
+    neura.loop_control current_index : %46, step : %45, bound : %44, loop_type : "lt" then ^bb27(%46 : index) else ^bb32
+  ^bb27(%47: index):  // pred: ^bb26
+    %48 = neura.constant {value = 0 : index} : index
+    %49 = neura.constant {value = 3 : index} : index
+    %50 = neura.constant {value = 1 : index} : index
+    neura.br %48 : index to ^bb29
+  ^bb28:  // pred: ^bb31
+    neura.br %47 : index to ^bb26
+  ^bb29(%51: index):  // 2 preds: ^bb27, ^bb30
+    neura.loop_control current_index : %51, step : %50, bound : %49, loop_type : "lt" then ^bb30(%51 : index) else ^bb31
+  ^bb30(%52: index):  // pred: ^bb29
+    %53 = neura.load_indexed memref<3x3x3xi32> %0[%6, %11, %16] : i32
+    %54 = arith.addi %53, %42 : i32
+    neura.store_indexed %54 to memref<3x3x3xi32> %0[%6, %11, %16] : i32
+    neura.br %52 : index to ^bb29
+  ^bb31:  // pred: ^bb29
+    neura.br :  to ^bb28
+  ^bb32:  // pred: ^bb26
+    neura.br :  to ^bb25
+  ^bb33:  // pred: ^bb23
+    neura.br :  to ^bb22
+  ^bb34:  // pred: ^bb20
+    neura.br :  to ^bb19
+  ^bb35:  // pred: ^bb17
+    neura.br :  to ^bb16
+  ^bb36:  // pred: ^bb14
+    neura.br :  to ^bb13
+  ^bb37:  // pred: ^bb11
+    neura.br :  to ^bb10
+  ^bb38:  // pred: ^bb8
+    neura.br :  to ^bb7
+  ^bb39:  // pred: ^bb5
+    neura.br :  to ^bb4
+  ^bb40:  // pred: ^bb2
+    neura.br :  to ^bb1
+  }
+}
+
diff --git a/test/affine2neura/gpt2-node11/node11.cpp b/test/affine2neura/gpt2-node11/node11.cpp
index 45e4262c..fdd7519f 100644
--- a/test/affine2neura/gpt2-node11/node11.cpp
+++ b/test/affine2neura/gpt2-node11/node11.cpp
@@ -1,12 +1,12 @@
 float input[1][16][64];
 float output[1][16];
 
-int main() {
+int node11() {
   for (int arg2 = 0; arg2 < 1; arg2++) {
     for (int arg3 = 0; arg3 < 16; arg3++) {
-      for (int arg4 = 0; arg4 < 64; arg4+=1) {
+      for (int arg4 = 0; arg4 < 64; arg4+=1) 
         output[arg2][arg3] += input[arg2][arg3][arg4];
-      }
     }
   }
+  return 0;
 }
\ No newline at end of file
diff --git a/test/affine2neura/gpt2-node11/node11.mlir b/test/affine2neura/gpt2-node11/node11.mlir
new file mode 100644
index 00000000..795bb45f
--- /dev/null
+++ b/test/affine2neura/gpt2-node11/node11.mlir
@@ -0,0 +1,31 @@
+// Check that the affine loop nest is correctly transformed to neura.loop_control
+// RUN: mlir-neura-opt %s --assign-accelerator --lower-affine-to-neura | FileCheck %s
+module attributes {} {
+  memref.global @input : memref<1x16x64xf32> = uninitialized
+  memref.global @output : memref<1x16xf32> = uninitialized
+  func.func @_Z6node11v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @output : memref<1x16xf32>
+    %1 = memref.get_global @input : memref<1x16x64xf32>
+    affine.for %arg0 = 0 to 16 {
+      affine.for %arg1 = 0 to 64 {
+        %2 = affine.load %1[0, %arg0, %arg1] : memref<1x16x64xf32>
+        %3 = affine.load %0[0, %arg0] : memref<1x16xf32>
+        %4 = arith.addf %3, %2 : f32
+        affine.store %4, %0[0, %arg0] : memref<1x16xf32>
+      }
+    }
+    return %c0_i32 : i32
+  }
+}
+
+// Verify function signature is preserved
+// CHECK-LABEL: func.func @_Z6node11v() -> i32
+
+// Verify all affine operations are eliminated
+// CHECK-NOT: affine.for
+// CHECK-NOT: affine.load
+// CHECK-NOT: affine.store
+// CHECK-NOT: affine.apply
+
+// CHECK-COUNT-2: neura.loop_control
diff --git a/test/affine2neura/gpt2-node11/node11_neura.mlir b/test/affine2neura/gpt2-node11/node11_neura.mlir
new file mode 100644
index 00000000..ccc214c7
--- /dev/null
+++ b/test/affine2neura/gpt2-node11/node11_neura.mlir
@@ -0,0 +1,40 @@
+module {
+  memref.global @input : memref<1x16x64xf32> = uninitialized
+  memref.global @output : memref<1x16xf32> = uninitialized
+  func.func @_Z6node11v() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+    %c0_i32 = arith.constant 0 : i32
+    %0 = memref.get_global @output : memref<1x16xf32>
+    %1 = memref.get_global @input : memref<1x16x64xf32>
+    %2 = neura.constant {value = 0 : index} : index
+    %3 = neura.constant {value = 16 : index} : index
+    %4 = neura.constant {value = 1 : index} : index
+    neura.br %2 : index to ^bb2
+  ^bb1:  // pred: ^bb8
+    return %c0_i32 : i32
+  ^bb2(%5: index):  // 2 preds: ^bb0, ^bb4
+    neura.loop_control current_index : %5, step : %4, bound : %3, loop_type : "lt" then ^bb3(%5 : index) else ^bb8
+  ^bb3(%6: index):  // pred: ^bb2
+    %7 = neura.constant {value = 0 : index} : index
+    %8 = neura.constant {value = 64 : index} : index
+    %9 = neura.constant {value = 1 : index} : index
+    neura.br %7 : index to ^bb5
+  ^bb4:  // pred: ^bb7
+    neura.br %6 : index to ^bb2
+  ^bb5(%10: index):  // 2 preds: ^bb3, ^bb6
+    neura.loop_control current_index : %10, step : %9, bound : %8, loop_type : "lt" then ^bb6(%10 : index) else ^bb7
+  ^bb6(%11: index):  // pred: ^bb5
+    %12 = neura.constant {value = 0 : index} : index
+    %13 = neura.load_indexed memref<1x16x64xf32> %1[%12, %6, %11] : f32
+    %14 = neura.constant {value = 0 : index} : index
+    %15 = neura.load_indexed memref<1x16xf32> %0[%14, %6] : f32
+    %16 = arith.addf %15, %13 : f32
+    %17 = neura.constant {value = 0 : index} : index
+    neura.store_indexed %16 to memref<1x16xf32> %0[%17, %6] : f32
+    neura.br %11 : index to ^bb5
+  ^bb7:  // pred: ^bb5
+    neura.br :  to ^bb4
+  ^bb8:  // pred: ^bb2
+    neura.br :  to ^bb1
+  }
+}
+
diff --git a/test/affine2neura/gpt2-node27/compile.sh b/test/affine2neura/gpt2-node27/compile.sh
index e1c6c965..bc268f1a 100755
--- a/test/affine2neura/gpt2-node27/compile.sh
+++ b/test/affine2neura/gpt2-node27/compile.sh
@@ -1,3 +1,3 @@
-/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/cgeist ./node27_unroll.cpp -S --raise-scf-to-affine -o ./node27.mlir
+/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/cgeist ./node27.cpp -S --raise-scf-to-affine -o ./node27.mlir
 /home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node27.mlir --affine-loop-unroll="unroll-factor=2" -o ./node27_unroll.mlir
 # /home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node27_unroll.mlir --affine-loop-tile="tile-size=2" -o ./node27_tile.mlir
\ No newline at end of file
diff --git a/test/affine2neura/gpt2-node27/node27.cpp b/test/affine2neura/gpt2-node27/node27.cpp
index 3bcf72c2..456aaabd 100644
--- a/test/affine2neura/gpt2-node27/node27.cpp
+++ b/test/affine2neura/gpt2-node27/node27.cpp
@@ -1,7 +1,7 @@
 float input[1][16][4][16];
 float output[1][4][16][16];
 
-int main() {
+int node27() {
   for (int arg2 = 0; arg2 < 1; arg2++) {
     for (int arg3 = 0; arg3 < 16; arg3++) {
       for (int arg4 = 0; arg4 < 4; arg4 += 1) {
diff --git a/test/affine2neura/gpt2-node27/node27.mlir b/test/affine2neura/gpt2-node27/node27.mlir
new file mode 100644
index 00000000..3bc78ff5
--- /dev/null
+++ b/test/affine2neura/gpt2-node27/node27.mlir
@@ -0,0 +1,30 @@
+// Check that the affine loop nest is correctly transformed to neura.loop_control
+// RUN: mlir-neura-opt %s --assign-accelerator --lower-affine-to-neura | FileCheck %s
+module attributes {} {
+  memref.global @input : memref<1x16x4x16xf32> = uninitialized
+  memref.global @output : memref<1x4x16x16xf32> = uninitialized
+  func.func @_Z6node27v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %0 = llvm.mlir.undef : i32
+    %1 = memref.get_global @output : memref<1x4x16x16xf32>
+    %2 = memref.get_global @input : memref<1x16x4x16xf32>
+    affine.for %arg0 = 0 to 16 {
+      affine.for %arg1 = 0 to 4 {
+        affine.for %arg2 = 0 to 16 {
+          %3 = affine.load %2[0, %arg1, %arg0, %arg2] : memref<1x16x4x16xf32>
+          affine.store %3, %1[0, %arg0, %arg1, %arg2] : memref<1x4x16x16xf32>
+        }
+      }
+    }
+    return %0 : i32
+  }
+}
+// Verify function signature is preserved
+// CHECK-LABEL: func.func @_Z6node27v() -> i32
+
+// Verify all affine operations are eliminated
+// CHECK-NOT: affine.for
+// CHECK-NOT: affine.load
+// CHECK-NOT: affine.store
+// CHECK-NOT: affine.apply
+
+// CHECK-COUNT-3: neura.loop_control
diff --git a/test/affine2neura/gpt2-node27/node27_neura.mlir b/test/affine2neura/gpt2-node27/node27_neura.mlir
new file mode 100644
index 00000000..8680f78c
--- /dev/null
+++ b/test/affine2neura/gpt2-node27/node27_neura.mlir
@@ -0,0 +1,48 @@
+module {
+  memref.global @input : memref<1x16x4x16xf32> = uninitialized
+  memref.global @output : memref<1x4x16x16xf32> = uninitialized
+  func.func @_Z6node27v() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+    %0 = llvm.mlir.undef : i32
+    %1 = memref.get_global @output : memref<1x4x16x16xf32>
+    %2 = memref.get_global @input : memref<1x16x4x16xf32>
+    %3 = neura.constant {value = 0 : index} : index
+    %4 = neura.constant {value = 16 : index} : index
+    %5 = neura.constant {value = 1 : index} : index
+    neura.br %3 : index to ^bb2
+  ^bb1:  // pred: ^bb12
+    return %0 : i32
+  ^bb2(%6: index):  // 2 preds: ^bb0, ^bb4
+    neura.loop_control current_index : %6, step : %5, bound : %4, loop_type : "lt" then ^bb3(%6 : index) else ^bb12
+  ^bb3(%7: index):  // pred: ^bb2
+    %8 = neura.constant {value = 0 : index} : index
+    %9 = neura.constant {value = 4 : index} : index
+    %10 = neura.constant {value = 1 : index} : index
+    neura.br %8 : index to ^bb5
+  ^bb4:  // pred: ^bb11
+    neura.br %7 : index to ^bb2
+  ^bb5(%11: index):  // 2 preds: ^bb3, ^bb7
+    neura.loop_control current_index : %11, step : %10, bound : %9, loop_type : "lt" then ^bb6(%11 : index) else ^bb11
+  ^bb6(%12: index):  // pred: ^bb5
+    %13 = neura.constant {value = 0 : index} : index
+    %14 = neura.constant {value = 16 : index} : index
+    %15 = neura.constant {value = 1 : index} : index
+    neura.br %13 : index to ^bb8
+  ^bb7:  // pred: ^bb10
+    neura.br %12 : index to ^bb5
+  ^bb8(%16: index):  // 2 preds: ^bb6, ^bb9
+    neura.loop_control current_index : %16, step : %15, bound : %14, loop_type : "lt" then ^bb9(%16 : index) else ^bb10
+  ^bb9(%17: index):  // pred: ^bb8
+    %18 = neura.constant {value = 0 : index} : index
+    %19 = neura.load_indexed memref<1x16x4x16xf32> %2[%18, %12, %7, %17] : f32
+    %20 = neura.constant {value = 0 : index} : index
+    neura.store_indexed %19 to memref<1x4x16x16xf32> %1[%20, %7, %12, %17] : f32
+    neura.br %17 : index to ^bb8
+  ^bb10:  // pred: ^bb8
+    neura.br :  to ^bb7
+  ^bb11:  // pred: ^bb5
+    neura.br :  to ^bb4
+  ^bb12:  // pred: ^bb2
+    neura.br :  to ^bb1
+  }
+}
+
diff --git a/test/affine2neura/gpt2-node27/node27_unroll.mlir b/test/affine2neura/gpt2-node27/node27_unroll.mlir
new file mode 100644
index 00000000..7708b308
--- /dev/null
+++ b/test/affine2neura/gpt2-node27/node27_unroll.mlir
@@ -0,0 +1,23 @@
+#map = affine_map<(d0) -> (d0 + 1)>
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i32>, #dlti.dl_entry<"dlti.endianness", "little">>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", "polygeist.target-cpu" = "x86-64", "polygeist.target-features" = "+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87", "polygeist.tune-cpu" = "generic"} {
+  memref.global @input : memref<1x16x4x16xf32> = uninitialized
+  memref.global @output : memref<1x4x16x16xf32> = uninitialized
+  func.func @_Z6node27v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %0 = llvm.mlir.undef : i32
+    %1 = memref.get_global @output : memref<1x4x16x16xf32>
+    %2 = memref.get_global @input : memref<1x16x4x16xf32>
+    affine.for %arg0 = 0 to 16 {
+      affine.for %arg1 = 0 to 4 {
+        affine.for %arg2 = 0 to 16 step 2 {
+          %3 = affine.load %2[0, %arg1, %arg0, %arg2] : memref<1x16x4x16xf32>
+          affine.store %3, %1[0, %arg0, %arg1, %arg2] : memref<1x4x16x16xf32>
+          %4 = affine.apply #map(%arg2)
+          %5 = affine.load %2[0, %arg1, %arg0, %4] : memref<1x16x4x16xf32>
+          affine.store %5, %1[0, %arg0, %arg1, %4] : memref<1x4x16x16xf32>
+        }
+      }
+    }
+    return %0 : i32
+  }
+}
+
diff --git a/test/affine2neura/gpt2-node30/node30.cpp b/test/affine2neura/gpt2-node30/node30.cpp
index 01177f33..596450f8 100644
--- a/test/affine2neura/gpt2-node30/node30.cpp
+++ b/test/affine2neura/gpt2-node30/node30.cpp
@@ -2,7 +2,7 @@ float A[1][4][16][64];
 // float B=20.0;
 float C[1][4][16][64];
 
-int main() {
+int node30() {
   for (int arg2 = 0; arg2 < 1; arg2++) {
     for (int arg3 = 0; arg3 < 4; arg3++) {
       for (int arg4 = 0; arg4 < 16; arg4++) {
diff --git a/test/affine2neura/gpt2-node30/node30.mlir b/test/affine2neura/gpt2-node30/node30.mlir
new file mode 100644
index 00000000..9d3b77d0
--- /dev/null
+++ b/test/affine2neura/gpt2-node30/node30.mlir
@@ -0,0 +1,33 @@
+// Check that the affine loop nest is correctly transformed to neura.loop_control
+// RUN: mlir-neura-opt %s --assign-accelerator --lower-affine-to-neura | FileCheck %s
+module attributes {} {
+  memref.global @A : memref<1x4x16x64xf32> = uninitialized
+  memref.global @C : memref<1x4x16x64xf32> = uninitialized
+  func.func @_Z6node30v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %cst = arith.constant 1.000000e+01 : f32
+    %0 = llvm.mlir.undef : i32
+    %1 = memref.get_global @C : memref<1x4x16x64xf32>
+    %2 = memref.get_global @A : memref<1x4x16x64xf32>
+    affine.for %arg0 = 0 to 4 {
+      affine.for %arg1 = 0 to 16 {
+        affine.for %arg2 = 0 to 64 {
+          %3 = affine.load %2[0, %arg0, %arg1, %arg2] : memref<1x4x16x64xf32>
+          %4 = arith.mulf %3, %cst : f32
+          affine.store %4, %1[0, %arg0, %arg1, %arg2] : memref<1x4x16x64xf32>
+        }
+      }
+    }
+    return %0 : i32
+  }
+}
+
+// Verify function signature is preserved
+// CHECK-LABEL: func.func @_Z6node30v() -> i32
+
+// Verify all affine operations are eliminated
+// CHECK-NOT: affine.for
+// CHECK-NOT: affine.load
+// CHECK-NOT: affine.store
+// CHECK-NOT: affine.apply
+
+// CHECK-COUNT-3: neura.loop_control
diff --git a/test/affine2neura/gpt2-node30/node30_neura.mlir b/test/affine2neura/gpt2-node30/node30_neura.mlir
new file mode 100644
index 00000000..7a3c641d
--- /dev/null
+++ b/test/affine2neura/gpt2-node30/node30_neura.mlir
@@ -0,0 +1,50 @@
+module {
+  memref.global @A : memref<1x4x16x64xf32> = uninitialized
+  memref.global @C : memref<1x4x16x64xf32> = uninitialized
+  func.func @_Z6node30v() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
+    %cst = arith.constant 1.000000e+01 : f32
+    %0 = llvm.mlir.undef : i32
+    %1 = memref.get_global @C : memref<1x4x16x64xf32>
+    %2 = memref.get_global @A : memref<1x4x16x64xf32>
+    %3 = neura.constant {value = 0 : index} : index
+    %4 = neura.constant {value = 4 : index} : index
+    %5 = neura.constant {value = 1 : index} : index
+    neura.br %3 : index to ^bb2
+  ^bb1:  // pred: ^bb12
+    return %0 : i32
+  ^bb2(%6: index):  // 2 preds: ^bb0, ^bb4
+    neura.loop_control current_index : %6, step : %5, bound : %4, loop_type : "lt" then ^bb3(%6 : index) else ^bb12
+  ^bb3(%7: index):  // pred: ^bb2
+    %8 = neura.constant {value = 0 : index} : index
+    %9 = neura.constant {value = 16 : index} : index
+    %10 = neura.constant {value = 1 : index} : index
+    neura.br %8 : index to ^bb5
+  ^bb4:  // pred: ^bb11
+    neura.br %7 : index to ^bb2
+  ^bb5(%11: index):  // 2 preds: ^bb3, ^bb7
+    neura.loop_control current_index : %11, step : %10, bound : %9, loop_type : "lt" then ^bb6(%11 : index) else ^bb11
+  ^bb6(%12: index):  // pred: ^bb5
+    %13 = neura.constant {value = 0 : index} : index
+    %14 = neura.constant {value = 64 : index} : index
+    %15 = neura.constant {value = 1 : index} : index
+    neura.br %13 : index to ^bb8
+  ^bb7:  // pred: ^bb10
+    neura.br %12 : index to ^bb5
+  ^bb8(%16: index):  // 2 preds: ^bb6, ^bb9
+    neura.loop_control current_index : %16, step : %15, bound : %14, loop_type : "lt" then ^bb9(%16 : index) else ^bb10
+  ^bb9(%17: index):  // pred: ^bb8
+    %18 = neura.constant {value = 0 : index} : index
+    %19 = neura.load_indexed memref<1x4x16x64xf32> %2[%18, %7, %12, %17] : f32
+    %20 = arith.mulf %19, %cst : f32
+    %21 = neura.constant {value = 0 : index} : index
+    neura.store_indexed %20 to memref<1x4x16x64xf32> %1[%21, %7, %12, %17] : f32
+    neura.br %17 : index to ^bb8
+  ^bb10:  // pred: ^bb8
+    neura.br :  to ^bb7
+  ^bb11:  // pred: ^bb5
+    neura.br :  to ^bb4
+  ^bb12:  // pred: ^bb2
+    neura.br :  to ^bb1
+  }
+}
+
diff --git a/test/affine2neura/gpt2-node30/node30_unroll.mlir b/test/affine2neura/gpt2-node30/node30_unroll.mlir
new file mode 100644
index 00000000..e55fe54e
--- /dev/null
+++ b/test/affine2neura/gpt2-node30/node30_unroll.mlir
@@ -0,0 +1,26 @@
+#map = affine_map<(d0) -> (d0 + 1)>
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i32>, #dlti.dl_entry<"dlti.endianness", "little">>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", "polygeist.target-cpu" = "x86-64", "polygeist.target-features" = "+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87", "polygeist.tune-cpu" = "generic"} {
+  memref.global @A : memref<1x4x16x64xf32> = uninitialized
+  memref.global @C : memref<1x4x16x64xf32> = uninitialized
+  func.func @_Z6node30v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
+    %cst = arith.constant 1.000000e+01 : f32
+    %0 = llvm.mlir.undef : i32
+    %1 = memref.get_global @C : memref<1x4x16x64xf32>
+    %2 = memref.get_global @A : memref<1x4x16x64xf32>
+    affine.for %arg0 = 0 to 4 {
+      affine.for %arg1 = 0 to 16 {
+        affine.for %arg2 = 0 to 64 step 2 {
+          %3 = affine.load %2[0, %arg0, %arg1, %arg2] : memref<1x4x16x64xf32>
+          %4 = arith.mulf %3, %cst : f32
+          affine.store %4, %1[0, %arg0, %arg1, %arg2] : memref<1x4x16x64xf32>
+          %5 = affine.apply #map(%arg2)
+          %6 = affine.load %2[0, %arg0, %arg1, %5] : memref<1x4x16x64xf32>
+          %7 = arith.mulf %6, %cst : f32
+          affine.store %7, %1[0, %arg0, %arg1, %5] : memref<1x4x16x64xf32>
+        }
+      }
+    }
+    return %0 : i32
+  }
+}
+
diff --git a/test/affine2neura/simpleloop/compile.sh b/test/affine2neura/simpleloop/compile.sh
deleted file mode 100755
index f19caf0e..00000000
--- a/test/affine2neura/simpleloop/compile.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/cgeist ./simple.cpp -S --raise-scf-to-affine -o ./simple.mlir
-/home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./simple.mlir --affine-loop-unroll="unroll-factor=2" -o ./simple_unroll.mlir
-# /home/lucas/Project/NeuraCompiler/thirdparty/Polygeist/build/bin/polygeist-opt ./node27_unroll.mlir --affine-loop-tile="tile-size=2" -o ./node27_tile.mlir
\ No newline at end of file
diff --git a/test/affine2neura/simpleloop/simple.cpp b/test/affine2neura/simpleloop/simple.cpp
deleted file mode 100644
index 6078f497..00000000
--- a/test/affine2neura/simpleloop/simple.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-float A[100];
-float C[100];
-
-int main() {
-  const int size = 100;
-  for (int i = 0; i < size; ++i) {
-    float loaded_value = A[i];      // Instruction 1: Load value from A
-    float multiplied_value = loaded_value * 10.0f; // Instruction 2: Multiply the value
-    C[i] = multiplied_value;      // Instruction 3: Store result into C
-  }
-  return 0;
-}

From 6594f4a4723eb6c5f70e5df5dd1827984b61fc5a Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 17 Jun 2025 14:56:41 +0800
Subject: [PATCH 12/13] [fix] solve confics with main

---
 include/NeuraDialect/NeuraOps.td | 2 +-
 test/.lit_test_times.txt         | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/NeuraDialect/NeuraOps.td b/include/NeuraDialect/NeuraOps.td
index dc76c021..48dd0b54 100644
--- a/include/NeuraDialect/NeuraOps.td
+++ b/include/NeuraDialect/NeuraOps.td
@@ -11,7 +11,7 @@ def Neura_ConstantOp : Op<NeuraDialect, "constant"> {
     OptionalAttr<BoolAttr>:$predicate  // Add optional predicate attribute
   );
   let results = (outs AnyType:$result);
-  let assemblyFormat = "attr-dict `:` type($result)";
+  // let assemblyFormat = "attr-dict `:` type($result)";
 }
 
 // Defines an addition operation.
diff --git a/test/.lit_test_times.txt b/test/.lit_test_times.txt
index 961067b1..9a9da3f0 100644
--- a/test/.lit_test_times.txt
+++ b/test/.lit_test_times.txt
@@ -1,4 +1,7 @@
 7.853746e-03 affine2neura/gpt2-node27/node27.mlir
-1.136017e-02 affine2neura/deep-nested/deep_nested.mlir
-7.997274e-03 affine2neura/gpt2-node11/node11.mlir
+9.844303e-03 affine2neura/deep-nested/deep_nested.mlir
+6.515980e-03 affine2neura/gpt2-node11/node11.mlir
 7.548809e-03 affine2neura/gpt2-node30/node30.mlir
+9.920120e-03 neura/ctrl/branch.mlir
+1.126695e-02 neura/ctrl/branch_no_arg.mlir
+9.951830e-03 neura/ctrl/branch_for.mlir

From 9d4ee2553974131824cc5678b5758e6d797a3635 Mon Sep 17 00:00:00 2001
From: ShangkunLI <shangkun.li@connect.ust.hk>
Date: Tue, 17 Jun 2025 16:37:12 +0800
Subject: [PATCH 13/13] [test] add test

---
 test/.lit_test_times.txt                      |  33 ++++-
 .../deep-nested/deep_nested_neura.mlir        | 125 ------------------
 .../gpt2-node11/node11_neura.mlir             |  40 ------
 .../gpt2-node27/node27_neura.mlir             |  48 -------
 .../gpt2-node27/node27_unroll.mlir            |  23 ----
 .../gpt2-node30/node30_neura.mlir             |  50 -------
 .../gpt2-node30/node30_unroll.mlir            |  26 ----
 7 files changed, 26 insertions(+), 319 deletions(-)
 delete mode 100644 test/affine2neura/deep-nested/deep_nested_neura.mlir
 delete mode 100644 test/affine2neura/gpt2-node11/node11_neura.mlir
 delete mode 100644 test/affine2neura/gpt2-node27/node27_neura.mlir
 delete mode 100644 test/affine2neura/gpt2-node27/node27_unroll.mlir
 delete mode 100644 test/affine2neura/gpt2-node30/node30_neura.mlir
 delete mode 100644 test/affine2neura/gpt2-node30/node30_unroll.mlir

diff --git a/test/.lit_test_times.txt b/test/.lit_test_times.txt
index 9a9da3f0..c0405ae5 100644
--- a/test/.lit_test_times.txt
+++ b/test/.lit_test_times.txt
@@ -1,7 +1,26 @@
-7.853746e-03 affine2neura/gpt2-node27/node27.mlir
-9.844303e-03 affine2neura/deep-nested/deep_nested.mlir
-6.515980e-03 affine2neura/gpt2-node11/node11.mlir
-7.548809e-03 affine2neura/gpt2-node30/node30.mlir
-9.920120e-03 neura/ctrl/branch.mlir
-1.126695e-02 neura/ctrl/branch_no_arg.mlir
-9.951830e-03 neura/ctrl/branch_for.mlir
+2.734089e-02 affine2neura/gpt2-node27/node27.mlir
+1.068902e-02 affine2neura/deep-nested/deep_nested.mlir
+2.698708e-02 affine2neura/gpt2-node11/node11.mlir
+2.851033e-02 affine2neura/gpt2-node30/node30.mlir
+3.188467e-02 neura/ctrl/branch.mlir
+2.987862e-02 neura/ctrl/branch_no_arg.mlir
+1.032019e-02 neura/ctrl/branch_for.mlir
+-5.869865e-04 affine2neura/deep-nested/deep_nested_neura.mlir
+-5.869865e-04 affine2neura/gpt2-node11/node11_neura.mlir
+-3.650188e-04 affine2neura/gpt2-node27/node27_neura.mlir
+-6.232262e-04 affine2neura/gpt2-node27/node27_unroll.mlir
+-3.950596e-04 affine2neura/gpt2-node30/node30_neura.mlir
+-4.494190e-04 affine2neura/gpt2-node30/node30_unroll.mlir
+6.077766e-03 arith2neura/add.mlir
+2.200377e-01 c2llvm2mlir/test.mlir
+5.845070e-03 neura/arith_add.mlir
+5.631447e-03 neura/fadd_fadd.mlir
+9.507132e-02 neura/for_loop/test.mlir
+2.653909e-02 neura/interpreter/add.mlir
+5.800486e-03 neura/interpreter/interpreter.mlir
+1.472716e-01 neura/interpreter/lower_and_interpret.mlir
+1.522479e-01 neura/interpreter/lower_and_interpret_subf.mlir
+2.643609e-02 neura/interpreter/predicated_data.mlir
+2.739096e-02 neura/llvm_add.mlir
+2.676344e-02 neura/llvm_sub.mlir
+2.563691e-02 test.mlir
diff --git a/test/affine2neura/deep-nested/deep_nested_neura.mlir b/test/affine2neura/deep-nested/deep_nested_neura.mlir
deleted file mode 100644
index 368b6dc5..00000000
--- a/test/affine2neura/deep-nested/deep_nested_neura.mlir
+++ /dev/null
@@ -1,125 +0,0 @@
-module {
-  memref.global @input_data : memref<3x3x3xi32> = uninitialized
-  memref.global @output_data : memref<3x3x3xi32> = uninitialized
-  func.func @_Z11deep_nestedv() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
-    %c0_i32 = arith.constant 0 : i32
-    %0 = memref.get_global @output_data : memref<3x3x3xi32>
-    %1 = memref.get_global @input_data : memref<3x3x3xi32>
-    %2 = neura.constant {value = 0 : index} : index
-    %3 = neura.constant {value = 3 : index} : index
-    %4 = neura.constant {value = 1 : index} : index
-    neura.br %2 : index to ^bb2
-  ^bb1:  // pred: ^bb40
-    return %c0_i32 : i32
-  ^bb2(%5: index):  // 2 preds: ^bb0, ^bb4
-    neura.loop_control current_index : %5, step : %4, bound : %3, loop_type : "lt" then ^bb3(%5 : index) else ^bb40
-  ^bb3(%6: index):  // pred: ^bb2
-    %7 = neura.constant {value = 0 : index} : index
-    %8 = neura.constant {value = 3 : index} : index
-    %9 = neura.constant {value = 1 : index} : index
-    neura.br %7 : index to ^bb5
-  ^bb4:  // pred: ^bb39
-    neura.br %6 : index to ^bb2
-  ^bb5(%10: index):  // 2 preds: ^bb3, ^bb7
-    neura.loop_control current_index : %10, step : %9, bound : %8, loop_type : "lt" then ^bb6(%10 : index) else ^bb39
-  ^bb6(%11: index):  // pred: ^bb5
-    %12 = neura.constant {value = 0 : index} : index
-    %13 = neura.constant {value = 3 : index} : index
-    %14 = neura.constant {value = 1 : index} : index
-    neura.br %12 : index to ^bb8
-  ^bb7:  // pred: ^bb38
-    neura.br %11 : index to ^bb5
-  ^bb8(%15: index):  // 2 preds: ^bb6, ^bb10
-    neura.loop_control current_index : %15, step : %14, bound : %13, loop_type : "lt" then ^bb9(%15 : index) else ^bb38
-  ^bb9(%16: index):  // pred: ^bb8
-    %17 = neura.constant {value = 0 : index} : index
-    %18 = neura.constant {value = 3 : index} : index
-    %19 = neura.constant {value = 1 : index} : index
-    neura.br %17 : index to ^bb11
-  ^bb10:  // pred: ^bb37
-    neura.br %16 : index to ^bb8
-  ^bb11(%20: index):  // 2 preds: ^bb9, ^bb13
-    neura.loop_control current_index : %20, step : %19, bound : %18, loop_type : "lt" then ^bb12(%20 : index) else ^bb37
-  ^bb12(%21: index):  // pred: ^bb11
-    %22 = neura.constant {value = 0 : index} : index
-    %23 = neura.constant {value = 3 : index} : index
-    %24 = neura.constant {value = 1 : index} : index
-    neura.br %22 : index to ^bb14
-  ^bb13:  // pred: ^bb36
-    neura.br %21 : index to ^bb11
-  ^bb14(%25: index):  // 2 preds: ^bb12, ^bb16
-    neura.loop_control current_index : %25, step : %24, bound : %23, loop_type : "lt" then ^bb15(%25 : index) else ^bb36
-  ^bb15(%26: index):  // pred: ^bb14
-    %27 = neura.constant {value = 0 : index} : index
-    %28 = neura.constant {value = 3 : index} : index
-    %29 = neura.constant {value = 1 : index} : index
-    neura.br %27 : index to ^bb17
-  ^bb16:  // pred: ^bb35
-    neura.br %26 : index to ^bb14
-  ^bb17(%30: index):  // 2 preds: ^bb15, ^bb19
-    neura.loop_control current_index : %30, step : %29, bound : %28, loop_type : "lt" then ^bb18(%30 : index) else ^bb35
-  ^bb18(%31: index):  // pred: ^bb17
-    %32 = neura.constant {value = 0 : index} : index
-    %33 = neura.constant {value = 3 : index} : index
-    %34 = neura.constant {value = 1 : index} : index
-    neura.br %32 : index to ^bb20
-  ^bb19:  // pred: ^bb34
-    neura.br %31 : index to ^bb17
-  ^bb20(%35: index):  // 2 preds: ^bb18, ^bb22
-    neura.loop_control current_index : %35, step : %34, bound : %33, loop_type : "lt" then ^bb21(%35 : index) else ^bb34
-  ^bb21(%36: index):  // pred: ^bb20
-    %37 = neura.constant {value = 0 : index} : index
-    %38 = neura.constant {value = 3 : index} : index
-    %39 = neura.constant {value = 1 : index} : index
-    neura.br %37 : index to ^bb23
-  ^bb22:  // pred: ^bb33
-    neura.br %36 : index to ^bb20
-  ^bb23(%40: index):  // 2 preds: ^bb21, ^bb25
-    neura.loop_control current_index : %40, step : %39, bound : %38, loop_type : "lt" then ^bb24(%40 : index) else ^bb33
-  ^bb24(%41: index):  // pred: ^bb23
-    %42 = neura.load_indexed memref<3x3x3xi32> %1[%6, %11, %16] : i32
-    %43 = neura.constant {value = 0 : index} : index
-    %44 = neura.constant {value = 3 : index} : index
-    %45 = neura.constant {value = 1 : index} : index
-    neura.br %43 : index to ^bb26
-  ^bb25:  // pred: ^bb32
-    neura.br %41 : index to ^bb23
-  ^bb26(%46: index):  // 2 preds: ^bb24, ^bb28
-    neura.loop_control current_index : %46, step : %45, bound : %44, loop_type : "lt" then ^bb27(%46 : index) else ^bb32
-  ^bb27(%47: index):  // pred: ^bb26
-    %48 = neura.constant {value = 0 : index} : index
-    %49 = neura.constant {value = 3 : index} : index
-    %50 = neura.constant {value = 1 : index} : index
-    neura.br %48 : index to ^bb29
-  ^bb28:  // pred: ^bb31
-    neura.br %47 : index to ^bb26
-  ^bb29(%51: index):  // 2 preds: ^bb27, ^bb30
-    neura.loop_control current_index : %51, step : %50, bound : %49, loop_type : "lt" then ^bb30(%51 : index) else ^bb31
-  ^bb30(%52: index):  // pred: ^bb29
-    %53 = neura.load_indexed memref<3x3x3xi32> %0[%6, %11, %16] : i32
-    %54 = arith.addi %53, %42 : i32
-    neura.store_indexed %54 to memref<3x3x3xi32> %0[%6, %11, %16] : i32
-    neura.br %52 : index to ^bb29
-  ^bb31:  // pred: ^bb29
-    neura.br :  to ^bb28
-  ^bb32:  // pred: ^bb26
-    neura.br :  to ^bb25
-  ^bb33:  // pred: ^bb23
-    neura.br :  to ^bb22
-  ^bb34:  // pred: ^bb20
-    neura.br :  to ^bb19
-  ^bb35:  // pred: ^bb17
-    neura.br :  to ^bb16
-  ^bb36:  // pred: ^bb14
-    neura.br :  to ^bb13
-  ^bb37:  // pred: ^bb11
-    neura.br :  to ^bb10
-  ^bb38:  // pred: ^bb8
-    neura.br :  to ^bb7
-  ^bb39:  // pred: ^bb5
-    neura.br :  to ^bb4
-  ^bb40:  // pred: ^bb2
-    neura.br :  to ^bb1
-  }
-}
-
diff --git a/test/affine2neura/gpt2-node11/node11_neura.mlir b/test/affine2neura/gpt2-node11/node11_neura.mlir
deleted file mode 100644
index ccc214c7..00000000
--- a/test/affine2neura/gpt2-node11/node11_neura.mlir
+++ /dev/null
@@ -1,40 +0,0 @@
-module {
-  memref.global @input : memref<1x16x64xf32> = uninitialized
-  memref.global @output : memref<1x16xf32> = uninitialized
-  func.func @_Z6node11v() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
-    %c0_i32 = arith.constant 0 : i32
-    %0 = memref.get_global @output : memref<1x16xf32>
-    %1 = memref.get_global @input : memref<1x16x64xf32>
-    %2 = neura.constant {value = 0 : index} : index
-    %3 = neura.constant {value = 16 : index} : index
-    %4 = neura.constant {value = 1 : index} : index
-    neura.br %2 : index to ^bb2
-  ^bb1:  // pred: ^bb8
-    return %c0_i32 : i32
-  ^bb2(%5: index):  // 2 preds: ^bb0, ^bb4
-    neura.loop_control current_index : %5, step : %4, bound : %3, loop_type : "lt" then ^bb3(%5 : index) else ^bb8
-  ^bb3(%6: index):  // pred: ^bb2
-    %7 = neura.constant {value = 0 : index} : index
-    %8 = neura.constant {value = 64 : index} : index
-    %9 = neura.constant {value = 1 : index} : index
-    neura.br %7 : index to ^bb5
-  ^bb4:  // pred: ^bb7
-    neura.br %6 : index to ^bb2
-  ^bb5(%10: index):  // 2 preds: ^bb3, ^bb6
-    neura.loop_control current_index : %10, step : %9, bound : %8, loop_type : "lt" then ^bb6(%10 : index) else ^bb7
-  ^bb6(%11: index):  // pred: ^bb5
-    %12 = neura.constant {value = 0 : index} : index
-    %13 = neura.load_indexed memref<1x16x64xf32> %1[%12, %6, %11] : f32
-    %14 = neura.constant {value = 0 : index} : index
-    %15 = neura.load_indexed memref<1x16xf32> %0[%14, %6] : f32
-    %16 = arith.addf %15, %13 : f32
-    %17 = neura.constant {value = 0 : index} : index
-    neura.store_indexed %16 to memref<1x16xf32> %0[%17, %6] : f32
-    neura.br %11 : index to ^bb5
-  ^bb7:  // pred: ^bb5
-    neura.br :  to ^bb4
-  ^bb8:  // pred: ^bb2
-    neura.br :  to ^bb1
-  }
-}
-
diff --git a/test/affine2neura/gpt2-node27/node27_neura.mlir b/test/affine2neura/gpt2-node27/node27_neura.mlir
deleted file mode 100644
index 8680f78c..00000000
--- a/test/affine2neura/gpt2-node27/node27_neura.mlir
+++ /dev/null
@@ -1,48 +0,0 @@
-module {
-  memref.global @input : memref<1x16x4x16xf32> = uninitialized
-  memref.global @output : memref<1x4x16x16xf32> = uninitialized
-  func.func @_Z6node27v() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
-    %0 = llvm.mlir.undef : i32
-    %1 = memref.get_global @output : memref<1x4x16x16xf32>
-    %2 = memref.get_global @input : memref<1x16x4x16xf32>
-    %3 = neura.constant {value = 0 : index} : index
-    %4 = neura.constant {value = 16 : index} : index
-    %5 = neura.constant {value = 1 : index} : index
-    neura.br %3 : index to ^bb2
-  ^bb1:  // pred: ^bb12
-    return %0 : i32
-  ^bb2(%6: index):  // 2 preds: ^bb0, ^bb4
-    neura.loop_control current_index : %6, step : %5, bound : %4, loop_type : "lt" then ^bb3(%6 : index) else ^bb12
-  ^bb3(%7: index):  // pred: ^bb2
-    %8 = neura.constant {value = 0 : index} : index
-    %9 = neura.constant {value = 4 : index} : index
-    %10 = neura.constant {value = 1 : index} : index
-    neura.br %8 : index to ^bb5
-  ^bb4:  // pred: ^bb11
-    neura.br %7 : index to ^bb2
-  ^bb5(%11: index):  // 2 preds: ^bb3, ^bb7
-    neura.loop_control current_index : %11, step : %10, bound : %9, loop_type : "lt" then ^bb6(%11 : index) else ^bb11
-  ^bb6(%12: index):  // pred: ^bb5
-    %13 = neura.constant {value = 0 : index} : index
-    %14 = neura.constant {value = 16 : index} : index
-    %15 = neura.constant {value = 1 : index} : index
-    neura.br %13 : index to ^bb8
-  ^bb7:  // pred: ^bb10
-    neura.br %12 : index to ^bb5
-  ^bb8(%16: index):  // 2 preds: ^bb6, ^bb9
-    neura.loop_control current_index : %16, step : %15, bound : %14, loop_type : "lt" then ^bb9(%16 : index) else ^bb10
-  ^bb9(%17: index):  // pred: ^bb8
-    %18 = neura.constant {value = 0 : index} : index
-    %19 = neura.load_indexed memref<1x16x4x16xf32> %2[%18, %12, %7, %17] : f32
-    %20 = neura.constant {value = 0 : index} : index
-    neura.store_indexed %19 to memref<1x4x16x16xf32> %1[%20, %7, %12, %17] : f32
-    neura.br %17 : index to ^bb8
-  ^bb10:  // pred: ^bb8
-    neura.br :  to ^bb7
-  ^bb11:  // pred: ^bb5
-    neura.br :  to ^bb4
-  ^bb12:  // pred: ^bb2
-    neura.br :  to ^bb1
-  }
-}
-
diff --git a/test/affine2neura/gpt2-node27/node27_unroll.mlir b/test/affine2neura/gpt2-node27/node27_unroll.mlir
deleted file mode 100644
index 7708b308..00000000
--- a/test/affine2neura/gpt2-node27/node27_unroll.mlir
+++ /dev/null
@@ -1,23 +0,0 @@
-#map = affine_map<(d0) -> (d0 + 1)>
-module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i32>, #dlti.dl_entry<"dlti.endianness", "little">>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", "polygeist.target-cpu" = "x86-64", "polygeist.target-features" = "+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87", "polygeist.tune-cpu" = "generic"} {
-  memref.global @input : memref<1x16x4x16xf32> = uninitialized
-  memref.global @output : memref<1x4x16x16xf32> = uninitialized
-  func.func @_Z6node27v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-    %0 = llvm.mlir.undef : i32
-    %1 = memref.get_global @output : memref<1x4x16x16xf32>
-    %2 = memref.get_global @input : memref<1x16x4x16xf32>
-    affine.for %arg0 = 0 to 16 {
-      affine.for %arg1 = 0 to 4 {
-        affine.for %arg2 = 0 to 16 step 2 {
-          %3 = affine.load %2[0, %arg1, %arg0, %arg2] : memref<1x16x4x16xf32>
-          affine.store %3, %1[0, %arg0, %arg1, %arg2] : memref<1x4x16x16xf32>
-          %4 = affine.apply #map(%arg2)
-          %5 = affine.load %2[0, %arg1, %arg0, %4] : memref<1x16x4x16xf32>
-          affine.store %5, %1[0, %arg0, %arg1, %4] : memref<1x4x16x16xf32>
-        }
-      }
-    }
-    return %0 : i32
-  }
-}
-
diff --git a/test/affine2neura/gpt2-node30/node30_neura.mlir b/test/affine2neura/gpt2-node30/node30_neura.mlir
deleted file mode 100644
index 7a3c641d..00000000
--- a/test/affine2neura/gpt2-node30/node30_neura.mlir
+++ /dev/null
@@ -1,50 +0,0 @@
-module {
-  memref.global @A : memref<1x4x16x64xf32> = uninitialized
-  memref.global @C : memref<1x4x16x64xf32> = uninitialized
-  func.func @_Z6node30v() -> i32 attributes {accelerator = "neura", llvm.linkage = #llvm.linkage<external>} {
-    %cst = arith.constant 1.000000e+01 : f32
-    %0 = llvm.mlir.undef : i32
-    %1 = memref.get_global @C : memref<1x4x16x64xf32>
-    %2 = memref.get_global @A : memref<1x4x16x64xf32>
-    %3 = neura.constant {value = 0 : index} : index
-    %4 = neura.constant {value = 4 : index} : index
-    %5 = neura.constant {value = 1 : index} : index
-    neura.br %3 : index to ^bb2
-  ^bb1:  // pred: ^bb12
-    return %0 : i32
-  ^bb2(%6: index):  // 2 preds: ^bb0, ^bb4
-    neura.loop_control current_index : %6, step : %5, bound : %4, loop_type : "lt" then ^bb3(%6 : index) else ^bb12
-  ^bb3(%7: index):  // pred: ^bb2
-    %8 = neura.constant {value = 0 : index} : index
-    %9 = neura.constant {value = 16 : index} : index
-    %10 = neura.constant {value = 1 : index} : index
-    neura.br %8 : index to ^bb5
-  ^bb4:  // pred: ^bb11
-    neura.br %7 : index to ^bb2
-  ^bb5(%11: index):  // 2 preds: ^bb3, ^bb7
-    neura.loop_control current_index : %11, step : %10, bound : %9, loop_type : "lt" then ^bb6(%11 : index) else ^bb11
-  ^bb6(%12: index):  // pred: ^bb5
-    %13 = neura.constant {value = 0 : index} : index
-    %14 = neura.constant {value = 64 : index} : index
-    %15 = neura.constant {value = 1 : index} : index
-    neura.br %13 : index to ^bb8
-  ^bb7:  // pred: ^bb10
-    neura.br %12 : index to ^bb5
-  ^bb8(%16: index):  // 2 preds: ^bb6, ^bb9
-    neura.loop_control current_index : %16, step : %15, bound : %14, loop_type : "lt" then ^bb9(%16 : index) else ^bb10
-  ^bb9(%17: index):  // pred: ^bb8
-    %18 = neura.constant {value = 0 : index} : index
-    %19 = neura.load_indexed memref<1x4x16x64xf32> %2[%18, %7, %12, %17] : f32
-    %20 = arith.mulf %19, %cst : f32
-    %21 = neura.constant {value = 0 : index} : index
-    neura.store_indexed %20 to memref<1x4x16x64xf32> %1[%21, %7, %12, %17] : f32
-    neura.br %17 : index to ^bb8
-  ^bb10:  // pred: ^bb8
-    neura.br :  to ^bb7
-  ^bb11:  // pred: ^bb5
-    neura.br :  to ^bb4
-  ^bb12:  // pred: ^bb2
-    neura.br :  to ^bb1
-  }
-}
-
diff --git a/test/affine2neura/gpt2-node30/node30_unroll.mlir b/test/affine2neura/gpt2-node30/node30_unroll.mlir
deleted file mode 100644
index e55fe54e..00000000
--- a/test/affine2neura/gpt2-node30/node30_unroll.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-#map = affine_map<(d0) -> (d0 + 1)>
-module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i16, dense<16> : vector<2xi32>>, #dlti.dl_entry<i32, dense<32> : vector<2xi32>>, #dlti.dl_entry<f16, dense<16> : vector<2xi32>>, #dlti.dl_entry<f64, dense<64> : vector<2xi32>>, #dlti.dl_entry<f128, dense<128> : vector<2xi32>>, #dlti.dl_entry<i64, dense<64> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi32>>, #dlti.dl_entry<f80, dense<128> : vector<2xi32>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi32>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi32>>, #dlti.dl_entry<i1, dense<8> : vector<2xi32>>, #dlti.dl_entry<i8, dense<8> : vector<2xi32>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i32>, #dlti.dl_entry<"dlti.endianness", "little">>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", "polygeist.target-cpu" = "x86-64", "polygeist.target-features" = "+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87", "polygeist.tune-cpu" = "generic"} {
-  memref.global @A : memref<1x4x16x64xf32> = uninitialized
-  memref.global @C : memref<1x4x16x64xf32> = uninitialized
-  func.func @_Z6node30v() -> i32 attributes {llvm.linkage = #llvm.linkage<external>} {
-    %cst = arith.constant 1.000000e+01 : f32
-    %0 = llvm.mlir.undef : i32
-    %1 = memref.get_global @C : memref<1x4x16x64xf32>
-    %2 = memref.get_global @A : memref<1x4x16x64xf32>
-    affine.for %arg0 = 0 to 4 {
-      affine.for %arg1 = 0 to 16 {
-        affine.for %arg2 = 0 to 64 step 2 {
-          %3 = affine.load %2[0, %arg0, %arg1, %arg2] : memref<1x4x16x64xf32>
-          %4 = arith.mulf %3, %cst : f32
-          affine.store %4, %1[0, %arg0, %arg1, %arg2] : memref<1x4x16x64xf32>
-          %5 = affine.apply #map(%arg2)
-          %6 = affine.load %2[0, %arg0, %arg1, %5] : memref<1x4x16x64xf32>
-          %7 = arith.mulf %6, %cst : f32
-          affine.store %7, %1[0, %arg0, %arg1, %5] : memref<1x4x16x64xf32>
-        }
-      }
-    }
-    return %0 : i32
-  }
-}
-