[P0] Unexpectedly high II when mapping compute intensive kernel

Hi, all. I've been testing some compute intensive kernel recently, and found that even RecII and ResII are both low enough **(RecII = 5, ResII = 5)**, but the final II is **14**, which is unexpectedly high.

After the initial triage, I believe the issue may have been caused by mapper. For certain operation, it is placed to a inappropriate by the current mapping logic, and cause the downstream and upstream operation have no enough explore space, which finally lead to increasing II.

So could we have some enhancement with the current mapper?

The IR `target.mlir` for information:
```
#loop_unroll = #llvm.loop_unroll<disable = true>
#tbaa_root = #llvm.tbaa_root<id = "Simple C++ TBAA">
#loop_annotation = #llvm.loop_annotation<unroll = #loop_unroll, mustProgress = true>
#tbaa_type_desc = #llvm.tbaa_type_desc<id = "omnipotent char", members = {<#tbaa_root, 0>}>
#tbaa_type_desc1 = #llvm.tbaa_type_desc<id = "int", members = {<#tbaa_type_desc, 0>}>
#tbaa_type_desc2 = #llvm.tbaa_type_desc<id = "float", members = {<#tbaa_type_desc, 0>}>
#tbaa_tag = #llvm.tbaa_tag<base_type = #tbaa_type_desc1, access_type = #tbaa_type_desc1, offset = 0>
#tbaa_tag1 = #llvm.tbaa_tag<base_type = #tbaa_type_desc2, access_type = #tbaa_type_desc2, offset = 0>
module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, llvm.ident = "clang version 20.1.7 (git@github.com:llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} {
  llvm.func local_unnamed_addr @_Z18kernel_phase1_aabbffffffPKfS0_S0_S0_S0_S0_PKiPii(%arg0: f32 {llvm.noundef}, %arg1: f32 {llvm.noundef}, %arg2: f32 {llvm.noundef}, %arg3: f32 {llvm.noundef}, %arg4: f32 {llvm.noundef}, %arg5: f32 {llvm.noundef}, %arg6: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg7: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg8: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg9: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg10: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg11: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg12: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg13: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.writeonly}, %arg14: i32 {llvm.noundef}) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_inline, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
    %0 = llvm.mlir.constant(1.000000e+00 : f32) : f32
    %1 = llvm.mlir.constant(0 : i32) : i32
    %2 = llvm.mlir.constant(0 : i64) : i64
    %3 = llvm.mlir.constant(1 : i64) : i64
    %4 = llvm.fdiv %0, %arg3 : f32
    %5 = llvm.fdiv %0, %arg4 : f32
    %6 = llvm.fdiv %0, %arg5 : f32
    %7 = llvm.icmp "sgt" %arg14, %1 : i32
    llvm.cond_br %7, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %8 = llvm.zext nneg %arg14 : i32 to i64
    llvm.br ^bb3(%2 : i64)
  ^bb2:  // 2 preds: ^bb0, ^bb3
    llvm.return
  ^bb3(%9: i64):  // 2 preds: ^bb1, ^bb3
    %10 = llvm.getelementptr inbounds %arg12[%9] : (!llvm.ptr, i64) -> !llvm.ptr, i32
    %11 = llvm.load %10 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> i32
    %12 = llvm.sext %11 : i32 to i64
    %13 = llvm.getelementptr inbounds %arg6[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %14 = llvm.load %13 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %15 = llvm.getelementptr inbounds %arg7[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %16 = llvm.load %15 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %17 = llvm.getelementptr inbounds %arg8[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %18 = llvm.load %17 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %19 = llvm.getelementptr inbounds %arg9[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %20 = llvm.load %19 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %21 = llvm.getelementptr inbounds %arg10[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %22 = llvm.load %21 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %23 = llvm.getelementptr inbounds %arg11[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %24 = llvm.load %23 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %25 = llvm.fsub %14, %arg0 : f32
    %26 = llvm.fmul %4, %25 : f32
    %27 = llvm.fsub %20, %arg0 : f32
    %28 = llvm.fmul %4, %27 : f32
    %29 = llvm.intr.minnum(%26, %28) : (f32, f32) -> f32
    %30 = llvm.intr.maxnum(%26, %28) : (f32, f32) -> f32
    %31 = llvm.fsub %16, %arg1 : f32
    %32 = llvm.fmul %5, %31 : f32
    %33 = llvm.fsub %22, %arg1 : f32
    %34 = llvm.fmul %5, %33 : f32
    %35 = llvm.intr.minnum(%32, %34) : (f32, f32) -> f32
    %36 = llvm.intr.maxnum(%32, %34) : (f32, f32) -> f32
    %37 = llvm.fsub %18, %arg2 : f32
    %38 = llvm.fmul %6, %37 : f32
    %39 = llvm.fsub %24, %arg2 : f32
    %40 = llvm.fmul %6, %39 : f32
    %41 = llvm.intr.minnum(%38, %40) : (f32, f32) -> f32
    %42 = llvm.intr.maxnum(%38, %40) : (f32, f32) -> f32
    %43 = llvm.intr.maxnum(%29, %35) : (f32, f32) -> f32
    %44 = llvm.intr.maxnum(%43, %41) : (f32, f32) -> f32
    %45 = llvm.intr.minnum(%30, %36) : (f32, f32) -> f32
    %46 = llvm.intr.minnum(%45, %42) : (f32, f32) -> f32
    %47 = llvm.fcmp "ole" %44, %46 : f32
    %48 = llvm.zext %47 : i1 to i32
    %49 = llvm.getelementptr inbounds %arg13[%9] : (!llvm.ptr, i64) -> !llvm.ptr, i32
    llvm.store %48, %49 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : i32, !llvm.ptr
    %50 = llvm.add %9, %3 overflow<nsw, nuw> : i64
    %51 = llvm.icmp "eq" %50, %8 : i64
    llvm.cond_br %51, ^bb2, ^bb3(%50 : i64) {loop_annotation = #loop_annotation}
  }
}
```

The command I use:
```
/workspace/dataflow/build/tools/mlir-neura-opt/mlir-neura-opt --assign-accelerator --lower-llvm-to-neura --fuse-pattern --promote-func-arg-to-const --fold-constant --canonicalize-live-in --leverage-predicated-value --transform-ctrl-to-data-flow --fold-constant --view-op-graph --architecture-spec=../../../arch_spec/architecture.yaml --insert-data-mov --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized=7,5" target.mlir
```

BTW, maybe we could relate #175 with this issue for better development and debugging.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[P0] Unexpectedly high II when mapping compute intensive kernel #183

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[P0] Unexpectedly high II when mapping compute intensive kernel #183

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions