Skip to content

[P0] Unexpectedly high II when mapping compute intensive kernel #183

@YanzhouTang

Description

@YanzhouTang

Hi, all. I've been testing some compute intensive kernel recently, and found that even RecII and ResII are both low enough (RecII = 5, ResII = 5), but the final II is 14, which is unexpectedly high.

After the initial triage, I believe the issue may have been caused by mapper. For certain operation, it is placed to a inappropriate by the current mapping logic, and cause the downstream and upstream operation have no enough explore space, which finally lead to increasing II.

So could we have some enhancement with the current mapper?

The IR target.mlir for information:

#loop_unroll = #llvm.loop_unroll<disable = true>
#tbaa_root = #llvm.tbaa_root<id = "Simple C++ TBAA">
#loop_annotation = #llvm.loop_annotation<unroll = #loop_unroll, mustProgress = true>
#tbaa_type_desc = #llvm.tbaa_type_desc<id = "omnipotent char", members = {<#tbaa_root, 0>}>
#tbaa_type_desc1 = #llvm.tbaa_type_desc<id = "int", members = {<#tbaa_type_desc, 0>}>
#tbaa_type_desc2 = #llvm.tbaa_type_desc<id = "float", members = {<#tbaa_type_desc, 0>}>
#tbaa_tag = #llvm.tbaa_tag<base_type = #tbaa_type_desc1, access_type = #tbaa_type_desc1, offset = 0>
#tbaa_tag1 = #llvm.tbaa_tag<base_type = #tbaa_type_desc2, access_type = #tbaa_type_desc2, offset = 0>
module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, llvm.ident = "clang version 20.1.7 (git@github.com:llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} {
  llvm.func local_unnamed_addr @_Z18kernel_phase1_aabbffffffPKfS0_S0_S0_S0_S0_PKiPii(%arg0: f32 {llvm.noundef}, %arg1: f32 {llvm.noundef}, %arg2: f32 {llvm.noundef}, %arg3: f32 {llvm.noundef}, %arg4: f32 {llvm.noundef}, %arg5: f32 {llvm.noundef}, %arg6: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg7: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg8: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg9: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg10: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg11: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg12: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg13: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.writeonly}, %arg14: i32 {llvm.noundef}) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_inline, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
    %0 = llvm.mlir.constant(1.000000e+00 : f32) : f32
    %1 = llvm.mlir.constant(0 : i32) : i32
    %2 = llvm.mlir.constant(0 : i64) : i64
    %3 = llvm.mlir.constant(1 : i64) : i64
    %4 = llvm.fdiv %0, %arg3 : f32
    %5 = llvm.fdiv %0, %arg4 : f32
    %6 = llvm.fdiv %0, %arg5 : f32
    %7 = llvm.icmp "sgt" %arg14, %1 : i32
    llvm.cond_br %7, ^bb1, ^bb2
  ^bb1:  // pred: ^bb0
    %8 = llvm.zext nneg %arg14 : i32 to i64
    llvm.br ^bb3(%2 : i64)
  ^bb2:  // 2 preds: ^bb0, ^bb3
    llvm.return
  ^bb3(%9: i64):  // 2 preds: ^bb1, ^bb3
    %10 = llvm.getelementptr inbounds %arg12[%9] : (!llvm.ptr, i64) -> !llvm.ptr, i32
    %11 = llvm.load %10 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> i32
    %12 = llvm.sext %11 : i32 to i64
    %13 = llvm.getelementptr inbounds %arg6[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %14 = llvm.load %13 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %15 = llvm.getelementptr inbounds %arg7[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %16 = llvm.load %15 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %17 = llvm.getelementptr inbounds %arg8[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %18 = llvm.load %17 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %19 = llvm.getelementptr inbounds %arg9[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %20 = llvm.load %19 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %21 = llvm.getelementptr inbounds %arg10[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %22 = llvm.load %21 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %23 = llvm.getelementptr inbounds %arg11[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %24 = llvm.load %23 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
    %25 = llvm.fsub %14, %arg0 : f32
    %26 = llvm.fmul %4, %25 : f32
    %27 = llvm.fsub %20, %arg0 : f32
    %28 = llvm.fmul %4, %27 : f32
    %29 = llvm.intr.minnum(%26, %28) : (f32, f32) -> f32
    %30 = llvm.intr.maxnum(%26, %28) : (f32, f32) -> f32
    %31 = llvm.fsub %16, %arg1 : f32
    %32 = llvm.fmul %5, %31 : f32
    %33 = llvm.fsub %22, %arg1 : f32
    %34 = llvm.fmul %5, %33 : f32
    %35 = llvm.intr.minnum(%32, %34) : (f32, f32) -> f32
    %36 = llvm.intr.maxnum(%32, %34) : (f32, f32) -> f32
    %37 = llvm.fsub %18, %arg2 : f32
    %38 = llvm.fmul %6, %37 : f32
    %39 = llvm.fsub %24, %arg2 : f32
    %40 = llvm.fmul %6, %39 : f32
    %41 = llvm.intr.minnum(%38, %40) : (f32, f32) -> f32
    %42 = llvm.intr.maxnum(%38, %40) : (f32, f32) -> f32
    %43 = llvm.intr.maxnum(%29, %35) : (f32, f32) -> f32
    %44 = llvm.intr.maxnum(%43, %41) : (f32, f32) -> f32
    %45 = llvm.intr.minnum(%30, %36) : (f32, f32) -> f32
    %46 = llvm.intr.minnum(%45, %42) : (f32, f32) -> f32
    %47 = llvm.fcmp "ole" %44, %46 : f32
    %48 = llvm.zext %47 : i1 to i32
    %49 = llvm.getelementptr inbounds %arg13[%9] : (!llvm.ptr, i64) -> !llvm.ptr, i32
    llvm.store %48, %49 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : i32, !llvm.ptr
    %50 = llvm.add %9, %3 overflow<nsw, nuw> : i64
    %51 = llvm.icmp "eq" %50, %8 : i64
    llvm.cond_br %51, ^bb2, ^bb3(%50 : i64) {loop_annotation = #loop_annotation}
  }
}

The command I use:

/workspace/dataflow/build/tools/mlir-neura-opt/mlir-neura-opt --assign-accelerator --lower-llvm-to-neura --fuse-pattern --promote-func-arg-to-const --fold-constant --canonicalize-live-in --leverage-predicated-value --transform-ctrl-to-data-flow --fold-constant --view-op-graph --architecture-spec=../../../arch_spec/architecture.yaml --insert-data-mov --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized=7,5" target.mlir

BTW, maybe we could relate #175 with this issue for better development and debugging.

Metadata

Metadata

Labels

enhancementNew feature or request

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions