-
Notifications
You must be signed in to change notification settings - Fork 14
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Hi, all. I've been testing some compute intensive kernel recently, and found that even RecII and ResII are both low enough (RecII = 5, ResII = 5), but the final II is 14, which is unexpectedly high.
After the initial triage, I believe the issue may have been caused by mapper. For certain operation, it is placed to a inappropriate by the current mapping logic, and cause the downstream and upstream operation have no enough explore space, which finally lead to increasing II.
So could we have some enhancement with the current mapper?
The IR target.mlir for information:
#loop_unroll = #llvm.loop_unroll<disable = true>
#tbaa_root = #llvm.tbaa_root<id = "Simple C++ TBAA">
#loop_annotation = #llvm.loop_annotation<unroll = #loop_unroll, mustProgress = true>
#tbaa_type_desc = #llvm.tbaa_type_desc<id = "omnipotent char", members = {<#tbaa_root, 0>}>
#tbaa_type_desc1 = #llvm.tbaa_type_desc<id = "int", members = {<#tbaa_type_desc, 0>}>
#tbaa_type_desc2 = #llvm.tbaa_type_desc<id = "float", members = {<#tbaa_type_desc, 0>}>
#tbaa_tag = #llvm.tbaa_tag<base_type = #tbaa_type_desc1, access_type = #tbaa_type_desc1, offset = 0>
#tbaa_tag1 = #llvm.tbaa_tag<base_type = #tbaa_type_desc2, access_type = #tbaa_type_desc2, offset = 0>
module attributes {dlti.dl_spec = #dlti.dl_spec<f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, llvm.ident = "clang version 20.1.7 (git@github.com:llvm/llvm-project.git 6146a88f60492b520a36f8f8f3231e15f3cc6082)"} {
llvm.func local_unnamed_addr @_Z18kernel_phase1_aabbffffffPKfS0_S0_S0_S0_S0_PKiPii(%arg0: f32 {llvm.noundef}, %arg1: f32 {llvm.noundef}, %arg2: f32 {llvm.noundef}, %arg3: f32 {llvm.noundef}, %arg4: f32 {llvm.noundef}, %arg5: f32 {llvm.noundef}, %arg6: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg7: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg8: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg9: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg10: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg11: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg12: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.readonly}, %arg13: !llvm.ptr {llvm.nocapture, llvm.noundef, llvm.writeonly}, %arg14: i32 {llvm.noundef}) attributes {memory_effects = #llvm.memory_effects<other = none, argMem = readwrite, inaccessibleMem = none>, no_inline, no_unwind, passthrough = ["mustprogress", "nofree", "norecurse", "nosync", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
%0 = llvm.mlir.constant(1.000000e+00 : f32) : f32
%1 = llvm.mlir.constant(0 : i32) : i32
%2 = llvm.mlir.constant(0 : i64) : i64
%3 = llvm.mlir.constant(1 : i64) : i64
%4 = llvm.fdiv %0, %arg3 : f32
%5 = llvm.fdiv %0, %arg4 : f32
%6 = llvm.fdiv %0, %arg5 : f32
%7 = llvm.icmp "sgt" %arg14, %1 : i32
llvm.cond_br %7, ^bb1, ^bb2
^bb1: // pred: ^bb0
%8 = llvm.zext nneg %arg14 : i32 to i64
llvm.br ^bb3(%2 : i64)
^bb2: // 2 preds: ^bb0, ^bb3
llvm.return
^bb3(%9: i64): // 2 preds: ^bb1, ^bb3
%10 = llvm.getelementptr inbounds %arg12[%9] : (!llvm.ptr, i64) -> !llvm.ptr, i32
%11 = llvm.load %10 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : !llvm.ptr -> i32
%12 = llvm.sext %11 : i32 to i64
%13 = llvm.getelementptr inbounds %arg6[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%14 = llvm.load %13 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
%15 = llvm.getelementptr inbounds %arg7[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%16 = llvm.load %15 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
%17 = llvm.getelementptr inbounds %arg8[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%18 = llvm.load %17 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
%19 = llvm.getelementptr inbounds %arg9[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%20 = llvm.load %19 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
%21 = llvm.getelementptr inbounds %arg10[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%22 = llvm.load %21 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
%23 = llvm.getelementptr inbounds %arg11[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%24 = llvm.load %23 {alignment = 4 : i64, tbaa = [#tbaa_tag1]} : !llvm.ptr -> f32
%25 = llvm.fsub %14, %arg0 : f32
%26 = llvm.fmul %4, %25 : f32
%27 = llvm.fsub %20, %arg0 : f32
%28 = llvm.fmul %4, %27 : f32
%29 = llvm.intr.minnum(%26, %28) : (f32, f32) -> f32
%30 = llvm.intr.maxnum(%26, %28) : (f32, f32) -> f32
%31 = llvm.fsub %16, %arg1 : f32
%32 = llvm.fmul %5, %31 : f32
%33 = llvm.fsub %22, %arg1 : f32
%34 = llvm.fmul %5, %33 : f32
%35 = llvm.intr.minnum(%32, %34) : (f32, f32) -> f32
%36 = llvm.intr.maxnum(%32, %34) : (f32, f32) -> f32
%37 = llvm.fsub %18, %arg2 : f32
%38 = llvm.fmul %6, %37 : f32
%39 = llvm.fsub %24, %arg2 : f32
%40 = llvm.fmul %6, %39 : f32
%41 = llvm.intr.minnum(%38, %40) : (f32, f32) -> f32
%42 = llvm.intr.maxnum(%38, %40) : (f32, f32) -> f32
%43 = llvm.intr.maxnum(%29, %35) : (f32, f32) -> f32
%44 = llvm.intr.maxnum(%43, %41) : (f32, f32) -> f32
%45 = llvm.intr.minnum(%30, %36) : (f32, f32) -> f32
%46 = llvm.intr.minnum(%45, %42) : (f32, f32) -> f32
%47 = llvm.fcmp "ole" %44, %46 : f32
%48 = llvm.zext %47 : i1 to i32
%49 = llvm.getelementptr inbounds %arg13[%9] : (!llvm.ptr, i64) -> !llvm.ptr, i32
llvm.store %48, %49 {alignment = 4 : i64, tbaa = [#tbaa_tag]} : i32, !llvm.ptr
%50 = llvm.add %9, %3 overflow<nsw, nuw> : i64
%51 = llvm.icmp "eq" %50, %8 : i64
llvm.cond_br %51, ^bb2, ^bb3(%50 : i64) {loop_annotation = #loop_annotation}
}
}
The command I use:
/workspace/dataflow/build/tools/mlir-neura-opt/mlir-neura-opt --assign-accelerator --lower-llvm-to-neura --fuse-pattern --promote-func-arg-to-const --fold-constant --canonicalize-live-in --leverage-predicated-value --transform-ctrl-to-data-flow --fold-constant --view-op-graph --architecture-spec=../../../arch_spec/architecture.yaml --insert-data-mov --map-to-accelerator="mapping-strategy=heuristic backtrack-config=customized=7,5" target.mlir
BTW, maybe we could relate #175 with this issue for better development and debugging.
Metadata
Metadata
Labels
enhancementNew feature or requestNew feature or request