From d30ceb02bb3845850c35724c6da46351ee22815e Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 13 May 2024 10:43:25 -0400 Subject: [PATCH 01/15] Stub abstract interpretation support --- src/driver.jl | 3 +++ src/jlgen.jl | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/src/driver.jl b/src/driver.jl index 0bfdec31..e65a846d 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -130,6 +130,9 @@ function codegen(output::Symbol, @nospecialize(job::CompilerJob); error("Unknown compilation output $output") end +@noinline function var"gpuc.deferred"(f, args...) end +@noinline function var"gpuc.lookup"(mi, f, args...) end + # primitive mechanism for deferred compilation, for implementing CUDA dynamic parallelism. # this could both be generalized (e.g. supporting actual function calls, instead of # returning a function pointer), and be integrated with the nonrecursive codegen. diff --git a/src/jlgen.jl b/src/jlgen.jl index a34bd42e..cdf965a6 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -435,6 +435,17 @@ function CC.concrete_eval_eligible(interp::GPUInterpreter, return ret end +function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f), + arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, + max_methods::Int = CC.get_max_methods(interp, f, sv)) + if f === var"gpuc.deferred" || + f === var"gpuc.lookup" + return CC.CallMeta(Ptr{Cvoid}, Union{}, CC.Effects(), CC.NoCallInfo()) + end + return @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f, + arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, + max_methods::Int) +end ## world view of the cache using Core.Compiler: WorldView From abb4cdd23a2b77cefe052104a4a720671f8b4842 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 13 May 2024 15:23:02 -0400 Subject: [PATCH 02/15] run abstract interpretation over deferred code --- src/jlgen.jl | 21 ++++++++++++++++++--- test/native_tests.jl | 14 ++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/src/jlgen.jl b/src/jlgen.jl index cdf965a6..7ab20d1e 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -435,12 +435,27 @@ function CC.concrete_eval_eligible(interp::GPUInterpreter, return ret end +struct DeferredCallInfo <: CC.CallInfo + rt::DataType + info::CC.CallInfo +end + function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f), arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, max_methods::Int = CC.get_max_methods(interp, f, sv)) - if f === var"gpuc.deferred" || - f === var"gpuc.lookup" - return CC.CallMeta(Ptr{Cvoid}, Union{}, CC.Effects(), CC.NoCallInfo()) + (; fargs, argtypes) = arginfo + if f === var"gpuc.deferred" + argvec = argtypes[2:end] + call = CC.abstract_call(interp, CC.ArgInfo(nothing, argvec), si, sv, max_methods) + callinfo = DeferredCallInfo(call.rt, call.info) + @static if VERSION < v"1.11.0-" + return CC.CallMeta(Ptr{Cvoid}, CC.Effects(), callinfo) + else + return CC.CallMeta(Ptr{Cvoid}, Union{}, CC.Effects(), callinfo) + end + end + if f === var"gpuc.lookup" + error("Unimplemented") end return @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f, arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, diff --git a/test/native_tests.jl b/test/native_tests.jl index 298c1010..9ea6ebd6 100644 --- a/test/native_tests.jl +++ b/test/native_tests.jl @@ -162,6 +162,20 @@ end ir = fetch(t) @test contains(ir, r"add i64 %\d+, 3") end + + @testset "deferred" begin + @gensym child kernel unrelated + @eval @noinline $child(i) = i + @eval $kernel(i) = GPUCompiler.var"gpuc.deferred"($child, i) + + # smoke test + job, _ = Native.create_job(eval(kernel), (Int64,)) + + ci, rt = only(GPUCompiler.code_typed(job)) + @test rt === Ptr{Cvoid} + + ir = sprint(io->GPUCompiler.code_llvm(io, job)) + end end ############################################################################################ From a85e06cbb857cf28be8f048b2bf8fa522a0cd357 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Mon, 13 May 2024 15:48:43 -0400 Subject: [PATCH 03/15] customize the opt pass --- src/jlgen.jl | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/jlgen.jl b/src/jlgen.jl index 7ab20d1e..82b9ce7a 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -462,6 +462,50 @@ function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f), max_methods::Int) end +# Customize the optimization pipeline +Base.iterate(compact::CC.IncrementalCompact, state=nothing) = CC.iterate(compact, state) +Base.getindex(compact::CC.IncrementalCompact, idx) = CC.getindex(compact, idx) + +function deferred_pass!(ir) + compact = CC.IncrementalCompact(ir) + for ((old_idx, idx), stmt) in compact + if CC.is_known_call(stmt, var"gpuc.deferred", compact) + @safe_show stmt + end + end + CC.non_dce_finish!(compact) + CC.simple_dce!(compact) + ir = CC.complete(compact) + return ir +end + +function CC.optimize(interp::GPUInterpreter, opt::CC.OptimizationState, caller::CC.InferenceResult) + CC.@timeit "optimizer" ir = CC.run_passes(opt.src, opt, caller) + # Customizing the ipo_safe pipeline is a pain + ir = deferred_pass!(ir) + @static if VERSION >= v"1.11.0-" + CC.ipo_dataflow_analysis!(interp, ir, caller) + end + return CC.finish(interp, opt, ir, caller) +end + +function CC.typeinf_ircode(interp::GPUInterpreter, mi::CC.MethodInstance, + optimize_until::Union{Integer,AbstractString,Nothing}) + start_time = ccall(:jl_typeinf_timing_begin, UInt64, ()) + frame = CC.typeinf_frame(interp, mi, false) + if frame === nothing + ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) + return nothing, Any + end + (; result) = frame + opt = CC.OptimizationState(frame, interp) + ir = CC.run_passes(opt.src, opt, result, optimize_until) + ir = deferred_pass!(ir) + rt = CC.widenconst(CC.ignorelimited(result.result)) + ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) + return ir, rt +end + ## world view of the cache using Core.Compiler: WorldView From 01b044418e9a50c3fc730267463babc8011d1ca1 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 May 2024 21:24:10 -0400 Subject: [PATCH 04/15] use handle_case from inlining instead of custom pass --- src/jlgen.jl | 62 ++++++++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 41 deletions(-) diff --git a/src/jlgen.jl b/src/jlgen.jl index 82b9ce7a..cce9f21a 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -462,48 +462,28 @@ function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f), max_methods::Int) end -# Customize the optimization pipeline -Base.iterate(compact::CC.IncrementalCompact, state=nothing) = CC.iterate(compact, state) -Base.getindex(compact::CC.IncrementalCompact, idx) = CC.getindex(compact, idx) - -function deferred_pass!(ir) - compact = CC.IncrementalCompact(ir) - for ((old_idx, idx), stmt) in compact - if CC.is_known_call(stmt, var"gpuc.deferred", compact) - @safe_show stmt - end - end - CC.non_dce_finish!(compact) - CC.simple_dce!(compact) - ir = CC.complete(compact) - return ir -end - -function CC.optimize(interp::GPUInterpreter, opt::CC.OptimizationState, caller::CC.InferenceResult) - CC.@timeit "optimizer" ir = CC.run_passes(opt.src, opt, caller) - # Customizing the ipo_safe pipeline is a pain - ir = deferred_pass!(ir) - @static if VERSION >= v"1.11.0-" - CC.ipo_dataflow_analysis!(interp, ir, caller) - end - return CC.finish(interp, opt, ir, caller) -end - -function CC.typeinf_ircode(interp::GPUInterpreter, mi::CC.MethodInstance, - optimize_until::Union{Integer,AbstractString,Nothing}) - start_time = ccall(:jl_typeinf_timing_begin, UInt64, ()) - frame = CC.typeinf_frame(interp, mi, false) - if frame === nothing - ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) - return nothing, Any +# Use the Inlining infrastructure to perform our refinement +function CC.handle_call!(todo::Vector{Pair{Int,Any}}, + ir::CC.IRCode, idx::CC.Int, stmt::Expr, info::DeferredCallInfo, flag::UInt8, sig::CC.Signature, + state::CC.InliningState) + + minfo = info.info + results = minfo.results + if length(results.matches) != 1 + return nothing end - (; result) = frame - opt = CC.OptimizationState(frame, interp) - ir = CC.run_passes(opt.src, opt, result, optimize_until) - ir = deferred_pass!(ir) - rt = CC.widenconst(CC.ignorelimited(result.result)) - ccall(:jl_typeinf_timing_end, Cvoid, (UInt64,), start_time) - return ir, rt + match = only(results.matches) + + # lookup the target mi with correct edge tracking + case = CC.compileable_specialization(match, CC.Effects(), CC.InliningEdgeTracker(state), info) + @assert case isa CC.InvokeCase + @assert stmt.head === :call + + # rewrite the marker function + stmt.args[1] = var"gpuc.lookup" + # insert the mi + insert!(stmt.args, 2, case.invoke) + return nothing end ## world view of the cache From 440d6be4a50708529042e6dc259ee2dc8efd0f41 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 14 May 2024 21:54:03 -0400 Subject: [PATCH 05/15] refine to llvmcall instead --- src/driver.jl | 4 ++-- src/jlgen.jl | 18 +++++++++++------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/driver.jl b/src/driver.jl index e65a846d..4962f1fa 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -130,8 +130,8 @@ function codegen(output::Symbol, @nospecialize(job::CompilerJob); error("Unknown compilation output $output") end -@noinline function var"gpuc.deferred"(f, args...) end -@noinline function var"gpuc.lookup"(mi, f, args...) end +# GPUCompiler intrinsic that marks deferred compilation +function var"gpuc.deferred" end # primitive mechanism for deferred compilation, for implementing CUDA dynamic parallelism. # this could both be generalized (e.g. supporting actual function calls, instead of diff --git a/src/jlgen.jl b/src/jlgen.jl index cce9f21a..ad13a1ad 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -454,9 +454,6 @@ function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f), return CC.CallMeta(Ptr{Cvoid}, Union{}, CC.Effects(), callinfo) end end - if f === var"gpuc.lookup" - error("Unimplemented") - end return @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f, arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, max_methods::Int) @@ -479,10 +476,17 @@ function CC.handle_call!(todo::Vector{Pair{Int,Any}}, @assert case isa CC.InvokeCase @assert stmt.head === :call - # rewrite the marker function - stmt.args[1] = var"gpuc.lookup" - # insert the mi - insert!(stmt.args, 2, case.invoke) + args = Any[ + "extern gpuc.lookup", + Ptr{Cvoid}, + Core.svec(Any, Any, match.spec_types.parameters[2:end]...), # Must use Any for MethodInstance or ftype + 0, + QuoteNode(:llvmcall), + case.invoke, + stmt.args[2:end]... + ] + stmt.head = :foreigncall + stmt.args = args return nothing end From b04f2f22e1f1299ba641cd5bacd6435cdfccdc2c Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Fri, 28 Jun 2024 14:12:10 -0400 Subject: [PATCH 06/15] add compiler support for gpuc.lookup --- src/driver.jl | 81 ++++++++++++++++++++++++++++++++++++++------------- src/jlgen.jl | 5 ++-- 2 files changed, 63 insertions(+), 23 deletions(-) diff --git a/src/driver.jl b/src/driver.jl index 4962f1fa..e0928d88 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -131,6 +131,8 @@ function codegen(output::Symbol, @nospecialize(job::CompilerJob); end # GPUCompiler intrinsic that marks deferred compilation +# In contrast to `deferred_codegen` this doesn't support arbitrary +# jobs as call targets. function var"gpuc.deferred" end # primitive mechanism for deferred compilation, for implementing CUDA dynamic parallelism. @@ -188,12 +190,28 @@ const __llvm_initialized = Ref(false) # since those modules have been finalized themselves, and we don't want to re-finalize. entry = finish_module!(job, ir, entry) + function unwrap_constant(val) + while val isa ConstantExpr + if opcode(val) == LLVM.API.LLVMIntToPtr || + opcode(val) == LLVM.API.LLVMBitCast || + opcode(val) == LLVM.API.LLVMAddrSpaceCast + val = first(operands(val)) + else + break + end + end + return val + end + # deferred code generation has_deferred_jobs = !only_entry && toplevel && - haskey(functions(ir), "deferred_codegen") + (haskey(functions(ir), "deferred_codegen") || + haskey(functions(ir), "gpuc.lookup")) + jobs = Dict{CompilerJob, String}(job => entry_fn) if has_deferred_jobs - dyn_marker = functions(ir)["deferred_codegen"] + dyn_marker = haskey(functions(ir), "deferred_codegen") ? functions(ir)["deferred_codegen"] : nothing + dyn_marker_v2 = haskey(functions(ir), "gpuc.lookup") ? functions(ir)["gpuc.lookup"] : nothing # iterative compilation (non-recursive) changed = true @@ -202,26 +220,40 @@ const __llvm_initialized = Ref(false) # find deferred compiler # TODO: recover this information earlier, from the Julia IR + # We can do this now with gpuc.lookup worklist = Dict{CompilerJob, Vector{LLVM.CallInst}}() - for use in uses(dyn_marker) - # decode the call - call = user(use)::LLVM.CallInst - id = convert(Int, first(operands(call))) - - global deferred_codegen_jobs - dyn_val = deferred_codegen_jobs[id] - - # get a job in the appopriate world - dyn_job = if dyn_val isa CompilerJob - # trust that the user knows what they're doing - dyn_val - else - ft, tt = dyn_val - dyn_src = methodinstance(ft, tt, tls_world_age()) - CompilerJob(dyn_src, job.config) + if dyn_marker !== nothing + for use in uses(dyn_marker) + # decode the call + call = user(use)::LLVM.CallInst + id = convert(Int, first(operands(call))) + + global deferred_codegen_jobs + dyn_val = deferred_codegen_jobs[id] + + # get a job in the appopriate world + dyn_job = if dyn_val isa CompilerJob + # trust that the user knows what they're doing + dyn_val + else + ft, tt = dyn_val + dyn_src = methodinstance(ft, tt, tls_world_age()) + CompilerJob(dyn_src, job.config) + end + + push!(get!(worklist, dyn_job, LLVM.CallInst[]), call) end + end - push!(get!(worklist, dyn_job, LLVM.CallInst[]), call) + if dyn_marker_v2 !== nothing + for use in uses(dyn_marker_v2) + # decode the call + call = user(use)::LLVM.CallInst + dyn_mi = Base.unsafe_pointer_to_objref( + convert(Ptr{Cvoid}, convert(Int, unwrap_constant(operands(call)[1])))) + dyn_job = CompilerJob(dyn_mi, job.config) + push!(get!(worklist, dyn_job, LLVM.CallInst[]), call) + end end # compile and link @@ -263,8 +295,15 @@ const __llvm_initialized = Ref(false) end # all deferred compilations should have been resolved - @compiler_assert isempty(uses(dyn_marker)) job - unsafe_delete!(ir, dyn_marker) + if dyn_marker !== nothing + @compiler_assert isempty(uses(dyn_marker)) job + unsafe_delete!(ir, dyn_marker) + end + + if dyn_marker_v2 !== nothing + @compiler_assert isempty(uses(dyn_marker_v2)) job + unsafe_delete!(ir, dyn_marker_v2) + end end if toplevel diff --git a/src/jlgen.jl b/src/jlgen.jl index ad13a1ad..f723815c 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -318,7 +318,8 @@ else get_method_table_view(world::UInt, mt::MTType) = OverlayMethodTable(world, mt) end -struct GPUInterpreter <: CC.AbstractInterpreter +abstract type AbstractGPUInterpreter <: CC.AbstractInterpreter end +struct GPUInterpreter <: AbstractGPUInterpreter world::UInt method_table::GPUMethodTableView @@ -440,7 +441,7 @@ struct DeferredCallInfo <: CC.CallInfo info::CC.CallInfo end -function CC.abstract_call_known(interp::GPUInterpreter, @nospecialize(f), +function CC.abstract_call_known(interp::AbstractGPUInterpreter, @nospecialize(f), arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, max_methods::Int = CC.get_max_methods(interp, f, sv)) (; fargs, argtypes) = arginfo From 448a8e1f4b8913cc10aff1cb81910bb35ee326fd Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Thu, 4 Jul 2024 15:27:32 -0400 Subject: [PATCH 07/15] WIP: Try to process gpuc.lookup early --- src/jlgen.jl | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/jlgen.jl b/src/jlgen.jl index f723815c..23bf2a79 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -751,6 +751,26 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) compiled[mi] = (; ci, func=llvm_func, specfunc=llvm_specfunc) end + # We don't control the interp that codegen constructs for us above. + # So we have to scan the IR manually. + for (mi, (ci::CodeInstance, _, _)) in compiled + src = @atomic :monotonic ci.inferred + if src isa String + src = Core.Compiler._uncompressed_ir(mi.def, src) + end + for expr in src.code + expr isa Expr || continue + if expr.head === :foreigncall && + expr.args[1] == "extern gpuc.lookup" + deferred_mi = expr.args[6] + # Now push to a worklist and process... + # TODO: How do we deal with call duplication? + # Can we codegen into the same module, or do we merge? + # we can check against "compiled" to avoid recursion? + end + end + end + # ensure that the requested method instance was compiled @assert haskey(compiled, job.source) From 36abc73c34c2bb6dd749e5cce6e26c97b4a5d4a3 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Thu, 4 Jul 2024 16:20:52 -0400 Subject: [PATCH 08/15] add worklist --- src/driver.jl | 95 ++++++++++++++++++++++++++++++--------------------- src/jlgen.jl | 34 +++++++++++++----- 2 files changed, 83 insertions(+), 46 deletions(-) diff --git a/src/driver.jl b/src/driver.jl index e0928d88..06fe2607 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -204,14 +204,11 @@ const __llvm_initialized = Ref(false) end # deferred code generation - has_deferred_jobs = !only_entry && toplevel && - (haskey(functions(ir), "deferred_codegen") || - haskey(functions(ir), "gpuc.lookup")) + has_deferred_jobs = !only_entry && toplevel && haskey(functions(ir), "deferred_codegen") jobs = Dict{CompilerJob, String}(job => entry_fn) if has_deferred_jobs - dyn_marker = haskey(functions(ir), "deferred_codegen") ? functions(ir)["deferred_codegen"] : nothing - dyn_marker_v2 = haskey(functions(ir), "gpuc.lookup") ? functions(ir)["gpuc.lookup"] : nothing + dyn_marker = functions(ir)["deferred_codegen"] # iterative compilation (non-recursive) changed = true @@ -222,38 +219,25 @@ const __llvm_initialized = Ref(false) # TODO: recover this information earlier, from the Julia IR # We can do this now with gpuc.lookup worklist = Dict{CompilerJob, Vector{LLVM.CallInst}}() - if dyn_marker !== nothing - for use in uses(dyn_marker) - # decode the call - call = user(use)::LLVM.CallInst - id = convert(Int, first(operands(call))) - - global deferred_codegen_jobs - dyn_val = deferred_codegen_jobs[id] - - # get a job in the appopriate world - dyn_job = if dyn_val isa CompilerJob - # trust that the user knows what they're doing - dyn_val - else - ft, tt = dyn_val - dyn_src = methodinstance(ft, tt, tls_world_age()) - CompilerJob(dyn_src, job.config) - end - - push!(get!(worklist, dyn_job, LLVM.CallInst[]), call) + for use in uses(dyn_marker) + # decode the call + call = user(use)::LLVM.CallInst + id = convert(Int, first(operands(call))) + + global deferred_codegen_jobs + dyn_val = deferred_codegen_jobs[id] + + # get a job in the appopriate world + dyn_job = if dyn_val isa CompilerJob + # trust that the user knows what they're doing + dyn_val + else + ft, tt = dyn_val + dyn_src = methodinstance(ft, tt, tls_world_age()) + CompilerJob(dyn_src, job.config) end - end - if dyn_marker_v2 !== nothing - for use in uses(dyn_marker_v2) - # decode the call - call = user(use)::LLVM.CallInst - dyn_mi = Base.unsafe_pointer_to_objref( - convert(Ptr{Cvoid}, convert(Int, unwrap_constant(operands(call)[1])))) - dyn_job = CompilerJob(dyn_mi, job.config) - push!(get!(worklist, dyn_job, LLVM.CallInst[]), call) - end + push!(get!(worklist, dyn_job, LLVM.CallInst[]), call) end # compile and link @@ -299,11 +283,46 @@ const __llvm_initialized = Ref(false) @compiler_assert isempty(uses(dyn_marker)) job unsafe_delete!(ir, dyn_marker) end + end + + if haskey(functions(ir), "gpuc.lookup") + dyn_marker = functions(ir)["gpuc.lookup"] - if dyn_marker_v2 !== nothing - @compiler_assert isempty(uses(dyn_marker_v2)) job - unsafe_delete!(ir, dyn_marker_v2) + worklist = Dict{Any, Vector{LLVM.CallInst}}() + for use in uses(dyn_marker) + # decode the call + call = user(use)::LLVM.CallInst + dyn_mi = Base.unsafe_pointer_to_objref( + convert(Ptr{Cvoid}, convert(Int, unwrap_constant(operands(call)[1])))) + push!(get!(worklist, dyn_mi, LLVM.CallInst[]), call) end + + for dyn_mi in keys(worklist) + dyn_fn_name = compiled[dyn_mi].specfunc + dyn_fn = functions(ir)[dyn_fn_name] + + # insert a pointer to the function everywhere the entry is used + T_ptr = convert(LLVMType, Ptr{Cvoid}) + for call in worklist[dyn_mi] + @dispose builder=IRBuilder() begin + position!(builder, call) + fptr = if LLVM.version() >= v"17" + T_ptr = LLVM.PointerType() + bitcast!(builder, dyn_fn, T_ptr) + elseif VERSION >= v"1.12.0-DEV.225" + T_ptr = LLVM.PointerType(LLVM.Int8Type()) + bitcast!(builder, dyn_fn, T_ptr) + else + ptrtoint!(builder, dyn_fn, T_ptr) + end + replace_uses!(call, fptr) + end + unsafe_delete!(LLVM.parent(call), call) + end + end + + @compiler_assert isempty(uses(dyn_marker)) job + unsafe_delete!(ir, dyn_marker) end if toplevel diff --git a/src/jlgen.jl b/src/jlgen.jl index 23bf2a79..5f97ddd1 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -639,6 +639,24 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) error("Cannot compile $(job.source) for world $(job.world); method is only valid in worlds $(job.source.def.primary_world) to $(job.source.def.deleted_world)") end + compiled = IdDict() + llvm_mod, outstanding = compile_method_instance(job, compiled) + worklist = outstanding + while !isempty(worklist) + source = pop!(worklist) + haskey(compiled, source) && continue + job2 = CompilerJob(source, job.config) + @debug "Processing..." job2 + llvm_mod2, outstanding = compile_method_instance(job2, compiled) + append!(worklist, outstanding) + @assert context(llvm_mod) == context(llvm_mod2) + link!(llvm_mod, llvm_mod2) + end + + return llvm_mod, compiled +end + +function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDict{Any, Any}) # populate the cache interp = get_interpreter(job) cache = CC.code_cache(interp) @@ -649,7 +667,7 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) # create a callback to look-up function in our cache, # and keep track of the method instances we needed. - method_instances = [] + method_instances = Any[] if Sys.ARCH == :x86 || Sys.ARCH == :x86_64 function lookup_fun(mi, min_world, max_world) push!(method_instances, mi) @@ -714,7 +732,6 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) end # process all compiled method instances - compiled = Dict() for mi in method_instances ci = ci_cache_lookup(cache, mi, job.world, job.world) ci === nothing && continue @@ -753,7 +770,9 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) # We don't control the interp that codegen constructs for us above. # So we have to scan the IR manually. - for (mi, (ci::CodeInstance, _, _)) in compiled + outstanding = Any[] + for mi in method_instances + ci = compiled[mi].ci src = @atomic :monotonic ci.inferred if src isa String src = Core.Compiler._uncompressed_ir(mi.def, src) @@ -763,10 +782,9 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) if expr.head === :foreigncall && expr.args[1] == "extern gpuc.lookup" deferred_mi = expr.args[6] - # Now push to a worklist and process... - # TODO: How do we deal with call duplication? - # Can we codegen into the same module, or do we merge? - # we can check against "compiled" to avoid recursion? + if !haskey(compiled, deferred_mi) + push!(outstanding, deferred_mi) + end end end end @@ -774,7 +792,7 @@ function compile_method_instance(@nospecialize(job::CompilerJob)) # ensure that the requested method instance was compiled @assert haskey(compiled, job.source) - return llvm_mod, compiled + return llvm_mod, outstanding end # partially revert JuliaLangjulia#49391 From 5c2fbf31f5125bf45cb2c165d690f86ab2f3fc04 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sat, 13 Jul 2024 08:42:32 -0400 Subject: [PATCH 09/15] safe juia hackathon work --- src/jlgen.jl | 41 +++++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/jlgen.jl b/src/jlgen.jl index 5f97ddd1..50d94e6b 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -491,6 +491,31 @@ function CC.handle_call!(todo::Vector{Pair{Int,Any}}, return nothing end +struct DeferredEdges + edges::Vector{MethodInstance} +end + +function CC.ipo_dataflow_analysis!(interp::AbstractGPUInterpreter, ir::CC.IRCode, caller::CC.InferenceResult) + edges = MethodInstance[] + # @aviateks: Can we add this instead in handle_call + for stmt in ir.stmts + inst = stmt[:inst] + @show inst + inst isa Expr || continue + expr = inst::Expr + if expr.head === :foreigncall && + expr.args[1] == "extern gpuc.lookup" + deferred_mi = expr.args[6] + push!(edges, deferred_mi) + end + end + unique!(edges) + if !isempty(edges) + CC.stack_analysis_result!(caller, DeferredEdges(edges)) + end + @invoke CC.ipo_dataflow_analysis!(interp::CC.AbstractInterpreter, ir::CC.IRCode, caller::CC.InferenceResult) +end + ## world view of the cache using Core.Compiler: WorldView @@ -768,20 +793,16 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi compiled[mi] = (; ci, func=llvm_func, specfunc=llvm_specfunc) end - # We don't control the interp that codegen constructs for us above. - # So we have to scan the IR manually. + # Collect the deferred edges outstanding = Any[] for mi in method_instances ci = compiled[mi].ci - src = @atomic :monotonic ci.inferred - if src isa String - src = Core.Compiler._uncompressed_ir(mi.def, src) + edges = CC.traverse_analysis_results(ci) do @nospecialize result + return result isa DeferredEdges ? result : return end - for expr in src.code - expr isa Expr || continue - if expr.head === :foreigncall && - expr.args[1] == "extern gpuc.lookup" - deferred_mi = expr.args[6] + @show edges + if edges !== nothing + for deferred_mi in (edges::DeferredEdges).edges if !haskey(compiled, deferred_mi) push!(outstanding, deferred_mi) end From cca8cfbee0a646806f8da3835a7fc70e79d0b181 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 14 Jul 2024 19:07:55 -0400 Subject: [PATCH 10/15] make things work on 1.10 --- src/jlgen.jl | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/src/jlgen.jl b/src/jlgen.jl index 50d94e6b..acde0d3e 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -461,6 +461,7 @@ function CC.abstract_call_known(interp::AbstractGPUInterpreter, @nospecialize(f) end # Use the Inlining infrastructure to perform our refinement +# TODO: @aviatesk This is not reached on 1.11 function CC.handle_call!(todo::Vector{Pair{Int,Any}}, ir::CC.IRCode, idx::CC.Int, stmt::Expr, info::DeferredCallInfo, flag::UInt8, sig::CC.Signature, state::CC.InliningState) @@ -495,12 +496,11 @@ struct DeferredEdges edges::Vector{MethodInstance} end -function CC.ipo_dataflow_analysis!(interp::AbstractGPUInterpreter, ir::CC.IRCode, caller::CC.InferenceResult) +function find_deferred_edges(ir::CC.IRCode) edges = MethodInstance[] - # @aviateks: Can we add this instead in handle_call + # @aviatesk: Can we add this instead in handle_call for stmt in ir.stmts inst = stmt[:inst] - @show inst inst isa Expr || continue expr = inst::Expr if expr.head === :foreigncall && @@ -510,11 +510,29 @@ function CC.ipo_dataflow_analysis!(interp::AbstractGPUInterpreter, ir::CC.IRCode end end unique!(edges) + return edges +end + +if VERSION >= v"1.11.0-" +# stack_analysis_result and ipo_dataflow_analysis is 1.11 only +function CC.ipo_dataflow_analysis!(interp::AbstractGPUInterpreter, ir::CC.IRCode, caller::CC.InferenceResult) + edges = find_deferred_edges(ir) if !isempty(edges) CC.stack_analysis_result!(caller, DeferredEdges(edges)) end @invoke CC.ipo_dataflow_analysis!(interp::CC.AbstractInterpreter, ir::CC.IRCode, caller::CC.InferenceResult) end +else +# v1.10.0 +function CC.finish(interp::AbstractGPUInterpreter, opt::CC.OptimizationState, ir::CC.IRCode, caller::CC.InferenceResult) + edges = find_deferred_edges(ir) + if !isempty(edges) + # This is a tad bit risky, but nobody should be running EA on our results. + caller.argescapes = DeferredEdges(edges) + end + @invoke CC.finish(interp::CC.AbstractInterpreter, opt::CC.OptimizationState, ir::CC.IRCode, caller::CC.InferenceResult) +end +end ## world view of the cache using Core.Compiler: WorldView @@ -797,10 +815,16 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi outstanding = Any[] for mi in method_instances ci = compiled[mi].ci - edges = CC.traverse_analysis_results(ci) do @nospecialize result - return result isa DeferredEdges ? result : return + @static if VERSION >= v"1.11.0-" + edges = CC.traverse_analysis_results(ci) do @nospecialize result + return result isa DeferredEdges ? result : return + end + else + edges = ci.argescapes + if !(edges isa Union{Nothing, DeferredEdges}) + edges = nothing + end end - @show edges if edges !== nothing for deferred_mi in (edges::DeferredEdges).edges if !haskey(compiled, deferred_mi) From 958bec729f701b472c1a858ce28aff6a2cdfa379 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Sun, 14 Jul 2024 19:14:52 -0400 Subject: [PATCH 11/15] make inlining work on 1.11 --- src/jlgen.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/jlgen.jl b/src/jlgen.jl index acde0d3e..a1ffdde1 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -461,9 +461,9 @@ function CC.abstract_call_known(interp::AbstractGPUInterpreter, @nospecialize(f) end # Use the Inlining infrastructure to perform our refinement -# TODO: @aviatesk This is not reached on 1.11 +const FlagType = VERSION >= v"1.11.0-" ? UInt32 : UInt8 function CC.handle_call!(todo::Vector{Pair{Int,Any}}, - ir::CC.IRCode, idx::CC.Int, stmt::Expr, info::DeferredCallInfo, flag::UInt8, sig::CC.Signature, + ir::CC.IRCode, idx::CC.Int, stmt::Expr, info::DeferredCallInfo, flag::FlagType, sig::CC.Signature, state::CC.InliningState) minfo = info.info From c5b8c71679ddb002c2c76ed76be20daf26f61cf6 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 16 Jul 2024 12:43:09 -0400 Subject: [PATCH 12/15] Handle 1.11 GVs --- src/driver.jl | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/driver.jl b/src/driver.jl index 06fe2607..675b86d3 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -162,6 +162,29 @@ end end end +function find_base_object(val) + while true + if val isa ConstantExpr && (opcode(val) == LLVM.API.LLVMIntToPtr || + opcode(val) == LLVM.API.LLVMBitCast || + opcode(val) == LLVM.API.LLVMAddrSpaceCast) + val = first(operands(val)) + elseif val isa LLVM.IntToPtrInst || val isa LLVM.BitCastInst || val isa LLVM.AddrSpaceCastInst + val = first(operands(val)) + elseif val isa LLVM.LoadInst + # In 1.11+ we no longer embed integer constants directly. + gv = first(operands(val)) + if gv isa LLVM.GlobalValue + val = LLVM.initializer(gv) + continue + end + break + else + break + end + end + return val +end + const __llvm_initialized = Ref(false) @locked function emit_llvm(@nospecialize(job::CompilerJob); @@ -190,19 +213,6 @@ const __llvm_initialized = Ref(false) # since those modules have been finalized themselves, and we don't want to re-finalize. entry = finish_module!(job, ir, entry) - function unwrap_constant(val) - while val isa ConstantExpr - if opcode(val) == LLVM.API.LLVMIntToPtr || - opcode(val) == LLVM.API.LLVMBitCast || - opcode(val) == LLVM.API.LLVMAddrSpaceCast - val = first(operands(val)) - else - break - end - end - return val - end - # deferred code generation has_deferred_jobs = !only_entry && toplevel && haskey(functions(ir), "deferred_codegen") @@ -216,8 +226,6 @@ const __llvm_initialized = Ref(false) changed = false # find deferred compiler - # TODO: recover this information earlier, from the Julia IR - # We can do this now with gpuc.lookup worklist = Dict{CompilerJob, Vector{LLVM.CallInst}}() for use in uses(dyn_marker) # decode the call @@ -292,8 +300,10 @@ const __llvm_initialized = Ref(false) for use in uses(dyn_marker) # decode the call call = user(use)::LLVM.CallInst + dyn_mi_inst = find_base_object(operands(call)[1]) + @compiler_assert isa(dyn_mi_inst, LLVM.ConstantInt) job dyn_mi = Base.unsafe_pointer_to_objref( - convert(Ptr{Cvoid}, convert(Int, unwrap_constant(operands(call)[1])))) + convert(Ptr{Cvoid}, convert(Int, dyn_mi_inst))) push!(get!(worklist, dyn_mi, LLVM.CallInst[]), call) end From 569f7a18c694b95033bc6506bff0653f02dc9d14 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 16 Jul 2024 12:56:03 -0400 Subject: [PATCH 13/15] only process inferred mi --- src/jlgen.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/jlgen.jl b/src/jlgen.jl index a1ffdde1..79f2abdd 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -814,6 +814,7 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi # Collect the deferred edges outstanding = Any[] for mi in method_instances + !haskey(mi, compiled) && continue # Equivalent to ci_cache_lookup == nothing ci = compiled[mi].ci @static if VERSION >= v"1.11.0-" edges = CC.traverse_analysis_results(ci) do @nospecialize result From 3c80a5d58131cea618a24a5c63b7e1f86b129297 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 16 Jul 2024 14:36:18 -0400 Subject: [PATCH 14/15] fixup! only process inferred mi --- src/jlgen.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jlgen.jl b/src/jlgen.jl index 79f2abdd..a2b2bcb4 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -814,7 +814,7 @@ function compile_method_instance(@nospecialize(job::CompilerJob), compiled::IdDi # Collect the deferred edges outstanding = Any[] for mi in method_instances - !haskey(mi, compiled) && continue # Equivalent to ci_cache_lookup == nothing + !haskey(compiled, mi) && continue # Equivalent to ci_cache_lookup == nothing ci = compiled[mi].ci @static if VERSION >= v"1.11.0-" edges = CC.traverse_analysis_results(ci) do @nospecialize result From 0d5bcaf656403dbcb342061edc7606bd546f297a Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 16 Jul 2024 14:55:58 -0400 Subject: [PATCH 15/15] Add deferred_with --- src/driver.jl | 5 +++-- src/jlgen.jl | 18 ++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/driver.jl b/src/driver.jl index 675b86d3..2ed08061 100644 --- a/src/driver.jl +++ b/src/driver.jl @@ -131,10 +131,11 @@ function codegen(output::Symbol, @nospecialize(job::CompilerJob); end # GPUCompiler intrinsic that marks deferred compilation -# In contrast to `deferred_codegen` this doesn't support arbitrary -# jobs as call targets. function var"gpuc.deferred" end +# GPUCompiler intrinsic that marks deferred compilation, across backends +function var"gpuc.deferred.with" end + # primitive mechanism for deferred compilation, for implementing CUDA dynamic parallelism. # this could both be generalized (e.g. supporting actual function calls, instead of # returning a function pointer), and be integrated with the nonrecursive codegen. diff --git a/src/jlgen.jl b/src/jlgen.jl index a2b2bcb4..04aa8bdb 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -445,8 +445,9 @@ function CC.abstract_call_known(interp::AbstractGPUInterpreter, @nospecialize(f) arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, max_methods::Int = CC.get_max_methods(interp, f, sv)) (; fargs, argtypes) = arginfo - if f === var"gpuc.deferred" - argvec = argtypes[2:end] + if f === var"gpuc.deferred" || f === var"gpuc.deferred.with" + first_arg = f === var"gpuc.deferred" ? 2 : 3 + argvec = argtypes[first_arg:end] call = CC.abstract_call(interp, CC.ArgInfo(nothing, argvec), si, sv, max_methods) callinfo = DeferredCallInfo(call.rt, call.info) @static if VERSION < v"1.11.0-" @@ -478,10 +479,14 @@ function CC.handle_call!(todo::Vector{Pair{Int,Any}}, @assert case isa CC.InvokeCase @assert stmt.head === :call + f = stmt.args[1] + name = f === var"gpuc.deferred" ? "extern gpuc.lookup" : "extern gpuc.lookup.with" + with_arg_T = f === var"gpuc.deferred" ? () : (Any,) + args = Any[ - "extern gpuc.lookup", + name, Ptr{Cvoid}, - Core.svec(Any, Any, match.spec_types.parameters[2:end]...), # Must use Any for MethodInstance or ftype + Core.svec(Any, Any, with_arg_T..., match.spec_types.parameters[2:end]...), # Must use Any for MethodInstance or ftype 0, QuoteNode(:llvmcall), case.invoke, @@ -507,6 +512,11 @@ function find_deferred_edges(ir::CC.IRCode) expr.args[1] == "extern gpuc.lookup" deferred_mi = expr.args[6] push!(edges, deferred_mi) + elseif expr.head === :foreigncall && + expr.args[1] == "extern gpuc.lookup.with" + deferred_mi = expr.args[6] + with = expr.args[7] + @show (deferred_mi, with) end end unique!(edges)