-
Notifications
You must be signed in to change notification settings - Fork 54
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add GPUCompiler precompilation caching #425
base: master
Are you sure you want to change the base?
Changes from 13 commits
772bd94
a4bad27
4de3f62
11007f2
3dbe9d5
db12163
9bfdcee
44c5a7e
f86feeb
09d05df
c0d25a3
845be17
a6bd41a
cc34d21
456d4cb
1951087
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) | ||
is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 | ||
|
||
export ci_cache_snapshot, ci_cache_delta, ci_cache_insert, precompile_gpucompiler | ||
|
||
function ci_cache_snapshot() | ||
cleaned_cache_to_save = IdDict() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is an additional parse when constructing the CodeCache that removes CodeInstances in finite ranges. I could potentially split up that process so there are two phases. Copying then filtering, I though since we were already doing one pass over the data we could add filtering in directly. |
||
for key in keys(GPUCompiler.GLOBAL_CI_CACHES) | ||
# Will only keep those elements with infinite ranges | ||
cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) | ||
end | ||
return cleaned_cache_to_save | ||
end | ||
|
||
function ci_cache_delta(previous_snapshot) | ||
current_snapshot = ci_cache_snapshot() | ||
delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}() | ||
for (cachekey, codecache) in current_snapshot | ||
if cachekey in keys(previous_snapshot) | ||
for (mi, civ) in codecache.dict | ||
if mi in keys(previous_snapshot[cachekey].dict) | ||
for ci in civ | ||
if !(ci in previous_snapshot[cachekey].dict[mi]) | ||
if !(cachekey in keys(delta_snapshot)) | ||
delta_snapshot[cachekey] = GPUCompiler.CodeCache() | ||
delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}() | ||
elseif !(mi in keys(delta_snapshot[cachekey].dict)) | ||
delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}() | ||
end | ||
|
||
push!(delta_snapshot[cachekey].dict[mi], ci) | ||
end | ||
end | ||
else | ||
# this whole cache is not present in the previous snapshot, can add all | ||
if !(cachekey in keys(delta_snapshot)) | ||
delta_snapshot[cachekey] = GPUCompiler.CodeCache() | ||
end | ||
delta_snapshot[cachekey].dict[mi] = civ | ||
end | ||
end | ||
else | ||
delta_snapshot[cachekey] = current_snapshot[cachekey] | ||
end | ||
end | ||
return delta_snapshot | ||
end | ||
|
||
#=function ci_cache_insert(caches) | ||
empty!(GPUCompiler.GLOBAL_CI_CACHES) | ||
for (key, cache) in caches | ||
GPUCompiler.GLOBAL_CI_CACHES[key] = GPUCompiler.CodeCache(cache) | ||
end | ||
end=# | ||
|
||
function ci_cache_insert(cache) | ||
if !is_precompiling() | ||
#first clean the cache | ||
cleaned_cache = IdDict() | ||
for (key, c) in cache | ||
usedCache = false | ||
newCodeCache = GPUCompiler.CodeCache() | ||
for (mi, civ) in c.dict | ||
new_civ = Vector() | ||
for ci in civ | ||
if ci.min_world <= ci.max_world | ||
push!(new_civ, ci) | ||
end | ||
end | ||
if length(new_civ) > 0 | ||
usedCache = true | ||
newCodeCache.dict[mi] = new_civ | ||
end | ||
end | ||
if usedCache | ||
cleaned_cache[key] = newCodeCache | ||
end | ||
end | ||
|
||
# need to merge caches at the code instance level | ||
for (key, local_cache) in cleaned_cache | ||
if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) | ||
global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] | ||
#local_cache = cache[key] | ||
for (mi, civ) in (local_cache.dict) | ||
# this should be one since there is only one range that is infinite | ||
@assert length(civ) == 1 | ||
# add all code instances to global cache | ||
# could move truncating code to set index | ||
ci = civ[1] | ||
if haskey(global_cache.dict, mi) | ||
gciv = global_cache.dict[mi] | ||
# truncation cod3 | ||
# sort by min world age, then make sure no age ranges overlap // this part is uneeded | ||
sort(gciv, by=x->x.min_world) | ||
if ci.min_world > gciv[length(gciv)].min_world | ||
invalidate_code_cache(global_cache, mi, ci.min_world - 1) | ||
Core.Compiler.setindex!(global_cache, ci, mi) | ||
else | ||
println("Should not get here?") | ||
@assert false | ||
end | ||
else | ||
# occurs if we kill everything in the parent and then need to store in child | ||
Core.Compiler.setindex!(global_cache, ci, mi) | ||
end | ||
end | ||
else | ||
# no conflict at cache level | ||
GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key] | ||
end | ||
end | ||
end | ||
end | ||
|
||
""" | ||
Given a function and param types caches the function to the global cache | ||
""" | ||
function precompile_gpucompiler(job) | ||
# populate the cache | ||
cache = GPUCompiler.ci_cache(job) | ||
mt = GPUCompiler.method_table(job) | ||
interp = GPUCompiler.get_interpreter(job) | ||
if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing | ||
GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint)) | ||
end | ||
end | ||
|
||
""" | ||
Reloads Global Cache from global variable which stores the previous | ||
cached results | ||
""" | ||
function reinit_cache(LOCAL_CACHE) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is this used for? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oops reminent code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dead code, removed |
||
if !is_precompiling() | ||
# need to merge caches at the code instance level | ||
for key in keys(LOCAL_CACHE) | ||
if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) | ||
global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] | ||
local_cache = LOCAL_CACHE[key] | ||
for (mi, civ) in (local_cache.dict) | ||
# this should be one since there is only one range that is infinite | ||
@assert length(civ) == 1 | ||
# add all code instances to global cache | ||
# could move truncating code to set index | ||
ci = civ[1] | ||
if haskey(global_cache.dict, mi) | ||
gciv = global_cache.dict[mi] | ||
# truncation cod3 | ||
# sort by min world age, then make sure no age ranges overlap // this part is uneeded | ||
sort(gciv, by=x->x.min_world) | ||
if ci.min_world > gciv[length(gciv)].min_world | ||
invalidate_code_cache(global_cache, mi, ci.min_world - 1) | ||
Core.Compiler.setindex!(global_cache, ci, mi) | ||
else | ||
println("Should not get here?") | ||
@assert false | ||
end | ||
else | ||
# occurs if we kill everything in the parent and then need to store in child | ||
Core.Compiler.setindex!(global_cache, ci, mi) | ||
end | ||
end | ||
else | ||
# no conflict at cache level | ||
GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key] | ||
end | ||
end | ||
end | ||
end | ||
|
||
""" | ||
Takes a snapshot of the current status of the cache | ||
|
||
The cache returned is a deep copy with finite world age endings removed | ||
""" | ||
function snapshot_cache(LOCAL_CACHE) | ||
cleaned_cache_to_save = IdDict() | ||
for key in keys(GPUCompiler.GLOBAL_CI_CACHES) | ||
# Will only keep those elements with infinite ranges | ||
cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) | ||
end | ||
global MY_CACHE #technically don't need the global | ||
#empty insert | ||
empty!(LOCAL_CACHE) | ||
merge!(LOCAL_CACHE, cleaned_cache_to_save) | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
module GPUKernel | ||
using GPUCompiler | ||
using TestRuntime | ||
snapshot = GPUCompiler.ci_cache_snapshot() | ||
|
||
struct TestCompilerParams <: AbstractCompilerParams end | ||
GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime | ||
|
||
kernel() = nothing | ||
function main() | ||
source = methodinstance(typeof(kernel), Tuple{}) | ||
target = NativeCompilerTarget() | ||
params = TestCompilerParams() | ||
config = CompilerConfig(target, params) | ||
job = CompilerJob(source, config) | ||
|
||
println(GPUCompiler.compile(:asm, job)[1]) | ||
end | ||
|
||
main() | ||
const persistent_cache = GPUCompiler.ci_cache_delta(snapshot) | ||
|
||
function __init__() | ||
GPUCompiler.ci_cache_insert(persistent_cache) | ||
end | ||
end # module GPUKernel |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
Persistent Cache api: | ||
|
||
GPUCompiler.ci_cache_snapshot() -> cache: returns a snapshot of GLOBAL_CI_CACHES used | ||
as a base point for what will be persistently cached. | ||
|
||
GPUCompiler.ci_cache_delta(snapshot::cache) -> cache: takes a snapshot and returns | ||
the cache that represents the difference between (current GLOBAL_CI_CACHES - snapshot) | ||
|
||
GPUCompiler.ci_cache_insert(snapshot::cache): inserts snapshot into GLOBAL_CI_CACHES | ||
|
||
|
||
Usage: | ||
snapshot = GPUCompiler.ci_cache_snapshot() | ||
... precompile work ... | ||
const persistent_cache = GPUCompiler.ci_cache_delta(snapshot) | ||
|
||
function __init__() | ||
GPUCompiler.ci_cache_insert(persistent_cache) | ||
... rest of init logic ... | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
module TestRuntime | ||
signal_exception() = return | ||
malloc(sz) = C_NULL | ||
report_oom(sz) = return | ||
report_exception(ex) = return | ||
report_exception_name(ex) = return | ||
report_exception_frame(idx, func, file, line) = return | ||
end # module TestRuntime |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is this needed for?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is used in https://github.com/collinwarner/GPUCompiler.jl/blob/3dbe9d5b7c7c5f56f18553f0e4d4bd9c2bdcaca5/src/precompile_native.jl#L102
It creates a CodeCache that contains unbounded entries only. Used when snapshotting.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we just write this as
filter(validate_codecache, cache.dict)
where valid is:But that seems overeager, are we gurantueed just one entry? Or do we want to remove all CIs that don't have max_world?