JuliaGPU · collinwarner · Apr 9, 2023 · Apr 9, 2023 · Apr 9, 2023 · Apr 11, 2023
diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl
@@ -40,7 +40,10 @@ include("cache.jl")
 include("execution.jl")
 include("reflection.jl")
 
+
 include("precompile.jl")
+include("precompilation_cache.jl")
+
 _precompile_()
 
 function __init__()

diff --git a/src/jlgen.jl b/src/jlgen.jl
@@ -257,8 +257,28 @@ struct CodeCache
     dict::IdDict{MethodInstance,Vector{CodeInstance}}
 
     CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}())
+    CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict))
+end
+
+function copyAndFilter(dict::IdDict)
+    out= IdDict()
+    for key in keys(dict)
+        useKey = true
+        # why is it an array of code instances, can there be more than 1?
+        for ci in dict[key]
+            if ci.max_world < typemax(typeof(ci.max_world))
+                useKey = false
+                break
+            end
+        end
+        if useKey
+            out[key] = dict[key]
+        end
+    end
+    return out
 end
 
+
 function Base.show(io::IO, ::MIME"text/plain", cc::CodeCache)
     print(io, "CodeCache with $(mapreduce(length, +, values(cc.dict); init=0)) entries")
     if !isempty(cc.dict)

diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
@@ -0,0 +1,186 @@
+const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
+is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
+
+export ci_cache_snapshot, ci_cache_delta, ci_cache_insert, precompile_gpucompiler
+
+function ci_cache_snapshot()
+    cleaned_cache_to_save = IdDict()
+    for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
+        # Will only keep those elements with infinite ranges
+        cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
+    end
+    return cleaned_cache_to_save
+end
+
+function ci_cache_delta(previous_snapshot)
+    current_snapshot = ci_cache_snapshot()
+    delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}()
+    for (cachekey, codecache) in current_snapshot
+        if cachekey in keys(previous_snapshot)
+            for (mi, civ) in codecache.dict
+                if mi in keys(previous_snapshot[cachekey].dict)
+                    for ci in civ
+                        if !(ci in previous_snapshot[cachekey].dict[mi])
+                            if !(cachekey in keys(delta_snapshot))
+                                delta_snapshot[cachekey] = GPUCompiler.CodeCache()
+                                delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                            elseif !(mi in keys(delta_snapshot[cachekey].dict))
+                                delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                            end
+
+                            push!(delta_snapshot[cachekey].dict[mi], ci)
+                        end
+                    end
+                else
+                    # this whole cache is not present in the previous snapshot, can add all
+                    if !(cachekey in keys(delta_snapshot))
+                        delta_snapshot[cachekey] = GPUCompiler.CodeCache()
+                    end
+                    delta_snapshot[cachekey].dict[mi] = civ
+                end
+            end
+        else
+            delta_snapshot[cachekey] = current_snapshot[cachekey]
+        end
+    end
+    return delta_snapshot
+end
+
+#=function ci_cache_insert(caches)
+    empty!(GPUCompiler.GLOBAL_CI_CACHES)
+    for (key, cache) in caches
+        GPUCompiler.GLOBAL_CI_CACHES[key] = GPUCompiler.CodeCache(cache)
+    end
+end=#
+
+function ci_cache_insert(cache)
+    if !is_precompiling()
+        #first clean the cache
+        cleaned_cache = IdDict()
+        for (key, c) in cache
+            usedCache = false
+            newCodeCache = GPUCompiler.CodeCache()
+            for (mi, civ) in c.dict
+                new_civ = Vector()
+                for ci in civ
+                    if ci.min_world <= ci.max_world
+                        push!(new_civ, ci)
+                    end
+                end
+                if length(new_civ) > 0
+                    usedCache = true
+                    newCodeCache.dict[mi] = new_civ
+                end
+            end
+            if usedCache
+                cleaned_cache[key] = newCodeCache
+            end
+        end
+
+        # need to merge caches at the code instance level
+        for (key, local_cache) in cleaned_cache
+            if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
+                global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
+                #local_cache = cache[key]
+                for (mi, civ) in (local_cache.dict)
+                    # this should be one since there is only one range that is infinite
+                    @assert length(civ) == 1
+                    # add all code instances to global cache
+                    # could move truncating code to set index
+                    ci = civ[1]
+                    if haskey(global_cache.dict, mi)
+                        gciv = global_cache.dict[mi]
+                        # truncation cod3
+                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
+                        sort(gciv, by=x->x.min_world)
+                        if ci.min_world > gciv[length(gciv)].min_world
+                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
+                            Core.Compiler.setindex!(global_cache, ci, mi)
+                        else
+                            println("Should not get here?")
+                            @assert false
+                        end
+                    else
+                        # occurs if we kill everything in the parent and then need to store in child
+                        Core.Compiler.setindex!(global_cache, ci, mi)
+                    end
+                end
+            else
+                # no conflict at cache level
+                GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key]
+            end
+        end
+    end
+end
+
+"""
+Given a function and param types caches the function to the global cache
+"""
+function precompile_gpucompiler(job)
+    # populate the cache
+    cache = GPUCompiler.ci_cache(job)
+    mt = GPUCompiler.method_table(job)
+    interp = GPUCompiler.get_interpreter(job)
+    if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing
+        GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
+    end
+end
+
+"""
+Reloads Global Cache from global variable which stores the previous
+cached results
+"""
+function reinit_cache(LOCAL_CACHE)
+    if !is_precompiling()
+        # need to merge caches at the code instance level
+        for key in keys(LOCAL_CACHE)
+            if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
+                global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
+                local_cache = LOCAL_CACHE[key]
+                for (mi, civ) in (local_cache.dict)
+                    # this should be one since there is only one range that is infinite
+                    @assert length(civ) == 1
+                    # add all code instances to global cache
+                    # could move truncating code to set index
+                    ci = civ[1]
+                    if haskey(global_cache.dict, mi)
+                        gciv = global_cache.dict[mi]
+                        # truncation cod3
+                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
+                        sort(gciv, by=x->x.min_world)
+                        if ci.min_world > gciv[length(gciv)].min_world
+                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
+                            Core.Compiler.setindex!(global_cache, ci, mi)
+                        else
+                            println("Should not get here?")
+                            @assert false
+                        end
+                    else
+                        # occurs if we kill everything in the parent and then need to store in child
+                        Core.Compiler.setindex!(global_cache, ci, mi)
+                    end
+                end
+            else
+                # no conflict at cache level
+                GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key]
+            end
+        end
+    end
+end
+
+"""
+Takes a snapshot of the current status of the cache
+
+The cache returned is a deep copy with finite world age endings removed
+"""
+function snapshot_cache(LOCAL_CACHE)
+    cleaned_cache_to_save = IdDict()
+    for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
+        # Will only keep those elements with infinite ranges
+        cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
+    end
+    global MY_CACHE #technically don't need the global
+    #empty insert
+    empty!(LOCAL_CACHE)
+    merge!(LOCAL_CACHE, cleaned_cache_to_save)
+end
diff --git a/test/ExamplePersistentCache/GPUKernel.jl b/test/ExamplePersistentCache/GPUKernel.jl
@@ -0,0 +1,26 @@
+module GPUKernel
+using GPUCompiler
+using TestRuntime
+snapshot = GPUCompiler.ci_cache_snapshot()
+
+struct TestCompilerParams <: AbstractCompilerParams end
+GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime
+
+kernel() = nothing
+function main()
+    source = methodinstance(typeof(kernel), Tuple{})
+    target = NativeCompilerTarget()
+    params = TestCompilerParams()
+    config = CompilerConfig(target, params)
+    job = CompilerJob(source, config)
+
+    println(GPUCompiler.compile(:asm, job)[1])
+end
+
+main()
+const persistent_cache = GPUCompiler.ci_cache_delta(snapshot)
+
+function __init__()
+    GPUCompiler.ci_cache_insert(persistent_cache)
+end
+end # module GPUKernel
diff --git a/test/ExamplePersistentCache/README.txt b/test/ExamplePersistentCache/README.txt
@@ -0,0 +1,20 @@
+Persistent Cache api:
+
+GPUCompiler.ci_cache_snapshot() -> cache: returns a snapshot of GLOBAL_CI_CACHES used 
+as a base point for what will be persistently cached.
+
+GPUCompiler.ci_cache_delta(snapshot::cache) -> cache: takes a snapshot and returns
+the cache that represents the difference between (current GLOBAL_CI_CACHES - snapshot)
+
+GPUCompiler.ci_cache_insert(snapshot::cache): inserts snapshot into GLOBAL_CI_CACHES
+
+
+Usage:
+snapshot = GPUCompiler.ci_cache_snapshot()
+... precompile work ...
+const persistent_cache = GPUCompiler.ci_cache_delta(snapshot)
+
+function __init__()
+    GPUCompiler.ci_cache_insert(persistent_cache)
+    ... rest of init logic ...
+end
diff --git a/test/ExamplePersistentCache/TestRuntime.jl b/test/ExamplePersistentCache/TestRuntime.jl
@@ -0,0 +1,8 @@
+module TestRuntime
+    signal_exception() = return
+    malloc(sz) = C_NULL
+    report_oom(sz) = return
+    report_exception(ex) = return
+    report_exception_name(ex) = return
+    report_exception_frame(idx, func, file, line) = return
+end # module TestRuntime
diff --git a/test/Project.toml b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 Metal_LLVM_Tools_jll = "0418c028-ff8c-56b8-a53e-0f9676ed36fc"