merge promotion/amd-mainline-open/2022.11.11 into amd-mainline-open

Bulk Promotion for ROCm 5.5 from November 11, 2022 Merge remote-tracking branch 'gerritgit/promotion/amd-mainline-open/2022.11.11' into HEAD Change-Id: I735c2f495d396ed2647b54ff342135f12ed210f6
ROCm · Dec 12, 2022 · 8192b03 · 8192b03
2 parents 05637f6 + 5db53a7
commit 8192b03
Show file tree

Hide file tree

Showing 9 changed files with 219 additions and 125 deletions.
diff --git a/asanrtl/src/stubs.cl b/asanrtl/src/stubs.cl
@@ -7,23 +7,23 @@
 
 #include "asan_util.h"
 
-void __asan_handle_no_return(void) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_handle_no_return(void) {}
 
-void __sanitizer_ptr_cmp(uptr a, uptr b) {}
+OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_cmp(uptr a, uptr b) {}
 
-void __sanitizer_ptr_sub(uptr a, uptr b) {}
+OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_sub(uptr a, uptr b) {}
 
-void __asan_before_dynamic_init(uptr addr) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_before_dynamic_init(uptr addr) {}
 
-void __asan_after_dynamic_init(void) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_after_dynamic_init(void) {}
 
-void __asan_register_image_globals(uptr flag) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_register_image_globals(uptr flag) {}
 
-void __asan_unregister_image_globals(uptr flag) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_image_globals(uptr flag) {}
 
-void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {}
 
-void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {}
 
 OPT_NONE NO_SANITIZE_ADDR void __asan_init(void) {}
 

diff --git a/ockl/inc/ockl.h b/ockl/inc/ockl.h
@@ -456,6 +456,8 @@ extern __attribute__((const)) __private void * OCKL_MANGLE_T(to,private)(void *)
 extern void OCKL_MANGLE_T(rtcwait,u32)(uint);
 extern void __ockl_sanitizer_report(ulong, ulong, ulong, ulong, ulong, ulong, ulong, ulong);
 
+extern uint OCKL_MANGLE_U32(alisa)(uint);
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
 
 #endif // OCKL_H

diff --git a/ockl/src/alrs.cl b/ockl/src/alrs.cl
@@ -0,0 +1,178 @@
+
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "oclc.h"
+#include "ockl.h"
+
+static uint
+bpermute_u32(uint l, uint v)
+{
+    return __builtin_amdgcn_ds_bpermute(l << 2, v);
+}
+
+uint
+OCKL_MANGLE_U32(alisa)(uint n)
+{
+    uint l = __ockl_lane_u32();
+    uint ret = n;
+
+    if (__oclc_wavefrontsize64) {
+        const ulong nolsb = ~1UL;
+
+        // Step 1
+        ulong smask = __builtin_amdgcn_read_exec() & ~((0x2UL << l) - 0x1UL);
+        int slid = (int)__ockl_ctz_u64(smask);
+        uint t = bpermute_u32(slid, n);
+        ret += slid < 64 ? t : 0;
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 2
+        slid = (int)__ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = (int)__ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 3
+        slid = __ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 4
+        slid = __ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 5
+        slid = __ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 6
+        slid = __ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+    } else {
+        const uint nolsb = ~1U;
+
+        // Step 1
+        uint smask = __builtin_amdgcn_read_exec_lo() & ~((0x2U << l) - 0x1U);
+        int slid = (int)__ockl_ctz_u32(smask);
+        uint t = bpermute_u32(slid, n);
+        ret += slid < 32 ? t : 0;
+        smask ^= (0x1U << slid) & nolsb;
+
+        // Step 2
+        slid = (int)__ockl_ctz_u32(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 32 ? t : 0;
+
+        smask ^= (0x1U << slid) & nolsb;
+        slid = (int)__ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+
+        // Step 3
+        slid = __ockl_ctz_u32(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 32 ? t : 0;
+
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+
+        // Step 4
+        slid = __ockl_ctz_u32(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 32 ? t : 0;
+
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+
+        // Step 5
+        slid = __ockl_ctz_u32(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 32 ? t : 0;
+    }
+
+    return ret;
+}
diff --git a/opencl/src/misc/printf.cl → ockl/src/cprintf.cl b/opencl/src/misc/printf.cl → ockl/src/cprintf.cl
@@ -7,6 +7,9 @@
 
 #include "oclc.h"
 
+#define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device)
+#define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device)
+
 #ifndef NULL
 #define NULL 0
 #endif
@@ -25,13 +28,13 @@ __printf_alloc(uint bytes)
     }
 
     uint size = ((__global uint *)ptr)[1];
-    uint offset = atomic_load_explicit((__global atomic_uint *)ptr, memory_order_relaxed, memory_scope_device);
+    uint offset = AL((__global atomic_uint *)ptr, memory_order_relaxed);
 
     for (;;) {
         if (OFFSET + offset + bytes > size)
             return NULL;
 
-        if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed, memory_order_relaxed, memory_scope_device))
+        if (ACE((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed))
             break;
     }
 

diff --git a/ockl/src/dm.cl b/ockl/src/dm.cl
@@ -177,9 +177,9 @@ typedef struct heap_s {
 
 // Inhibit control flow optimizations
 #define O0(X) X = o0(X)
-__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
-__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
-__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
+__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }
+__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }
+__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }
 
 // Atomics wrappers
 #define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device)

diff --git a/ockl/src/gaaf.cl b/ockl/src/gaaf.cl
@@ -15,7 +15,7 @@
 #define AC(P, E, V, O, R, S) __opencl_atomic_compare_exchange_strong(P, E, V, O, R, S)
 #define AL(P, O, S) __opencl_atomic_load(P, O, S)
 
-extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32");
+extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1.f32");
 
 static void
 global_atomic_fadd(__global float *p, float v)

diff --git a/ockl/src/hostcall_impl.cl b/ockl/src/hostcall_impl.cl
@@ -100,9 +100,7 @@ static uint
 optimizationBarrierHack(uint in_val)
 {
     uint out_val;
-    __asm__ volatile("; ockl readfirstlane hoisting hack %0"
-                     : "=v"(out_val)
-                     : "0"(in_val));
+    __asm__ volatile("" : "=v"(out_val) : "0"(in_val));
     return out_val;
 }
 

diff --git a/ockl/src/wfaas.cl b/ockl/src/wfaas.cl
@@ -21,8 +21,7 @@ ATTR
 static int optimizationBarrierHack(int in_val)
 {
     int out_val;
-    __asm__ volatile ("; ockl ballot hoisting hack %0" :
-                      "=v"(out_val) : "0"(in_val));
+    __asm__ volatile ("" : "=v"(out_val) : "0"(in_val));
     return out_val;
 }