diff --git a/asanrtl/src/stubs.cl b/asanrtl/src/stubs.cl index 7140557..364a050 100644 --- a/asanrtl/src/stubs.cl +++ b/asanrtl/src/stubs.cl @@ -7,23 +7,23 @@ #include "asan_util.h" -void __asan_handle_no_return(void) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_handle_no_return(void) {} -void __sanitizer_ptr_cmp(uptr a, uptr b) {} +OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_cmp(uptr a, uptr b) {} -void __sanitizer_ptr_sub(uptr a, uptr b) {} +OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_sub(uptr a, uptr b) {} -void __asan_before_dynamic_init(uptr addr) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_before_dynamic_init(uptr addr) {} -void __asan_after_dynamic_init(void) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_after_dynamic_init(void) {} -void __asan_register_image_globals(uptr flag) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_register_image_globals(uptr flag) {} -void __asan_unregister_image_globals(uptr flag) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_image_globals(uptr flag) {} -void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {} -void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {} OPT_NONE NO_SANITIZE_ADDR void __asan_init(void) {} diff --git a/ockl/inc/ockl.h b/ockl/inc/ockl.h index d0b98d4..0477d7d 100644 --- a/ockl/inc/ockl.h +++ b/ockl/inc/ockl.h @@ -456,6 +456,8 @@ extern __attribute__((const)) __private void * OCKL_MANGLE_T(to,private)(void *) extern void OCKL_MANGLE_T(rtcwait,u32)(uint); extern void __ockl_sanitizer_report(ulong, ulong, ulong, ulong, ulong, ulong, ulong, ulong); +extern uint OCKL_MANGLE_U32(alisa)(uint); + #pragma OPENCL EXTENSION cl_khr_fp16 : disable #endif // OCKL_H diff --git a/ockl/src/alrs.cl b/ockl/src/alrs.cl new file mode 100644 index 0000000..dc5d11f --- /dev/null +++ b/ockl/src/alrs.cl @@ -0,0 +1,178 @@ + +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "oclc.h" +#include "ockl.h" + +static uint +bpermute_u32(uint l, uint v) +{ + return __builtin_amdgcn_ds_bpermute(l << 2, v); +} + +uint +OCKL_MANGLE_U32(alisa)(uint n) +{ + uint l = __ockl_lane_u32(); + uint ret = n; + + if (__oclc_wavefrontsize64) { + const ulong nolsb = ~1UL; + + // Step 1 + ulong smask = __builtin_amdgcn_read_exec() & ~((0x2UL << l) - 0x1UL); + int slid = (int)__ockl_ctz_u64(smask); + uint t = bpermute_u32(slid, n); + ret += slid < 64 ? t : 0; + smask ^= (0x1UL << slid) & nolsb; + + // Step 2 + slid = (int)__ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + + smask ^= (0x1UL << slid) & nolsb; + slid = (int)__ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + + // Step 3 + slid = __ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + + // Step 4 + slid = __ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + + // Step 5 + slid = __ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + + // Step 6 + slid = __ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + } else { + const uint nolsb = ~1U; + + // Step 1 + uint smask = __builtin_amdgcn_read_exec_lo() & ~((0x2U << l) - 0x1U); + int slid = (int)__ockl_ctz_u32(smask); + uint t = bpermute_u32(slid, n); + ret += slid < 32 ? t : 0; + smask ^= (0x1U << slid) & nolsb; + + // Step 2 + slid = (int)__ockl_ctz_u32(smask); + t = bpermute_u32(slid, ret); + ret += slid < 32 ? t : 0; + + smask ^= (0x1U << slid) & nolsb; + slid = (int)__ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + + // Step 3 + slid = __ockl_ctz_u32(smask); + t = bpermute_u32(slid, ret); + ret += slid < 32 ? t : 0; + + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + + // Step 4 + slid = __ockl_ctz_u32(smask); + t = bpermute_u32(slid, ret); + ret += slid < 32 ? t : 0; + + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + + // Step 5 + slid = __ockl_ctz_u32(smask); + t = bpermute_u32(slid, ret); + ret += slid < 32 ? t : 0; + } + + return ret; +} diff --git a/opencl/src/misc/printf.cl b/ockl/src/cprintf.cl similarity index 78% rename from opencl/src/misc/printf.cl rename to ockl/src/cprintf.cl index 0686906..6975bae 100644 --- a/opencl/src/misc/printf.cl +++ b/ockl/src/cprintf.cl @@ -7,6 +7,9 @@ #include "oclc.h" +#define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device) +#define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device) + #ifndef NULL #define NULL 0 #endif @@ -25,13 +28,13 @@ __printf_alloc(uint bytes) } uint size = ((__global uint *)ptr)[1]; - uint offset = atomic_load_explicit((__global atomic_uint *)ptr, memory_order_relaxed, memory_scope_device); + uint offset = AL((__global atomic_uint *)ptr, memory_order_relaxed); for (;;) { if (OFFSET + offset + bytes > size) return NULL; - if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) + if (ACE((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed)) break; } diff --git a/ockl/src/dm.cl b/ockl/src/dm.cl index 245b4a1..a947b69 100644 --- a/ockl/src/dm.cl +++ b/ockl/src/dm.cl @@ -177,9 +177,9 @@ typedef struct heap_s { // Inhibit control flow optimizations #define O0(X) X = o0(X) -__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } -__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } -__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } +__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } +__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } +__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } // Atomics wrappers #define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device) diff --git a/ockl/src/gaaf.cl b/ockl/src/gaaf.cl index a11dfe0..89787de 100644 --- a/ockl/src/gaaf.cl +++ b/ockl/src/gaaf.cl @@ -15,7 +15,7 @@ #define AC(P, E, V, O, R, S) __opencl_atomic_compare_exchange_strong(P, E, V, O, R, S) #define AL(P, O, S) __opencl_atomic_load(P, O, S) -extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32"); +extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1.f32"); static void global_atomic_fadd(__global float *p, float v) diff --git a/ockl/src/hostcall_impl.cl b/ockl/src/hostcall_impl.cl index ecf7b08..20af208 100644 --- a/ockl/src/hostcall_impl.cl +++ b/ockl/src/hostcall_impl.cl @@ -100,9 +100,7 @@ static uint optimizationBarrierHack(uint in_val) { uint out_val; - __asm__ volatile("; ockl readfirstlane hoisting hack %0" - : "=v"(out_val) - : "0"(in_val)); + __asm__ volatile("" : "=v"(out_val) : "0"(in_val)); return out_val; } diff --git a/ockl/src/wfaas.cl b/ockl/src/wfaas.cl index ac36d3b..9bac8dd 100644 --- a/ockl/src/wfaas.cl +++ b/ockl/src/wfaas.cl @@ -21,8 +21,7 @@ ATTR static int optimizationBarrierHack(int in_val) { int out_val; - __asm__ volatile ("; ockl ballot hoisting hack %0" : - "=v"(out_val) : "0"(in_val)); + __asm__ volatile ("" : "=v"(out_val) : "0"(in_val)); return out_val; } diff --git a/opencl/src/pipes/wresvnp.cl b/opencl/src/pipes/wresvnp.cl index e1a9b2c..77e9d30 100644 --- a/opencl/src/pipes/wresvnp.cl +++ b/opencl/src/pipes/wresvnp.cl @@ -5,14 +5,25 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ +#include "oclc.h" +#include "ockl.h" #include "pipes.h" +static uint +active_lane_count(void) +{ + if (__oclc_wavefrontsize64) { + return __builtin_popcountl(__builtin_amdgcn_read_exec()); + } else { + return __builtin_popcount(__builtin_amdgcn_read_exec_lo()); + } +} + size_t __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n) { - uint alc = (size_t)(__builtin_popcount(__builtin_amdgcn_read_exec_lo()) + - __builtin_popcount(__builtin_amdgcn_read_exec_hi())); - uint l = __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0u)); + uint alc = active_lane_count(); + uint l = __ockl_lane_u32(); size_t rid; if (__builtin_amdgcn_read_exec() == (1UL << alc) - 1UL) { @@ -26,113 +37,16 @@ __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n) rid = idx + (size_t)(sum - (uint)n); rid = idx != ~(size_t)0 ? rid : idx; } else { - // Inclusive add scan with not all lanes active - const ulong nomsb = 0x7fffffffffffffffUL; - - // Step 1 - ulong smask = __builtin_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL); - int slid = 63 - (int)clz(smask); - uint t = __builtin_amdgcn_ds_bpermute(slid << 2, n); - uint sum = n + (slid < 0 ? 0 : t); - smask ^= (0x1UL << slid) & nomsb; - - // Step 2 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - - // Step 3 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - - // Step 4 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - - // Step 5 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - - // Step 6 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - __builtin_amdgcn_wave_barrier(); + uint sum = __ockl_alisa_u32((uint)n); + uint al = __ockl_activelane_u32(); size_t idx = 0; - if (l == 63 - (int)clz(__builtin_amdgcn_read_exec())) { + if (al == 0) { idx = reserve(pidx, lim, (size_t)sum); } __builtin_amdgcn_wave_barrier(); - - // Broadcast - uint k = 63u - (uint)clz(__builtin_amdgcn_read_exec()); - idx = ((size_t)__builtin_amdgcn_readlane((uint)(idx >> 32), k) << 32) | - (size_t)__builtin_amdgcn_readlane((uint)idx, k); - __builtin_amdgcn_wave_barrier(); + idx = ((size_t)__builtin_amdgcn_readfirstlane((uint)(idx >> 32)) << 32) | + (size_t)__builtin_amdgcn_readfirstlane((uint)idx); rid = idx + (size_t)(sum - (uint)n); rid = idx != ~(size_t)0 ? rid : idx;