From 66a1c5bdca86ef8108263f0a3e2e21a6a343b643 Mon Sep 17 00:00:00 2001 From: Ron Lieberman Date: Tue, 12 Apr 2022 06:43:54 -0500 Subject: [PATCH 1/7] fix atomic.fadd.f32.p1.f32 related to opaque pointers changes in llvm Change-Id: I51a9ad703865218bee357b32e6be4fcd689b9071 --- ockl/src/gaaf.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ockl/src/gaaf.cl b/ockl/src/gaaf.cl index a11dfe0..89787de 100644 --- a/ockl/src/gaaf.cl +++ b/ockl/src/gaaf.cl @@ -15,7 +15,7 @@ #define AC(P, E, V, O, R, S) __opencl_atomic_compare_exchange_strong(P, E, V, O, R, S) #define AL(P, O, S) __opencl_atomic_load(P, O, S) -extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32"); +extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1.f32"); static void global_atomic_fadd(__global float *p, float v) From 99a39cdf06c323c78d8a2ef9319195ba2f727c07 Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Wed, 14 Sep 2022 07:05:42 -0700 Subject: [PATCH 2/7] Fix dot2 check Change-Id: I5be830b92ca9869e161402d0cae024e9682f7ea1 --- ockl/src/dots.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ockl/src/dots.cl b/ockl/src/dots.cl index 12a1f4b..b88f694 100644 --- a/ockl/src/dots.cl +++ b/ockl/src/dots.cl @@ -65,7 +65,7 @@ __attribute__((target("dot8-insts"), const)) static uint amdgcn_sudot8(bool as, } #define SWDOT __oclc_ISA_version < 9006 || __oclc_ISA_version == 9009 || __oclc_ISA_version == 10100 -#define SWIDOT2 __oclc_ISA_version < 9006 || __oclc_ISA_version == 9009 || __oclc_ISA_version == 10100 || __oclc_ISA_version == 11000 +#define SWIDOT2 __oclc_ISA_version < 9006 || __oclc_ISA_version == 9009 || __oclc_ISA_version == 10100 || __oclc_ISA_version >= 11000 #define SUDOT __oclc_ISA_version >= 11000 #define AS_INT(X) __builtin_astype(X, int) From 2a1ce3e32ea186ca52c0e4d31b32b5cce3801ccf Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Wed, 28 Sep 2022 14:00:50 -0700 Subject: [PATCH 3/7] Make scan more available and handle wave32 Change-Id: Ief7c68eb29c9ab190f7a1e33513061fbf02bbd27 --- ockl/inc/ockl.h | 2 + ockl/src/alrs.cl | 178 ++++++++++++++++++++++++++++++++++++ opencl/src/pipes/wresvnp.cl | 124 ++++--------------------- 3 files changed, 199 insertions(+), 105 deletions(-) create mode 100644 ockl/src/alrs.cl diff --git a/ockl/inc/ockl.h b/ockl/inc/ockl.h index d0b98d4..0477d7d 100644 --- a/ockl/inc/ockl.h +++ b/ockl/inc/ockl.h @@ -456,6 +456,8 @@ extern __attribute__((const)) __private void * OCKL_MANGLE_T(to,private)(void *) extern void OCKL_MANGLE_T(rtcwait,u32)(uint); extern void __ockl_sanitizer_report(ulong, ulong, ulong, ulong, ulong, ulong, ulong, ulong); +extern uint OCKL_MANGLE_U32(alisa)(uint); + #pragma OPENCL EXTENSION cl_khr_fp16 : disable #endif // OCKL_H diff --git a/ockl/src/alrs.cl b/ockl/src/alrs.cl new file mode 100644 index 0000000..dc5d11f --- /dev/null +++ b/ockl/src/alrs.cl @@ -0,0 +1,178 @@ + +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "oclc.h" +#include "ockl.h" + +static uint +bpermute_u32(uint l, uint v) +{ + return __builtin_amdgcn_ds_bpermute(l << 2, v); +} + +uint +OCKL_MANGLE_U32(alisa)(uint n) +{ + uint l = __ockl_lane_u32(); + uint ret = n; + + if (__oclc_wavefrontsize64) { + const ulong nolsb = ~1UL; + + // Step 1 + ulong smask = __builtin_amdgcn_read_exec() & ~((0x2UL << l) - 0x1UL); + int slid = (int)__ockl_ctz_u64(smask); + uint t = bpermute_u32(slid, n); + ret += slid < 64 ? t : 0; + smask ^= (0x1UL << slid) & nolsb; + + // Step 2 + slid = (int)__ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + + smask ^= (0x1UL << slid) & nolsb; + slid = (int)__ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + + // Step 3 + slid = __ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + + // Step 4 + slid = __ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + + // Step 5 + slid = __ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + slid = __ockl_ctz_u64(smask); + smask ^= (0x1UL << slid) & nolsb; + + // Step 6 + slid = __ockl_ctz_u64(smask); + t = bpermute_u32(slid, ret); + ret += slid < 64 ? t : 0; + } else { + const uint nolsb = ~1U; + + // Step 1 + uint smask = __builtin_amdgcn_read_exec_lo() & ~((0x2U << l) - 0x1U); + int slid = (int)__ockl_ctz_u32(smask); + uint t = bpermute_u32(slid, n); + ret += slid < 32 ? t : 0; + smask ^= (0x1U << slid) & nolsb; + + // Step 2 + slid = (int)__ockl_ctz_u32(smask); + t = bpermute_u32(slid, ret); + ret += slid < 32 ? t : 0; + + smask ^= (0x1U << slid) & nolsb; + slid = (int)__ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + + // Step 3 + slid = __ockl_ctz_u32(smask); + t = bpermute_u32(slid, ret); + ret += slid < 32 ? t : 0; + + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + + // Step 4 + slid = __ockl_ctz_u32(smask); + t = bpermute_u32(slid, ret); + ret += slid < 32 ? t : 0; + + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + slid = __ockl_ctz_u32(smask); + smask ^= (0x1U << slid) & nolsb; + + // Step 5 + slid = __ockl_ctz_u32(smask); + t = bpermute_u32(slid, ret); + ret += slid < 32 ? t : 0; + } + + return ret; +} diff --git a/opencl/src/pipes/wresvnp.cl b/opencl/src/pipes/wresvnp.cl index e1a9b2c..77e9d30 100644 --- a/opencl/src/pipes/wresvnp.cl +++ b/opencl/src/pipes/wresvnp.cl @@ -5,14 +5,25 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ +#include "oclc.h" +#include "ockl.h" #include "pipes.h" +static uint +active_lane_count(void) +{ + if (__oclc_wavefrontsize64) { + return __builtin_popcountl(__builtin_amdgcn_read_exec()); + } else { + return __builtin_popcount(__builtin_amdgcn_read_exec_lo()); + } +} + size_t __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n) { - uint alc = (size_t)(__builtin_popcount(__builtin_amdgcn_read_exec_lo()) + - __builtin_popcount(__builtin_amdgcn_read_exec_hi())); - uint l = __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0u)); + uint alc = active_lane_count(); + uint l = __ockl_lane_u32(); size_t rid; if (__builtin_amdgcn_read_exec() == (1UL << alc) - 1UL) { @@ -26,113 +37,16 @@ __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n) rid = idx + (size_t)(sum - (uint)n); rid = idx != ~(size_t)0 ? rid : idx; } else { - // Inclusive add scan with not all lanes active - const ulong nomsb = 0x7fffffffffffffffUL; - - // Step 1 - ulong smask = __builtin_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL); - int slid = 63 - (int)clz(smask); - uint t = __builtin_amdgcn_ds_bpermute(slid << 2, n); - uint sum = n + (slid < 0 ? 0 : t); - smask ^= (0x1UL << slid) & nomsb; - - // Step 2 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - - // Step 3 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - - // Step 4 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - - // Step 5 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - slid = 63 - (int)clz(smask); - smask ^= (0x1UL << slid) & nomsb; - - // Step 6 - slid = 63 - (int)clz(smask); - t = __builtin_amdgcn_ds_bpermute(slid << 2, sum); - sum += slid < 0 ? 0 : t; - __builtin_amdgcn_wave_barrier(); + uint sum = __ockl_alisa_u32((uint)n); + uint al = __ockl_activelane_u32(); size_t idx = 0; - if (l == 63 - (int)clz(__builtin_amdgcn_read_exec())) { + if (al == 0) { idx = reserve(pidx, lim, (size_t)sum); } __builtin_amdgcn_wave_barrier(); - - // Broadcast - uint k = 63u - (uint)clz(__builtin_amdgcn_read_exec()); - idx = ((size_t)__builtin_amdgcn_readlane((uint)(idx >> 32), k) << 32) | - (size_t)__builtin_amdgcn_readlane((uint)idx, k); - __builtin_amdgcn_wave_barrier(); + idx = ((size_t)__builtin_amdgcn_readfirstlane((uint)(idx >> 32)) << 32) | + (size_t)__builtin_amdgcn_readfirstlane((uint)idx); rid = idx + (size_t)(sum - (uint)n); rid = idx != ~(size_t)0 ? rid : idx; From 822711a83c47431cc56b717af0c88cc336ffcecc Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Mon, 3 Oct 2022 10:27:11 -0700 Subject: [PATCH 4/7] Move non-hostcall printf support to OCKL Change-Id: Iff5d28647b731d6e351413a2bfe3ca048112f35c --- opencl/src/misc/printf.cl => ockl/src/cprintf.cl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) rename opencl/src/misc/printf.cl => ockl/src/cprintf.cl (78%) diff --git a/opencl/src/misc/printf.cl b/ockl/src/cprintf.cl similarity index 78% rename from opencl/src/misc/printf.cl rename to ockl/src/cprintf.cl index 0686906..6975bae 100644 --- a/opencl/src/misc/printf.cl +++ b/ockl/src/cprintf.cl @@ -7,6 +7,9 @@ #include "oclc.h" +#define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device) +#define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device) + #ifndef NULL #define NULL 0 #endif @@ -25,13 +28,13 @@ __printf_alloc(uint bytes) } uint size = ((__global uint *)ptr)[1]; - uint offset = atomic_load_explicit((__global atomic_uint *)ptr, memory_order_relaxed, memory_scope_device); + uint offset = AL((__global atomic_uint *)ptr, memory_order_relaxed); for (;;) { if (OFFSET + offset + bytes > size) return NULL; - if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) + if (ACE((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed)) break; } From 4f9f2c24103a895d3e59791098804e4fd28197f5 Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Thu, 13 Oct 2022 14:33:34 -0700 Subject: [PATCH 5/7] Add needed attributes to other stubs. Change-Id: Ib06288fb89afa8c34bb26f0e38bf5adb1aaab1fd --- asanrtl/src/stubs.cl | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/asanrtl/src/stubs.cl b/asanrtl/src/stubs.cl index 7140557..364a050 100644 --- a/asanrtl/src/stubs.cl +++ b/asanrtl/src/stubs.cl @@ -7,23 +7,23 @@ #include "asan_util.h" -void __asan_handle_no_return(void) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_handle_no_return(void) {} -void __sanitizer_ptr_cmp(uptr a, uptr b) {} +OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_cmp(uptr a, uptr b) {} -void __sanitizer_ptr_sub(uptr a, uptr b) {} +OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_sub(uptr a, uptr b) {} -void __asan_before_dynamic_init(uptr addr) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_before_dynamic_init(uptr addr) {} -void __asan_after_dynamic_init(void) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_after_dynamic_init(void) {} -void __asan_register_image_globals(uptr flag) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_register_image_globals(uptr flag) {} -void __asan_unregister_image_globals(uptr flag) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_image_globals(uptr flag) {} -void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {} -void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {} +OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {} OPT_NONE NO_SANITIZE_ADDR void __asan_init(void) {} From 968b44a31bf399e42eabce1e6dc1e8b750bfbac0 Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Tue, 18 Oct 2022 10:45:18 -0700 Subject: [PATCH 6/7] Update workarounds to avoid hazard code. Change-Id: I6943dd794b3387a603249c06cb7adda7375252ca --- ockl/src/dm.cl | 6 +++--- ockl/src/hostcall_impl.cl | 4 +--- ockl/src/wfaas.cl | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/ockl/src/dm.cl b/ockl/src/dm.cl index 245b4a1..a947b69 100644 --- a/ockl/src/dm.cl +++ b/ockl/src/dm.cl @@ -177,9 +177,9 @@ typedef struct heap_s { // Inhibit control flow optimizations #define O0(X) X = o0(X) -__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } -__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } -__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } +__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } +__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } +__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; } // Atomics wrappers #define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device) diff --git a/ockl/src/hostcall_impl.cl b/ockl/src/hostcall_impl.cl index ecf7b08..20af208 100644 --- a/ockl/src/hostcall_impl.cl +++ b/ockl/src/hostcall_impl.cl @@ -100,9 +100,7 @@ static uint optimizationBarrierHack(uint in_val) { uint out_val; - __asm__ volatile("; ockl readfirstlane hoisting hack %0" - : "=v"(out_val) - : "0"(in_val)); + __asm__ volatile("" : "=v"(out_val) : "0"(in_val)); return out_val; } diff --git a/ockl/src/wfaas.cl b/ockl/src/wfaas.cl index ac36d3b..9bac8dd 100644 --- a/ockl/src/wfaas.cl +++ b/ockl/src/wfaas.cl @@ -21,8 +21,7 @@ ATTR static int optimizationBarrierHack(int in_val) { int out_val; - __asm__ volatile ("; ockl ballot hoisting hack %0" : - "=v"(out_val) : "0"(in_val)); + __asm__ volatile ("" : "=v"(out_val) : "0"(in_val)); return out_val; } From 7ed89777c1e14ac382a823e9d6163341699f5ef8 Mon Sep 17 00:00:00 2001 From: Brendon Cahoon Date: Thu, 27 Oct 2022 11:30:20 -0500 Subject: [PATCH 7/7] Fix sinH to return the valid result intead of nan A previous change swapped the return values so that the function return nan when BUILTIN_ISFINITE_F16 is true. Change-Id: Ie37ced76c5b53787016bf1437a00cec3c2e34191 --- ocml/src/sinH.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocml/src/sinH.cl b/ocml/src/sinH.cl index 83ba1ca..527a8e7 100644 --- a/ocml/src/sinH.cl +++ b/ocml/src/sinH.cl @@ -21,7 +21,7 @@ MATH_MANGLE(sin)(half x) s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000); if (!FINITE_ONLY_OPT()) { - s = BUILTIN_ISFINITE_F16(ax) ?(short)QNANBITPATT_HP16 : s; + s = BUILTIN_ISFINITE_F16(ax) ? s : (short)QNANBITPATT_HP16; } return AS_HALF(s);