Skip to content
This repository has been archived by the owner on May 14, 2024. It is now read-only.

Commit

Permalink
merge promotion/amd-mainline-open/2022.11.11 into amd-mainline-open
Browse files Browse the repository at this point in the history
  Bulk Promotion for ROCm 5.5
  from November 11, 2022

  Merge remote-tracking branch 'gerritgit/promotion/amd-mainline-open/2022.11.11' into HEAD

Change-Id: I735c2f495d396ed2647b54ff342135f12ed210f6
  • Loading branch information
David Salinas committed Dec 12, 2022
2 parents 05637f6 + 5db53a7 commit 8192b03
Show file tree
Hide file tree
Showing 9 changed files with 219 additions and 125 deletions.
18 changes: 9 additions & 9 deletions asanrtl/src/stubs.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,23 @@

#include "asan_util.h"

void __asan_handle_no_return(void) {}
OPT_NONE NO_SANITIZE_ADDR void __asan_handle_no_return(void) {}

void __sanitizer_ptr_cmp(uptr a, uptr b) {}
OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_cmp(uptr a, uptr b) {}

void __sanitizer_ptr_sub(uptr a, uptr b) {}
OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_sub(uptr a, uptr b) {}

void __asan_before_dynamic_init(uptr addr) {}
OPT_NONE NO_SANITIZE_ADDR void __asan_before_dynamic_init(uptr addr) {}

void __asan_after_dynamic_init(void) {}
OPT_NONE NO_SANITIZE_ADDR void __asan_after_dynamic_init(void) {}

void __asan_register_image_globals(uptr flag) {}
OPT_NONE NO_SANITIZE_ADDR void __asan_register_image_globals(uptr flag) {}

void __asan_unregister_image_globals(uptr flag) {}
OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_image_globals(uptr flag) {}

void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {}
OPT_NONE NO_SANITIZE_ADDR void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {}

void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {}
OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {}

OPT_NONE NO_SANITIZE_ADDR void __asan_init(void) {}

Expand Down
2 changes: 2 additions & 0 deletions ockl/inc/ockl.h
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,8 @@ extern __attribute__((const)) __private void * OCKL_MANGLE_T(to,private)(void *)
extern void OCKL_MANGLE_T(rtcwait,u32)(uint);
extern void __ockl_sanitizer_report(ulong, ulong, ulong, ulong, ulong, ulong, ulong, ulong);

extern uint OCKL_MANGLE_U32(alisa)(uint);

#pragma OPENCL EXTENSION cl_khr_fp16 : disable

#endif // OCKL_H
Expand Down
178 changes: 178 additions & 0 deletions ockl/src/alrs.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@

/*===--------------------------------------------------------------------------
* ROCm Device Libraries
*
* This file is distributed under the University of Illinois Open Source
* License. See LICENSE.TXT for details.
*===------------------------------------------------------------------------*/

#include "oclc.h"
#include "ockl.h"

static uint
bpermute_u32(uint l, uint v)
{
return __builtin_amdgcn_ds_bpermute(l << 2, v);
}

uint
OCKL_MANGLE_U32(alisa)(uint n)
{
uint l = __ockl_lane_u32();
uint ret = n;

if (__oclc_wavefrontsize64) {
const ulong nolsb = ~1UL;

// Step 1
ulong smask = __builtin_amdgcn_read_exec() & ~((0x2UL << l) - 0x1UL);
int slid = (int)__ockl_ctz_u64(smask);
uint t = bpermute_u32(slid, n);
ret += slid < 64 ? t : 0;
smask ^= (0x1UL << slid) & nolsb;

// Step 2
slid = (int)__ockl_ctz_u64(smask);
t = bpermute_u32(slid, ret);
ret += slid < 64 ? t : 0;

smask ^= (0x1UL << slid) & nolsb;
slid = (int)__ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;

// Step 3
slid = __ockl_ctz_u64(smask);
t = bpermute_u32(slid, ret);
ret += slid < 64 ? t : 0;

smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;

// Step 4
slid = __ockl_ctz_u64(smask);
t = bpermute_u32(slid, ret);
ret += slid < 64 ? t : 0;

smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;

// Step 5
slid = __ockl_ctz_u64(smask);
t = bpermute_u32(slid, ret);
ret += slid < 64 ? t : 0;

smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;
slid = __ockl_ctz_u64(smask);
smask ^= (0x1UL << slid) & nolsb;

// Step 6
slid = __ockl_ctz_u64(smask);
t = bpermute_u32(slid, ret);
ret += slid < 64 ? t : 0;
} else {
const uint nolsb = ~1U;

// Step 1
uint smask = __builtin_amdgcn_read_exec_lo() & ~((0x2U << l) - 0x1U);
int slid = (int)__ockl_ctz_u32(smask);
uint t = bpermute_u32(slid, n);
ret += slid < 32 ? t : 0;
smask ^= (0x1U << slid) & nolsb;

// Step 2
slid = (int)__ockl_ctz_u32(smask);
t = bpermute_u32(slid, ret);
ret += slid < 32 ? t : 0;

smask ^= (0x1U << slid) & nolsb;
slid = (int)__ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;

// Step 3
slid = __ockl_ctz_u32(smask);
t = bpermute_u32(slid, ret);
ret += slid < 32 ? t : 0;

smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;

// Step 4
slid = __ockl_ctz_u32(smask);
t = bpermute_u32(slid, ret);
ret += slid < 32 ? t : 0;

smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;
slid = __ockl_ctz_u32(smask);
smask ^= (0x1U << slid) & nolsb;

// Step 5
slid = __ockl_ctz_u32(smask);
t = bpermute_u32(slid, ret);
ret += slid < 32 ? t : 0;
}

return ret;
}
7 changes: 5 additions & 2 deletions opencl/src/misc/printf.cl → ockl/src/cprintf.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@

#include "oclc.h"

#define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device)
#define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device)

#ifndef NULL
#define NULL 0
#endif
Expand All @@ -25,13 +28,13 @@ __printf_alloc(uint bytes)
}

uint size = ((__global uint *)ptr)[1];
uint offset = atomic_load_explicit((__global atomic_uint *)ptr, memory_order_relaxed, memory_scope_device);
uint offset = AL((__global atomic_uint *)ptr, memory_order_relaxed);

for (;;) {
if (OFFSET + offset + bytes > size)
return NULL;

if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed, memory_order_relaxed, memory_scope_device))
if (ACE((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed))
break;
}

Expand Down
6 changes: 3 additions & 3 deletions ockl/src/dm.cl
Original file line number Diff line number Diff line change
Expand Up @@ -177,9 +177,9 @@ typedef struct heap_s {

// Inhibit control flow optimizations
#define O0(X) X = o0(X)
__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }
__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }
__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }

// Atomics wrappers
#define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device)
Expand Down
2 changes: 1 addition & 1 deletion ockl/src/gaaf.cl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#define AC(P, E, V, O, R, S) __opencl_atomic_compare_exchange_strong(P, E, V, O, R, S)
#define AL(P, O, S) __opencl_atomic_load(P, O, S)

extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32");
extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1.f32");

static void
global_atomic_fadd(__global float *p, float v)
Expand Down
4 changes: 1 addition & 3 deletions ockl/src/hostcall_impl.cl
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,7 @@ static uint
optimizationBarrierHack(uint in_val)
{
uint out_val;
__asm__ volatile("; ockl readfirstlane hoisting hack %0"
: "=v"(out_val)
: "0"(in_val));
__asm__ volatile("" : "=v"(out_val) : "0"(in_val));
return out_val;
}

Expand Down
3 changes: 1 addition & 2 deletions ockl/src/wfaas.cl
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ ATTR
static int optimizationBarrierHack(int in_val)
{
int out_val;
__asm__ volatile ("; ockl ballot hoisting hack %0" :
"=v"(out_val) : "0"(in_val));
__asm__ volatile ("" : "=v"(out_val) : "0"(in_val));
return out_val;
}

Expand Down
Loading

0 comments on commit 8192b03

Please sign in to comment.