Skip to content
This repository has been archived by the owner on May 14, 2024. It is now read-only.

Commit

Permalink
Merge pull request #49 from RadeonOpenCompute/roc-1.7.x-rc4
Browse files Browse the repository at this point in the history
roc-1.7.x rc4 updates
  • Loading branch information
kzhuravl authored Dec 11, 2017
2 parents 197e51e + 2f3851d commit 738fe62
Show file tree
Hide file tree
Showing 7 changed files with 24 additions and 28 deletions.
4 changes: 0 additions & 4 deletions irif/inc/irif.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,6 @@ extern void __llvm_amdcgn_buffer_wbinvl1_vol(void) __asm("llvm.amdgcn.buffer.wbi
extern __attribute__((const)) uint __llvm_amdgcn_mbcnt_lo(uint, uint) __asm("llvm.amdgcn.mbcnt.lo");
extern __attribute__((const)) uint __llvm_amdgcn_mbcnt_hi(uint, uint) __asm("llvm.amdgcn.mbcnt.hi");

extern __attribute__((convergent)) ulong __llvm_amdgcn_read_exec(void);
extern __attribute__((convergent)) uint __llvm_amdgcn_read_exec_lo(void);
extern __attribute__((convergent)) uint __llvm_amdgcn_read_exec_hi(void);

extern uint __llvm_amdgcn_s_getreg(uint) __asm("llvm.amdgcn.s.getreg");

extern uint __llvm_amdgcn_readfirstlane(uint) __asm("llvm.amdgcn.readfirstlane");
Expand Down
4 changes: 2 additions & 2 deletions ockl/src/activelane.cl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ __attribute__((always_inline)) uint
OCKL_MANGLE_U32(activelane)(void)
{
// TODO - check that this compiles to the desired 2 ISA instructions
return __llvm_amdgcn_mbcnt_hi(__llvm_amdgcn_read_exec_hi(),
__llvm_amdgcn_mbcnt_lo(__llvm_amdgcn_read_exec_lo(), 0u));
return __llvm_amdgcn_mbcnt_hi(__builtin_amdgcn_read_exec_hi(),
__llvm_amdgcn_mbcnt_lo(__builtin_amdgcn_read_exec_lo(), 0u));
}

10 changes: 5 additions & 5 deletions ockl/src/hsaqs.cl
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ update_mbox(const __global amd_signal_t *sig)
__global atomic_ulong *mb = (__global atomic_ulong *)sig->event_mailbox_ptr;
if (mb) {
uint id = sig->event_id;
atomic_store_explicit(mb, id, memory_order_release, memory_scope_all_svm_devices);
AS(mb, id, memory_order_release, memory_scope_all_svm_devices);
__builtin_amdgcn_s_sendmsg(1 | (0 << 4), __llvm_amdgcn_readfirstlane(id) & 0xff);
}
}
Expand Down Expand Up @@ -145,22 +145,22 @@ OCKL_MANGLE_T(hsa_signal,store)(hsa_signal_t sig, long value, __ockl_memory_orde
update_mbox(s);
} else if (__oclc_ISA_version() >= 900) {
// Hardware doorbell supports AQL semantics.
atomic_store_explicit((__global atomic_ulong *)s->hardware_doorbell_ptr, (ulong)value, memory_order_release, memory_scope_all_svm_devices);
AS((__global atomic_ulong *)s->hardware_doorbell_ptr, (ulong)value, memory_order_release, memory_scope_all_svm_devices);
} else {

{
__global amd_queue_t * q = s->queue_ptr;
__global atomic_uint *lp = (__global atomic_uint *)&q->legacy_doorbell_lock;
uint e = 0;
while (!atomic_compare_exchange_strong_explicit(lp, &e, (uint)1, memory_order_acquire, memory_order_relaxed, memory_scope_all_svm_devices)) {
while (!AC(lp, &e, (uint)1, memory_order_acquire, memory_order_relaxed, memory_scope_all_svm_devices)) {
__llvm_amdgcn_s_sleep(1);
e = 0;
}

ulong legacy_dispatch_id = value + 1;

if (legacy_dispatch_id > q->max_legacy_doorbell_dispatch_id_plus_1) {
atomic_store_explicit((__global atomic_ulong *)&q->max_legacy_doorbell_dispatch_id_plus_1, legacy_dispatch_id, memory_order_relaxed, memory_scope_all_svm_devices);
AS((__global atomic_ulong *)&q->max_legacy_doorbell_dispatch_id_plus_1, legacy_dispatch_id, memory_order_relaxed, memory_scope_all_svm_devices);

if (__oclc_ISA_version() < 800) {
legacy_dispatch_id = (ulong)(((uint)legacy_dispatch_id & ((q->hsa_queue.size << 1) - 1)) * 16);
Expand All @@ -169,7 +169,7 @@ OCKL_MANGLE_T(hsa_signal,store)(hsa_signal_t sig, long value, __ockl_memory_orde
*s->legacy_hardware_doorbell_ptr = (uint)legacy_dispatch_id;
}

atomic_store_explicit(lp, 0, memory_order_release, memory_scope_all_svm_devices);
AS(lp, 0, memory_order_release, memory_scope_all_svm_devices);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions ockl/src/wfaas.cl
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ OCKL_MANGLE_I32(wfany)(int e)
ATTR bool
OCKL_MANGLE_I32(wfall)(int e)
{
return __llvm_amdgcn_icmp_i32(e, 0, ICMP_NE) == __llvm_amdgcn_read_exec();
return __llvm_amdgcn_icmp_i32(e, 0, ICMP_NE) == __builtin_amdgcn_read_exec();
}


ATTR bool
OCKL_MANGLE_I32(wfsame)(int e)
{
ulong u = __llvm_amdgcn_icmp_i32(e, 0, ICMP_NE) != 0;
return (u == 0UL) | (u == __llvm_amdgcn_read_exec());
return (u == 0UL) | (u == __builtin_amdgcn_read_exec());
}

8 changes: 4 additions & 4 deletions ockl/src/wfredscan.cl
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ GENMAX(ulong)
r = T##_##OP(r, v); \
\
t = T##_readlane(r, 32); \
v = (__llvm_amdgcn_read_exec_hi() & 1) ? t : ID; \
v = (__builtin_amdgcn_read_exec_hi() & 1) ? t : ID; \
r = T##_##OP(T##_readlane(r, 0), v)


Expand Down Expand Up @@ -301,7 +301,7 @@ GENMAX(ulong)
} \
\
T t32 = T##_readlane(r, 32); \
T v32 = (__llvm_amdgcn_read_exec_hi() & 1) ? t32 : ID; \
T v32 = (__builtin_amdgcn_read_exec_hi() & 1) ? t32 : ID; \
r = T##_##OP(T##_readlane(r, 0), v32)

// Inclusive scan with operation OP using swizzle
Expand Down Expand Up @@ -420,8 +420,8 @@ GENMAX(ulong)
IATTR static bool
fullwave(void)
{
return __llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) +
__llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi()) == 64;
return __llvm_ctpop_i32(__builtin_amdgcn_read_exec_lo()) +
__llvm_ctpop_i32(__builtin_amdgcn_read_exec_hi()) == 64;
}

#define GENRED(T,OP,ID) \
Expand Down
10 changes: 5 additions & 5 deletions opencl/src/pipes/pipes.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,10 @@ reserve(volatile __global atomic_size_t *pi, size_t lim, size_t n)
static inline size_t
wave_reserve_1(volatile __global atomic_size_t *pi, size_t lim)
{
size_t n = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) +
__llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi()));
uint l = __llvm_amdgcn_mbcnt_hi(__llvm_amdgcn_read_exec_hi(),
__llvm_amdgcn_mbcnt_lo(__llvm_amdgcn_read_exec_lo(), 0u));
size_t n = (size_t)(__llvm_ctpop_i32(__builtin_amdgcn_read_exec_lo()) +
__llvm_ctpop_i32(__builtin_amdgcn_read_exec_hi()));
uint l = __llvm_amdgcn_mbcnt_hi(__builtin_amdgcn_read_exec_hi(),
__llvm_amdgcn_mbcnt_lo(__builtin_amdgcn_read_exec_lo(), 0u));
size_t i = 0;

if (l == 0) {
Expand All @@ -74,7 +74,7 @@ wave_reserve_1(volatile __global atomic_size_t *pi, size_t lim)
__builtin_amdgcn_wave_barrier();

// Broadcast the result; the ctz tells us which lane has active lane id 0
uint k = (uint)__llvm_cttz_i64(__llvm_amdgcn_read_exec());
uint k = (uint)__llvm_cttz_i64(__builtin_amdgcn_read_exec());
i = ((size_t)__llvm_amdgcn_readlane((uint)(i >> 32), k) << 32) |
(size_t)__llvm_amdgcn_readlane((uint)i, k);

Expand Down
12 changes: 6 additions & 6 deletions opencl/src/pipes/wresvnp.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
size_t
__amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
{
uint alc = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) +
__llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi()));
uint alc = (size_t)(__llvm_ctpop_i32(__builtin_amdgcn_read_exec_lo()) +
__llvm_ctpop_i32(__builtin_amdgcn_read_exec_hi()));
uint l = __llvm_amdgcn_mbcnt_hi(-1, __llvm_amdgcn_mbcnt_lo(-1, 0u));
size_t rid;

if (__llvm_amdgcn_read_exec() == (1UL << alc) - 1UL) {
if (__builtin_amdgcn_read_exec() == (1UL << alc) - 1UL) {
// Handle fully active subgroup
uint sum = sub_group_scan_inclusive_add((uint)n);
size_t idx = 0;
Expand All @@ -30,7 +30,7 @@ __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
const ulong nomsb = 0x7fffffffffffffffUL;

// Step 1
ulong smask = __llvm_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL);
ulong smask = __builtin_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL);
int slid = 63 - (int)clz(smask);
uint t = __llvm_amdgcn_ds_bpermute(slid << 2, n);
uint sum = n + (slid < 0 ? 0 : t);
Expand Down Expand Up @@ -123,13 +123,13 @@ __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
__builtin_amdgcn_wave_barrier();

size_t idx = 0;
if (l == 63 - (int)clz(__llvm_amdgcn_read_exec())) {
if (l == 63 - (int)clz(__builtin_amdgcn_read_exec())) {
idx = reserve(pidx, lim, (size_t)sum);
}
__builtin_amdgcn_wave_barrier();

// Broadcast
uint k = 63u - (uint)clz(__llvm_amdgcn_read_exec());
uint k = 63u - (uint)clz(__builtin_amdgcn_read_exec());
idx = ((size_t)__llvm_amdgcn_readlane((uint)(idx >> 32), k) << 32) |
(size_t)__llvm_amdgcn_readlane((uint)idx, k);
__builtin_amdgcn_wave_barrier();
Expand Down

0 comments on commit 738fe62

Please sign in to comment.