From c9ce02b87880a96db46df6ecbf203c4b54a5fe7c Mon Sep 17 00:00:00 2001 From: Aakanksha Patil Date: Tue, 12 Oct 2021 13:26:01 -0400 Subject: [PATCH 1/6] Add gfx1036 Change-Id: I6639daef6714e7aab93ad0061d147e92e105fdc0 --- oclc/src/isa_version_1036.cl | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 oclc/src/isa_version_1036.cl diff --git a/oclc/src/isa_version_1036.cl b/oclc/src/isa_version_1036.cl new file mode 100644 index 0000000..3559b9a --- /dev/null +++ b/oclc/src/isa_version_1036.cl @@ -0,0 +1,10 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "oclc.h" + +const __constant int __oclc_ISA_version = 10306; From 1de32ef43a44cf578ea5fa351df6b6da12ba84c4 Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Wed, 23 Mar 2022 14:34:50 -0700 Subject: [PATCH 2/6] Improve broadcast Change-Id: I136afdb06ce77e545265d29b711c703706ca965f --- ockl/src/wfbc.cl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ockl/src/wfbc.cl b/ockl/src/wfbc.cl index 322df4d..d3bbe2d 100644 --- a/ockl/src/wfbc.cl +++ b/ockl/src/wfbc.cl @@ -13,7 +13,9 @@ ulong OCKL_MANGLE_U64(wfbcast)(ulong a, uint i) { uint j = __builtin_amdgcn_readfirstlane(i); - return ((ulong)__builtin_amdgcn_readlane((uint)(a >> 32), j) << 32) | - (ulong)__builtin_amdgcn_readlane((uint)a, j); + uint2 aa = __builtin_astype(a, uint2); + aa.x = __builtin_amdgcn_readlane(aa.x, j); + aa.y = __builtin_amdgcn_readlane(aa.y, j); + return __builtin_astype(aa, ulong); } From 35f4a18cf05e6ef8b312e5e5ec3043a99701abb5 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 29 Mar 2022 13:17:27 -0700 Subject: [PATCH 3/6] Add gfx940 isa version Change-Id: Ie28bb30b1187020702f86c1d5b69c6db48f0a3ac --- oclc/src/isa_version_940.cl | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 oclc/src/isa_version_940.cl diff --git a/oclc/src/isa_version_940.cl b/oclc/src/isa_version_940.cl new file mode 100644 index 0000000..e8acdaa --- /dev/null +++ b/oclc/src/isa_version_940.cl @@ -0,0 +1,10 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "oclc.h" + +const __constant int __oclc_ISA_version = 9400; From 43322a26f85d71ae4fdf8f898197022a25a6fb50 Mon Sep 17 00:00:00 2001 From: Sarbojit Sarkar Date: Tue, 29 Mar 2022 18:42:28 +0000 Subject: [PATCH 4/6] SWDEV-329086 - Fix for hipStreamWaitValue (NOR) Change-Id: Ic7bf4c4fbb80f7a949362f2c399ce5d6950bb20a --- opencl/src/misc/amdblit.cl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opencl/src/misc/amdblit.cl b/opencl/src/misc/amdblit.cl index 98ab55e..bf10617 100644 --- a/opencl/src/misc/amdblit.cl +++ b/opencl/src/misc/amdblit.cl @@ -642,13 +642,13 @@ __amd_streamOpsWait( case 3: //NOR if (ptrUint) { while (((atomic_load_explicit(ptrUint, memory_order_relaxed, - memory_scope_all_svm_devices) & (uint)mask) | (uint)value) == ~0U) { + memory_scope_all_svm_devices) | (uint)value) & (uint)mask) == (uint)mask) { __builtin_amdgcn_s_sleep(1); } } else { while (((atomic_load_explicit(ptrUlong, memory_order_relaxed, - memory_scope_all_svm_devices) & mask) | value) == ~0UL) { + memory_scope_all_svm_devices) | value) & mask) == mask) { __builtin_amdgcn_s_sleep(1); } } From 2f7150ac3650e9b49cea6ea803374cacbbd38fba Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Tue, 5 Apr 2022 14:28:17 -0700 Subject: [PATCH 5/6] Eliminate one operation Change-Id: I6d8b06787938498425b8790bfa3237295f6b799a --- ocml/src/tanhF.cl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocml/src/tanhF.cl b/ocml/src/tanhF.cl index ec3d996..e49e6d3 100644 --- a/ocml/src/tanhF.cl +++ b/ocml/src/tanhF.cl @@ -37,7 +37,7 @@ MATH_MANGLE(tanh)(float x) z = MATH_MAD(y2, y*p, y); } else { float t = MATH_MANGLE(exp)(2.0f * y); - z = 1.0f - MATH_FAST_DIV(2.0f, t + 1.0f); + z = MATH_MAD(-2.0f, MATH_FAST_RCP(t + 1.0f), 1.0f); } #endif From 4b1191ad456a5e436081245e52a2229a960ae74e Mon Sep 17 00:00:00 2001 From: Brian Sumner Date: Thu, 7 Apr 2022 12:26:11 -0700 Subject: [PATCH 6/6] Device libs updates for code object version 5 Change-Id: I249db245297190b4cea17f336b90e9d2ff4b1001 --- ockl/src/cg.cl | 7 +- ockl/src/dm.cl | 28 ++- ockl/src/hostcall.cl | 14 +- ockl/src/workitem.cl | 471 ++++++++++++++++++++++++++--------- opencl/src/devenq/devenq.h | 36 ++- opencl/src/devenq/enqueue.cl | 130 ++++++++-- opencl/src/misc/printf.cl | 9 +- 7 files changed, 528 insertions(+), 167 deletions(-) diff --git a/ockl/src/cg.cl b/ockl/src/cg.cl index 1afc236..8ff1df4 100644 --- a/ockl/src/cg.cl +++ b/ockl/src/cg.cl @@ -6,6 +6,7 @@ *===------------------------------------------------------------------------*/ #include "irif.h" +#include "oclc.h" #include "ockl.h" #define AL(P) __opencl_atomic_load((__global atomic_uint *)P, memory_order_relaxed, memory_scope_all_svm_devices) @@ -28,7 +29,11 @@ struct mg_info { static inline size_t get_mg_info_arg(void) { - return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[6]; + if (__oclc_ABI_version < 500) { + return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[6]; + } else { + return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[11]; + } } static inline bool diff --git a/ockl/src/dm.cl b/ockl/src/dm.cl index 5b9ed25..3a6c035 100644 --- a/ockl/src/dm.cl +++ b/ockl/src/dm.cl @@ -170,11 +170,6 @@ typedef struct heap_s { #endif } heap_t; -// TODO: get the heap pointer from the language runtime -static __global heap_t heap; -#define HEAP_POINTER &heap - - // Inhibit control flow optimizations #define O0(X) X = o0(X) __attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; } @@ -190,8 +185,19 @@ __attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volati #define AFO(P, V, O) __opencl_atomic_fetch_or (P, V, O, memory_scope_device) #define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device) +// get the heap pointer +static __global heap_t * +get_heap_ptr(void) { + if (__oclc_ABI_version < 500) { + static __global heap_t heap; + return &heap; + } else { + return (__global heap_t *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[12]; + } +} + // realtime -__attribute__((target("s-memrealtime")))static ulong +__attribute__((target("s-memrealtime"))) static ulong realtime(void) { return __builtin_amdgcn_s_memrealtime(); @@ -382,7 +388,7 @@ non_slab_free(ulong addr) uint nactive = active_lane_count(); if (aid == 0) { - __global heap_t *hp = HEAP_POINTER; + __global heap_t *hp = get_heap_ptr(); AFS(&hp->num_nonslab_allocations, nactive, memory_order_relaxed); } #endif @@ -404,7 +410,7 @@ __ockl_dm_dealloc(ulong addr) kind_t my_k = sptr->k; sid_t my_i = sptr->i; - __global heap_t *hp = HEAP_POINTER; + __global heap_t *hp = get_heap_ptr(); int go = 1; do { o0(go); @@ -446,7 +452,7 @@ non_slab_malloc(size_t sz) uint nactive = active_lane_count(); if (aid == 0) { - __global heap_t *hp = HEAP_POINTER; + __global heap_t *hp = get_heap_ptr(); AFA(&hp->num_nonslab_allocations, nactive, memory_order_relaxed); } } @@ -898,7 +904,7 @@ slab_malloc(int sz) { kind_t my_k = size_to_kind(sz); __global void *ret = (__global void *)0; - __global heap_t *hp = HEAP_POINTER; + __global heap_t *hp = get_heap_ptr(); int k_go = 1; do { @@ -949,7 +955,7 @@ __ockl_dm_alloc(ulong sz) ulong __ockl_dm_nna(void) { - __global heap_t *hp = HEAP_POINTER; + __global heap_t *hp = get_heap_ptr(); return AL(&hp->num_nonslab_allocations, memory_order_relaxed); } #endif diff --git a/ockl/src/hostcall.cl b/ockl/src/hostcall.cl index 2d7e3c9..4084451 100644 --- a/ockl/src/hostcall.cl +++ b/ockl/src/hostcall.cl @@ -5,6 +5,8 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ +#include "oclc.h" + /** \brief Internal implementation of hostcall. * * *** INTERNAL USE ONLY *** @@ -42,12 +44,12 @@ __ockl_hostcall_preview(uint service_id, ulong arg0, ulong arg1, ulong arg2, ulong arg3, ulong arg4, ulong arg5, ulong arg6, ulong arg7) { - // Retrieve the buffer pointer passed as an implicit kernel - // argument. This is at offset 3, which is the same as the OpenCL - // printf buffer. - __constant size_t *argptr = - (__constant size_t *)__builtin_amdgcn_implicitarg_ptr(); - void *buffer = (void *)argptr[3]; + void *buffer; + if (__oclc_ABI_version < 500) { + buffer = (__global void *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]; + } else { + buffer = (__global void *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[10]; + } return __ockl_hostcall_internal(buffer, service_id, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7); diff --git a/ockl/src/workitem.cl b/ockl/src/workitem.cl index 5d1d58a..2f92f3f 100644 --- a/ockl/src/workitem.cl +++ b/ockl/src/workitem.cl @@ -6,21 +6,344 @@ *===------------------------------------------------------------------------*/ #include "irif.h" +#include "oclc.h" #include "device_amd_hsa.h" #define ATTR __attribute__((const)) +#define OLD_ABI __oclc_ABI_version < 500 + +#define IMPLICITARG(T) ((__constant T *)__builtin_amdgcn_implicitarg_ptr()) + +ATTR static size_t +get_global_offset_x(void) +{ + if (OLD_ABI) { + return IMPLICITARG(ulong)[0]; + } else { + return IMPLICITARG(ulong)[5]; + } +} + +ATTR static size_t +get_global_offset_y(void) +{ + if (OLD_ABI) { + return IMPLICITARG(ulong)[1]; + } else { + return IMPLICITARG(ulong)[6]; + } +} + +ATTR static size_t +get_global_offset_z(void) +{ + if (OLD_ABI) { + return IMPLICITARG(ulong)[2]; + } else { + return IMPLICITARG(ulong)[7]; + } +} + +ATTR static size_t +get_global_size_x(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + return p->grid_size_x; + } else { + return IMPLICITARG(uint)[0]*IMPLICITARG(ushort)[6] + IMPLICITARG(ushort)[9]; + return 0; + } +} + +ATTR static size_t +get_global_size_y(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + return p->grid_size_y; + } else { + return IMPLICITARG(uint)[1]*IMPLICITARG(ushort)[7] + IMPLICITARG(ushort)[10]; + } +} + +ATTR static size_t +get_global_size_z(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + return p->grid_size_z; + } else { + return IMPLICITARG(uint)[2]*IMPLICITARG(ushort)[8] + IMPLICITARG(ushort)[11]; + return 0; + } +} + +ATTR static size_t +get_global_id_x(void) +{ + uint l = __builtin_amdgcn_workitem_id_x(); + uint g = __builtin_amdgcn_workgroup_id_x(); + uint s; + if (OLD_ABI) { + s = __builtin_amdgcn_workgroup_size_x(); + } else { + s = IMPLICITARG(ushort)[6]; + } + return (g*s + l) + get_global_offset_x(); +} + +ATTR static size_t +get_global_id_y(void) +{ + uint l = __builtin_amdgcn_workitem_id_y(); + uint g = __builtin_amdgcn_workgroup_id_y(); + uint s; + if (OLD_ABI) { + s = __builtin_amdgcn_workgroup_size_y(); + } else { + s = IMPLICITARG(ushort)[7]; + } + return (g*s + l) + get_global_offset_y(); +} + +ATTR static size_t +get_global_id_z(void) +{ + uint l = __builtin_amdgcn_workitem_id_z(); + uint g = __builtin_amdgcn_workgroup_id_z(); + uint s; + if (OLD_ABI) { + s = __builtin_amdgcn_workgroup_size_z(); + } else { + s = IMPLICITARG(ushort)[8]; + } + return (g*s + l) + get_global_offset_z(); +} + +ATTR static size_t +get_local_size_x(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + uint group_id = __builtin_amdgcn_workgroup_id_x(); + uint group_size = __builtin_amdgcn_workgroup_size_x(); + uint grid_size = p->grid_size_x; + uint r = grid_size - group_id * group_size; + return (r < group_size) ? r : group_size; + } else { + return __builtin_amdgcn_workgroup_id_x() < IMPLICITARG(uint)[0] ? IMPLICITARG(ushort)[6] : IMPLICITARG(ushort)[9]; + } +} + +ATTR static size_t +get_local_size_y(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + uint group_id = __builtin_amdgcn_workgroup_id_y(); + uint group_size = __builtin_amdgcn_workgroup_size_y(); + uint grid_size = p->grid_size_y; + uint r = grid_size - group_id * group_size; + return (r < group_size) ? r : group_size; + } else { + return __builtin_amdgcn_workgroup_id_y() < IMPLICITARG(uint)[1] ? IMPLICITARG(ushort)[7] : IMPLICITARG(ushort)[10]; + } +} + +ATTR static size_t +get_local_size_z(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + uint group_id = __builtin_amdgcn_workgroup_id_z(); + uint group_size = __builtin_amdgcn_workgroup_size_z(); + uint grid_size = p->grid_size_z; + uint r = grid_size - group_id * group_size; + return (r < group_size) ? r : group_size; + } else { + return __builtin_amdgcn_workgroup_id_z() < IMPLICITARG(uint)[2] ? IMPLICITARG(ushort)[8] : IMPLICITARG(ushort)[11]; + } +} + +ATTR static size_t +get_enqueued_local_size_x(void) +{ + if (OLD_ABI) { + return __builtin_amdgcn_workgroup_size_x(); + } else { + return IMPLICITARG(ushort)[6]; + } +} + +ATTR static size_t +get_enqueued_local_size_y(void) +{ + if (OLD_ABI) { + return __builtin_amdgcn_workgroup_size_y(); + } else { + return IMPLICITARG(ushort)[7]; + } +} + +ATTR static size_t +get_enqueued_local_size_z(void) +{ + if (OLD_ABI) { + return __builtin_amdgcn_workgroup_size_z(); + } else { + return IMPLICITARG(ushort)[8]; + } +} + +ATTR static size_t +get_num_groups_x(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + uint n = p->grid_size_x; + uint d = __builtin_amdgcn_workgroup_size_x(); + uint q = n / d; + return q + (n > q*d); + } else { + return IMPLICITARG(uint)[0] + (IMPLICITARG(ushort)[9] > 0); + } +} + +ATTR static size_t +get_num_groups_y(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + uint n = p->grid_size_y; + uint d = __builtin_amdgcn_workgroup_size_y(); + uint q = n / d; + return q + (n > q*d); + } else { + return IMPLICITARG(uint)[1] + (IMPLICITARG(ushort)[10] > 0); + } +} + +ATTR static size_t +get_num_groups_z(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + uint n = p->grid_size_z; + uint d = __builtin_amdgcn_workgroup_size_z(); + uint q = n / d; + return q + (n > q*d); + } else { + return IMPLICITARG(uint)[2] + (IMPLICITARG(ushort)[11] > 0); + } +} + +ATTR static uint +get_work_dim_(void) +{ + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + return p->setup; + } else { + return IMPLICITARG(ushort)[32]; + } +} + +ATTR static size_t +get_global_linear_id_x(void) +{ + uint l0 = __builtin_amdgcn_workitem_id_x(); + uint g0 = __builtin_amdgcn_workgroup_id_x(); + uint s0; + if (OLD_ABI) { + s0 = __builtin_amdgcn_workgroup_size_x(); + } else { + s0 = IMPLICITARG(ushort)[6]; + } + return g0*s0 + l0; +} + +ATTR static size_t +get_global_linear_id_y(void) +{ + uint l0 = __builtin_amdgcn_workitem_id_x(); + uint l1 = __builtin_amdgcn_workitem_id_y(); + uint g0 = __builtin_amdgcn_workgroup_id_x(); + uint g1 = __builtin_amdgcn_workgroup_id_y(); + uint s0, s1; + uint n0; + + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + s0 = __builtin_amdgcn_workgroup_size_x(); + s1 = __builtin_amdgcn_workgroup_size_y(); + n0 = p->grid_size_x; + } else { + s0 = IMPLICITARG(ushort)[6]; + s1 = IMPLICITARG(ushort)[7]; + n0 = IMPLICITARG(uint)[0]*s0 + IMPLICITARG(ushort)[9]; + } + uint i0 = g0*s0 + l0; + uint i1 = g1*s1 + l1; + return (size_t)i1 * (size_t)n0 + i0; +} + +ATTR static size_t +get_global_linear_id_z(void) +{ + uint l0 = __builtin_amdgcn_workitem_id_x(); + uint l1 = __builtin_amdgcn_workitem_id_y(); + uint l2 = __builtin_amdgcn_workitem_id_z(); + uint g0 = __builtin_amdgcn_workgroup_id_x(); + uint g1 = __builtin_amdgcn_workgroup_id_y(); + uint g2 = __builtin_amdgcn_workgroup_id_z(); + uint s0, s1, s2; + uint n0, n1; + + if (OLD_ABI) { + __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); + s0 = __builtin_amdgcn_workgroup_size_x(); + s1 = __builtin_amdgcn_workgroup_size_y(); + s2 = __builtin_amdgcn_workgroup_size_z(); + n0 = p->grid_size_x; + n1 = p->grid_size_y; + } else { + s0 = IMPLICITARG(ushort)[6]; + s1 = IMPLICITARG(ushort)[7]; + s2 = IMPLICITARG(ushort)[8]; + n0 = IMPLICITARG(uint)[0]*s0 + IMPLICITARG(ushort)[9]; + n1 = IMPLICITARG(uint)[1]*s1 + IMPLICITARG(ushort)[10]; + } + uint i0 = g0*s0 + l0; + uint i1 = g1*s1 + l1; + uint i2 = g2*s2 + l2; + return ((size_t)i2 * (size_t)n1 + (size_t)i1) * (size_t)n0 + i0; +} + +ATTR static size_t +get_local_linear_id_(void) +{ + if (OLD_ABI) { + return (__builtin_amdgcn_workitem_id_z() * __builtin_amdgcn_workgroup_size_y() + + __builtin_amdgcn_workitem_id_y()) * __builtin_amdgcn_workgroup_size_x() + + __builtin_amdgcn_workitem_id_x(); + } else { + return (__builtin_amdgcn_workitem_id_z() * IMPLICITARG(ushort)[7] + + __builtin_amdgcn_workitem_id_y()) * IMPLICITARG(ushort)[6] + + __builtin_amdgcn_workitem_id_x(); + } +} ATTR size_t __ockl_get_global_offset(uint dim) { - // TODO find out if implicit arg pointer is aligned properly switch(dim) { case 0: - return *(__constant size_t *)__builtin_amdgcn_implicitarg_ptr(); + return get_global_offset_x(); case 1: - return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[1]; + return get_global_offset_y(); case 2: - return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[2]; + return get_global_offset_z(); default: return 0; } @@ -29,32 +352,16 @@ __ockl_get_global_offset(uint dim) ATTR size_t __ockl_get_global_id(uint dim) { - uint l, g, s; - switch(dim) { case 0: - l = __builtin_amdgcn_workitem_id_x(); - g = __builtin_amdgcn_workgroup_id_x(); - s = __builtin_amdgcn_workgroup_size_x(); - break; + return get_global_id_x(); case 1: - l = __builtin_amdgcn_workitem_id_y(); - g = __builtin_amdgcn_workgroup_id_y(); - s = __builtin_amdgcn_workgroup_size_y(); - break; + return get_global_id_y(); case 2: - l = __builtin_amdgcn_workitem_id_z(); - g = __builtin_amdgcn_workgroup_id_z(); - s = __builtin_amdgcn_workgroup_size_z(); - break; + return get_global_id_z(); default: - l = 0; - g = 0; - s = 1; - break; + return 0; } - - return (g*s + l) + __ockl_get_global_offset(dim); } ATTR size_t @@ -90,15 +397,13 @@ __ockl_get_group_id(uint dim) ATTR size_t __ockl_get_global_size(uint dim) { - __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); - switch(dim) { case 0: - return p->grid_size_x; + return get_global_size_x(); case 1: - return p->grid_size_y; + return get_global_size_y(); case 2: - return p->grid_size_z; + return get_global_size_z(); default: return 1; } @@ -107,70 +412,37 @@ __ockl_get_global_size(uint dim) ATTR size_t __ockl_get_local_size(uint dim) { - __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); - uint group_id, grid_size, group_size; - switch(dim) { case 0: - group_id = __builtin_amdgcn_workgroup_id_x(); - group_size = __builtin_amdgcn_workgroup_size_x(); - grid_size = p->grid_size_x; - break; + return get_local_size_x(); case 1: - group_id = __builtin_amdgcn_workgroup_id_y(); - group_size = __builtin_amdgcn_workgroup_size_y(); - grid_size = p->grid_size_y; - break; + return get_local_size_y(); case 2: - group_id = __builtin_amdgcn_workgroup_id_z(); - group_size = __builtin_amdgcn_workgroup_size_z(); - grid_size = p->grid_size_z; - break; + return get_local_size_z(); default: - group_id = 0; - grid_size = 0; - group_size = 1; - break; + return 1; } - uint r = grid_size - group_id * group_size; - return (r < group_size) ? r : group_size; } ATTR size_t __ockl_get_num_groups(uint dim) { - __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); - - uint n, d; switch(dim) { case 0: - n = p->grid_size_x; - d = __builtin_amdgcn_workgroup_size_x(); - break; + return get_num_groups_x(); case 1: - n = p->grid_size_y; - d = __builtin_amdgcn_workgroup_size_y(); - break; + return get_num_groups_y(); case 2: - n = p->grid_size_z; - d = __builtin_amdgcn_workgroup_size_z(); - break; + return get_num_groups_z(); default: - n = 1; - d = 1; - break; + return 1; } - - uint q = n / d; - - return q + (n > q*d); } ATTR uint -__ockl_get_work_dim(void) { - __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); - // XXX revist this if setup field ever changes - return p->setup; +__ockl_get_work_dim(void) +{ + return get_work_dim_(); } ATTR size_t @@ -178,11 +450,11 @@ __ockl_get_enqueued_local_size(uint dim) { switch(dim) { case 0: - return __builtin_amdgcn_workgroup_size_x(); + return get_enqueued_local_size_x(); case 1: - return __builtin_amdgcn_workgroup_size_y(); + return get_enqueued_local_size_y(); case 2: - return __builtin_amdgcn_workgroup_size_z(); + return get_enqueued_local_size_z(); default: return 1; } @@ -191,48 +463,13 @@ __ockl_get_enqueued_local_size(uint dim) ATTR size_t __ockl_get_global_linear_id(void) { - __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr(); - - // XXX revisit this if setup field ever changes - switch (p->setup) { + switch (get_work_dim_()) { case 1: - { - uint l0 = __builtin_amdgcn_workitem_id_x(); - uint g0 = __builtin_amdgcn_workgroup_id_x(); - uint s0 = __builtin_amdgcn_workgroup_size_x(); - return g0*s0 + l0; - } + return get_global_linear_id_x(); case 2: - { - uint l0 = __builtin_amdgcn_workitem_id_x(); - uint l1 = __builtin_amdgcn_workitem_id_y(); - uint g0 = __builtin_amdgcn_workgroup_id_x(); - uint g1 = __builtin_amdgcn_workgroup_id_y(); - uint s0 = __builtin_amdgcn_workgroup_size_x(); - uint s1 = __builtin_amdgcn_workgroup_size_y(); - uint n0 = p->grid_size_x; - uint i0 = g0*s0 + l0; - uint i1 = g1*s1 + l1; - return (size_t)i1 * (size_t)n0 + i0; - } + return get_global_linear_id_y(); case 3: - { - uint l0 = __builtin_amdgcn_workitem_id_x(); - uint l1 = __builtin_amdgcn_workitem_id_y(); - uint l2 = __builtin_amdgcn_workitem_id_z(); - uint g0 = __builtin_amdgcn_workgroup_id_x(); - uint g1 = __builtin_amdgcn_workgroup_id_y(); - uint g2 = __builtin_amdgcn_workgroup_id_z(); - uint s0 = __builtin_amdgcn_workgroup_size_x(); - uint s1 = __builtin_amdgcn_workgroup_size_y(); - uint s2 = __builtin_amdgcn_workgroup_size_z(); - uint n0 = p->grid_size_x; - uint n1 = p->grid_size_y; - uint i0 = g0*s0 + l0; - uint i1 = g1*s1 + l1; - uint i2 = g2*s2 + l2; - return ((size_t)i2 * (size_t)n1 + (size_t)i1) * (size_t)n0 + i0; - } + return get_global_linear_id_z(); default: return 0; } @@ -241,8 +478,6 @@ __ockl_get_global_linear_id(void) ATTR size_t __ockl_get_local_linear_id(void) { - return (__builtin_amdgcn_workitem_id_z() * __builtin_amdgcn_workgroup_size_y() + - __builtin_amdgcn_workitem_id_y()) * __builtin_amdgcn_workgroup_size_x() + - __builtin_amdgcn_workitem_id_x(); + return get_local_linear_id_(); } diff --git a/opencl/src/devenq/devenq.h b/opencl/src/devenq/devenq.h index 7dbe0e8..4299a77 100644 --- a/opencl/src/devenq/devenq.h +++ b/opencl/src/devenq/devenq.h @@ -1,4 +1,5 @@ +#include "oclc.h" #include "device_amd_hsa.h" #pragma OPENCL EXTENSION cl_amd_media_ops2 : enable @@ -85,26 +86,49 @@ typedef struct _AmdEvent { // XXX this needs to match workgroup/wg.h MAX_WAVES_PER_SIMD #define CL_DEVICE_MAX_WORK_GROUP_SIZE 256 -// ABI has 6 implicit trailing arguments: -// global_offset[3], printf_buf, default vqueue pointer, and self AqlWrap pointer -#define NUM_IMPLICIT_ARGS 6 +// ABI has implicit trailing arguments +#define NUM_IMPLICIT_ARGS (__oclc_ABI_version < 500 ? 7 : 32) static inline __global void * get_printf_ptr(void) { - return (__global void *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]); + if (__oclc_ABI_version < 500) { + return (__global void *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]); + } else { + return (__global void *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[9]); + } } static inline __global AmdVQueueHeader * get_vqueue(void) { - return (__global AmdVQueueHeader *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[4]); + if (__oclc_ABI_version < 500) { + return (__global AmdVQueueHeader *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[4]); + } else { + return (__global AmdVQueueHeader *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[13]); + } } static inline __global AmdAqlWrap * get_aql_wrap(void) { - return (__global AmdAqlWrap *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[5]); + if (__oclc_ABI_version < 500) { + return (__global AmdAqlWrap *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[5]); + } else { + return (__global AmdAqlWrap *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[14]); + } +} + +static inline size_t +get_bases(void) +{ + return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[24]; +} + +static inline size_t +get_hsa_queue(void) +{ + return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[25]; } // reserve a slot in a bitmask controlled resource diff --git a/opencl/src/devenq/enqueue.cl b/opencl/src/devenq/enqueue.cl index 8527189..a5098fb 100644 --- a/opencl/src/devenq/enqueue.cl +++ b/opencl/src/devenq/enqueue.cl @@ -190,12 +190,33 @@ __enqueue_kernel_basic(queue_t q, kernel_enqueue_flags_t f, const ndrange_t r, v // Set up kernarg copy_captured_context(aw->aql.kernarg_address, capture, csize, calign); __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address + align_up(csize, sizeof(size_t))); - implicit[0] = r.globalWorkOffset[0]; - implicit[1] = r.globalWorkOffset[1]; - implicit[2] = r.globalWorkOffset[2]; - implicit[3] = (size_t)get_printf_ptr(); - implicit[4] = (size_t)get_vqueue(); - implicit[5] = (size_t)aw; + if (__oclc_ABI_version < 500) { + implicit[0] = r.globalWorkOffset[0]; + implicit[1] = r.globalWorkOffset[1]; + implicit[2] = r.globalWorkOffset[2]; + implicit[3] = (size_t)get_printf_ptr(); + implicit[4] = (size_t)get_vqueue(); + implicit[5] = (size_t)aw; + } else { + implicit[0] = ((size_t)((uint)r.globalWorkSize[0] / (ushort)r.localWorkSize[0])) | + ((size_t)((uint)r.globalWorkSize[1] / (ushort)r.localWorkSize[1]) << 32); + implicit[1] = ((size_t)((uint)r.globalWorkSize[2] / (ushort)r.localWorkSize[2])) | + ((size_t)(ushort)r.localWorkSize[0] << 32) | + ((size_t)(ushort)r.localWorkSize[1] << 48); + implicit[2] = ((size_t)(ushort)r.localWorkSize[2]) | + ((size_t)((uint)r.globalWorkSize[0] % (ushort)r.localWorkSize[0]) << 16) | + ((size_t)((uint)r.globalWorkSize[1] % (ushort)r.localWorkSize[1]) << 32) | + ((size_t)((uint)r.globalWorkSize[2] % (ushort)r.localWorkSize[2]) << 48); + implicit[5] = r.globalWorkOffset[0]; + implicit[6] = r.globalWorkOffset[1]; + implicit[7] = r.globalWorkOffset[2]; + implicit[8] = (size_t)(ushort)r.workDimension; + implicit[9] = (size_t)get_printf_ptr(); + implicit[13] = (size_t)get_vqueue(); + implicit[14] = (size_t)aw; + implicit[24] = get_bases(); + implicit[25] = get_hsa_queue(); + } const __global struct rtinfo *rti = (const __global struct rtinfo *)block; @@ -264,12 +285,33 @@ __enqueue_kernel_basic_events(queue_t q, kernel_enqueue_flags_t f, const ndrange // Set up kernarg copy_captured_context(aw->aql.kernarg_address, capture, csize, calign); __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address + align_up(csize, sizeof(size_t))); - implicit[0] = r.globalWorkOffset[0]; - implicit[1] = r.globalWorkOffset[1]; - implicit[2] = r.globalWorkOffset[2]; - implicit[3] = (size_t)get_printf_ptr(); - implicit[4] = (size_t)get_vqueue(); - implicit[5] = (size_t)aw; + if (__oclc_ABI_version < 500) { + implicit[0] = r.globalWorkOffset[0]; + implicit[1] = r.globalWorkOffset[1]; + implicit[2] = r.globalWorkOffset[2]; + implicit[3] = (size_t)get_printf_ptr(); + implicit[4] = (size_t)get_vqueue(); + implicit[5] = (size_t)aw; + } else { + implicit[0] = ((size_t)((uint)r.globalWorkSize[0] / (ushort)r.localWorkSize[0])) | + ((size_t)((uint)r.globalWorkSize[1] / (ushort)r.localWorkSize[1]) << 32); + implicit[1] = ((size_t)((uint)r.globalWorkSize[2] / (ushort)r.localWorkSize[2])) | + ((size_t)(ushort)r.localWorkSize[0] << 32) | + ((size_t)(ushort)r.localWorkSize[1] << 48); + implicit[2] = ((size_t)(ushort)r.localWorkSize[2]) | + ((size_t)((uint)r.globalWorkSize[0] % (ushort)r.localWorkSize[0]) << 16) | + ((size_t)((uint)r.globalWorkSize[1] % (ushort)r.localWorkSize[1]) << 32) | + ((size_t)((uint)r.globalWorkSize[2] % (ushort)r.localWorkSize[2]) << 48); + implicit[5] = r.globalWorkOffset[0]; + implicit[6] = r.globalWorkOffset[1]; + implicit[7] = r.globalWorkOffset[2]; + implicit[8] = (size_t)(ushort)r.workDimension; + implicit[9] = (size_t)get_printf_ptr(); + implicit[13] = (size_t)get_vqueue(); + implicit[14] = (size_t)aw; + implicit[24] = get_bases(); + implicit[25] = get_hsa_queue(); + } const __global struct rtinfo *rti = (const __global struct rtinfo *)block; @@ -336,12 +378,33 @@ __enqueue_kernel_varargs(queue_t q, kernel_enqueue_flags_t f, const ndrange_t r, __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address + align_up(align_up(csize, sizeof(uint)) + nl*sizeof(uint), sizeof(size_t))); - implicit[0] = r.globalWorkOffset[0]; - implicit[1] = r.globalWorkOffset[1]; - implicit[2] = r.globalWorkOffset[2]; - implicit[3] = (size_t)get_printf_ptr(); - implicit[4] = (size_t)get_vqueue(); - implicit[5] = (size_t)aw; + if (__oclc_ABI_version < 500) { + implicit[0] = r.globalWorkOffset[0]; + implicit[1] = r.globalWorkOffset[1]; + implicit[2] = r.globalWorkOffset[2]; + implicit[3] = (size_t)get_printf_ptr(); + implicit[4] = (size_t)get_vqueue(); + implicit[5] = (size_t)aw; + } else { + implicit[0] = ((size_t)((uint)r.globalWorkSize[0] / (ushort)r.localWorkSize[0])) | + ((size_t)((uint)r.globalWorkSize[1] / (ushort)r.localWorkSize[1]) << 32); + implicit[1] = ((size_t)((uint)r.globalWorkSize[2] / (ushort)r.localWorkSize[2])) | + ((size_t)(ushort)r.localWorkSize[0] << 32) | + ((size_t)(ushort)r.localWorkSize[1] << 48); + implicit[2] = ((size_t)(ushort)r.localWorkSize[2]) | + ((size_t)((uint)r.globalWorkSize[0] % (ushort)r.localWorkSize[0]) << 16) | + ((size_t)((uint)r.globalWorkSize[1] % (ushort)r.localWorkSize[1]) << 32) | + ((size_t)((uint)r.globalWorkSize[2] % (ushort)r.localWorkSize[2]) << 48); + implicit[5] = r.globalWorkOffset[0]; + implicit[6] = r.globalWorkOffset[1]; + implicit[7] = r.globalWorkOffset[2]; + implicit[8] = (size_t)(ushort)r.workDimension; + implicit[9] = (size_t)get_printf_ptr(); + implicit[13] = (size_t)get_vqueue(); + implicit[14] = (size_t)aw; + implicit[24] = get_bases(); + implicit[25] = get_hsa_queue(); + } __global AmdAqlWrap *me = get_aql_wrap(); @@ -424,12 +487,31 @@ __enqueue_kernel_events_varargs(queue_t q, kernel_enqueue_flags_t f, const ndran __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address + align_up(align_up(csize, sizeof(uint)) + nl*sizeof(uint), sizeof(size_t))); - implicit[0] = r.globalWorkOffset[0]; - implicit[1] = r.globalWorkOffset[1]; - implicit[2] = r.globalWorkOffset[2]; - implicit[3] = (size_t)get_printf_ptr(); - implicit[4] = (size_t)get_vqueue(); - implicit[5] = (size_t)aw; + if (__oclc_ABI_version < 500) { + implicit[0] = r.globalWorkOffset[0]; + implicit[1] = r.globalWorkOffset[1]; + implicit[2] = r.globalWorkOffset[2]; + implicit[3] = (size_t)get_printf_ptr(); + implicit[4] = (size_t)get_vqueue(); + implicit[5] = (size_t)aw; + } else { + implicit[0] = ((size_t)((uint)r.globalWorkSize[0] / (ushort)r.localWorkSize[0])) | + ((size_t)((uint)r.globalWorkSize[1] / (ushort)r.localWorkSize[1]) << 32); + implicit[1] = ((size_t)((uint)r.globalWorkSize[2] / (ushort)r.localWorkSize[2])) | + ((size_t)(ushort)r.localWorkSize[0] << 32) | + ((size_t)(ushort)r.localWorkSize[1] << 48); + implicit[2] = ((size_t)(ushort)r.localWorkSize[2]) | + ((size_t)((uint)r.globalWorkSize[0] % (ushort)r.localWorkSize[0]) << 16) | + ((size_t)((uint)r.globalWorkSize[1] % (ushort)r.localWorkSize[1]) << 32) | + ((size_t)((uint)r.globalWorkSize[2] % (ushort)r.localWorkSize[2]) << 48); + implicit[5] = r.globalWorkOffset[0]; + implicit[6] = r.globalWorkOffset[1]; + implicit[7] = r.globalWorkOffset[2]; + implicit[8] = (size_t)(ushort)r.workDimension; + implicit[9] = (size_t)get_printf_ptr(); + implicit[13] = (size_t)get_vqueue(); + implicit[14] = (size_t)aw; + } __global AmdAqlWrap *me = get_aql_wrap(); diff --git a/opencl/src/misc/printf.cl b/opencl/src/misc/printf.cl index a9aade5..0686906 100644 --- a/opencl/src/misc/printf.cl +++ b/opencl/src/misc/printf.cl @@ -5,6 +5,8 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ +#include "oclc.h" + #ifndef NULL #define NULL 0 #endif @@ -15,7 +17,12 @@ __global char * __printf_alloc(uint bytes) { - __global char *ptr = (__global char *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]); + __global char *ptr; + if (__oclc_ABI_version < 500) { + ptr = (__global char *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]; + } else { + ptr = (__global char *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[9]; + } uint size = ((__global uint *)ptr)[1]; uint offset = atomic_load_explicit((__global atomic_uint *)ptr, memory_order_relaxed, memory_scope_device);