From c9ce02b87880a96db46df6ecbf203c4b54a5fe7c Mon Sep 17 00:00:00 2001
From: Aakanksha Patil <aakanksha555@gmail.com>
Date: Tue, 12 Oct 2021 13:26:01 -0400
Subject: [PATCH 1/6] Add gfx1036

Change-Id: I6639daef6714e7aab93ad0061d147e92e105fdc0
---
 oclc/src/isa_version_1036.cl | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 oclc/src/isa_version_1036.cl

diff --git a/oclc/src/isa_version_1036.cl b/oclc/src/isa_version_1036.cl
new file mode 100644
index 0000000..3559b9a
--- /dev/null
+++ b/oclc/src/isa_version_1036.cl
@@ -0,0 +1,10 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "oclc.h"
+
+const __constant int __oclc_ISA_version = 10306;

From 1de32ef43a44cf578ea5fa351df6b6da12ba84c4 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Wed, 23 Mar 2022 14:34:50 -0700
Subject: [PATCH 2/6] Improve broadcast

Change-Id: I136afdb06ce77e545265d29b711c703706ca965f
---
 ockl/src/wfbc.cl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ockl/src/wfbc.cl b/ockl/src/wfbc.cl
index 322df4d..d3bbe2d 100644
--- a/ockl/src/wfbc.cl
+++ b/ockl/src/wfbc.cl
@@ -13,7 +13,9 @@ ulong
 OCKL_MANGLE_U64(wfbcast)(ulong a, uint i)
 {
     uint j = __builtin_amdgcn_readfirstlane(i);
-    return ((ulong)__builtin_amdgcn_readlane((uint)(a >> 32), j) << 32) |
-            (ulong)__builtin_amdgcn_readlane((uint)a, j);
+    uint2 aa = __builtin_astype(a, uint2);
+    aa.x = __builtin_amdgcn_readlane(aa.x, j);
+    aa.y = __builtin_amdgcn_readlane(aa.y, j);
+    return __builtin_astype(aa, ulong);
 }
 

From 35f4a18cf05e6ef8b312e5e5ec3043a99701abb5 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Tue, 29 Mar 2022 13:17:27 -0700
Subject: [PATCH 3/6] Add gfx940 isa version

Change-Id: Ie28bb30b1187020702f86c1d5b69c6db48f0a3ac
---
 oclc/src/isa_version_940.cl | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 oclc/src/isa_version_940.cl

diff --git a/oclc/src/isa_version_940.cl b/oclc/src/isa_version_940.cl
new file mode 100644
index 0000000..e8acdaa
--- /dev/null
+++ b/oclc/src/isa_version_940.cl
@@ -0,0 +1,10 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "oclc.h"
+
+const __constant int __oclc_ISA_version = 9400;

From 43322a26f85d71ae4fdf8f898197022a25a6fb50 Mon Sep 17 00:00:00 2001
From: Sarbojit Sarkar <sarbojit.sarkar@amd.com>
Date: Tue, 29 Mar 2022 18:42:28 +0000
Subject: [PATCH 4/6] SWDEV-329086 - Fix for hipStreamWaitValue (NOR)

Change-Id: Ic7bf4c4fbb80f7a949362f2c399ce5d6950bb20a
---
 opencl/src/misc/amdblit.cl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opencl/src/misc/amdblit.cl b/opencl/src/misc/amdblit.cl
index 98ab55e..bf10617 100644
--- a/opencl/src/misc/amdblit.cl
+++ b/opencl/src/misc/amdblit.cl
@@ -642,13 +642,13 @@ __amd_streamOpsWait(
     case 3: //NOR
       if (ptrUint) {
         while (((atomic_load_explicit(ptrUint, memory_order_relaxed,
-                     memory_scope_all_svm_devices) & (uint)mask) | (uint)value) == ~0U) {
+                 memory_scope_all_svm_devices) | (uint)value) & (uint)mask) == (uint)mask) {
           __builtin_amdgcn_s_sleep(1);
         }
       }
       else {
         while (((atomic_load_explicit(ptrUlong, memory_order_relaxed,
-                     memory_scope_all_svm_devices) & mask) | value) == ~0UL) {
+                     memory_scope_all_svm_devices) | value) & mask) == mask) {
           __builtin_amdgcn_s_sleep(1);
         }
       }

From 2f7150ac3650e9b49cea6ea803374cacbbd38fba Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Tue, 5 Apr 2022 14:28:17 -0700
Subject: [PATCH 5/6] Eliminate one operation

Change-Id: I6d8b06787938498425b8790bfa3237295f6b799a
---
 ocml/src/tanhF.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocml/src/tanhF.cl b/ocml/src/tanhF.cl
index ec3d996..e49e6d3 100644
--- a/ocml/src/tanhF.cl
+++ b/ocml/src/tanhF.cl
@@ -37,7 +37,7 @@ MATH_MANGLE(tanh)(float x)
         z = MATH_MAD(y2, y*p, y);
     } else {
         float t = MATH_MANGLE(exp)(2.0f * y);
-        z = 1.0f - MATH_FAST_DIV(2.0f, t + 1.0f);
+        z = MATH_MAD(-2.0f, MATH_FAST_RCP(t + 1.0f), 1.0f);
     }
 #endif
 

From 4b1191ad456a5e436081245e52a2229a960ae74e Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Thu, 7 Apr 2022 12:26:11 -0700
Subject: [PATCH 6/6] Device libs updates for code object version 5

Change-Id: I249db245297190b4cea17f336b90e9d2ff4b1001
---
 ockl/src/cg.cl               |   7 +-
 ockl/src/dm.cl               |  28 ++-
 ockl/src/hostcall.cl         |  14 +-
 ockl/src/workitem.cl         | 471 ++++++++++++++++++++++++++---------
 opencl/src/devenq/devenq.h   |  36 ++-
 opencl/src/devenq/enqueue.cl | 130 ++++++++--
 opencl/src/misc/printf.cl    |   9 +-
 7 files changed, 528 insertions(+), 167 deletions(-)

diff --git a/ockl/src/cg.cl b/ockl/src/cg.cl
index 1afc236..8ff1df4 100644
--- a/ockl/src/cg.cl
+++ b/ockl/src/cg.cl
@@ -6,6 +6,7 @@
  *===------------------------------------------------------------------------*/
 
 #include "irif.h"
+#include "oclc.h"
 #include "ockl.h"
 
 #define AL(P) __opencl_atomic_load((__global atomic_uint *)P, memory_order_relaxed, memory_scope_all_svm_devices)
@@ -28,7 +29,11 @@ struct mg_info {
 static inline size_t
 get_mg_info_arg(void)
 {
-    return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[6];
+    if (__oclc_ABI_version < 500) {
+        return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[6];
+    } else {
+        return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[11];
+    }
 }
 
 static inline bool
diff --git a/ockl/src/dm.cl b/ockl/src/dm.cl
index 5b9ed25..3a6c035 100644
--- a/ockl/src/dm.cl
+++ b/ockl/src/dm.cl
@@ -170,11 +170,6 @@ typedef struct heap_s {
 #endif
 } heap_t;
 
-// TODO: get the heap pointer from the language runtime
-static __global heap_t heap;
-#define HEAP_POINTER &heap
-
-
 // Inhibit control flow optimizations
 #define O0(X) X = o0(X)
 __attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
@@ -190,8 +185,19 @@ __attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volati
 #define AFO(P, V, O) __opencl_atomic_fetch_or (P, V, O, memory_scope_device)
 #define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device)
 
+// get the heap pointer
+static __global heap_t *
+get_heap_ptr(void) {
+    if (__oclc_ABI_version < 500) {
+        static __global heap_t heap;
+        return &heap;
+    } else {
+        return (__global heap_t *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[12];
+    }
+}
+
 // realtime
-__attribute__((target("s-memrealtime")))static ulong
+__attribute__((target("s-memrealtime"))) static ulong
 realtime(void)
 {
     return __builtin_amdgcn_s_memrealtime();
@@ -382,7 +388,7 @@ non_slab_free(ulong addr)
     uint nactive = active_lane_count();
 
     if (aid == 0) {
-        __global heap_t *hp = HEAP_POINTER;
+        __global heap_t *hp = get_heap_ptr();
         AFS(&hp->num_nonslab_allocations, nactive, memory_order_relaxed);
     }
 #endif
@@ -404,7 +410,7 @@ __ockl_dm_dealloc(ulong addr)
     kind_t my_k = sptr->k;
     sid_t my_i = sptr->i;
 
-    __global heap_t *hp = HEAP_POINTER;
+    __global heap_t *hp = get_heap_ptr();
     int go = 1;
     do {
         o0(go);
@@ -446,7 +452,7 @@ non_slab_malloc(size_t sz)
         uint nactive = active_lane_count();
 
         if (aid == 0) {
-            __global heap_t *hp = HEAP_POINTER;
+            __global heap_t *hp = get_heap_ptr();
             AFA(&hp->num_nonslab_allocations, nactive, memory_order_relaxed);
         }
     }
@@ -898,7 +904,7 @@ slab_malloc(int sz)
 {
     kind_t my_k = size_to_kind(sz);
     __global void *ret = (__global void *)0;
-    __global heap_t *hp = HEAP_POINTER;
+    __global heap_t *hp = get_heap_ptr();
 
     int k_go = 1;
     do {
@@ -949,7 +955,7 @@ __ockl_dm_alloc(ulong sz)
 ulong
 __ockl_dm_nna(void)
 {
-    __global heap_t *hp = HEAP_POINTER;
+    __global heap_t *hp = get_heap_ptr();
     return AL(&hp->num_nonslab_allocations, memory_order_relaxed);
 }
 #endif
diff --git a/ockl/src/hostcall.cl b/ockl/src/hostcall.cl
index 2d7e3c9..4084451 100644
--- a/ockl/src/hostcall.cl
+++ b/ockl/src/hostcall.cl
@@ -5,6 +5,8 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
+#include "oclc.h"
+
 /** \brief Internal implementation of hostcall.
  *
  *  *** INTERNAL USE ONLY ***
@@ -42,12 +44,12 @@ __ockl_hostcall_preview(uint service_id,
                         ulong arg0, ulong arg1, ulong arg2, ulong arg3,
                         ulong arg4, ulong arg5, ulong arg6, ulong arg7)
 {
-    // Retrieve the buffer pointer passed as an implicit kernel
-    // argument. This is at offset 3, which is the same as the OpenCL
-    // printf buffer.
-    __constant size_t *argptr =
-        (__constant size_t *)__builtin_amdgcn_implicitarg_ptr();
-    void *buffer = (void *)argptr[3];
+    void *buffer;
+    if (__oclc_ABI_version < 500) {
+        buffer = (__global void *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3];
+    } else {
+        buffer = (__global void *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[10];
+    }
 
     return __ockl_hostcall_internal(buffer, service_id, arg0, arg1, arg2, arg3,
                                     arg4, arg5, arg6, arg7);
diff --git a/ockl/src/workitem.cl b/ockl/src/workitem.cl
index 5d1d58a..2f92f3f 100644
--- a/ockl/src/workitem.cl
+++ b/ockl/src/workitem.cl
@@ -6,21 +6,344 @@
  *===------------------------------------------------------------------------*/
 
 #include "irif.h"
+#include "oclc.h"
 #include "device_amd_hsa.h"
 
 #define ATTR __attribute__((const))
+#define OLD_ABI __oclc_ABI_version < 500
+
+#define IMPLICITARG(T) ((__constant T *)__builtin_amdgcn_implicitarg_ptr())
+
+ATTR static size_t
+get_global_offset_x(void)
+{
+    if (OLD_ABI) {
+        return IMPLICITARG(ulong)[0];
+    } else {
+        return IMPLICITARG(ulong)[5];
+    }
+}
+
+ATTR static size_t
+get_global_offset_y(void)
+{
+    if (OLD_ABI) {
+        return IMPLICITARG(ulong)[1];
+    } else {
+        return IMPLICITARG(ulong)[6];
+    }
+}
+
+ATTR static size_t
+get_global_offset_z(void)
+{
+    if (OLD_ABI) {
+        return IMPLICITARG(ulong)[2];
+    } else {
+        return IMPLICITARG(ulong)[7];
+    }
+}
+
+ATTR static size_t
+get_global_size_x(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        return p->grid_size_x;
+    } else {
+        return IMPLICITARG(uint)[0]*IMPLICITARG(ushort)[6] + IMPLICITARG(ushort)[9];
+        return 0;
+    }
+}
+
+ATTR static size_t
+get_global_size_y(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        return p->grid_size_y;
+    } else {
+        return IMPLICITARG(uint)[1]*IMPLICITARG(ushort)[7] + IMPLICITARG(ushort)[10];
+    }
+}
+
+ATTR static size_t
+get_global_size_z(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        return p->grid_size_z;
+    } else {
+        return IMPLICITARG(uint)[2]*IMPLICITARG(ushort)[8] + IMPLICITARG(ushort)[11];
+        return 0;
+    }
+}
+
+ATTR static size_t
+get_global_id_x(void)
+{
+    uint l = __builtin_amdgcn_workitem_id_x();
+    uint g = __builtin_amdgcn_workgroup_id_x();
+    uint s;
+    if (OLD_ABI) {
+        s = __builtin_amdgcn_workgroup_size_x();
+    } else {
+        s = IMPLICITARG(ushort)[6];
+    }
+    return (g*s + l) + get_global_offset_x();
+}
+
+ATTR static size_t
+get_global_id_y(void)
+{
+    uint l = __builtin_amdgcn_workitem_id_y();
+    uint g = __builtin_amdgcn_workgroup_id_y();
+    uint s;
+    if (OLD_ABI) {
+        s = __builtin_amdgcn_workgroup_size_y();
+    } else {
+        s = IMPLICITARG(ushort)[7];
+    }
+    return (g*s + l) + get_global_offset_y();
+}
+
+ATTR static size_t
+get_global_id_z(void)
+{
+    uint l = __builtin_amdgcn_workitem_id_z();
+    uint g = __builtin_amdgcn_workgroup_id_z();
+    uint s;
+    if (OLD_ABI) {
+        s = __builtin_amdgcn_workgroup_size_z();
+    } else {
+        s = IMPLICITARG(ushort)[8];
+    }
+    return (g*s + l) + get_global_offset_z();
+}
+
+ATTR static size_t
+get_local_size_x(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        uint group_id = __builtin_amdgcn_workgroup_id_x();
+        uint group_size = __builtin_amdgcn_workgroup_size_x();
+        uint grid_size = p->grid_size_x;
+        uint r = grid_size - group_id * group_size;
+        return (r < group_size) ? r : group_size;
+    } else {
+        return __builtin_amdgcn_workgroup_id_x() < IMPLICITARG(uint)[0] ? IMPLICITARG(ushort)[6] : IMPLICITARG(ushort)[9];
+    }
+}
+
+ATTR static size_t
+get_local_size_y(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        uint group_id = __builtin_amdgcn_workgroup_id_y();
+        uint group_size = __builtin_amdgcn_workgroup_size_y();
+        uint grid_size = p->grid_size_y;
+        uint r = grid_size - group_id * group_size;
+        return (r < group_size) ? r : group_size;
+    } else {
+        return __builtin_amdgcn_workgroup_id_y() < IMPLICITARG(uint)[1] ? IMPLICITARG(ushort)[7] : IMPLICITARG(ushort)[10];
+    }
+}
+
+ATTR static size_t
+get_local_size_z(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        uint group_id = __builtin_amdgcn_workgroup_id_z();
+        uint group_size = __builtin_amdgcn_workgroup_size_z();
+        uint grid_size = p->grid_size_z;
+        uint r = grid_size - group_id * group_size;
+        return (r < group_size) ? r : group_size;
+    } else {
+        return __builtin_amdgcn_workgroup_id_z() < IMPLICITARG(uint)[2] ? IMPLICITARG(ushort)[8] : IMPLICITARG(ushort)[11];
+    }
+}
+
+ATTR static size_t
+get_enqueued_local_size_x(void)
+{
+    if (OLD_ABI) {
+        return __builtin_amdgcn_workgroup_size_x();
+    } else {
+        return IMPLICITARG(ushort)[6];
+    }
+}
+
+ATTR static size_t
+get_enqueued_local_size_y(void)
+{
+    if (OLD_ABI) {
+        return __builtin_amdgcn_workgroup_size_y();
+    } else {
+        return IMPLICITARG(ushort)[7];
+    }
+}
+
+ATTR static size_t
+get_enqueued_local_size_z(void)
+{
+    if (OLD_ABI) {
+        return __builtin_amdgcn_workgroup_size_z();
+    } else {
+        return IMPLICITARG(ushort)[8];
+    }
+}
+
+ATTR static size_t
+get_num_groups_x(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        uint n = p->grid_size_x;
+        uint d = __builtin_amdgcn_workgroup_size_x();
+        uint q = n / d;
+        return q + (n > q*d);
+    } else {
+        return IMPLICITARG(uint)[0] + (IMPLICITARG(ushort)[9] > 0);
+    }
+}
+
+ATTR static size_t
+get_num_groups_y(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        uint n = p->grid_size_y;
+        uint d = __builtin_amdgcn_workgroup_size_y();
+        uint q = n / d;
+        return q + (n > q*d);
+    } else {
+        return IMPLICITARG(uint)[1] + (IMPLICITARG(ushort)[10] > 0);
+    }
+}
+
+ATTR static size_t
+get_num_groups_z(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        uint n = p->grid_size_z;
+        uint d = __builtin_amdgcn_workgroup_size_z();
+        uint q = n / d;
+        return q + (n > q*d);
+    } else {
+        return IMPLICITARG(uint)[2] + (IMPLICITARG(ushort)[11] > 0);
+    }
+}
+
+ATTR static uint
+get_work_dim_(void)
+{
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        return p->setup;
+    } else {
+        return IMPLICITARG(ushort)[32];
+    }
+}
+
+ATTR static size_t
+get_global_linear_id_x(void)
+{
+    uint l0 = __builtin_amdgcn_workitem_id_x();
+    uint g0 = __builtin_amdgcn_workgroup_id_x();
+    uint s0;
+    if (OLD_ABI) {
+        s0 = __builtin_amdgcn_workgroup_size_x();
+    } else {
+        s0 = IMPLICITARG(ushort)[6];
+    }
+    return g0*s0 + l0;
+}
+
+ATTR static size_t
+get_global_linear_id_y(void)
+{
+    uint l0 = __builtin_amdgcn_workitem_id_x();
+    uint l1 = __builtin_amdgcn_workitem_id_y();
+    uint g0 = __builtin_amdgcn_workgroup_id_x();
+    uint g1 = __builtin_amdgcn_workgroup_id_y();
+    uint s0, s1;
+    uint n0;
+
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        s0 = __builtin_amdgcn_workgroup_size_x();
+        s1 = __builtin_amdgcn_workgroup_size_y();
+        n0 = p->grid_size_x;
+    } else {
+        s0 = IMPLICITARG(ushort)[6];
+        s1 = IMPLICITARG(ushort)[7];
+        n0 = IMPLICITARG(uint)[0]*s0 + IMPLICITARG(ushort)[9];
+    }
+    uint i0 = g0*s0 + l0;
+    uint i1 = g1*s1 + l1;
+    return (size_t)i1 * (size_t)n0 + i0;
+}
+
+ATTR static size_t
+get_global_linear_id_z(void)
+{
+    uint l0 = __builtin_amdgcn_workitem_id_x();
+    uint l1 = __builtin_amdgcn_workitem_id_y();
+    uint l2 = __builtin_amdgcn_workitem_id_z();
+    uint g0 = __builtin_amdgcn_workgroup_id_x();
+    uint g1 = __builtin_amdgcn_workgroup_id_y();
+    uint g2 = __builtin_amdgcn_workgroup_id_z();
+    uint s0, s1, s2;
+    uint n0, n1;
+
+    if (OLD_ABI) {
+        __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
+        s0 = __builtin_amdgcn_workgroup_size_x();
+        s1 = __builtin_amdgcn_workgroup_size_y();
+        s2 = __builtin_amdgcn_workgroup_size_z();
+        n0 = p->grid_size_x;
+        n1 = p->grid_size_y;
+    } else {
+        s0 = IMPLICITARG(ushort)[6];
+        s1 = IMPLICITARG(ushort)[7];
+        s2 = IMPLICITARG(ushort)[8];
+        n0 = IMPLICITARG(uint)[0]*s0 + IMPLICITARG(ushort)[9];
+        n1 = IMPLICITARG(uint)[1]*s1 + IMPLICITARG(ushort)[10];
+    }
+    uint i0 = g0*s0 + l0;
+    uint i1 = g1*s1 + l1;
+    uint i2 = g2*s2 + l2;
+    return ((size_t)i2 * (size_t)n1 + (size_t)i1) * (size_t)n0 + i0;
+}
+
+ATTR static size_t
+get_local_linear_id_(void)
+{
+    if (OLD_ABI) {
+        return (__builtin_amdgcn_workitem_id_z() * __builtin_amdgcn_workgroup_size_y() +
+                __builtin_amdgcn_workitem_id_y()) * __builtin_amdgcn_workgroup_size_x() +
+                __builtin_amdgcn_workitem_id_x();
+    } else {
+        return (__builtin_amdgcn_workitem_id_z() * IMPLICITARG(ushort)[7] +
+                __builtin_amdgcn_workitem_id_y()) * IMPLICITARG(ushort)[6] +
+                __builtin_amdgcn_workitem_id_x();
+    }
+}
 
 ATTR size_t
 __ockl_get_global_offset(uint dim)
 {
-    // TODO find out if implicit arg pointer is aligned properly
     switch(dim) {
     case 0:
-        return *(__constant size_t *)__builtin_amdgcn_implicitarg_ptr();
+        return get_global_offset_x();
     case 1:
-        return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[1];
+        return get_global_offset_y();
     case 2:
-        return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[2];
+        return get_global_offset_z();
     default:
         return 0;
     }
@@ -29,32 +352,16 @@ __ockl_get_global_offset(uint dim)
 ATTR size_t
 __ockl_get_global_id(uint dim)
 {
-    uint l, g, s;
-
     switch(dim) {
     case 0:
-        l = __builtin_amdgcn_workitem_id_x();
-        g = __builtin_amdgcn_workgroup_id_x();
-        s = __builtin_amdgcn_workgroup_size_x();
-        break;
+        return get_global_id_x();
     case 1:
-        l = __builtin_amdgcn_workitem_id_y();
-        g = __builtin_amdgcn_workgroup_id_y();
-        s = __builtin_amdgcn_workgroup_size_y();
-        break;
+        return get_global_id_y();
     case 2:
-        l = __builtin_amdgcn_workitem_id_z();
-        g = __builtin_amdgcn_workgroup_id_z();
-        s = __builtin_amdgcn_workgroup_size_z();
-        break;
+        return get_global_id_z();
     default:
-        l = 0;
-        g = 0;
-        s = 1;
-        break;
+        return 0;
     }
-
-    return (g*s + l) + __ockl_get_global_offset(dim);
 }
 
 ATTR size_t
@@ -90,15 +397,13 @@ __ockl_get_group_id(uint dim)
 ATTR size_t
 __ockl_get_global_size(uint dim)
 {
-    __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
-
     switch(dim) {
     case 0:
-        return p->grid_size_x;
+        return get_global_size_x();
     case 1:
-        return p->grid_size_y;
+        return get_global_size_y();
     case 2:
-        return p->grid_size_z;
+        return get_global_size_z();
     default:
         return 1;
     }
@@ -107,70 +412,37 @@ __ockl_get_global_size(uint dim)
 ATTR size_t
 __ockl_get_local_size(uint dim)
 {
-    __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
-    uint group_id, grid_size, group_size;
-
     switch(dim) {
     case 0:
-        group_id = __builtin_amdgcn_workgroup_id_x();
-        group_size = __builtin_amdgcn_workgroup_size_x();
-        grid_size = p->grid_size_x;
-        break;
+        return get_local_size_x();
     case 1:
-        group_id = __builtin_amdgcn_workgroup_id_y();
-        group_size = __builtin_amdgcn_workgroup_size_y();
-        grid_size = p->grid_size_y;
-        break;
+        return get_local_size_y();
     case 2:
-        group_id = __builtin_amdgcn_workgroup_id_z();
-        group_size = __builtin_amdgcn_workgroup_size_z();
-        grid_size = p->grid_size_z;
-        break;
+        return get_local_size_z();
     default:
-        group_id = 0;
-        grid_size = 0;
-        group_size = 1;
-        break;
+        return 1;
     }
-    uint r = grid_size - group_id * group_size;
-    return (r < group_size) ? r : group_size;
 }
 
 ATTR size_t
 __ockl_get_num_groups(uint dim)
 {
-    __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
-
-    uint n, d;
     switch(dim) {
     case 0:
-        n = p->grid_size_x;
-        d = __builtin_amdgcn_workgroup_size_x();
-        break;
+        return get_num_groups_x();
     case 1:
-        n = p->grid_size_y;
-        d = __builtin_amdgcn_workgroup_size_y();
-        break;
+        return get_num_groups_y();
     case 2:
-        n = p->grid_size_z;
-        d = __builtin_amdgcn_workgroup_size_z();
-        break;
+        return get_num_groups_z();
     default:
-        n = 1;
-        d = 1;
-        break;
+        return 1;
     }
-
-    uint q = n / d;
-
-    return q + (n > q*d);
 }
 
 ATTR uint
-__ockl_get_work_dim(void) {
-    __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
-    // XXX revist this if setup field ever changes
-    return p->setup;
+__ockl_get_work_dim(void)
+{
+    return get_work_dim_();
 }
 
 ATTR size_t
@@ -178,11 +450,11 @@ __ockl_get_enqueued_local_size(uint dim)
 {
     switch(dim) {
     case 0:
-        return __builtin_amdgcn_workgroup_size_x();
+        return get_enqueued_local_size_x();
     case 1:
-        return __builtin_amdgcn_workgroup_size_y();
+        return get_enqueued_local_size_y();
     case 2:
-        return __builtin_amdgcn_workgroup_size_z();
+        return get_enqueued_local_size_z();
     default:
         return 1;
     }
@@ -191,48 +463,13 @@ __ockl_get_enqueued_local_size(uint dim)
 ATTR size_t
 __ockl_get_global_linear_id(void)
 {
-    __constant hsa_kernel_dispatch_packet_t *p = __builtin_amdgcn_dispatch_ptr();
-
-    // XXX revisit this if setup field ever changes
-    switch (p->setup) {
+    switch (get_work_dim_()) {
     case 1:
-        {
-            uint l0 = __builtin_amdgcn_workitem_id_x();
-            uint g0 = __builtin_amdgcn_workgroup_id_x();
-            uint s0 = __builtin_amdgcn_workgroup_size_x();
-            return g0*s0 + l0;
-        }
+        return get_global_linear_id_x();
     case 2:
-        {
-            uint l0 = __builtin_amdgcn_workitem_id_x();
-            uint l1 = __builtin_amdgcn_workitem_id_y();
-            uint g0 = __builtin_amdgcn_workgroup_id_x();
-            uint g1 = __builtin_amdgcn_workgroup_id_y();
-            uint s0 = __builtin_amdgcn_workgroup_size_x();
-            uint s1 = __builtin_amdgcn_workgroup_size_y();
-            uint n0 = p->grid_size_x;
-            uint i0 = g0*s0 + l0;
-            uint i1 = g1*s1 + l1;
-            return (size_t)i1 * (size_t)n0 + i0;
-        }
+        return get_global_linear_id_y();
     case 3:
-        {
-            uint l0 = __builtin_amdgcn_workitem_id_x();
-            uint l1 = __builtin_amdgcn_workitem_id_y();
-            uint l2 = __builtin_amdgcn_workitem_id_z();
-            uint g0 = __builtin_amdgcn_workgroup_id_x();
-            uint g1 = __builtin_amdgcn_workgroup_id_y();
-            uint g2 = __builtin_amdgcn_workgroup_id_z();
-            uint s0 = __builtin_amdgcn_workgroup_size_x();
-            uint s1 = __builtin_amdgcn_workgroup_size_y();
-            uint s2 = __builtin_amdgcn_workgroup_size_z();
-            uint n0 = p->grid_size_x;
-            uint n1 = p->grid_size_y;
-            uint i0 = g0*s0 + l0;
-            uint i1 = g1*s1 + l1;
-            uint i2 = g2*s2 + l2;
-            return ((size_t)i2 * (size_t)n1 + (size_t)i1) * (size_t)n0 + i0;
-        }
+        return get_global_linear_id_z();
     default:
         return 0;
     }
@@ -241,8 +478,6 @@ __ockl_get_global_linear_id(void)
 ATTR size_t
 __ockl_get_local_linear_id(void)
 {
-    return (__builtin_amdgcn_workitem_id_z() * __builtin_amdgcn_workgroup_size_y() +
-            __builtin_amdgcn_workitem_id_y()) * __builtin_amdgcn_workgroup_size_x() +
-           __builtin_amdgcn_workitem_id_x();
+    return get_local_linear_id_();
 }
 
diff --git a/opencl/src/devenq/devenq.h b/opencl/src/devenq/devenq.h
index 7dbe0e8..4299a77 100644
--- a/opencl/src/devenq/devenq.h
+++ b/opencl/src/devenq/devenq.h
@@ -1,4 +1,5 @@
 
+#include "oclc.h"
 #include "device_amd_hsa.h"
 
 #pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
@@ -85,26 +86,49 @@ typedef struct _AmdEvent {
 // XXX this needs to match workgroup/wg.h MAX_WAVES_PER_SIMD
 #define CL_DEVICE_MAX_WORK_GROUP_SIZE 256
 
-// ABI has 6 implicit trailing arguments:
-//  global_offset[3], printf_buf, default vqueue pointer, and self AqlWrap pointer
-#define NUM_IMPLICIT_ARGS 6
+// ABI has implicit trailing arguments
+#define NUM_IMPLICIT_ARGS (__oclc_ABI_version < 500 ? 7 : 32)
 
 static inline __global void *
 get_printf_ptr(void)
 {
-    return (__global void *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]);
+    if (__oclc_ABI_version < 500) {
+        return (__global void *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]);
+    } else {
+        return (__global void *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[9]);
+    }
 }
 
 static inline __global AmdVQueueHeader *
 get_vqueue(void)
 {
-    return (__global AmdVQueueHeader *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[4]);
+    if (__oclc_ABI_version < 500) {
+        return (__global AmdVQueueHeader *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[4]);
+    } else {
+        return (__global AmdVQueueHeader *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[13]);
+    }
 }
 
 static inline __global AmdAqlWrap *
 get_aql_wrap(void)
 {
-    return (__global AmdAqlWrap *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[5]);
+    if (__oclc_ABI_version < 500) {
+        return (__global AmdAqlWrap *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[5]);
+    } else {
+        return (__global AmdAqlWrap *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[14]);
+    }
+}
+
+static inline size_t
+get_bases(void)
+{
+    return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[24];
+}
+
+static inline size_t
+get_hsa_queue(void)
+{
+    return ((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[25];
 }
 
 // reserve a slot in a bitmask controlled resource
diff --git a/opencl/src/devenq/enqueue.cl b/opencl/src/devenq/enqueue.cl
index 8527189..a5098fb 100644
--- a/opencl/src/devenq/enqueue.cl
+++ b/opencl/src/devenq/enqueue.cl
@@ -190,12 +190,33 @@ __enqueue_kernel_basic(queue_t q, kernel_enqueue_flags_t f, const ndrange_t r, v
     // Set up kernarg
     copy_captured_context(aw->aql.kernarg_address, capture, csize, calign);
     __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address + align_up(csize, sizeof(size_t)));
-    implicit[0] = r.globalWorkOffset[0];
-    implicit[1] = r.globalWorkOffset[1];
-    implicit[2] = r.globalWorkOffset[2];
-    implicit[3] = (size_t)get_printf_ptr();
-    implicit[4] = (size_t)get_vqueue();
-    implicit[5] = (size_t)aw;
+    if (__oclc_ABI_version < 500) {
+        implicit[0] = r.globalWorkOffset[0];
+        implicit[1] = r.globalWorkOffset[1];
+        implicit[2] = r.globalWorkOffset[2];
+        implicit[3] = (size_t)get_printf_ptr();
+        implicit[4] = (size_t)get_vqueue();
+        implicit[5] = (size_t)aw;
+    } else {
+        implicit[0] = ((size_t)((uint)r.globalWorkSize[0] / (ushort)r.localWorkSize[0])) |
+                      ((size_t)((uint)r.globalWorkSize[1] / (ushort)r.localWorkSize[1]) << 32);
+        implicit[1] = ((size_t)((uint)r.globalWorkSize[2] / (ushort)r.localWorkSize[2])) |
+                      ((size_t)(ushort)r.localWorkSize[0] << 32) |
+                      ((size_t)(ushort)r.localWorkSize[1] << 48);
+        implicit[2] = ((size_t)(ushort)r.localWorkSize[2]) |
+                      ((size_t)((uint)r.globalWorkSize[0] % (ushort)r.localWorkSize[0]) << 16) |
+                      ((size_t)((uint)r.globalWorkSize[1] % (ushort)r.localWorkSize[1]) << 32) |
+                      ((size_t)((uint)r.globalWorkSize[2] % (ushort)r.localWorkSize[2]) << 48);
+        implicit[5] = r.globalWorkOffset[0];
+        implicit[6] = r.globalWorkOffset[1];
+        implicit[7] = r.globalWorkOffset[2];
+        implicit[8] = (size_t)(ushort)r.workDimension;
+        implicit[9] = (size_t)get_printf_ptr();
+        implicit[13] = (size_t)get_vqueue();
+        implicit[14] = (size_t)aw;
+        implicit[24] = get_bases();
+        implicit[25] = get_hsa_queue();
+    }
 
     const __global struct rtinfo *rti = (const __global struct rtinfo *)block;
 
@@ -264,12 +285,33 @@ __enqueue_kernel_basic_events(queue_t q, kernel_enqueue_flags_t f, const ndrange
     // Set up kernarg
     copy_captured_context(aw->aql.kernarg_address, capture, csize, calign);
     __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address + align_up(csize, sizeof(size_t)));
-    implicit[0] = r.globalWorkOffset[0];
-    implicit[1] = r.globalWorkOffset[1];
-    implicit[2] = r.globalWorkOffset[2];
-    implicit[3] = (size_t)get_printf_ptr();
-    implicit[4] = (size_t)get_vqueue();
-    implicit[5] = (size_t)aw;
+    if (__oclc_ABI_version < 500) {
+        implicit[0] = r.globalWorkOffset[0];
+        implicit[1] = r.globalWorkOffset[1];
+        implicit[2] = r.globalWorkOffset[2];
+        implicit[3] = (size_t)get_printf_ptr();
+        implicit[4] = (size_t)get_vqueue();
+        implicit[5] = (size_t)aw;
+    } else {
+        implicit[0] = ((size_t)((uint)r.globalWorkSize[0] / (ushort)r.localWorkSize[0])) |
+                      ((size_t)((uint)r.globalWorkSize[1] / (ushort)r.localWorkSize[1]) << 32);
+        implicit[1] = ((size_t)((uint)r.globalWorkSize[2] / (ushort)r.localWorkSize[2])) |
+                      ((size_t)(ushort)r.localWorkSize[0] << 32) |
+                      ((size_t)(ushort)r.localWorkSize[1] << 48);
+        implicit[2] = ((size_t)(ushort)r.localWorkSize[2]) |
+                      ((size_t)((uint)r.globalWorkSize[0] % (ushort)r.localWorkSize[0]) << 16) |
+                      ((size_t)((uint)r.globalWorkSize[1] % (ushort)r.localWorkSize[1]) << 32) |
+                      ((size_t)((uint)r.globalWorkSize[2] % (ushort)r.localWorkSize[2]) << 48);
+        implicit[5] = r.globalWorkOffset[0];
+        implicit[6] = r.globalWorkOffset[1];
+        implicit[7] = r.globalWorkOffset[2];
+        implicit[8] = (size_t)(ushort)r.workDimension;
+        implicit[9] = (size_t)get_printf_ptr();
+        implicit[13] = (size_t)get_vqueue();
+        implicit[14] = (size_t)aw;
+        implicit[24] = get_bases();
+        implicit[25] = get_hsa_queue();
+    }
 
     const __global struct rtinfo *rti = (const __global struct rtinfo *)block;
 
@@ -336,12 +378,33 @@ __enqueue_kernel_varargs(queue_t q, kernel_enqueue_flags_t f, const ndrange_t r,
 
     __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address +
             align_up(align_up(csize, sizeof(uint)) + nl*sizeof(uint), sizeof(size_t)));
-    implicit[0] = r.globalWorkOffset[0];
-    implicit[1] = r.globalWorkOffset[1];
-    implicit[2] = r.globalWorkOffset[2];
-    implicit[3] = (size_t)get_printf_ptr();
-    implicit[4] = (size_t)get_vqueue();
-    implicit[5] = (size_t)aw;
+    if (__oclc_ABI_version < 500) {
+        implicit[0] = r.globalWorkOffset[0];
+        implicit[1] = r.globalWorkOffset[1];
+        implicit[2] = r.globalWorkOffset[2];
+        implicit[3] = (size_t)get_printf_ptr();
+        implicit[4] = (size_t)get_vqueue();
+        implicit[5] = (size_t)aw;
+    } else {
+        implicit[0] = ((size_t)((uint)r.globalWorkSize[0] / (ushort)r.localWorkSize[0])) |
+                      ((size_t)((uint)r.globalWorkSize[1] / (ushort)r.localWorkSize[1]) << 32);
+        implicit[1] = ((size_t)((uint)r.globalWorkSize[2] / (ushort)r.localWorkSize[2])) |
+                      ((size_t)(ushort)r.localWorkSize[0] << 32) |
+                      ((size_t)(ushort)r.localWorkSize[1] << 48);
+        implicit[2] = ((size_t)(ushort)r.localWorkSize[2]) |
+                      ((size_t)((uint)r.globalWorkSize[0] % (ushort)r.localWorkSize[0]) << 16) |
+                      ((size_t)((uint)r.globalWorkSize[1] % (ushort)r.localWorkSize[1]) << 32) |
+                      ((size_t)((uint)r.globalWorkSize[2] % (ushort)r.localWorkSize[2]) << 48);
+        implicit[5] = r.globalWorkOffset[0];
+        implicit[6] = r.globalWorkOffset[1];
+        implicit[7] = r.globalWorkOffset[2];
+        implicit[8] = (size_t)(ushort)r.workDimension;
+        implicit[9] = (size_t)get_printf_ptr();
+        implicit[13] = (size_t)get_vqueue();
+        implicit[14] = (size_t)aw;
+        implicit[24] = get_bases();
+        implicit[25] = get_hsa_queue();
+    }
 
     __global AmdAqlWrap *me = get_aql_wrap();
 
@@ -424,12 +487,31 @@ __enqueue_kernel_events_varargs(queue_t q, kernel_enqueue_flags_t f, const ndran
 
     __global size_t *implicit = (__global size_t *)((__global char *)aw->aql.kernarg_address +
             align_up(align_up(csize, sizeof(uint)) + nl*sizeof(uint), sizeof(size_t)));
-    implicit[0] = r.globalWorkOffset[0];
-    implicit[1] = r.globalWorkOffset[1];
-    implicit[2] = r.globalWorkOffset[2];
-    implicit[3] = (size_t)get_printf_ptr();
-    implicit[4] = (size_t)get_vqueue();
-    implicit[5] = (size_t)aw;
+    if (__oclc_ABI_version < 500) {
+        implicit[0] = r.globalWorkOffset[0];
+        implicit[1] = r.globalWorkOffset[1];
+        implicit[2] = r.globalWorkOffset[2];
+        implicit[3] = (size_t)get_printf_ptr();
+        implicit[4] = (size_t)get_vqueue();
+        implicit[5] = (size_t)aw;
+    } else {
+        implicit[0] = ((size_t)((uint)r.globalWorkSize[0] / (ushort)r.localWorkSize[0])) |
+                      ((size_t)((uint)r.globalWorkSize[1] / (ushort)r.localWorkSize[1]) << 32);
+        implicit[1] = ((size_t)((uint)r.globalWorkSize[2] / (ushort)r.localWorkSize[2])) |
+                      ((size_t)(ushort)r.localWorkSize[0] << 32) |
+                      ((size_t)(ushort)r.localWorkSize[1] << 48);
+        implicit[2] = ((size_t)(ushort)r.localWorkSize[2]) |
+                      ((size_t)((uint)r.globalWorkSize[0] % (ushort)r.localWorkSize[0]) << 16) |
+                      ((size_t)((uint)r.globalWorkSize[1] % (ushort)r.localWorkSize[1]) << 32) |
+                      ((size_t)((uint)r.globalWorkSize[2] % (ushort)r.localWorkSize[2]) << 48);
+        implicit[5] = r.globalWorkOffset[0];
+        implicit[6] = r.globalWorkOffset[1];
+        implicit[7] = r.globalWorkOffset[2];
+        implicit[8] = (size_t)(ushort)r.workDimension;
+        implicit[9] = (size_t)get_printf_ptr();
+        implicit[13] = (size_t)get_vqueue();
+        implicit[14] = (size_t)aw;
+    }
 
     __global AmdAqlWrap *me = get_aql_wrap();
 
diff --git a/opencl/src/misc/printf.cl b/opencl/src/misc/printf.cl
index a9aade5..0686906 100644
--- a/opencl/src/misc/printf.cl
+++ b/opencl/src/misc/printf.cl
@@ -5,6 +5,8 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
+#include "oclc.h"
+
 #ifndef NULL
 #define NULL 0
 #endif
@@ -15,7 +17,12 @@
 __global char *
 __printf_alloc(uint bytes)
 {
-    __global char *ptr = (__global char *)(((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3]);
+    __global char *ptr;
+    if (__oclc_ABI_version < 500) {
+        ptr = (__global char *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[3];
+    } else {
+        ptr = (__global char *)((__constant size_t *)__builtin_amdgcn_implicitarg_ptr())[9];
+    }
 
     uint size = ((__global uint *)ptr)[1];
     uint offset = atomic_load_explicit((__global atomic_uint *)ptr, memory_order_relaxed, memory_scope_device);