diff --git a/asanrtl/src/stubs.cl b/asanrtl/src/stubs.cl
index 7140557..364a050 100644
--- a/asanrtl/src/stubs.cl
+++ b/asanrtl/src/stubs.cl
@@ -7,23 +7,23 @@
 
 #include "asan_util.h"
 
-void __asan_handle_no_return(void) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_handle_no_return(void) {}
 
-void __sanitizer_ptr_cmp(uptr a, uptr b) {}
+OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_cmp(uptr a, uptr b) {}
 
-void __sanitizer_ptr_sub(uptr a, uptr b) {}
+OPT_NONE NO_SANITIZE_ADDR void __sanitizer_ptr_sub(uptr a, uptr b) {}
 
-void __asan_before_dynamic_init(uptr addr) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_before_dynamic_init(uptr addr) {}
 
-void __asan_after_dynamic_init(void) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_after_dynamic_init(void) {}
 
-void __asan_register_image_globals(uptr flag) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_register_image_globals(uptr flag) {}
 
-void __asan_unregister_image_globals(uptr flag) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_image_globals(uptr flag) {}
 
-void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_register_elf_globals(uptr flag, uptr start, uptr stop) {}
 
-void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {}
+OPT_NONE NO_SANITIZE_ADDR void __asan_unregister_elf_globals(uptr flag, uptr start, uptr stop) {}
 
 OPT_NONE NO_SANITIZE_ADDR void __asan_init(void) {}
 
diff --git a/ockl/inc/ockl.h b/ockl/inc/ockl.h
index d0b98d4..0477d7d 100644
--- a/ockl/inc/ockl.h
+++ b/ockl/inc/ockl.h
@@ -456,6 +456,8 @@ extern __attribute__((const)) __private void * OCKL_MANGLE_T(to,private)(void *)
 extern void OCKL_MANGLE_T(rtcwait,u32)(uint);
 extern void __ockl_sanitizer_report(ulong, ulong, ulong, ulong, ulong, ulong, ulong, ulong);
 
+extern uint OCKL_MANGLE_U32(alisa)(uint);
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
 
 #endif // OCKL_H
diff --git a/ockl/src/alrs.cl b/ockl/src/alrs.cl
new file mode 100644
index 0000000..dc5d11f
--- /dev/null
+++ b/ockl/src/alrs.cl
@@ -0,0 +1,178 @@
+
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "oclc.h"
+#include "ockl.h"
+
+static uint
+bpermute_u32(uint l, uint v)
+{
+    return __builtin_amdgcn_ds_bpermute(l << 2, v);
+}
+
+uint
+OCKL_MANGLE_U32(alisa)(uint n)
+{
+    uint l = __ockl_lane_u32();
+    uint ret = n;
+
+    if (__oclc_wavefrontsize64) {
+        const ulong nolsb = ~1UL;
+
+        // Step 1
+        ulong smask = __builtin_amdgcn_read_exec() & ~((0x2UL << l) - 0x1UL);
+        int slid = (int)__ockl_ctz_u64(smask);
+        uint t = bpermute_u32(slid, n);
+        ret += slid < 64 ? t : 0;
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 2
+        slid = (int)__ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = (int)__ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 3
+        slid = __ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 4
+        slid = __ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 5
+        slid = __ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+        slid = __ockl_ctz_u64(smask);
+        smask ^= (0x1UL << slid) & nolsb;
+
+        // Step 6
+        slid = __ockl_ctz_u64(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 64 ? t : 0;
+    } else {
+        const uint nolsb = ~1U;
+
+        // Step 1
+        uint smask = __builtin_amdgcn_read_exec_lo() & ~((0x2U << l) - 0x1U);
+        int slid = (int)__ockl_ctz_u32(smask);
+        uint t = bpermute_u32(slid, n);
+        ret += slid < 32 ? t : 0;
+        smask ^= (0x1U << slid) & nolsb;
+
+        // Step 2
+        slid = (int)__ockl_ctz_u32(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 32 ? t : 0;
+
+        smask ^= (0x1U << slid) & nolsb;
+        slid = (int)__ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+
+        // Step 3
+        slid = __ockl_ctz_u32(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 32 ? t : 0;
+
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+
+        // Step 4
+        slid = __ockl_ctz_u32(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 32 ? t : 0;
+
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+        slid = __ockl_ctz_u32(smask);
+        smask ^= (0x1U << slid) & nolsb;
+
+        // Step 5
+        slid = __ockl_ctz_u32(smask);
+        t = bpermute_u32(slid, ret);
+        ret += slid < 32 ? t : 0;
+    }
+
+    return ret;
+}
diff --git a/opencl/src/misc/printf.cl b/ockl/src/cprintf.cl
similarity index 78%
rename from opencl/src/misc/printf.cl
rename to ockl/src/cprintf.cl
index 0686906..6975bae 100644
--- a/opencl/src/misc/printf.cl
+++ b/ockl/src/cprintf.cl
@@ -7,6 +7,9 @@
 
 #include "oclc.h"
 
+#define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device)
+#define ACE(P, E, V, O) __opencl_atomic_compare_exchange_strong(P, E, V, O, O, memory_scope_device)
+
 #ifndef NULL
 #define NULL 0
 #endif
@@ -25,13 +28,13 @@ __printf_alloc(uint bytes)
     }
 
     uint size = ((__global uint *)ptr)[1];
-    uint offset = atomic_load_explicit((__global atomic_uint *)ptr, memory_order_relaxed, memory_scope_device);
+    uint offset = AL((__global atomic_uint *)ptr, memory_order_relaxed);
 
     for (;;) {
         if (OFFSET + offset + bytes > size)
             return NULL;
 
-        if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed, memory_order_relaxed, memory_scope_device))
+        if (ACE((__global atomic_uint *)ptr, &offset, offset+bytes, memory_order_relaxed))
             break;
     }
 
diff --git a/ockl/src/dm.cl b/ockl/src/dm.cl
index 245b4a1..a947b69 100644
--- a/ockl/src/dm.cl
+++ b/ockl/src/dm.cl
@@ -177,9 +177,9 @@ typedef struct heap_s {
 
 // Inhibit control flow optimizations
 #define O0(X) X = o0(X)
-__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
-__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
-__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("; O0 %0" : "=v"(y) : "0"(x)); return y; }
+__attribute__((overloadable)) static int o0(int x) { int y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }
+__attribute__((overloadable)) static uint o0(uint x) { uint y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }
+__attribute__((overloadable)) static ulong o0(ulong x) { ulong y; __asm__ volatile("" : "=v"(y) : "0"(x)); return y; }
 
 // Atomics wrappers
 #define AL(P, O) __opencl_atomic_load(P, O, memory_scope_device)
diff --git a/ockl/src/gaaf.cl b/ockl/src/gaaf.cl
index a11dfe0..89787de 100644
--- a/ockl/src/gaaf.cl
+++ b/ockl/src/gaaf.cl
@@ -15,7 +15,7 @@
 #define AC(P, E, V, O, R, S) __opencl_atomic_compare_exchange_strong(P, E, V, O, R, S)
 #define AL(P, O, S) __opencl_atomic_load(P, O, S)
 
-extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1f32.f32");
+extern float __llvm_amdgcn_global_atomic_fadd_f32_p1f32_f32(__global float *, float) __asm("llvm.amdgcn.global.atomic.fadd.f32.p1.f32");
 
 static void
 global_atomic_fadd(__global float *p, float v)
diff --git a/ockl/src/hostcall_impl.cl b/ockl/src/hostcall_impl.cl
index ecf7b08..20af208 100644
--- a/ockl/src/hostcall_impl.cl
+++ b/ockl/src/hostcall_impl.cl
@@ -100,9 +100,7 @@ static uint
 optimizationBarrierHack(uint in_val)
 {
     uint out_val;
-    __asm__ volatile("; ockl readfirstlane hoisting hack %0"
-                     : "=v"(out_val)
-                     : "0"(in_val));
+    __asm__ volatile("" : "=v"(out_val) : "0"(in_val));
     return out_val;
 }
 
diff --git a/ockl/src/wfaas.cl b/ockl/src/wfaas.cl
index ac36d3b..9bac8dd 100644
--- a/ockl/src/wfaas.cl
+++ b/ockl/src/wfaas.cl
@@ -21,8 +21,7 @@ ATTR
 static int optimizationBarrierHack(int in_val)
 {
     int out_val;
-    __asm__ volatile ("; ockl ballot hoisting hack %0" :
-                      "=v"(out_val) : "0"(in_val));
+    __asm__ volatile ("" : "=v"(out_val) : "0"(in_val));
     return out_val;
 }
 
diff --git a/opencl/src/pipes/wresvnp.cl b/opencl/src/pipes/wresvnp.cl
index e1a9b2c..77e9d30 100644
--- a/opencl/src/pipes/wresvnp.cl
+++ b/opencl/src/pipes/wresvnp.cl
@@ -5,14 +5,25 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
+#include "oclc.h"
+#include "ockl.h"
 #include "pipes.h"
 
+static uint
+active_lane_count(void)
+{
+    if (__oclc_wavefrontsize64) {
+        return __builtin_popcountl(__builtin_amdgcn_read_exec());
+    } else {
+        return __builtin_popcount(__builtin_amdgcn_read_exec_lo());
+    }
+}
+
 size_t
 __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
 {
-    uint alc = (size_t)(__builtin_popcount(__builtin_amdgcn_read_exec_lo()) +
-                        __builtin_popcount(__builtin_amdgcn_read_exec_hi()));
-    uint l = __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0u));
+    uint alc = active_lane_count();
+    uint l = __ockl_lane_u32();
     size_t rid;
 
     if (__builtin_amdgcn_read_exec() == (1UL << alc) - 1UL) {
@@ -26,113 +37,16 @@ __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
         rid = idx + (size_t)(sum - (uint)n);
         rid = idx != ~(size_t)0 ? rid : idx;
     } else {
-        // Inclusive add scan with not all lanes active
-        const ulong nomsb = 0x7fffffffffffffffUL;
-
-        // Step 1
-        ulong smask = __builtin_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL);
-        int slid = 63 - (int)clz(smask);
-        uint t = __builtin_amdgcn_ds_bpermute(slid << 2, n);
-        uint sum = n + (slid < 0 ? 0 : t);
-        smask ^= (0x1UL << slid) & nomsb;
-
-        // Step 2
-        slid = 63 - (int)clz(smask);
-        t = __builtin_amdgcn_ds_bpermute(slid << 2, sum);
-        sum += slid < 0 ? 0 : t;
-
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-
-        // Step 3
-        slid = 63 - (int)clz(smask);
-        t = __builtin_amdgcn_ds_bpermute(slid << 2, sum);
-        sum += slid < 0 ? 0 : t;
-
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-
-        // Step 4
-        slid = 63 - (int)clz(smask);
-        t = __builtin_amdgcn_ds_bpermute(slid << 2, sum);
-        sum += slid < 0 ? 0 : t;
-
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-
-        // Step 5
-        slid = 63 - (int)clz(smask);
-        t = __builtin_amdgcn_ds_bpermute(slid << 2, sum);
-        sum += slid < 0 ? 0 : t;
-
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-        slid = 63 - (int)clz(smask);
-        smask ^= (0x1UL << slid) & nomsb;
-
-        // Step 6
-        slid = 63 - (int)clz(smask);
-        t = __builtin_amdgcn_ds_bpermute(slid << 2, sum);
-        sum += slid < 0 ? 0 : t;
-        __builtin_amdgcn_wave_barrier();
+        uint sum = __ockl_alisa_u32((uint)n);
+        uint al = __ockl_activelane_u32();
 
         size_t idx = 0;
-        if (l == 63 - (int)clz(__builtin_amdgcn_read_exec())) {
+        if (al == 0) {
             idx = reserve(pidx, lim, (size_t)sum);
         }
         __builtin_amdgcn_wave_barrier();
-
-        // Broadcast
-        uint k = 63u - (uint)clz(__builtin_amdgcn_read_exec());
-        idx = ((size_t)__builtin_amdgcn_readlane((uint)(idx >> 32), k) << 32) |
-              (size_t)__builtin_amdgcn_readlane((uint)idx, k);
-        __builtin_amdgcn_wave_barrier();
+        idx = ((size_t)__builtin_amdgcn_readfirstlane((uint)(idx >> 32)) << 32) |
+              (size_t)__builtin_amdgcn_readfirstlane((uint)idx);
 
         rid = idx + (size_t)(sum - (uint)n);
         rid = idx != ~(size_t)0 ? rid : idx;