diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5b85f7d..1008f1a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,9 @@ if (GENERIC_IS_ZERO)
   set(AMDGPU_TARGET_TRIPLE "amdgcn--amdhsa-amdgizcl")
   # HCC will execute utils/change-addr-space.sh
   # and apply utils/add_amdgiz.sed on all .ll files in subdirectory hc/, irif/, opencl/
+  if (CUDA_TRIPLE)
+    set(AMDGPU_TARGET_TRIPLE "amdgcn--cuda")
+  endif (CUDA_TRIPLE)
 
 endif (GENERIC_IS_ZERO)
 
@@ -52,6 +55,9 @@ add_subdirectory(oclc)
 add_subdirectory(ocml)
 add_subdirectory(ockl)
 add_subdirectory(opencl)
+if (CUDA_TRIPLE)
+  add_subdirectory(cuda2gcn)
+endif (CUDA_TRIPLE)
 
 if(BUILD_HC_LIB)
   add_subdirectory(hc)
diff --git a/cuda2gcn/CMakeLists.txt b/cuda2gcn/CMakeLists.txt
new file mode 100644
index 00000000..c2ed32fe
--- /dev/null
+++ b/cuda2gcn/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===--------------------------------------------------------------------------
+##                   ROCm Device Libraries
+##
+## This file is distributed under the University of Illinois Open Source
+## License. See LICENSE.TXT for details.
+##===--------------------------------------------------------------------------
+
+file(GLOB cl_sources
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl
+)
+
+file(GLOB sources ${cl_sources})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ocml/inc)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ockl/inc)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc)
+opencl_bc_lib(cuda2gcn ${sources})
diff --git a/cuda2gcn/src/bitsbytes.cl b/cuda2gcn/src/bitsbytes.cl
new file mode 100644
index 00000000..2df61c5a
--- /dev/null
+++ b/cuda2gcn/src/bitsbytes.cl
@@ -0,0 +1,46 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ockl.h"
+#include "irif.h"
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_brev
+ATTR int __nv_brev(int x) { return __llvm_bitreverse_i32(x); }
+
+//-------- T __nv_brevll
+ATTR long __nv_brevll(long x) { return __llvm_bitreverse_i64(x); }
+
+//-------- T __nv_clz
+ATTR int __nv_clz(int x)
+{
+    return (int)__ockl_clz_u32((uint)x);
+}
+
+//-------- T __nv_clzll
+ATTR int __nv_clzll(long x)
+{
+    uint xlo = (uint)x;
+    uint xhi = (uint)(x >> 32);
+    uint zlo = __ockl_clz_u32(xlo) + 32u;
+    uint zhi = __ockl_clz_u32(xhi);
+    return (int)(xhi == 0 ? zlo : zhi);
+}
+
+//-------- T __nv_ffs
+ATTR int __nv_ffs(int x) { return (32 - __nv_clz(x&(-x))); }
+
+//-------- T __nv_ffsll
+ATTR int __nv_ffsll(long x) { return (int)(64 - __nv_clzll(x&(-x))); }
+
+//-------- T __nv_popc
+ATTR int __nv_popc(int x) { return __llvm_ctpop_i32(x); }
+
+//-------- T __nv_popcll
+ATTR int __nv_popcll(long x) { return (int)__llvm_ctpop_i64(x); }
+
diff --git a/cuda2gcn/src/convert.cl b/cuda2gcn/src/convert.cl
new file mode 100644
index 00000000..43113915
--- /dev/null
+++ b/cuda2gcn/src/convert.cl
@@ -0,0 +1,150 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define ATTR __attribute__((always_inline, const))
+
+#define CONVERTM(A,B,m,n) ATTR B __nv_##A##2##B##_##m(A x) \
+    { return convert_##B##_##n(x); }
+
+#define CONVERT(A,B) \
+    CONVERTM(A, B, rd, rtn) \
+    CONVERTM(A, B, rn, rte) \
+    CONVERTM(A, B, ru, rtp) \
+    CONVERTM(A, B, rz, rtz)
+
+//-------- T __nv_double2float_rd
+//-------- T __nv_double2float_rn
+//-------- T __nv_double2float_ru
+//-------- T __nv_double2float_rz
+CONVERT(double, float)
+
+//-------- T __nv_double2int_rd
+//-------- T __nv_double2int_rn
+//-------- T __nv_double2int_ru
+//-------- T __nv_double2int_rz
+CONVERT(double, int)
+
+//-------- T __nv_float2int_rd
+//-------- T __nv_float2int_rn
+//-------- T __nv_float2int_ru
+//-------- T __nv_float2int_rz
+CONVERT(float, int)
+
+//-------- T __nv_int2float_rd
+//-------- T __nv_int2float_rn
+//-------- T __nv_int2float_ru
+//-------- T __nv_int2float_rz
+CONVERT(int, float)
+
+//-------- T __nv_double2uint_rd
+//-------- T __nv_double2uint_rn
+//-------- T __nv_double2uint_ru
+//-------- T __nv_double2uint_rz
+CONVERT(double, uint)
+
+//-------- T __nv_float2uint_rd
+//-------- T __nv_float2uint_rn
+//-------- T __nv_float2uint_ru
+//-------- T __nv_float2uint_rz
+CONVERT(float, uint)
+
+//-------- T __nv_uint2double_rd
+//-------- T __nv_uint2double_rn
+//-------- T __nv_uint2double_ru
+//-------- T __nv_uint2double_rz
+CONVERT(uint, double)
+
+//-------- T __nv_uint2float_rd
+//-------- T __nv_uint2float_rn
+//-------- T __nv_uint2float_ru
+//-------- T __nv_uint2float_rz
+CONVERT(uint, float)
+
+#define CONVERT2LLM(A,B,m,n) ATTR long __nv_##A##2ll_##m(A x) \
+    { return convert_long_##n(x); }
+
+#define CONVERT2LL(A) \
+    CONVERT2LLM(A, long, rd, rtn) \
+    CONVERT2LLM(A, long, rn, rte) \
+    CONVERT2LLM(A, long, ru, rtp) \
+    CONVERT2LLM(A, long, rz, rtz)
+
+//-------- T __nv_double2ll_rd
+//-------- T __nv_double2ll_rn
+//-------- T __nv_double2ll_ru
+//-------- T __nv_double2ll_rz
+CONVERT2LL(double)
+
+//-------- T __nv_float2ll_rd
+//-------- T __nv_float2ll_rn
+//-------- T __nv_float2ll_ru
+//-------- T __nv_float2ll_rz
+CONVERT2LL(float)
+
+#define CONVERT2ULLM(A,B,m,n) ATTR ulong __nv_##A##2ull_##m(A x) \
+    { return convert_ulong_##n(x); }
+
+#define CONVERT2ULL(A) \
+    CONVERT2ULLM(A, ulong, rd, rtn) \
+    CONVERT2ULLM(A, ulong, rn, rte) \
+    CONVERT2ULLM(A, ulong, ru, rtp) \
+    CONVERT2ULLM(A, ulong, rz, rtz)
+
+//-------- T __nv_double2ull_rd
+//-------- T __nv_double2ull_rn
+//-------- T __nv_double2ull_ru
+//-------- T __nv_double2ull_rz
+CONVERT2ULL(double)
+
+//-------- T __nv_float2ull_rd
+//-------- T __nv_float2ull_rn
+//-------- T __nv_float2ull_ru
+//-------- T __nv_float2ull_rz
+CONVERT2ULL(float)
+
+#define CONVERT4LLM(A,B,m,n) ATTR B __nv_ll2##B##_##m(long x) \
+    { return convert_##B##_##n(x); }
+
+#define CONVERT4LL(B) \
+    CONVERT4LLM(long, B, rd, rtn) \
+    CONVERT4LLM(long, B, rn, rte) \
+    CONVERT4LLM(long, B, ru, rtp) \
+    CONVERT4LLM(long, B, rz, rtz)
+
+//-------- T __nv_ll2double_rd
+//-------- T __nv_ll2double_rn
+//-------- T __nv_ll2double_ru
+//-------- T __nv_ll2double_rz
+CONVERT4LL(double)
+
+//-------- T __nv_ll2float_rd
+//-------- T __nv_ll2float_rn
+//-------- T __nv_ll2float_ru
+//-------- T __nv_ll2float_rz
+CONVERT4LL(float)
+
+#define CONVERT4ULLM(A,B,m,n) ATTR B __nv_ull2##B##_##m(ulong x) \
+    { return convert_##B##_##n(x); }
+
+#define CONVERT4ULL(B) \
+    CONVERT4ULLM(ulong, B, rd, rtn) \
+    CONVERT4ULLM(ulong, B, rn, rte) \
+    CONVERT4ULLM(ulong, B, ru, rtp) \
+    CONVERT4ULLM(ulong, B, rz, rtz)
+
+//-------- T __nv_ull2double_rd
+//-------- T __nv_ull2double_rn
+//-------- T __nv_ull2double_ru
+//-------- T __nv_ull2double_rz
+CONVERT4ULL(double)
+
+//-------- T __nv_ull2float_rd
+//-------- T __nv_ull2float_rn
+//-------- T __nv_ull2float_ru
+//-------- T __nv_ull2float_rz
+CONVERT4ULL(float)
+
diff --git a/cuda2gcn/src/float.cl b/cuda2gcn/src/float.cl
new file mode 100644
index 00000000..58c8a00b
--- /dev/null
+++ b/cuda2gcn/src/float.cl
@@ -0,0 +1,33 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_finitef
+ATTR int __nv_finitef(float x) { return isfinite(x); }
+
+//-------- T __nv_isfinited
+ATTR int __nv_isfinited(double x) { return isfinite(x); }
+
+//-------- T __nv_isinfd
+ATTR int __nv_isinfd(double x) { return isinf(x); }
+
+//-------- T __nv_isinff
+ATTR int __nv_isinff(float x) { return isinf(x); }
+
+//-------- T __nv_isnand
+ATTR int __nv_isnand(double x) { return isnan(x); }
+
+//-------- T __nv_isnanf
+ATTR int __nv_isnanf(float x) { return isnan(x); }
+
+//-------- T __nv_nan
+ATTR double __nv_nan(char *tagp) { return __builtin_nan(tagp); }
+
+//-------- T __nv_nanf
+ATTR float __nv_nanf(char *tagp) { return __builtin_nan(tagp); }
+
diff --git a/cuda2gcn/src/generic.cl b/cuda2gcn/src/generic.cl
new file mode 100644
index 00000000..c2a232c9
--- /dev/null
+++ b/cuda2gcn/src/generic.cl
@@ -0,0 +1,54 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define ATTR __attribute__((always_inline, const))
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+//-------- T __nv_abs
+ATTR int __nv_abs(int x) { return abs(x); }
+
+//-------- T __nv_llabs
+ATTR long __nv_llabs(long x) { return abs(x); }
+
+//-------- T __nv_max
+ATTR int __nv_max(int a, int b) { return MAX(a,b); }
+
+//-------- T __nv_llmax
+ATTR long __nv_llmax(long a, long b) { return MAX(a,b); }
+
+//-------- T __nv_ullmax
+ATTR ulong __nv_ullmax(ulong a, ulong b) { return MAX(a,b); }
+
+//-------- T __nv_umax
+ATTR uint __nv_umax(uint a, uint b) { return MAX(a,b); }
+
+//-------- T __nv_min
+ATTR int __nv_min(int a, int b) { return MIN(a,b); }
+
+//-------- T __nv_llmin
+ATTR long __nv_llmin(long a, long b) { return MIN(a,b); }
+
+//-------- T __nv_ullmin
+ATTR ulong __nv_ullmin(ulong a, ulong b) { return MIN(a,b); }
+
+//-------- T __nv_umin
+ATTR uint __nv_umin(uint a, uint b) { return MIN(a,b); }
+
+//-------- T __nv_sad
+ATTR uint __nv_sad(int x, int y, uint z)
+{
+    return (z+abs(x-y));
+}
+
+//-------- T __nv_usad
+ATTR uint __nv_usad(uint x, uint y, uint z)
+{
+    return (z+abs(x-y));
+}
+
diff --git a/cuda2gcn/src/half.cl b/cuda2gcn/src/half.cl
new file mode 100644
index 00000000..02a26529
--- /dev/null
+++ b/cuda2gcn/src/half.cl
@@ -0,0 +1,23 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_float2half_rn
+half __nv_float2half_rn(float x)
+{
+    return (half)x;
+}
+
+//-------- T __nv_half2float
+float __nv_half2float(half x)
+{
+    return (float)x;
+}
+
diff --git a/cuda2gcn/src/integer.cl b/cuda2gcn/src/integer.cl
new file mode 100644
index 00000000..58b8bf5a
--- /dev/null
+++ b/cuda2gcn/src/integer.cl
@@ -0,0 +1,29 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ockl.h"
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_mul24
+ATTR int __nv_mul24(int x, int y) { return __ockl_mul24_i32(x, y); }
+
+//-------- T __nv_umul24
+ATTR uint __nv_umul24(uint x, uint y) { return __ockl_mul24_u32(x, y); }
+
+//-------- T __nv_mul64hi
+ATTR long __nv_mul64hi(long x, long y) { return __ockl_mul_hi_i64(x,y); }
+
+//-------- T __nv_mulhi
+ATTR int __nv_mulhi(int x, int y) { return __ockl_mul_hi_i32(x,y); }
+
+//-------- T __nv_umul64hi
+ATTR ulong __nv_umul64hi(ulong x, ulong y) { return __ockl_mul_hi_u64(x,y); }
+
+//-------- T __nv_umulhi
+ATTR uint __nv_umulhi(uint x, uint y) { return __ockl_mul_hi_u32(x,y); }
+
diff --git a/cuda2gcn/src/math.cl b/cuda2gcn/src/math.cl
new file mode 100644
index 00000000..2c4eaf55
--- /dev/null
+++ b/cuda2gcn/src/math.cl
@@ -0,0 +1,354 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define FUNC1D(root) \
+  ATTR double __nv_##root(double x) { return __ocml_##root##_f64(x); }
+#define FUNC1F(root) \
+  ATTR float __nv_##root##f(float x) { return __ocml_##root##_f32(x); }
+#define FUNC1(root) FUNC1D(root) FUNC1F(root)
+
+#define FUNC2D(root) \
+  ATTR double __nv_##root(double x, double y) { return __ocml_##root##_f64(x, y); }
+#define FUNC2F(root) \
+  ATTR float __nv_##root##f(float x, float y) { return __ocml_##root##_f32(x, y); }
+#define FUNC2(root) FUNC2D(root) FUNC2F(root)
+
+#define FUNC3D(root) \
+  ATTR double __nv_##root(double x, double y, double z) { return __ocml_##root##_f64(x, y, z); }
+#define FUNC3F(root) \
+  ATTR float __nv_##root##f(float x, float y, float z) { return __ocml_##root##_f32(x, y, z); }
+#define FUNC3(root) FUNC3D(root) FUNC3F(root)
+
+//-------- T __nv_acos
+//-------- T __nv_acosf
+FUNC1(acos)
+
+//-------- T __nv_acosh
+//-------- T __nv_acoshf
+FUNC1(acosh)
+
+//-------- T __nv_asin
+//-------- T __nv_asinf
+FUNC1(asin)
+
+//-------- T __nv_asinh
+//-------- T __nv_asinhf
+FUNC1(asinh)
+
+//-------- T __nv_atan
+//-------- T __nv_atanf
+FUNC1(atan)
+
+//-------- T __nv_atan2
+//-------- T __nv_atan2f
+FUNC2(atan2)
+
+//-------- T __nv_atanh
+//-------- T __nv_atanhf
+FUNC1(atanh)
+
+//-------- T __nv_cbrt
+//-------- T __nv_cbrtf
+FUNC1(cbrt)
+
+//-------- T __nv_ceil
+//-------- T __nv_ceilf
+FUNC1(ceil)
+
+//-------- T __nv_copysign
+//-------- T __nv_copysignf
+FUNC2(copysign)
+
+//-------- T __nv_cos
+//-------- T __nv_cosf
+FUNC1(cos)
+
+//-------- T __nv_cosh
+//-------- T __nv_coshf
+FUNC1(cosh)
+
+//-------- T __nv_cospi
+//-------- T __nv_cospif
+FUNC1(cospi)
+
+//-------- T __nv_erf
+//-------- T __nv_erff
+FUNC1(erf)
+
+//-------- T __nv_erfc
+//-------- T __nv_erfcf
+FUNC1(erfc)
+
+//-------- T __nv_erfcinv
+//-------- T __nv_erfcinvf
+FUNC1(erfcinv)
+
+//-------- T __nv_erfcx
+//-------- T __nv_erfcxf
+FUNC1(erfcx)
+
+//-------- T __nv_erfinv
+//-------- T __nv_erfinvf
+FUNC1(erfinv)
+
+//-------- T __nv_exp
+//-------- T __nv_expf
+FUNC1(exp)
+
+//-------- T __nv_exp10
+//-------- T __nv_exp10f
+FUNC1(exp10)
+
+//-------- T __nv_exp2
+//-------- T __nv_exp2f
+FUNC1(exp2)
+
+//-------- T __nv_expm1
+//-------- T __nv_expm1f
+FUNC1(expm1)
+
+//-------- T __nv_fabs
+//-------- T __nv_fabsf
+FUNC1(fabs)
+
+//-------- T __nv_fdim
+//-------- T __nv_fdimf
+FUNC2(fdim)
+
+//-------- T __nv_floor
+//-------- T __nv_floorf
+FUNC1(floor)
+
+//-------- T __nv_fma
+//-------- T __nv_fmaf
+FUNC3(fma)
+
+//-------- T __nv_fmax
+//-------- T __nv_fmaxf
+FUNC2(fmax)
+
+//-------- T __nv_fmin
+//-------- T __nv_fminf
+FUNC2(fmin)
+
+//-------- T __nv_fmod
+//-------- T __nv_fmodf
+FUNC2(fmod)
+
+//-------- T __nv_hypot
+//-------- T __nv_hypotf
+FUNC2(hypot)
+
+//-------- T __nv_j0
+//-------- T __nv_j0f
+FUNC1(j0)
+
+//-------- T __nv_j1
+//-------- T __nv_j1f
+FUNC1(j1)
+
+//-------- T __nv_lgamma
+//-------- T __nv_lgammaf
+FUNC1(lgamma)
+
+//-------- T __nv_log
+//-------- T __nv_logf
+FUNC1(log)
+
+//-------- T __nv_log10
+//-------- T __nv_log10f
+FUNC1(log10)
+
+//-------- T __nv_log1p
+//-------- T __nv_log1pf
+FUNC1(log1p)
+
+//-------- T __nv_log2
+//-------- T __nv_log2f
+FUNC1(log2)
+
+//-------- T __nv_logb
+//-------- T __nv_logbf
+FUNC1(logb)
+
+//-------- T __nv_pow
+//-------- T __nv_powf
+FUNC2(pow)
+
+//-------- T __nv_rcbrt
+//-------- T __nv_rcbrtf
+FUNC1(rcbrt)
+
+//-------- T __nv_remainder
+//-------- T __nv_remainderf
+FUNC2(remainder)
+
+//-------- T __nv_rhypot
+//-------- T __nv_rhypotf
+FUNC2(rhypot)
+
+//-------- T __nv_nearbyint
+//-------- T __nv_nearbyintf
+FUNC1(nearbyint)
+
+//-------- T __nv_nextafter
+//-------- T __nv_nextafterf
+FUNC2(nextafter)
+
+//-------- T __nv_rint
+//-------- T __nv_rintf
+FUNC1(rint)
+
+//-------- T __nv_round
+//-------- T __nv_roundf
+FUNC1(round)
+
+//-------- T __nv_rsqrt
+//-------- T __nv_rsqrtf
+FUNC1(rsqrt)
+
+//-------- T __nv_scalbn
+//-------- T __nv_scalbnf
+FUNC2(scalbn)
+
+//-------- T __nv_sin
+//-------- T __nv_sinf
+FUNC1(sin)
+
+//-------- T __nv_sinh
+//-------- T __nv_sinhf
+FUNC1(sinh)
+
+//-------- T __nv_sinpi
+//-------- T __nv_sinpif
+FUNC1(sinpi)
+
+//-------- T __nv_sqrt
+//-------- T __nv_sqrtf
+FUNC1(sqrt)
+
+//-------- T __nv_tan
+//-------- T __nv_tanf
+FUNC1(tan)
+
+//-------- T __nv_tanh
+//-------- T __nv_tanhf
+FUNC1(tanh)
+
+//-------- T __nv_tgamma
+//-------- T __nv_tgammaf
+FUNC1(tgamma)
+
+//-------- T __nv_trunc
+//-------- T __nv_truncf
+FUNC1(trunc)
+
+//-------- T __nv_y0
+//-------- T __nv_y0f
+FUNC1(y0)
+
+//-------- T __nv_y1
+//-------- T __nv_y1f
+FUNC1(y1)
+
+//-------- T __nv_cyl_bessel_i0
+ATTR double __nv_cyl_bessel_i0(double x) { return __ocml_i0_f64(x); }
+
+//-------- T __nv_cyl_bessel_i0f
+ATTR float __nv_cyl_bessel_i0f(float x) { return __ocml_i0_f32(x); }
+
+//-------- T __nv_cyl_bessel_i1
+ATTR double __nv_cyl_bessel_i1(double x) { return __ocml_i1_f64(x); }
+
+//-------- T __nv_cyl_bessel_i1f
+ATTR float __nv_cyl_bessel_i1f(float x) { return __ocml_i1_f32(x); }
+
+//-------- T __nv_frexp
+ATTR double __nv_frexp(double x, __private int *ptr) { return __ocml_frexp_f64(x, ptr); }
+
+//-------- T __nv_frexpf
+ATTR float __nv_frexpf(float x, __private int *ptr) { return __ocml_frexp_f32(x, ptr); }
+
+//-------- T __nv_ilogb
+ATTR int __nv_ilogb(double x) { return __ocml_ilogb_f64(x); }
+
+//-------- T __nv_ilogbf
+ATTR int __nv_ilogbf(float x) { return __ocml_ilogb_f32(x); }
+
+//-------- T __nv_ldexp
+ATTR double __nv_ldexp(double x, int i) { return __ocml_ldexp_f64(x, i); }
+
+//-------- T __nv_ldexpf
+ATTR float __nv_ldexpf(float x, int i) { return __ocml_ldexp_f32(x, i); }
+
+//-------- T __nv_modf
+ATTR double __nv_modf(double x, __private double *ptr) { return __ocml_modf_f64(x, ptr); }
+
+//-------- T __nv_modff
+ATTR float __nv_modff(float x, __private float *ptr) { return __ocml_modf_f32(x, ptr); }
+
+//-------- T __nv_norm3d
+ATTR double __nv_norm3d(double x, double y, double z) { return __ocml_len3_f64(x,y,z); }
+
+//-------- T __nv_norm3df
+ATTR float __nv_norm3df(float x, float y, float z) { return __ocml_len3_f32(x,y,z); }
+
+//-------- T __nv_norm4d
+ATTR double __nv_norm4d(double a, double b, double c, double d) { return __ocml_len4_f64(a,b,c,d); }
+
+//-------- T __nv_norm4df
+ATTR float __nv_norm4df(float a, float b, float c, float d) { return __ocml_len4_f32(a,b,c,d); }
+
+//-------- T __nv_normcdf
+ATTR double __nv_normcdf(double x) { return __ocml_ncdf_f64(x); }
+
+//-------- T __nv_normcdff
+ATTR float __nv_normcdff(float x) { return __ocml_ncdf_f32(x); }
+
+//-------- T __nv_normcdfinv
+ATTR double __nv_normcdfinv(double x) { return __ocml_ncdfinv_f64(x); }
+
+//-------- T __nv_normcdfinvf
+ATTR float __nv_normcdfinvf(float x) { return __ocml_ncdfinv_f32(x); }
+
+//-------- T __nv_powi
+ATTR double __nv_powi(double x, int n) { return __ocml_pown_f64(x, n); }
+
+//-------- T __nv_powi
+ATTR float __nv_powif(float x, int n) { return __ocml_pown_f32(x, n); }
+
+//-------- T __nv_remquo
+ATTR double __nv_remquo(double x, double y, __private int *ptr) { return __ocml_remquo_f64(x, y, ptr); }
+
+//-------- T __nv_remquof
+ATTR float __nv_remquof(float x, float y, __private int *ptr) { return __ocml_remquo_f32(x, y, ptr); }
+
+//-------- T __nv_saturatef
+ATTR float __nv_saturatef(float x) { return __ocml_min_f32(__ocml_max_f32(x, 0.0f), 1.0f); }
+
+//-------- T __nv_signbitd
+ATTR int __nv_signbitd(double x) { return __ocml_signbit_f64(x); }
+
+//-------- T __nv_signbitf
+ATTR int __nv_signbitf(float x) { return __ocml_signbit_f32(x); }
+
+//-------- T __nv_sincos
+ATTR void __nv_sincos(double x, __private double * sptr, __private double *cptr) { (*sptr)=__ocml_sincos_f64(x, cptr); }
+
+//-------- T __nv_sincosf
+ATTR void __nv_sincosf(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincos_f32(x, cptr); }
+
+//-------- T __nv_sincospi
+ATTR void __nv_sincospi(double x, __private double * sptr, __private double *cptr) { (*sptr)=__ocml_sincospi_f64(x, cptr); }
+
+//-------- T __nv_sincospif
+ATTR void __nv_sincosfpif(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincospi_f32(x, cptr); }
+
diff --git a/cuda2gcn/src/precision.cl b/cuda2gcn/src/precision.cl
new file mode 100644
index 00000000..21a13d6e
--- /dev/null
+++ b/cuda2gcn/src/precision.cl
@@ -0,0 +1,56 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define FUNC1F(root) \
+  ATTR float __nv_fast_##root##f(float x) { return __ocml_##root##_f32(x); }
+#define FUNC1(root) FUNC1F(root)
+
+#define FUNC2F(root) \
+  ATTR float __nv_fast_##root##f(float x, float y) { return __ocml_##root##_f32(x, y); }
+#define FUNC2(root) FUNC2F(root)
+
+#define FUNC3F(root) \
+  ATTR float __nv_fast_##root##f(float x, float y, float z) { return __ocml_##root##_f32(x, y, z); }
+#define FUNC3(root) FUNC3F(root)
+
+//-------- T __nv_fast_cosf
+FUNC1(cos)
+
+//-------- T __nv_fast_exp10f
+FUNC1(exp10)
+
+//-------- T __nv_fast_expf
+FUNC1(exp)
+
+//-------- T __nv_fast_log10f
+FUNC1(log10)
+
+//-------- T __nv_fast_log2f
+FUNC1(log2)
+
+//-------- T __nv_fast_logf
+FUNC1(log)
+
+//-------- T __nv_fast_powf
+FUNC2(pow)
+
+//-------- T __nv_fast_sinf
+FUNC1(sin)
+
+//-------- T __nv_fast_tanf
+FUNC1(tan)
+
+//-------- T __nv_fast_fdividef
+ATTR float __nv_fast_fdividef(float x, float y) { return native_divide(x, y); }
+
+//-------- T __nv_fast_sincosf
+ATTR void __nv_fast_sincosf(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincos_f32(x, cptr); }
+
diff --git a/cuda2gcn/src/reinterpret.cl b/cuda2gcn/src/reinterpret.cl
new file mode 100644
index 00000000..0d55cded
--- /dev/null
+++ b/cuda2gcn/src/reinterpret.cl
@@ -0,0 +1,63 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_double_as_longlong
+ATTR long __nv_double_as_longlong(double x)
+{
+  return as_long(x);
+}
+
+//-------- T __nv_float_as_int
+ATTR int __nv_float_as_int(float x)
+{
+  return as_int(x);
+}
+
+//-------- T __nv_float_as_uint
+ATTR unsigned int __nv_float_as_uint(float x)
+{
+  return as_uint(x);
+}
+
+//-------- T __nv_int_as_float
+ATTR float __nv_int_as_float(int x)
+{
+  return as_float(x);
+}
+
+//-------- T __nv_longlong_as_double
+ATTR double __nv_longlong_as_double(long x)
+{
+  return as_double(x);
+}
+
+//-------- T __nv_uint_as_float
+ATTR float __nv_uint_as_float(unsigned int x)
+{
+  return as_float(x);
+}
+
+//-------- T __nv_double2hiint
+int __nv_double2hiint(double x)
+{
+    return (int) as_long(x) >> 32;
+}
+
+//-------- T __nv_double2loint
+int __nv_double2loint(double x)
+{
+    return (int) as_long(x);
+}
+
+//-------- T __nv_hiloint2double
+double __nv_hiloint2double(int x, int y)
+{
+    return as_double((long)x << 32 | y);
+}
+
diff --git a/cuda2gcn/src/rounding.cl b/cuda2gcn/src/rounding.cl
new file mode 100644
index 00000000..a377e39d
--- /dev/null
+++ b/cuda2gcn/src/rounding.cl
@@ -0,0 +1,23 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml.h"
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_llrint
+ATTR long __nv_llrint(double x) { return (long)__ocml_rint_f64(x); }
+
+//-------- T __nv_llrintf
+ATTR long __nv_llrintf(float x) { return (long)__ocml_rint_f32(x); }
+
+//-------- T __nv_llround
+ATTR long __nv_llround(double x) { return (long)__ocml_round_f64(x); }
+
+//-------- T __nv_llroundf
+ATTR long __nv_llroundf(float x) { return (long)__ocml_round_f32(x); }
+
diff --git a/doc/OCKL.md b/doc/OCKL.md
new file mode 100644
index 00000000..169f511e
--- /dev/null
+++ b/doc/OCKL.md
@@ -0,0 +1,412 @@
+# OCKL User Guide
+
+* [Introduction](#introduction)
+  * [What Is OCKL](#what-is-ockl)
+* [Using OCKL](#using-ocml)
+  * [Standard Usage](#standard-usage)
+  * [Controls](#controls)
+* [Versioning](#versioning)
+* [Naming convention](#naming-convention)
+* [Supported functions](#supported-functions)
+
+
+## Introduction
+### What Is OCKL
+
+OCKL is an LLVM-IR bitcode library designed to provide access to certain hardware
+and compiler capabilities needed by language runtimes.  It should rarely be necessary
+to call any of these functions directly from application code.  Consider this library
+a "detail" layer.
+
+## Using OCKL
+### Standard Usage
+
+OCKL is expected to be used in a standard LLVM compilation flow as follows:
+  * Compile source modules to LLVM-IR bitcode (clang)
+  * Link together program bitcode with library bitcode including OCKL and OCLC.
+  * Run generic optimizations (opt)
+  * Code generation (llc)
+
+### Controls
+
+OCKL supports a number of controls that are provided by linking in specifically named inline
+functions.  These functions are inlined at optimization time and result in specific paths
+taken with no control flow overhead.  These functions all have the form (in C)
+
+    __attribute__((always_inline, const)) int
+    __oclc_control(void)
+    { return 1; } // or 0 to disable
+
+The currently supported control are
+  * `finite_only_opt` - floating point Inf and NaN are never expected to be consumed or produced
+  * `unsafe_math_opt` - lower accuracy results may be produced with higher performance
+  * `daz_opt` - subnormal values consumed and produced may be flushed to zero
+  * `correctly_rounded_sqrt32` - float square root must be correctly rounded
+  * `ISA_version` - an integer representation of the ISA version of the target device
+
+### Versioning
+
+OCKL usually ships as a single LLVM-IR bitcode file named
+
+    ocml-{LLVM rev}-{OCKL rev}.bc
+
+where `{LLVM rev}` is the version of LLVM used to create the file, of the
+form X.Y, e.g. 3.8, and `{OCKL rev}` is the OCKL library version of the form X.Y, currently 0.9.
+
+### Naming convention
+
+OCKL functions follow a simple naming convention:
+
+    __ockl_{function}_{type suffix}
+
+where {type suffix} generally indicates the type of the arguments and/or returned result using a type letter,
+e.g. "u" for unsigned integer, and a bit width, e.g. 32.
+
+### Supported functions
+
+The following table lists the available functions along with a brief description of each:
+
+| **function** | **Brief Description** |
+| :--- | :--- |
+| `uchar __ockl_clz_u8(uchar);` | Count leading zeroes |
+| `ushort __ockl_clz_u16(ushort);` | |
+| `uint __ockl_clz_u32(uint);` | |
+| `ulong __ockl_clz_u64(ulong);` | |
+| - | |
+| `uchar __ockl_ctz_u8(uchar);` | Count trailing zeroes |
+| `ushort __ockl_ctz_u16(ushort);` | |
+| `uint __ockl_ctz_u32(uint);` | |
+| `ulong __ockl_ctz_u64(ulong);` | |
+| - | |
+| `uint __ockl_popcount_u32(uint);` | Count nonzero bits |
+| `ulong __ockl_popcount_u64(ulong);` | |
+| - | |
+| `int __ockl_add_sat_i32(int,int);` | Add with saturation |
+| `uint __ockl_add_sat_u32(uint,uint);` | |
+| `long __ockl_add_sat_i64(long,long);` | |
+| `ulong __ockl_add_sat_u64(ulong,ulong);` | |
+| - | |
+| `int __ockl_sub_sat_i32(int,int);` | Subtract with saturation |
+| `uint __ockl_sub_sat_u32(uint,uint);` | |
+| `long __ockl_sub_sat_i64(long,long);` | |
+| `ulong __ockl_sub_sat_u64(ulong,ulong);` | |
+| - | |
+| `int __ockl_mul_hi_i32(int,int);` | High part of multiplication |
+| `uint __ockl_mul_hi_u32(uint,uint);` | |
+| `long __ockl_mul_hi_i64(long,long);` | |
+| `ulong __ockl_mul_hi_u64(ulong,ulong);` | |
+| - | |
+| `int __ockl_mul24_i32(int,int);` | Multiply assuming operands fit in 24 bits |
+| `uint __ockl_mul24_u32(uint,uint);` | |
+| - | |
+| `uint __ockl_activelane_u32(void);` | Index of currently lane counting only active lanes in wavefront |
+| - | |
+| `half __ockl_wfred_add_f16(half x);` | ADD reduction across wavefront |
+| `float __ockl_wfred_add_f32(float x);` | |
+| `double __ockl_wfred_add_f64(double x);` | |
+| `int __ockl_wfred_add_i32(int x);` | |
+| `long __ockl_wfred_add_i64(long x);` | |
+| `uint __ockl_wfred_add_u32(uint x);` | |
+| `ulong __ockl_wfred_add_u64(ulong x);` | AND reduction across wavefront |
+| `int __ockl_wfred_and_i32(int x);` | |
+| `long __ockl_wfred_and_i64(long x);` | |
+| `uint __ockl_wfred_and_u32(uint x);` | |
+| `ulong __ockl_wfred_and_u64(ulong x);` | |
+| `half __ockl_wfred_max_f16(half x);` | MAX reduction across wavefront |
+| `float __ockl_wfred_max_f32(float x);` | |
+| `double __ockl_wfred_max_f64(double x);` | |
+| `int __ockl_wfred_max_i32(int x);` | |
+| `long __ockl_wfred_max_i64(long x);` | |
+| `uint __ockl_wfred_max_u32(uint x);` | |
+| `ulong __ockl_wfred_max_u64(ulong x);` | |
+| `half __ockl_wfred_min_f16(half x);` | MIN reduction across wavefront |
+| `float __ockl_wfred_min_f32(float x);` | |
+| `double __ockl_wfred_min_f64(double x);` | |
+| `int __ockl_wfred_min_i32(int x);` | |
+| `long __ockl_wfred_min_i64(long x);` | |
+| `uint __ockl_wfred_min_u32(uint x);` | |
+| `ulong __ockl_wfred_min_u64(ulong x);` | |
+| `int __ockl_wfred_or_i32(int x);` | OR reduction across wavefront |
+| `long __ockl_wfred_or_i64(long x);` | |
+| `uint __ockl_wfred_or_u32(uint x);` | |
+| `ulong __ockl_wfred_or_u64(ulong x);` | |
+| `int __ockl_wfred_xor_i32(int x);` | XOR reduction across wavefront |
+| `long __ockl_wfred_xor_i64(long x);` | |
+| `uint __ockl_wfred_xor_u32(uint x);` | |
+| `ulong __ockl_wfred_xor_u64(ulong x);` | |
+| `half __ockl_wfscan_add_f16(half x, bool inclusive);` | ADD scan across wavefront |
+| `float __ockl_wfscan_add_f32(float x, bool inclusive);` | |
+| `double __ockl_wfscan_add_f64(double x, bool inclusive);` | |
+| `int __ockl_wfscan_add_i32(int x, bool inclusive);` | |
+| `long __ockl_wfscan_add_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_add_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_add_u64(ulong x, bool inclusive);` | |
+| `int __ockl_wfscan_and_i32(int x, bool inclusive);` | AND scan across wavefront |
+| `long __ockl_wfscan_and_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_and_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_and_u64(ulong x, bool inclusive);` | |
+| `half __ockl_wfscan_max_f16(half x, bool inclusive);` | MAX scan across wavefront |
+| `float __ockl_wfscan_max_f32(float x, bool inclusive);` | |
+| `double __ockl_wfscan_max_f64(double x, bool inclusive);` | |
+| `int __ockl_wfscan_max_i32(int x, bool inclusive);` | |
+| `long __ockl_wfscan_max_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_max_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_max_u64(ulong x, bool inclusive);` | |
+| `half __ockl_wfscan_min_f16(half x, bool inclusive);` | MIN scan across wavefront |
+| `float __ockl_wfscan_min_f32(float x, bool inclusive);` | |
+| `double __ockl_wfscan_min_f64(double x, bool inclusive);` | |
+| `int __ockl_wfscan_min_i32(int x, bool inclusive);` | |
+| `long __ockl_wfscan_min_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_min_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_min_u64(ulong x, bool inclusive);` | |
+| `int __ockl_wfscan_or_i32(int x, bool inclusive);` | OR scan across wavefront |
+| `long __ockl_wfscan_or_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_or_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_or_u64(ulong x, bool inclusive);` | |
+| `int __ockl_wfscan_xor_i32(int x, bool inclusive);` | XOR scan across wavefront |
+| `long __ockl_wfscan_xor_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_xor_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_xor_u64(ulong x, bool inclusive);` | |
+| `uint __ockl_wfbcast_u32(uint x, uint i);` | Broadcast to wavefront |
+| `ulong __ockl_wfbcast_u64(ulong x, uint i);` | |
+| - | |
+| `bool __ockl_wfany_i32(int e);` | Detect any nonzero across wavefront |
+| `bool __ockl_wfall_i32(int e);` | Detect all nozero across wavefront |
+| `bool __ockl_wfsame_i32(int e);` | Detect same across wavefront  |
+| - | |
+| `uint __ockl_bfm_u32(uint,uint);` | Bit field mask |
+| `int __ockl_bfe_i32(int, uint, uint);` | Bit field extract |
+| `uint __ockl_bfe_u32(uint,uint,uint);` | |
+| `uint __ockl_bitalign_u32(uint,uint,uint);` | Align on bit boundary |
+| `uint __ockl_bytealign_u32(uint,uint,uint);` | Align on byte boundary |
+| `uint __ockl_lerp_u32(uint,uint,uint);` | Add each byte with prescribed carry |
+| `float __ockl_max3_f32(float,float,float);` | Max of 3 |
+| `half __ockl_max3_f16(half,half,half);` | |
+| `int __ockl_max3_i32(int,int,int);` | |
+| `uint __ockl_max3_u32(uint,uint,uint);` | |
+| `float __ockl_median3_f32(float,float,float);` | Median of 3 |
+| `half __ockl_median3_f16(half,half,half);` | |
+| `int __ockl_median3_i32(int,int,int);` | |
+| `uint __ockl_median3_u32(uint,uint,uint);` | |
+| `float __ockl_min3_f32(float,float,float);` | Min of 3 |
+| `half __ockl_min3_f16(half,half,half);` | |
+| `int __ockl_min3_i32(int,int,int);` | |
+| `uint __ockl_min3_u32(uint,uint,uint);` | |
+| `ulong __ockl_mqsad_u64(ulong, uint, ulong);` | Masked rolling SAD |
+| `uint __ockl_pack_u32(float4);` | Pack vector to bytes |
+| `ulong __ockl_qsad_u64(ulong, uint, ulong);` | Rolling SAD |
+| `uint __ockl_msad_u32(uint,uint,uint);` | Masked SAD |
+| `uint __ockl_sad_u32(uint,uint,uint);` | SAD |
+| `uint __ockl_sadd_u32(uint,uint,uint);` | 32-bit SAD |
+| `uint __ockl_sadhi_u32(uint,uint,uint);` | SAD accululating to high half |
+| `uint __ockl_sadw_u32(uint,uint,uint);` | 16-bit SAD |
+| `float __ockl_unpack0_f32(uint);` | Extract byte and convert to float |
+| `float __ockl_unpack1_f32(uint);` | |
+| `float __ockl_unpack2_f32(uint);` | |
+| `float __ockl_unpack3_f32(uint);` | |
+| - | |
+| `float4 __ockl_image_load_1D(TSHARP i, int c);` | Load from 1D image |
+| `float4 __ockl_image_load_1Da(TSHARP i, int2 c);` | Load from 1D image array |
+| `float4 __ockl_image_load_1Db(TSHARP i, int c);` | Load from 1D buffered image |
+| `float4 __ockl_image_load_2D(TSHARP i, int2 c);` | Load from 2D image |
+| `float4 __ockl_image_load_2Da(TSHARP i, int4 c);` | Load from 2D image array |
+| `float __ockl_image_load_2Dad(TSHARP i, int4 c);` | Load from 2D depth image array |
+| `float __ockl_image_load_2Dd(TSHARP i, int2 c);` | Load from 2D depth image |
+| `float4 __ockl_image_load_3D(TSHARP i, int4 c);` | Load from 3D image |
+| `float4 __ockl_image_load_CM(TSHARP i, int2 c, int f);` | Load from cubemap |
+| `float4 __ockl_image_load_CMa(TSHARP i, int4 c, int f);` | Load from cubemap array |
+| - | |
+| `float4 __ockl_image_load_mip_1D(TSHARP i, int c, int l);` | Load from mipmapped image |
+| `float4 __ockl_image_load_mip_1Da(TSHARP i, int2 c, int l);` | |
+| `float4 __ockl_image_load_mip_2D(TSHARP i, int2 c, int l);` | |
+| `float4 __ockl_image_load_mip_2Da(TSHARP i, int4 c, int l);` | |
+| `float __ockl_image_load_mip_2Dad(TSHARP i, int4 c, int l);` | |
+| `float __ockl_image_load_mip_2Dd(TSHARP i, int2 c, int l);` | |
+| `float4 __ockl_image_load_mip_3D(TSHARP i, int4 c, int l);` | |
+| `float4 __ockl_image_load_mip_CM(TSHARP i, int2 c, int f, int l);` | |
+| `float4 __ockl_image_load_mip_CMa(TSHARP i, int4 c, int f, int l);` | |
+| - | |
+| `half4 __ockl_image_loadh_1D(TSHARP i, int c);` | Load from image returning half precision |
+| `half4 __ockl_image_loadh_1Da(TSHARP i, int2 c);` | |
+| `half4 __ockl_image_loadh_1Db(TSHARP i, int c);` | |
+| `half4 __ockl_image_loadh_2D(TSHARP i, int2 c);` | |
+| `half4 __ockl_image_loadh_2Da(TSHARP i, int4 c);` | |
+| `half4 __ockl_image_loadh_3D(TSHARP i, int4 c);` | |
+| `half4 __ockl_image_loadh_CM(TSHARP i, int2 c, int f);` | |
+| `half4 __ockl_image_loadh_CMa(TSHARP i, int4 c, int f);` | |
+| `half4 __ockl_image_loadh_mip_1D(TSHARP i, int c, int l);` | |
+| `half4 __ockl_image_loadh_mip_1Da(TSHARP i, int2 c, int l);` | |
+| `half4 __ockl_image_loadh_mip_2D(TSHARP i, int2 c, int l);` | |
+| `half4 __ockl_image_loadh_mip_2Da(TSHARP i, int4 c, int l);` | |
+| `half4 __ockl_image_loadh_mip_3D(TSHARP i, int4 c, int l);` | |
+| `half4 __ockl_image_loadh_mip_CM(TSHARP i, int2 c, int f, int l);` | |
+| `half4 __ockl_image_loadh_mip_CMa(TSHARP i, int4 c, int f, int l);` | |
+| - | |
+| `void __ockl_image_store_1D(TSHARP i, int c, float4 p);` | Store to image |
+| `void __ockl_image_store_1Da(TSHARP i, int2 c, float4 p);` | |
+| `void __ockl_image_store_1Db(TSHARP i, int c, float4 p);` | |
+| `void __ockl_image_store_2D(TSHARP i, int2 c, float4 p);` | |
+| `void __ockl_image_store_2Da(TSHARP i, int4 c, float4 p);` | |
+| `void __ockl_image_store_2Dad(TSHARP i, int4 c, float p);` | |
+| `void __ockl_image_store_2Dd(TSHARP i, int2 c, float p);` | |
+| `void __ockl_image_store_3D(TSHARP i, int4 c, float4 p);` | |
+| `void __ockl_image_store_CM(TSHARP i, int2 c, int f, float4 p);` | |
+| `void __ockl_image_store_CMa(TSHARP i, int4 c, int f, float4 p);` | |
+| `void __ockl_image_store_lod_1D(TSHARP i, int c, int l, float4 p);` | Store to level of mipmapped image |
+| - | |
+| `void __ockl_image_store_lod_1Da(TSHARP i, int2 c, int l, float4 p);` | |
+| `void __ockl_image_store_lod_2D(TSHARP i, int2 c, int l, float4 p);` | |
+| `void __ockl_image_store_lod_2Da(TSHARP i, int4 c, int l, float4 p);` | |
+| `void __ockl_image_store_lod_2Dad(TSHARP i, int4 c, int l, float p);` | |
+| `void __ockl_image_store_lod_2Dd(TSHARP i, int2 c, int l, float p);` | |
+| `void __ockl_image_store_lod_3D(TSHARP i, int4 c, int l, float4 p);` | |
+| `void __ockl_image_store_lod_CM(TSHARP i, int2 c, int f, int l, float4 p);` | |
+| `void __ockl_image_store_lod_CMa(TSHARP i, int4 c, int f, int l, float4 p);` | |
+| - | |
+| `void __ockl_image_storeh_1D(TSHARP i, int c, half4 p);` | Store half precision pixel to image|
+| `void __ockl_image_storeh_1Da(TSHARP i, int2 c, half4 p);` | |
+| `void __ockl_image_storeh_1Db(TSHARP i, int c, half4 p);` | |
+| `void __ockl_image_storeh_2D(TSHARP i, int2 c, half4 p);` | |
+| `void __ockl_image_storeh_2Da(TSHARP i, int4 c, half4 p);` | |
+| `void __ockl_image_storeh_3D(TSHARP i, int4 c, half4 p);` | |
+| `void __ockl_image_storeh_CM(TSHARP i, int2 c, int f, half4 p);` | |
+| `void __ockl_image_storeh_CMa(TSHARP i, int4 c, int f, half4 p);` | |
+| - | |
+| `void __ockl_image_storeh_lod_1D(TSHARP i, int c, int l, half4 p);` | Store half precision pixel to level of mipmapped image |
+| `void __ockl_image_storeh_lod_1Da(TSHARP i, int2 c, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_2D(TSHARP i, int2 c, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_2Da(TSHARP i, int4 c, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_3D(TSHARP i, int4 c, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_CM(TSHARP i, int2 c, int f, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_CMa(TSHARP i, int4 c, int f, int l, half4 p);` | |
+| - | |
+| `float4 __ockl_image_sample_1D(TSHARP i, SSHARP s, float c);` | Sample image |
+| `float4 __ockl_image_sample_1Da(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_sample_2D(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_sample_2Da(TSHARP i, SSHARP s, float4 c);` | |
+| `float __ockl_image_sample_2Dad(TSHARP i, SSHARP s, float4 c);` | |
+| `float __ockl_image_sample_2Dd(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_sample_3D(TSHARP i, SSHARP s, float4 c);` | |
+| `float4 __ockl_image_sample_CM(TSHARP i, SSHARP s, float4 c);` | |
+| `float4 __ockl_image_sample_CMa(TSHARP i, SSHARP s, float4 c);` | |
+| - | |
+| `float4 __ockl_image_sample_grad_1D(TSHARP i, SSHARP s, float c, float dx, float dy);` | Sample mipmapped image using gradient |
+| `float4 __ockl_image_sample_grad_1Da(TSHARP i, SSHARP s, float2 c, float dx, float dy);` | |
+| `float4 __ockl_image_sample_grad_2D(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | |
+| `float4 __ockl_image_sample_grad_2Da(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | |
+| `float __ockl_image_sample_grad_2Dad(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | |
+| `float __ockl_image_sample_grad_2Dd(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | |
+| `float4 __ockl_image_sample_grad_3D(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy);` | |
+| - | |
+| `float4 __ockl_image_sample_lod_1D(TSHARP i, SSHARP s, float c, float l);` | Sample mipmapped image using LOD |
+| `float4 __ockl_image_sample_lod_1Da(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `float4 __ockl_image_sample_lod_2D(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `float4 __ockl_image_sample_lod_2Da(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `float __ockl_image_sample_lod_2Dad(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `float __ockl_image_sample_lod_2Dd(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `float4 __ockl_image_sample_lod_3D(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `float4 __ockl_image_sample_lod_CM(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `float4 __ockl_image_sample_lod_CMa(TSHARP i, SSHARP s, float4 c, float l);` | |
+| - | |
+| `half4 __ockl_image_sampleh_1D(TSHARP i, SSHARP s, float c);` | Sample image returning half precision |
+| `half4 __ockl_image_sampleh_1Da(TSHARP i, SSHARP s, float2 c);` | |
+| `half4 __ockl_image_sampleh_2D(TSHARP i, SSHARP s, float2 c);` | |
+| `half4 __ockl_image_sampleh_2Da(TSHARP i, SSHARP s, float4 c);` | |
+| `half4 __ockl_image_sampleh_3D(TSHARP i, SSHARP s, float4 c);` | |
+| `half4 __ockl_image_sampleh_CM(TSHARP i, SSHARP s, float4 c);` | |
+| `half4 __ockl_image_sampleh_CMa(TSHARP i, SSHARP s, float4 c);` | |
+| - | |
+| `half4 __ockl_image_sampleh_grad_1D(TSHARP i, SSHARP s, float c, float dx, float dy);` | Sample mipmapped image using gradient returning half precision |
+| `half4 __ockl_image_sampleh_grad_1Da(TSHARP i, SSHARP s, float2 c, float dx, float dy);` | |
+| `half4 __ockl_image_sampleh_grad_2D(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | |
+| `half4 __ockl_image_sampleh_grad_2Da(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | |
+| `half4 __ockl_image_sampleh_grad_3D(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy);` | |
+| - | |
+| `half4 __ockl_image_sampleh_lod_1D(TSHARP i, SSHARP s, float c, float l);` | Sample mipmapped image using LOD returning half precision |
+| `half4 __ockl_image_sampleh_lod_1Da(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_2D(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_2Da(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_3D(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_CM(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_CMa(TSHARP i, SSHARP s, float4 c, float l);` | |
+| - | |
+| `float4 __ockl_image_gather4r_2D(TSHARP i, SSHARP s, float2 c);` | Gather 2x2 channel from image |
+| `float4 __ockl_image_gather4g_2D(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_gather4b_2D(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_gather4a_2D(TSHARP i, SSHARP s, float2 c);` | |
+| - | |
+| `int __ockl_image_array_size_1Da(TSHARP i);` | Get image array size |
+| `int __ockl_image_array_size_2Da(TSHARP i);` | |
+| `int __ockl_image_array_size_2Dad(TSHARP i);` | |
+| `int __ockl_image_array_size_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_channel_data_type_1D(TSHARP i);` | Get image channel data type |
+| `int __ockl_image_channel_data_type_1Da(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_1Db(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_2D(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_2Da(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_2Dad(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_2Dd(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_3D(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_CM(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_channel_order_1D(TSHARP i);` | Get image channel order |
+| `int __ockl_image_channel_order_1Da(TSHARP i);` | |
+| `int __ockl_image_channel_order_1Db(TSHARP i);` | |
+| `int __ockl_image_channel_order_2D(TSHARP i);` | |
+| `int __ockl_image_channel_order_2Da(TSHARP i);` | |
+| `int __ockl_image_channel_order_2Dad(TSHARP i);` | |
+| `int __ockl_image_channel_order_2Dd(TSHARP i);` | |
+| `int __ockl_image_channel_order_3D(TSHARP i);` | |
+| `int __ockl_image_channel_order_CM(TSHARP i);` | |
+| `int __ockl_image_channel_order_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_depth_3D(TSHARP i);` | Get 3D image depth |
+| - | |
+| `int __ockl_image_height_2D(TSHARP i);` | Get image height |
+| `int __ockl_image_height_2Da(TSHARP i);` | |
+| `int __ockl_image_height_2Dad(TSHARP i);` | |
+| `int __ockl_image_height_2Dd(TSHARP i);` | |
+| `int __ockl_image_height_3D(TSHARP i);` | |
+| `int __ockl_image_height_CM(TSHARP i);` | |
+| `int __ockl_image_height_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_num_mip_levels_1D(TSHARP i);` | Get number of levels in mipmapped image |
+| `int __ockl_image_num_mip_levels_1Da(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_2D(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_2Da(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_2Dad(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_2Dd(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_3D(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_CM(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_width_1D(TSHARP i);` | Get image width |
+| `int __ockl_image_width_1Da(TSHARP i);` | |
+| `int __ockl_image_width_1Db(TSHARP i);` | |
+| `int __ockl_image_width_2D(TSHARP i);` | |
+| `int __ockl_image_width_2Da(TSHARP i);` | |
+| `int __ockl_image_width_2Dad(TSHARP i);` | |
+| `int __ockl_image_width_2Dd(TSHARP i);` | |
+| `int __ockl_image_width_3D(TSHARP i);` | |
+| `int __ockl_image_width_CM(TSHARP i);` | |
+| `int __ockl_image_width_CMa(TSHARP i);` | |
+| - | |
+| `size_t __ockl_get_global_offset(uint);` | Get grid global offset (OpenCL) of dimension |
+| `size_t __ockl_get_global_id(uint);` | Get workitem global ID of dimension |
+| `size_t __ockl_get_local_id(uint);` | Get workitem local ID of dimension |
+| `size_t __ockl_get_group_id(uint);` | Get ID of group workitem resides in of dimension |
+| `size_t __ockl_get_global_size(uint);` | Get global size of dimension |
+| `size_t __ockl_get_local_size(uint);` | Get local size of dimension |
+| `size_t __ockl_get_num_groups(uint);` | Get number of groups in dimension |
+| `uint __ockl_get_work_dim(void);` | Get grid number of dimensions |
+| `size_t __ockl_get_enqueued_local_size(uint);` | Get enqueued local size of dimension |
+| `size_t __ockl_get_global_linear_id(void);` | Get global linear ID of workitem|
+| `size_t __ockl_get_local_linear_id(void);` | Get local linear ID of workitem |
+| - | |
+| `bool __ockl_is_local_addr(const void *);` | Test if generic address is local |
+| `bool __ockl_is_private_addr(const void *);` | Test if generic address is private |
+| `__global void * __ockl_to_global(void *);` | Convert generic address to global address |
+| `__local void * __ockl_to_local(void *);` | Convert generic address to local address |
+| `__private void * __ockl_to_private(void *);` | Convert generic address to private address |
diff --git a/hc/CMakeLists.txt b/hc/CMakeLists.txt
index 6c4eb4e7..e7440e4f 100644
--- a/hc/CMakeLists.txt
+++ b/hc/CMakeLists.txt
@@ -24,7 +24,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_CURRENT_SOURCE_DIR}/../utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB ll_srcs
diff --git a/hc/src/hc_kernel.cl b/hc/src/hc_kernel.cl
index 50e8e07d..3bbd1c77 100644
--- a/hc/src/hc_kernel.cl
+++ b/hc/src/hc_kernel.cl
@@ -5,7 +5,7 @@
 #define ATTR __attribute__((always_inline, const))
 #define ATTR2 __attribute__((always_inline))
 
-ATTR long
+ATTR uint
 amp_get_global_id(int dim)
 {
   __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr();
@@ -37,70 +37,138 @@ amp_get_global_id(int dim)
   return (g*s + l);
 }
 
-ATTR long
+ATTR uint
 amp_get_global_size(int dim)
 {
-  return __ockl_get_global_size(dim);
+    __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr();
+
+    switch(dim) {
+    case 0:
+        return p->grid_size_x;
+    case 1:
+        return p->grid_size_y;
+    case 2:
+        return p->grid_size_z;
+    default:
+        return 1;
+    }
 }
 
-ATTR long
+ATTR uint
 amp_get_local_id(int dim)
 {
-  return __ockl_get_local_id(dim);
+    switch(dim) {
+    case 0:
+        return __llvm_amdgcn_workitem_id_x();
+    case 1:
+        return __llvm_amdgcn_workitem_id_y();
+    case 2:
+        return __llvm_amdgcn_workitem_id_z();
+    default:
+        return 0;
+    }
 }
 
-ATTR long
+ATTR uint
 amp_get_num_groups(int dim)
 {
-  return __ockl_get_num_groups(dim);
+    __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr();
+
+    uint n, d;
+    switch(dim) {
+    case 0:
+        n = p->grid_size_x;
+        d = p->workgroup_size_x;
+        break;
+    case 1:
+        n = p->grid_size_y;
+        d = p->workgroup_size_y;
+        break;
+    case 2:
+        n = p->grid_size_z;
+        d = p->workgroup_size_z;
+        break;
+    default:
+        n = 1;
+        d = 1;
+        break;
+    }
+
+    return n / d;
 }
 
-ATTR long
+ATTR uint
 amp_get_group_id(int dim)
 {
-  return __ockl_get_group_id(dim);
+    switch(dim) {
+    case 0:
+        return __llvm_amdgcn_workgroup_id_x();
+    case 1:
+        return __llvm_amdgcn_workgroup_id_y();
+    case 2:
+        return __llvm_amdgcn_workgroup_id_z();
+    default:
+        return 0;
+    }
 }
 
-ATTR long
+ATTR uint
 amp_get_local_size(int dim)
 {
-  return __ockl_get_local_size(dim);
+    __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr();
+    uint d;
+
+    switch(dim) {
+    case 0:
+        d = p->workgroup_size_x;
+        break;
+    case 1:
+        d = p->workgroup_size_y;
+        break;
+    case 2:
+        d = p->workgroup_size_z;
+        break;
+    default:
+        d = 1;
+        break;
+    }
+    return d;
 }
 
-ATTR long
+ATTR uint
 hc_get_grid_size(int dim)
 {
-  return __ockl_get_global_size(dim);
+    return amp_get_global_size(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_workitem_absolute_id(int dim)
 {
-  return amp_get_global_id(dim);
+    return amp_get_global_id(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_workitem_id(int dim)
 {
-  return __ockl_get_local_id(dim);
+    return amp_get_local_id(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_num_groups(int dim)
 {
-  return __ockl_get_num_groups(dim);
+    return amp_get_num_groups(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_group_id(int dim)
 {
-  return __ockl_get_group_id(dim);
+    return amp_get_group_id(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_group_size(int dim)
 {
-  return __ockl_get_local_size(dim);
+    return amp_get_local_size(dim);
 }
 
 ATTR2 void
diff --git a/irif/CMakeLists.txt b/irif/CMakeLists.txt
index 37e89dca..883240f6 100644
--- a/irif/CMakeLists.txt
+++ b/irif/CMakeLists.txt
@@ -20,7 +20,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_CURRENT_SOURCE_DIR}/../utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB srcs
diff --git a/irif/inc/irif.h b/irif/inc/irif.h
index ec91d9f1..9121bb39 100644
--- a/irif/inc/irif.h
+++ b/irif/inc/irif.h
@@ -120,11 +120,15 @@ extern bool __llvm_umul_with_overflow_i32(uint, uint, __private uint*);
 extern bool __llvm_smul_with_overflow_i64(long, long, __private long*);
 extern bool __llvm_umul_with_overflow_i64(ulong, ulong, __private ulong*);
 
-extern __attribute__((const)) int __llvm_ctlz_i32(int);
-extern __attribute__((const)) int __llvm_ctlz_i64(long);
+extern __attribute__((const)) uchar __llvm_ctlz_i8(uchar);
+extern __attribute__((const)) ushort __llvm_ctlz_i16(ushort);
+extern __attribute__((const)) uint __llvm_ctlz_i32(uint);
+extern __attribute__((const)) ulong __llvm_ctlz_i64(ulong);
 
-extern __attribute__((const)) int __llvm_cttz_i32(int);
-extern __attribute__((const)) int __llvm_cttz_i64(long);
+extern __attribute__((const)) uchar __llvm_cttz_i8(uchar);
+extern __attribute__((const)) ushort __llvm_cttz_i16(ushort);
+extern __attribute__((const)) uint __llvm_cttz_i32(uint);
+extern __attribute__((const)) ulong __llvm_cttz_i64(ulong);
 
 // Fence intrinsics
 extern void __llvm_fence_acq_wi(void);
@@ -197,6 +201,80 @@ extern ulong __llvm_cmpxchg_a1_x_x_dev_i64(__global ulong *, ulong, ulong);
 extern uint __llvm_cmpxchg_a3_x_x_wg_i32(__local uint *, uint, uint);
 extern ulong __llvm_cmpxchg_a3_x_x_wg_i64(__local ulong *, ulong, ulong);
 
+// Constrained floating point
+extern __attribute__((const)) half __llvm_add_rte_f16(half, half);
+extern __attribute__((const)) half __llvm_add_rtn_f16(half, half);
+extern __attribute__((const)) half __llvm_add_rtp_f16(half, half);
+extern __attribute__((const)) half __llvm_add_rtz_f16(half, half);
+extern __attribute__((const)) float __llvm_add_rte_f32(float, float);
+extern __attribute__((const)) float __llvm_add_rtn_f32(float, float);
+extern __attribute__((const)) float __llvm_add_rtp_f32(float, float);
+extern __attribute__((const)) float __llvm_add_rtz_f32(float, float);
+extern __attribute__((const)) double __llvm_add_rte_f64(double, double);
+extern __attribute__((const)) double __llvm_add_rtn_f64(double, double);
+extern __attribute__((const)) double __llvm_add_rtp_f64(double, double);
+extern __attribute__((const)) double __llvm_add_rtz_f64(double, double);
+extern __attribute__((const)) half __llvm_sub_rte_f16(half, half);
+extern __attribute__((const)) half __llvm_sub_rtn_f16(half, half);
+extern __attribute__((const)) half __llvm_sub_rtp_f16(half, half);
+extern __attribute__((const)) half __llvm_sub_rtz_f16(half, half);
+extern __attribute__((const)) float __llvm_sub_rte_f32(float, float);
+extern __attribute__((const)) float __llvm_sub_rtn_f32(float, float);
+extern __attribute__((const)) float __llvm_sub_rtp_f32(float, float);
+extern __attribute__((const)) float __llvm_sub_rtz_f32(float, float);
+extern __attribute__((const)) double __llvm_sub_rte_f64(double, double);
+extern __attribute__((const)) double __llvm_sub_rtn_f64(double, double);
+extern __attribute__((const)) double __llvm_sub_rtp_f64(double, double);
+extern __attribute__((const)) double __llvm_sub_rtz_f64(double, double);
+extern __attribute__((const)) half __llvm_mul_rte_f16(half, half);
+extern __attribute__((const)) half __llvm_mul_rtn_f16(half, half);
+extern __attribute__((const)) half __llvm_mul_rtp_f16(half, half);
+extern __attribute__((const)) half __llvm_mul_rtz_f16(half, half);
+extern __attribute__((const)) float __llvm_mul_rte_f32(float, float);
+extern __attribute__((const)) float __llvm_mul_rtn_f32(float, float);
+extern __attribute__((const)) float __llvm_mul_rtp_f32(float, float);
+extern __attribute__((const)) float __llvm_mul_rtz_f32(float, float);
+extern __attribute__((const)) double __llvm_mul_rte_f64(double, double);
+extern __attribute__((const)) double __llvm_mul_rtn_f64(double, double);
+extern __attribute__((const)) double __llvm_mul_rtp_f64(double, double);
+extern __attribute__((const)) double __llvm_mul_rtz_f64(double, double);
+extern __attribute__((const)) half __llvm_div_rte_f16(half, half);
+extern __attribute__((const)) half __llvm_div_rtn_f16(half, half);
+extern __attribute__((const)) half __llvm_div_rtp_f16(half, half);
+extern __attribute__((const)) half __llvm_div_rtz_f16(half, half);
+extern __attribute__((const)) float __llvm_div_rte_f32(float, float);
+extern __attribute__((const)) float __llvm_div_rtn_f32(float, float);
+extern __attribute__((const)) float __llvm_div_rtp_f32(float, float);
+extern __attribute__((const)) float __llvm_div_rtz_f32(float, float);
+extern __attribute__((const)) double __llvm_div_rte_f64(double, double);
+extern __attribute__((const)) double __llvm_div_rtn_f64(double, double);
+extern __attribute__((const)) double __llvm_div_rtp_f64(double, double);
+extern __attribute__((const)) double __llvm_div_rtz_f64(double, double);
+extern __attribute__((const)) half __llvm_sqrt_rte_f16(half);
+extern __attribute__((const)) half __llvm_sqrt_rtn_f16(half);
+extern __attribute__((const)) half __llvm_sqrt_rtp_f16(half);
+extern __attribute__((const)) half __llvm_sqrt_rtz_f16(half);
+extern __attribute__((const)) float __llvm_sqrt_rte_f32(float);
+extern __attribute__((const)) float __llvm_sqrt_rtn_f32(float);
+extern __attribute__((const)) float __llvm_sqrt_rtp_f32(float);
+extern __attribute__((const)) float __llvm_sqrt_rtz_f32(float);
+extern __attribute__((const)) double __llvm_sqrt_rte_f64(double);
+extern __attribute__((const)) double __llvm_sqrt_rtn_f64(double);
+extern __attribute__((const)) double __llvm_sqrt_rtp_f64(double);
+extern __attribute__((const)) double __llvm_sqrt_rtz_f64(double);
+extern __attribute__((const)) half __llvm_fma_rte_f16(half, half, half);
+extern __attribute__((const)) half __llvm_fma_rtn_f16(half, half, half);
+extern __attribute__((const)) half __llvm_fma_rtp_f16(half, half, half);
+extern __attribute__((const)) half __llvm_fma_rtz_f16(half, half, half);
+extern __attribute__((const)) float __llvm_fma_rte_f32(float, float, float);
+extern __attribute__((const)) float __llvm_fma_rtn_f32(float, float, float);
+extern __attribute__((const)) float __llvm_fma_rtp_f32(float, float, float);
+extern __attribute__((const)) float __llvm_fma_rtz_f32(float, float, float);
+extern __attribute__((const)) double __llvm_fma_rte_f64(double, double, double);
+extern __attribute__((const)) double __llvm_fma_rtn_f64(double, double, double);
+extern __attribute__((const)) double __llvm_fma_rtp_f64(double, double, double);
+extern __attribute__((const)) double __llvm_fma_rtz_f64(double, double, double);
+
 // AMDGPU intrinsics
 extern __attribute__((const)) bool __llvm_amdgcn_class_f16(half, int) __asm("llvm.amdgcn.class.f16");
 extern __attribute__((const)) bool __llvm_amdgcn_class_f32(float, int) __asm("llvm.amdgcn.class.f32");
@@ -275,9 +353,9 @@ extern void __llvm_amdcgn_buffer_wbinvl1_vol(void) __asm("llvm.amdgcn.buffer.wbi
 extern __attribute__((const)) uint __llvm_amdgcn_mbcnt_lo(uint, uint) __asm("llvm.amdgcn.mbcnt.lo");
 extern __attribute__((const)) uint __llvm_amdgcn_mbcnt_hi(uint, uint) __asm("llvm.amdgcn.mbcnt.hi");
 
-extern ulong __llvm_amdgcn_read_exec(void);
-extern uint __llvm_amdgcn_read_exec_lo(void);
-extern uint __llvm_amdgcn_read_exec_hi(void);
+extern __attribute__((convergent)) ulong __llvm_amdgcn_read_exec(void);
+extern __attribute__((convergent)) uint __llvm_amdgcn_read_exec_lo(void);
+extern __attribute__((convergent)) uint __llvm_amdgcn_read_exec_hi(void);
 
 extern uint __llvm_amdgcn_s_getreg(uint) __asm("llvm.amdgcn.s.getreg");
 
@@ -387,12 +465,12 @@ extern void __llvm_amdgcn_image_store_mip_f32_v4i32(float p, int4 c, uint8 t, ui
     __asm("llvm.amdgcn.image.store.mip.f32.v4i32.v8i32");
 
 // Image Sample: Only expose 8 word T# and a few of the other combinations
-extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f32.f32.v8i32");
-extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f32.v2f32.v8i32");
-extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f32.v4f32.v8i32");
+extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f32.f32.v8i32");
+extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f32.v2f32.v8i32");
+extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32");
 
 extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
     __asm("llvm.amdgcn.image.sample.l.v4f32.v2f32.v8i32");
@@ -406,12 +484,12 @@ extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_v4f32_v8f32(flo
 extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_v4f32_v16f32(float16 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
     __asm("llvm.amdgcn.image.sample.l.v4f32.v16f32.v8i32");
 
-extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f16.f32.v8i32");
-extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f16.v2f32.v8i32");
-extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f16.v4f32.v8i32");
+extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f16.f32.v8i32");
+extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f16.v2f32.v8i32");
+extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f16.v4f32.v8i32");
 
 extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
     __asm("llvm.amdgcn.image.sample.l.v4f16.v2f32.v8i32");
@@ -426,10 +504,10 @@ extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_d_v4f16_v16f32(flo
     __asm("llvm.amdgcn.image.sample.l.v4f16.v16f32.v8i32");
 
 // depth image sample
-extern __attribute__((pure)) float __llvm_amdgcn_image_sample_f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.f32.v2f32.v8i32");
-extern __attribute__((pure)) float __llvm_amdgcn_image_sample_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.f32.v4f32.v8i32");
+extern __attribute__((pure)) float __llvm_amdgcn_image_sample_lz_f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.f32.v2f32.v8i32");
+extern __attribute__((pure)) float __llvm_amdgcn_image_sample_lz_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.f32.v4f32.v8i32");
 
 extern __attribute__((pure)) float __llvm_amdgcn_image_sample_l_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
     __asm("llvm.amdgcn.image.sample.l.f32.v4f32.v8i32");
@@ -440,8 +518,8 @@ extern __attribute__((pure)) float __llvm_amdgcn_image_sample_d_f32_v16f32(float
     __asm("llvm.amdgcn.image.sample.l.f32.v16f32.v8i32");
 
 // image fetch
-extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-        __asm("llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32");
+extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+        __asm("llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32");
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
 #endif // IRIF_H
diff --git a/irif/src/cz.ll b/irif/src/cz.ll
index af16d51c..bd9194c7 100644
--- a/irif/src/cz.ll
+++ b/irif/src/cz.ll
@@ -18,25 +18,45 @@ declare i16 @llvm.cttz.i16(i16, i1)
 declare i32 @llvm.cttz.i32(i32, i1)
 declare i64 @llvm.cttz.i64(i64, i1)
 
+define i8 @__llvm_ctlz_i8(i8) #0 {
+    %2 = call i8 @llvm.ctlz.i8(i8 %0, i1 0)
+    ret i8 %2
+}
+
+define i16 @__llvm_ctlz_i16(i16) #0 {
+    %2 = call i16 @llvm.ctlz.i16(i16 %0, i1 0)
+    ret i16 %2
+}
+
 define i32 @__llvm_ctlz_i32(i32) #0 {
-    %2 = call i32 @llvm.ctlz.i32(i32 %0, i1 1)
+    %2 = call i32 @llvm.ctlz.i32(i32 %0, i1 0)
     ret i32 %2
 }
 
 define i64 @__llvm_ctlz_i64(i64) #0 {
-    %2 = call i64 @llvm.ctlz.i64(i64 %0, i1 1)
+    %2 = call i64 @llvm.ctlz.i64(i64 %0, i1 0)
     ret i64 %2
 }
 
+define i8 @__llvm_cttz_i8(i8) #0 {
+    %2 = call i8 @llvm.cttz.i8(i8 %0, i1 0)
+    ret i8 %2
+}
+
+define i16 @__llvm_cttz_i16(i16) #0 {
+    %2 = call i16 @llvm.cttz.i16(i16 %0, i1 0)
+    ret i16 %2
+}
+
 define i32 @__llvm_cttz_i32(i32) #0 {
-    %2 = call i32 @llvm.cttz.i32(i32 %0, i1 1)
+    %2 = call i32 @llvm.cttz.i32(i32 %0, i1 0)
     ret i32 %2
 }
 
 define i64 @__llvm_cttz_i64(i64) #0 {
-    %2 = call i64 @llvm.cttz.i64(i64 %0, i1 1)
+    %2 = call i64 @llvm.cttz.i64(i64 %0, i1 0)
     ret i64 %2
 }
 
-attributes #0 = { alwaysinline argmemonly norecurse nounwind readnone }
+attributes #0 = { alwaysinline norecurse nounwind readnone }
 
diff --git a/irif/src/fence.ll b/irif/src/fence.ll
index 14f04b03..0bcaaaa9 100644
--- a/irif/src/fence.ll
+++ b/irif/src/fence.ll
@@ -1,27 +1,23 @@
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 target triple = "amdgcn--amdhsa"
 
-;;
-;; syncscope number mapping is in llvm/target/AMDGPU/AMDGPU.h class AMDGPUSynchronizationScope
-;;
-
 define void @__llvm_fence_acq_wi() local_unnamed_addr #0 {
-  fence syncscope(5) acquire
+  fence syncscope("singlethread") acquire
   ret void
 }
 
 define void @__llvm_fence_acq_sg() local_unnamed_addr #0 {
-  fence syncscope(4) acquire
+  fence syncscope("wavefront") acquire
   ret void
 }
 
 define void @__llvm_fence_acq_wg() local_unnamed_addr #0 {
-  fence syncscope(3) acquire
+  fence syncscope("workgroup") acquire
   ret void
 }
 
 define void @__llvm_fence_acq_dev() local_unnamed_addr #0 {
-  fence syncscope(2) acquire
+  fence syncscope("agent") acquire
   ret void
 }
 
@@ -31,22 +27,22 @@ define void @__llvm_fence_acq_sys() local_unnamed_addr #0 {
 }
 
 define void @__llvm_fence_rel_wi() local_unnamed_addr #0 {
-  fence syncscope(5) release
+  fence syncscope("singlethread") release
   ret void
 }
 
 define void @__llvm_fence_rel_sg() local_unnamed_addr #0 {
-  fence syncscope(4) release
+  fence syncscope("wavefront") release
   ret void
 }
 
 define void @__llvm_fence_rel_wg() local_unnamed_addr #0 {
-  fence syncscope(3) release
+  fence syncscope("workgroup") release
   ret void
 }
 
 define void @__llvm_fence_rel_dev() local_unnamed_addr #0 {
-  fence syncscope(2) release
+  fence syncscope("agent") release
   ret void
 }
 
@@ -56,22 +52,22 @@ define void @__llvm_fence_rel_sys() local_unnamed_addr #0 {
 }
 
 define void @__llvm_fence_ar_wi() local_unnamed_addr #0 {
-  fence syncscope(5) acq_rel
+  fence syncscope("singlethread") acq_rel
   ret void
 }
 
 define void @__llvm_fence_ar_sg() local_unnamed_addr #0 {
-  fence syncscope(4) acq_rel
+  fence syncscope("wavefront") acq_rel
   ret void
 }
 
 define void @__llvm_fence_ar_wg() local_unnamed_addr #0 {
-  fence syncscope(3) acq_rel
+  fence syncscope("workgroup") acq_rel
   ret void
 }
 
 define void @__llvm_fence_ar_dev() local_unnamed_addr #0 {
-  fence syncscope(2) acq_rel
+  fence syncscope("agent") acq_rel
   ret void
 }
 
@@ -81,22 +77,22 @@ define void @__llvm_fence_ar_sys() local_unnamed_addr #0 {
 }
 
 define void @__llvm_fence_sc_wi() local_unnamed_addr #0 {
-  fence syncscope(5) seq_cst
+  fence syncscope("singlethread") seq_cst
   ret void
 }
 
 define void @__llvm_fence_sc_sg() local_unnamed_addr #0 {
-  fence syncscope(4) seq_cst
+  fence syncscope("wavefront") seq_cst
   ret void
 }
 
 define void @__llvm_fence_sc_wg() local_unnamed_addr #0 {
-  fence syncscope(3) seq_cst
+  fence syncscope("workgroup") seq_cst
   ret void
 }
 
 define void @__llvm_fence_sc_dev() local_unnamed_addr #0 {
-  fence syncscope(2) seq_cst
+  fence syncscope("agent") seq_cst
   ret void
 }
 
diff --git a/irif/src/reg.ll b/irif/src/reg.ll
index 2fa2ab65..43bf238c 100644
--- a/irif/src/reg.ll
+++ b/irif/src/reg.ll
@@ -12,23 +12,22 @@ declare i32 @llvm.read_register.i32(metadata) #0
 declare i64 @llvm.read_register.i64(metadata) #0
 
 define i64 @__llvm_amdgcn_read_exec() #1 {
-    %1 = call i64 @llvm.read_register.i64(metadata !0) #2
+    %1 = call i64 @llvm.read_register.i64(metadata !0) #0
     ret i64 %1
 }
 
 define i32 @__llvm_amdgcn_read_exec_lo() #1 {
-    %1 = call i32 @llvm.read_register.i32(metadata !1) #2
+    %1 = call i32 @llvm.read_register.i32(metadata !1) #0
     ret i32 %1
 }
 
 define i32 @__llvm_amdgcn_read_exec_hi() #1 {
-    %1 = call i32 @llvm.read_register.i32(metadata !2) #2
+    %1 = call i32 @llvm.read_register.i32(metadata !2) #0
     ret i32 %1
 }
 
-attributes #0 = { nounwind }
-attributes #1 = { alwaysinline nounwind }
-attributes #2 = { nounwind convergent }
+attributes #0 = { nounwind convergent }
+attributes #1 = { alwaysinline nounwind convergent }
 
 !0 = !{!"exec"}
 !1 = !{!"exec_lo"}
diff --git a/irif/src/rounded.ll b/irif/src/rounded.ll
new file mode 100644
index 00000000..80b0082b
--- /dev/null
+++ b/irif/src/rounded.ll
@@ -0,0 +1,393 @@
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn-amd-amdhsa-opencl"
+
+;;;;; Add ;;;;;
+define half @__llvm_add_rte_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_add_rtn_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_add_rtp_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_add_rtz_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define float @__llvm_add_rte_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_add_rtn_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_add_rtp_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_add_rtz_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define double @__llvm_add_rte_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_add_rtn_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_add_rtp_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_add_rtz_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+;;;;; Sub ;;;;;
+define half @__llvm_sub_rte_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_sub_rtn_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_sub_rtp_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_sub_rtz_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define float @__llvm_sub_rte_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_sub_rtn_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_sub_rtp_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_sub_rtz_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define double @__llvm_sub_rte_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_sub_rtn_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_sub_rtp_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_sub_rtz_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+;;;;; Mul ;;;;;
+define half @__llvm_mul_rte_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_mul_rtn_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_mul_rtp_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_mul_rtz_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define float @__llvm_mul_rte_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_mul_rtn_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_mul_rtp_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_mul_rtz_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define double @__llvm_mul_rte_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_mul_rtn_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_mul_rtp_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_mul_rtz_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+;;;;; Div ;;;;;
+define half @__llvm_div_rte_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_div_rtn_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_div_rtp_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_div_rtz_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define float @__llvm_div_rte_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_div_rtn_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_div_rtp_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_div_rtz_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define double @__llvm_div_rte_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_div_rtn_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_div_rtp_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_div_rtz_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+;;;;; Sqrt ;;;;;
+define half @__llvm_sqrt_rte_f16(half) local_unnamed_addr #0 {
+  %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0,  metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %2
+}
+
+define half @__llvm_sqrt_rtn_f16(half) local_unnamed_addr #0 {
+  %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0,  metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %2
+}
+
+define half @__llvm_sqrt_rtp_f16(half) local_unnamed_addr #0 {
+  %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0,  metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %2
+}
+
+define half @__llvm_sqrt_rtz_f16(half) local_unnamed_addr #0 {
+  %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0,  metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %2
+}
+
+define float @__llvm_sqrt_rte_f32(float) local_unnamed_addr #0 {
+  %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0,  metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %2
+}
+
+define float @__llvm_sqrt_rtn_f32(float) local_unnamed_addr #0 {
+  %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0,  metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %2
+}
+
+define float @__llvm_sqrt_rtp_f32(float) local_unnamed_addr #0 {
+  %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0,  metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %2
+}
+
+define float @__llvm_sqrt_rtz_f32(float) local_unnamed_addr #0 {
+  %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0,  metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %2
+}
+
+define double @__llvm_sqrt_rte_f64(double) local_unnamed_addr #0 {
+  %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0,  metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %2
+}
+
+define double @__llvm_sqrt_rtn_f64(double) local_unnamed_addr #0 {
+  %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0,  metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %2
+}
+
+define double @__llvm_sqrt_rtp_f64(double) local_unnamed_addr #0 {
+  %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0,  metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %2
+}
+
+define double @__llvm_sqrt_rtz_f64(double) local_unnamed_addr #0 {
+  %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0,  metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %2
+}
+
+;;;;; Fma ;;;;;
+define half @__llvm_fma_rte_f16(half, half, half) local_unnamed_addr #0 {
+  %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %4
+}
+
+define half @__llvm_fma_rtn_f16(half, half, half) local_unnamed_addr #0 {
+  %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %4
+}
+
+define half @__llvm_fma_rtp_f16(half, half, half) local_unnamed_addr #0 {
+  %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %4
+}
+
+define half @__llvm_fma_rtz_f16(half, half, half) local_unnamed_addr #0 {
+  %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %4
+}
+
+define float @__llvm_fma_rte_f32(float, float, float) local_unnamed_addr #0 {
+  %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %4
+}
+
+define float @__llvm_fma_rtn_f32(float, float, float) local_unnamed_addr #0 {
+  %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %4
+}
+
+define float @__llvm_fma_rtp_f32(float, float, float) local_unnamed_addr #0 {
+  %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %4
+}
+
+define float @__llvm_fma_rtz_f32(float, float, float) local_unnamed_addr #0 {
+  %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %4
+}
+
+define double @__llvm_fma_rte_f64(double, double, double) local_unnamed_addr #0 {
+  %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %4
+}
+
+define double @__llvm_fma_rtn_f64(double, double, double) local_unnamed_addr #0 {
+  %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %4
+}
+
+define double @__llvm_fma_rtp_f64(double, double, double) local_unnamed_addr #0 {
+  %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %4
+}
+
+define double @__llvm_fma_rtz_f64(double, double, double) local_unnamed_addr #0 {
+  %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %4
+}
+
+declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) local_unnamed_addr #1
+
+declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) local_unnamed_addr #1
+
+declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) local_unnamed_addr #1
+
+attributes #0 = { alwaysinline nounwind readnone }
+attributes #1 = { nounwind readnone }
+
diff --git a/ockl/CMakeLists.txt b/ockl/CMakeLists.txt
index cb9bb25a..7fa87ef4 100644
--- a/ockl/CMakeLists.txt
+++ b/ockl/CMakeLists.txt
@@ -15,4 +15,14 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc)
 
 opencl_bc_lib(ockl ${sources})
 
-install(FILES inc/ockl.h DESTINATION include COMPONENT OpenCL)
+install(FILES
+        inc/amd_hsa_common.h
+        inc/amd_hsa_elf.h
+        inc/amd_hsa_kernel_code.h
+        inc/amd_hsa_queue.h
+        inc/amd_hsa_signal.h
+        inc/device_amd_hsa.h
+        inc/hsa.h
+        inc/ockl_hsa.h
+        inc/ockl.h
+        DESTINATION include COMPONENT OpenCL)
diff --git a/ockl/inc/hsa.h b/ockl/inc/hsa.h
index 0252b009..85365882 100644
--- a/ockl/inc/hsa.h
+++ b/ockl/inc/hsa.h
@@ -1502,7 +1502,7 @@ typedef struct hsa_queue_s {
 
 #ifdef HSA_LARGE_MODEL
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *base_address;
 #elif defined HSA_LITTLE_ENDIAN
@@ -1511,7 +1511,7 @@ typedef struct hsa_queue_s {
    * packets. Must be aligned to the size of an AQL packet.
    */
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *base_address;
   /**
@@ -1521,7 +1521,7 @@ typedef struct hsa_queue_s {
 #else
   uint32_t reserved0;
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *base_address;
 #endif
@@ -2129,7 +2129,7 @@ typedef struct hsa_kernel_dispatch_packet_s {
 
 #ifdef HSA_LARGE_MODEL
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *kernarg_address;
 #elif defined HSA_LITTLE_ENDIAN
@@ -2141,7 +2141,7 @@ typedef struct hsa_kernel_dispatch_packet_s {
    * completed execution.
    */
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *kernarg_address;
   /**
@@ -2151,7 +2151,7 @@ typedef struct hsa_kernel_dispatch_packet_s {
 #else
   uint32_t reserved1;
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *kernarg_address;
 #endif
diff --git a/ockl/inc/ockl.h b/ockl/inc/ockl.h
index 0cba8e0f..bceacdaa 100644
--- a/ockl/inc/ockl.h
+++ b/ockl/inc/ockl.h
@@ -102,9 +102,18 @@
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
+extern __attribute__((const)) uchar OCKL_MANGLE_T(clz,u8)(uchar);
+extern __attribute__((const)) ushort OCKL_MANGLE_T(clz,u16)(ushort);
 DECL_CONST_OCKL_UNARY_U32(clz)
+DECL_CONST_OCKL_UNARY_U64(clz)
+
+extern __attribute__((const)) uchar OCKL_MANGLE_T(ctz,u8)(uchar);
+extern __attribute__((const)) ushort OCKL_MANGLE_T(ctz,u16)(ushort);
 DECL_CONST_OCKL_UNARY_U32(ctz)
+DECL_CONST_OCKL_UNARY_U64(ctz)
+
 DECL_CONST_OCKL_UNARY_U32(popcount)
+DECL_CONST_OCKL_UNARY_U64(popcount)
 
 DECL_CONST_OCKL_BINARY_I32(add_sat)
 DECL_CONST_OCKL_BINARY_U32(add_sat)
diff --git a/ockl/inc/ockl_hsa.h b/ockl/inc/ockl_hsa.h
index 111116b7..1a53d9e1 100644
--- a/ockl/inc/ockl_hsa.h
+++ b/ockl/inc/ockl_hsa.h
@@ -12,11 +12,11 @@
 #include "device_amd_hsa.h"
 
 typedef enum __ockl_memory_order_e {
-  __ockl_memory_order_relaxed,
-  __ockl_memory_order_acquire,
-  __ockl_memory_order_release,
-  __ockl_memory_order_acq_rel,
-  __ockl_memory_order_seq_cst,
+  __ockl_memory_order_relaxed = __ATOMIC_RELAXED,
+  __ockl_memory_order_acquire = __ATOMIC_ACQUIRE,
+  __ockl_memory_order_release = __ATOMIC_RELEASE,
+  __ockl_memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  __ockl_memory_order_seq_cst = __ATOMIC_SEQ_CST,
 } __ockl_memory_order;
 
 extern ulong OCKL_MANGLE_T(hsa_queue,load_write_index)(const __global hsa_queue_t *queue, __ockl_memory_order mem_order);
diff --git a/ockl/src/clz.cl b/ockl/src/clz.cl
index 593b4fcf..e93edbee 100644
--- a/ockl/src/clz.cl
+++ b/ockl/src/clz.cl
@@ -8,10 +8,27 @@
 #include "irif.h"
 #include "ockl.h"
 
+__attribute__((always_inline, const)) uchar
+OCKL_MANGLE_T(clz,u8)(uchar i)
+{
+    return __llvm_ctlz_i8(i);
+}
+
+__attribute__((always_inline, const)) ushort
+OCKL_MANGLE_T(clz,u16)(ushort i)
+{
+    return __llvm_ctlz_i16(i);
+}
+
 __attribute__((always_inline, const)) uint
 OCKL_MANGLE_U32(clz)(uint i)
 {
-    uint r = (uint)__llvm_ctlz_i32((int)i);
-    return i ? r : 32u;
+    return __llvm_ctlz_i32(i);
+}
+
+__attribute__((always_inline, const)) ulong
+OCKL_MANGLE_U64(clz)(ulong i)
+{
+   return __llvm_ctlz_i64(i);
 }
 
diff --git a/ockl/src/ctz.cl b/ockl/src/ctz.cl
index 72de58cf..a7ad76e9 100644
--- a/ockl/src/ctz.cl
+++ b/ockl/src/ctz.cl
@@ -8,10 +8,27 @@
 #include "irif.h"
 #include "ockl.h"
 
+__attribute__((always_inline, const)) uchar
+OCKL_MANGLE_T(ctz,u8)(uchar i)
+{
+    return __llvm_cttz_i8(i);
+}
+
+__attribute__((always_inline, const)) ushort
+OCKL_MANGLE_T(ctz,u16)(ushort i)
+{
+    return __llvm_cttz_i16(i);
+}
+
 __attribute__((always_inline, const)) uint
 OCKL_MANGLE_U32(ctz)(uint i)
 {
-    uint r = (uint)__llvm_cttz_i32((int)i);
-    return i ? r : 32u;
+    return __llvm_cttz_i32(i);
+}
+
+__attribute__((always_inline, const)) ulong
+OCKL_MANGLE_U64(ctz)(ulong i)
+{
+    return __llvm_cttz_i64(i);
 }
 
diff --git a/ockl/src/hsaqs.cl b/ockl/src/hsaqs.cl
index 426d2c6b..a39fc495 100644
--- a/ockl/src/hsaqs.cl
+++ b/ockl/src/hsaqs.cl
@@ -15,103 +15,11 @@
 
 #define ATTR __attribute__((always_inline))
 
-// TODO Remove this workaround when the compiler is ready
-
-#define AL(T,P,O,S) ({ \
-    T __l; \
-    switch (O) { \
-    case __ockl_memory_order_acquire: \
-        __l = atomic_load_explicit(P, memory_order_acquire, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        __l = atomic_load_explicit(P, memory_order_seq_cst, S); \
-        break; \
-    default: \
-        __l = atomic_load_explicit(P, memory_order_relaxed, S); \
-        break; \
-    } \
-    __l; \
-})
-
-#define AS(P,V,O,S) ({ \
-    switch (O) { \
-    case __ockl_memory_order_release: \
-        atomic_store_explicit(P, V, memory_order_release, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        atomic_store_explicit(P, V, memory_order_seq_cst, S); \
-        break; \
-    default: \
-        atomic_store_explicit(P, V, memory_order_relaxed, S); \
-        break; \
-    } \
-})
-
-#define AF(T,K,P,V,O,S) ({ \
-    T __f; \
-    switch (O) { \
-    case __ockl_memory_order_acquire: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_acquire, S); \
-        break; \
-    case __ockl_memory_order_release: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_release, S); \
-        break; \
-    case __ockl_memory_order_acq_rel: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_acq_rel, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_seq_cst, S); \
-        break; \
-    default: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_relaxed, S); \
-        break; \
-    } \
-    __f; \
-})
-
-#define AX(T,P,V,O,S) ({ \
-    T __e; \
-    switch (O) { \
-    case __ockl_memory_order_acquire: \
-        __e = atomic_exchange_explicit(P, V, memory_order_acquire, S); \
-        break; \
-    case __ockl_memory_order_release: \
-        __e = atomic_exchange_explicit(P, V, memory_order_release, S); \
-        break; \
-    case __ockl_memory_order_acq_rel: \
-        __e = atomic_exchange_explicit(P, V, memory_order_acq_rel, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        __e = atomic_exchange_explicit(P, V, memory_order_seq_cst, S); \
-        break; \
-    default: \
-        __e = atomic_exchange_explicit(P, V, memory_order_relaxed, S); \
-        break; \
-    } \
-    __e; \
-})
-
-#define AC(P,E,V,O,R,S) ({ \
-    bool __c; \
-    switch (O) { \
-    case __ockl_memory_order_acquire: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_acquire, R, S); \
-        break; \
-    case __ockl_memory_order_release: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_release, R, S); \
-        break; \
-    case __ockl_memory_order_acq_rel: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_acq_rel, R, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_seq_cst, R, S); \
-        break; \
-    default: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_relaxed, R, S); \
-        break; \
-    } \
-    __c; \
-})
+#define AL(T,P,O,S) __opencl_atomic_load(P,O,S)
+#define AS(P,V,O,S) __opencl_atomic_store(P,V,O,S)
+#define AF(T,K,P,V,O,S) __opencl_atomic_fetch_##K(P,V,O,S)
+#define AX(T,P,V,O,S) __opencl_atomic_exchange(P,V,O,S)
+#define AC(P,E,V,O,R,S) __opencl_atomic_compare_exchange_strong(P,E,V,O,R,S)
 
 //
 // HSA queue ops
@@ -235,6 +143,9 @@ OCKL_MANGLE_T(hsa_signal,store)(hsa_signal_t sig, long value, __ockl_memory_orde
     if (s->kind == AMD_SIGNAL_KIND_USER) {
         AS((__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices);
         update_mbox(s);
+    } else if (__oclc_ISA_version() >= 900) {
+        // Hardware doorbell supports AQL semantics.
+        atomic_store_explicit((__global atomic_ulong *)s->hardware_doorbell_ptr, (ulong)value, memory_order_release, memory_scope_all_svm_devices);
     } else {
 
         {
diff --git a/ockl/src/image.cl b/ockl/src/image.cl
index b1752c4a..1d3ee450 100644
--- a/ockl/src/image.cl
+++ b/ockl/src/image.cl
@@ -497,7 +497,7 @@ RATTR float4
 OCKL_MANGLE_T(image_sample,1D)(TSHARP i, SSHARP s, float c)
 {
     ADJUST_X(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f32_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
@@ -505,14 +505,14 @@ OCKL_MANGLE_T(image_sample,1Da)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_X(c.x, i, s);
     c.y = __llvm_rint_f32(c.y);
-    return __llvm_amdgcn_image_sample_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_sample,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
@@ -520,7 +520,7 @@ OCKL_MANGLE_T(image_sample,2Da)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XY(c, i, s);
     c.z = __llvm_rint_f32(c.z);
-    return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
 }
 
 RATTR float
@@ -528,28 +528,28 @@ OCKL_MANGLE_T(image_sample,2Dad)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XY(c, i, s);
     c.z = __llvm_rint_f32(c.z);
-    return __llvm_amdgcn_image_sample_f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, true);
 }
 
 RATTR float
 OCKL_MANGLE_T(image_sample,2Dd)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_sample_f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_sample,3D)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XYZ(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_sample,CM)(TSHARP i, SSHARP s, float4 c)
 {
     CUBE_PREP(c);
-    return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
@@ -557,7 +557,7 @@ OCKL_MANGLE_T(image_sample,CMa)(TSHARP i, SSHARP s, float4 c)
 {
     CUBE_PREP(c);
     c.z = SAMPLE_ARRAY_FACE(c.w, c.z);
-    return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
@@ -685,7 +685,7 @@ RATTR half4
 OCKL_MANGLE_T(image_sampleh,1D)(TSHARP i, SSHARP s, float c)
 {
     ADJUST_X(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f16_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
@@ -693,14 +693,14 @@ OCKL_MANGLE_T(image_sampleh,1Da)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_X(c.x, i, s);
     c.y = __llvm_rint_f32(c.y);
-    return __llvm_amdgcn_image_sample_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
 }
 
 RATTR half4
 OCKL_MANGLE_T(image_sampleh,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
@@ -708,21 +708,21 @@ OCKL_MANGLE_T(image_sampleh,2Da)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XY(c, i, s);
     c.z = __llvm_rint_f32(c.z);
-    return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
 }
 
 RATTR half4
 OCKL_MANGLE_T(image_sampleh,3D)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XYZ(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
 OCKL_MANGLE_T(image_sampleh,CM)(TSHARP i, SSHARP s, float4 c)
 {
     CUBE_PREP(c);
-    return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
@@ -730,7 +730,7 @@ OCKL_MANGLE_T(image_sampleh,CMa)(TSHARP i, SSHARP s, float4 c)
 {
     CUBE_PREP(c);
     c.z = SAMPLE_ARRAY_FACE(c.w, c.z);
-    return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
@@ -828,28 +828,28 @@ RATTR float4
 OCKL_MANGLE_T(image_gather4r,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false);
+    return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_gather4g,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x2, false, false, false, false, false);
+    return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x2, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_gather4b,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x4, false, false, false, false, false);
+    return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x4, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_gather4a,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x8, false, false, false, false, false);
+    return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x8, false, false, false, false, false);
 }
 
 // We rely on the fact that the runtime allocates 12 words for the T# or V#
diff --git a/ockl/src/popcount.cl b/ockl/src/popcount.cl
index 77212f17..b6404022 100644
--- a/ockl/src/popcount.cl
+++ b/ockl/src/popcount.cl
@@ -14,3 +14,9 @@ OCKL_MANGLE_U32(popcount)(uint i)
     return (uint)__llvm_ctpop_i32((int)i);
 }
 
+__attribute__((always_inline, const)) ulong
+OCKL_MANGLE_U64(popcount)(ulong i)
+{
+    return (ulong)__llvm_ctpop_i64((long)i);
+}
+
diff --git a/ocml/src/acoshD.cl b/ocml/src/acoshD.cl
index 619f35a5..064897ff 100644
--- a/ocml/src/acoshD.cl
+++ b/ocml/src/acoshD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(acosh)(double x)
 {
     bool b = x >= 0x1.0p+512;
diff --git a/ocml/src/acoshF.cl b/ocml/src/acoshF.cl
index 179b413d..962e0e39 100644
--- a/ocml/src/acoshF.cl
+++ b/ocml/src/acoshF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(acosh)(float x)
 {
     bool b = x >= 0x1.0p+64f;
diff --git a/ocml/src/acoshH.cl b/ocml/src/acoshH.cl
index 074a7166..a8dc827e 100644
--- a/ocml/src/acoshH.cl
+++ b/ocml/src/acoshH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(acosh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(acosh)(half hx)
 {
     half ret;
diff --git a/ocml/src/addD.cl b/ocml/src/addD.cl
index 9e85367d..7a96f339 100644
--- a/ocml/src/addD.cl
+++ b/ocml/src/addD.cl
@@ -7,21 +7,15 @@
 
 #include "mathD.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR double \
-MATH_MANGLE(NAME)(double x, double y) \
+MATH_MANGLE(LN)(double x, double y) \
 { \
-    return BUILTIN_FULL_BINARY(fadd, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F64(x, y); \
 }
 
-GEN(add_rte, ROUND_TO_NEAREST_EVEN)
-GEN(add_rtp, ROUND_TO_POSINF)
-GEN(add_rtn, ROUND_TO_NEGINF)
-GEN(add_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(add_rte,ADD_RTE)
+GEN(add_rtn,ADD_RTN)
+GEN(add_rtp,ADD_RTP)
+GEN(add_rtz,ADD_RTZ)
 
diff --git a/ocml/src/addF.cl b/ocml/src/addF.cl
index 8e676725..95debe18 100644
--- a/ocml/src/addF.cl
+++ b/ocml/src/addF.cl
@@ -7,27 +7,15 @@
 
 #include "mathF.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR float \
-MATH_MANGLE(NAME)(float x, float y) \
+MATH_MANGLE(LN)(float x, float y) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_BINARY(faddf, true, ROUND, x, y); \
-    } else { \
-        ret = BUILTIN_FULL_BINARY(faddf, false, ROUND, x, y); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x, y); \
 }
 
-GEN(add_rte, ROUND_TO_NEAREST_EVEN)
-GEN(add_rtp, ROUND_TO_POSINF)
-GEN(add_rtn, ROUND_TO_NEGINF)
-GEN(add_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(add_rte,ADD_RTE)
+GEN(add_rtn,ADD_RTN)
+GEN(add_rtp,ADD_RTP)
+GEN(add_rtz,ADD_RTZ)
 
diff --git a/ocml/src/addH.cl b/ocml/src/addH.cl
index b540fdfe..e77e7a0a 100644
--- a/ocml/src/addH.cl
+++ b/ocml/src/addH.cl
@@ -7,21 +7,15 @@
 
 #include "mathH.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR half \
-MATH_MANGLE(NAME)(half x, half y) \
+MATH_MANGLE(LN)(half x, half y) \
 { \
-    return BUILTIN_FULL_BINARY(faddh, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F16(x, y); \
 }
 
-GEN(add_rte, ROUND_TO_NEAREST_EVEN)
-GEN(add_rtp, ROUND_TO_POSINF)
-GEN(add_rtn, ROUND_TO_NEGINF)
-GEN(add_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(add_rte,ADD_RTE)
+GEN(add_rtn,ADD_RTN)
+GEN(add_rtp,ADD_RTP)
+GEN(add_rtz,ADD_RTZ)
 
diff --git a/ocml/src/asinhD.cl b/ocml/src/asinhD.cl
index 75c3408a..09957fcc 100644
--- a/ocml/src/asinhD.cl
+++ b/ocml/src/asinhD.cl
@@ -13,7 +13,7 @@
 extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x);
 
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(asinh)(double x)
 {
     double y = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/asinhF.cl b/ocml/src/asinhF.cl
index 407d9545..f5eeaf04 100644
--- a/ocml/src/asinhF.cl
+++ b/ocml/src/asinhF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(asinh)(float x)
 {
     float y = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/asinhH.cl b/ocml/src/asinhH.cl
index 027aed99..ae994c76 100644
--- a/ocml/src/asinhH.cl
+++ b/ocml/src/asinhH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(asinh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(asinh)(half hx)
 {
     half ret;
diff --git a/ocml/src/atanF.cl b/ocml/src/atanF.cl
index ad3cdb03..08a7b1b1 100644
--- a/ocml/src/atanF.cl
+++ b/ocml/src/atanF.cl
@@ -9,7 +9,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(atanred)(float);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(atan)(float x)
 {
     float v = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/atanH.cl b/ocml/src/atanH.cl
index 9fe95d8c..42ba6898 100644
--- a/ocml/src/atanH.cl
+++ b/ocml/src/atanH.cl
@@ -11,7 +11,7 @@ extern CONSTATTR half MATH_PRIVATE(atanred)(half);
 
 CONSTATTR UGEN(atan)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(atan)(half x)
 {
     half v = BUILTIN_ABS_F16(x);
diff --git a/ocml/src/atanhD.cl b/ocml/src/atanhD.cl
index c044f71a..62d9ddb1 100644
--- a/ocml/src/atanhD.cl
+++ b/ocml/src/atanhD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(atanh)(double x)
 {
     double y = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/atanhF.cl b/ocml/src/atanhF.cl
index 82a5c3ab..817ed41a 100644
--- a/ocml/src/atanhF.cl
+++ b/ocml/src/atanhF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(atanh)(float x)
 {
     float y = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/atanhH.cl b/ocml/src/atanhH.cl
index 46c30ff8..c86722cd 100644
--- a/ocml/src/atanhH.cl
+++ b/ocml/src/atanhH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(atanh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(atanh)(half hx)
 {
     half ret;
diff --git a/ocml/src/atanpiF.cl b/ocml/src/atanpiF.cl
index f9af9b72..1c46c155 100644
--- a/ocml/src/atanpiF.cl
+++ b/ocml/src/atanpiF.cl
@@ -9,7 +9,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(atanpired)(float);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(atanpi)(float x)
 {
     float v = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/atanpiH.cl b/ocml/src/atanpiH.cl
index d85fe700..44cb201c 100644
--- a/ocml/src/atanpiH.cl
+++ b/ocml/src/atanpiH.cl
@@ -12,7 +12,7 @@ extern CONSTATTR half MATH_PRIVATE(atanpired)(half);
 
 CONSTATTR UGEN(atanpi)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(atanpi)(half x)
 {
     half v = BUILTIN_ABS_F16(x);
diff --git a/ocml/src/atanpiredF.cl b/ocml/src/atanpiredF.cl
index d982869a..63af0f76 100644
--- a/ocml/src/atanpiredF.cl
+++ b/ocml/src/atanpiredF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(atanpired)(float v)
 {
     float t = v * v;
diff --git a/ocml/src/atanpiredH.cl b/ocml/src/atanpiredH.cl
index 121d304f..3eabd599 100644
--- a/ocml/src/atanpiredH.cl
+++ b/ocml/src/atanpiredH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_PRIVATE(atanpired)(half v)
 {
     half t = v * v;
diff --git a/ocml/src/atanredF.cl b/ocml/src/atanredF.cl
index 10b5c5c1..a0895928 100644
--- a/ocml/src/atanredF.cl
+++ b/ocml/src/atanredF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(atanred)(float v)
 {
     float t = v * v;
diff --git a/ocml/src/atanredH.cl b/ocml/src/atanredH.cl
index dd2d1ba9..d721edb3 100644
--- a/ocml/src/atanredH.cl
+++ b/ocml/src/atanredH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_PRIVATE(atanred)(half v)
 {
     half t = v * v;
diff --git a/ocml/src/ba0D.cl b/ocml/src/ba0D.cl
index c21d308b..e87226bc 100644
--- a/ocml/src/ba0D.cl
+++ b/ocml/src/ba0D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(ba0)(double t)
 {
     return
diff --git a/ocml/src/ba0F.cl b/ocml/src/ba0F.cl
index fc18577f..309ad267 100644
--- a/ocml/src/ba0F.cl
+++ b/ocml/src/ba0F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(ba0)(float t)
 {
     return
diff --git a/ocml/src/ba1D.cl b/ocml/src/ba1D.cl
index c735f595..d4453e00 100644
--- a/ocml/src/ba1D.cl
+++ b/ocml/src/ba1D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(ba1)(double t)
 {
     return
diff --git a/ocml/src/ba1F.cl b/ocml/src/ba1F.cl
index 2b974a39..5dd1ea96 100644
--- a/ocml/src/ba1F.cl
+++ b/ocml/src/ba1F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(ba1)(float t)
 {
     return
diff --git a/ocml/src/bp0D.cl b/ocml/src/bp0D.cl
index 0e08cb4b..9014ae9e 100644
--- a/ocml/src/bp0D.cl
+++ b/ocml/src/bp0D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(bp0)(double t)
 {
     return
diff --git a/ocml/src/bp0F.cl b/ocml/src/bp0F.cl
index da6b9b4f..c0c27a1f 100644
--- a/ocml/src/bp0F.cl
+++ b/ocml/src/bp0F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(bp0)(float t)
 {
     return
diff --git a/ocml/src/bp1D.cl b/ocml/src/bp1D.cl
index 55ace155..c9239c95 100644
--- a/ocml/src/bp1D.cl
+++ b/ocml/src/bp1D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(bp1)(double t)
 {
     return
diff --git a/ocml/src/bp1F.cl b/ocml/src/bp1F.cl
index 3267c7f4..18569cb6 100644
--- a/ocml/src/bp1F.cl
+++ b/ocml/src/bp1F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(bp1)(float t)
 {
     return
diff --git a/ocml/src/builtins.h b/ocml/src/builtins.h
index 337c1d0a..2d5f6565 100644
--- a/ocml/src/builtins.h
+++ b/ocml/src/builtins.h
@@ -222,3 +222,81 @@
 #define BUILTIN_CLAMP_F32(X,L,H) __llvm_amdgcn_fmed3_f32(X,L,H)
 #define BUILTIN_CLAMP_F16(X,L,H) __llvm_amdgcn_fmed3_f16(X,L,H)
 
+#define BUILTIN_ADD_RTE_F32 __llvm_add_rte_f32
+#define BUILTIN_ADD_RTE_F64 __llvm_add_rte_f64
+#define BUILTIN_ADD_RTE_F16 __llvm_add_rte_f16
+#define BUILTIN_ADD_RTN_F32 __llvm_add_rtn_f32
+#define BUILTIN_ADD_RTN_F64 __llvm_add_rtn_f64
+#define BUILTIN_ADD_RTN_F16 __llvm_add_rtn_f16
+#define BUILTIN_ADD_RTP_F32 __llvm_add_rtp_f32
+#define BUILTIN_ADD_RTP_F64 __llvm_add_rtp_f64
+#define BUILTIN_ADD_RTP_F16 __llvm_add_rtp_f16
+#define BUILTIN_ADD_RTZ_F32 __llvm_add_rtz_f32
+#define BUILTIN_ADD_RTZ_F64 __llvm_add_rtz_f64
+#define BUILTIN_ADD_RTZ_F16 __llvm_add_rtz_f16
+
+#define BUILTIN_SUB_RTE_F32 __llvm_sub_rte_f32
+#define BUILTIN_SUB_RTE_F64 __llvm_sub_rte_f64
+#define BUILTIN_SUB_RTE_F16 __llvm_sub_rte_f16
+#define BUILTIN_SUB_RTN_F32 __llvm_sub_rtn_f32
+#define BUILTIN_SUB_RTN_F64 __llvm_sub_rtn_f64
+#define BUILTIN_SUB_RTN_F16 __llvm_sub_rtn_f16
+#define BUILTIN_SUB_RTP_F32 __llvm_sub_rtp_f32
+#define BUILTIN_SUB_RTP_F64 __llvm_sub_rtp_f64
+#define BUILTIN_SUB_RTP_F16 __llvm_sub_rtp_f16
+#define BUILTIN_SUB_RTZ_F32 __llvm_sub_rtz_f32
+#define BUILTIN_SUB_RTZ_F64 __llvm_sub_rtz_f64
+#define BUILTIN_SUB_RTZ_F16 __llvm_sub_rtz_f16
+
+#define BUILTIN_MUL_RTE_F32 __llvm_mul_rte_f32
+#define BUILTIN_MUL_RTE_F64 __llvm_mul_rte_f64
+#define BUILTIN_MUL_RTE_F16 __llvm_mul_rte_f16
+#define BUILTIN_MUL_RTN_F32 __llvm_mul_rtn_f32
+#define BUILTIN_MUL_RTN_F64 __llvm_mul_rtn_f64
+#define BUILTIN_MUL_RTN_F16 __llvm_mul_rtn_f16
+#define BUILTIN_MUL_RTP_F32 __llvm_mul_rtp_f32
+#define BUILTIN_MUL_RTP_F64 __llvm_mul_rtp_f64
+#define BUILTIN_MUL_RTP_F16 __llvm_mul_rtp_f16
+#define BUILTIN_MUL_RTZ_F32 __llvm_mul_rtz_f32
+#define BUILTIN_MUL_RTZ_F64 __llvm_mul_rtz_f64
+#define BUILTIN_MUL_RTZ_F16 __llvm_mul_rtz_f16
+
+#define BUILTIN_DIV_RTE_F32 __llvm_div_rte_f32
+#define BUILTIN_DIV_RTE_F64 __llvm_div_rte_f64
+#define BUILTIN_DIV_RTE_F16 __llvm_div_rte_f16
+#define BUILTIN_DIV_RTN_F32 __llvm_div_rtn_f32
+#define BUILTIN_DIV_RTN_F64 __llvm_div_rtn_f64
+#define BUILTIN_DIV_RTN_F16 __llvm_div_rtn_f16
+#define BUILTIN_DIV_RTP_F32 __llvm_div_rtp_f32
+#define BUILTIN_DIV_RTP_F64 __llvm_div_rtp_f64
+#define BUILTIN_DIV_RTP_F16 __llvm_div_rtp_f16
+#define BUILTIN_DIV_RTZ_F32 __llvm_div_rtz_f32
+#define BUILTIN_DIV_RTZ_F64 __llvm_div_rtz_f64
+#define BUILTIN_DIV_RTZ_F16 __llvm_div_rtz_f16
+
+#define BUILTIN_SQRT_RTE_F32 __llvm_sqrt_rte_f32
+#define BUILTIN_SQRT_RTE_F64 __llvm_sqrt_rte_f64
+#define BUILTIN_SQRT_RTE_F16 __llvm_sqrt_rte_f16
+#define BUILTIN_SQRT_RTN_F32 __llvm_sqrt_rtn_f32
+#define BUILTIN_SQRT_RTN_F64 __llvm_sqrt_rtn_f64
+#define BUILTIN_SQRT_RTN_F16 __llvm_sqrt_rtn_f16
+#define BUILTIN_SQRT_RTP_F32 __llvm_sqrt_rtp_f32
+#define BUILTIN_SQRT_RTP_F64 __llvm_sqrt_rtp_f64
+#define BUILTIN_SQRT_RTP_F16 __llvm_sqrt_rtp_f16
+#define BUILTIN_SQRT_RTZ_F32 __llvm_sqrt_rtz_f32
+#define BUILTIN_SQRT_RTZ_F64 __llvm_sqrt_rtz_f64
+#define BUILTIN_SQRT_RTZ_F16 __llvm_sqrt_rtz_f16
+
+#define BUILTIN_FMA_RTE_F32 __llvm_fma_rte_f32
+#define BUILTIN_FMA_RTE_F64 __llvm_fma_rte_f64
+#define BUILTIN_FMA_RTE_F16 __llvm_fma_rte_f16
+#define BUILTIN_FMA_RTN_F32 __llvm_fma_rtn_f32
+#define BUILTIN_FMA_RTN_F64 __llvm_fma_rtn_f64
+#define BUILTIN_FMA_RTN_F16 __llvm_fma_rtn_f16
+#define BUILTIN_FMA_RTP_F32 __llvm_fma_rtp_f32
+#define BUILTIN_FMA_RTP_F64 __llvm_fma_rtp_f64
+#define BUILTIN_FMA_RTP_F16 __llvm_fma_rtp_f16
+#define BUILTIN_FMA_RTZ_F32 __llvm_fma_rtz_f32
+#define BUILTIN_FMA_RTZ_F64 __llvm_fma_rtz_f64
+#define BUILTIN_FMA_RTZ_F16 __llvm_fma_rtz_f16
+
diff --git a/ocml/src/cbrtD.cl b/ocml/src/cbrtD.cl
index 67cd2628..fd83a2fb 100644
--- a/ocml/src/cbrtD.cl
+++ b/ocml/src/cbrtD.cl
@@ -1,7 +1,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(cbrt)(double x)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/cbrtF.cl b/ocml/src/cbrtF.cl
index cab2df26..5e436900 100644
--- a/ocml/src/cbrtF.cl
+++ b/ocml/src/cbrtF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(cbrt)(float x)
 {
     if (DAZ_OPT()) {
diff --git a/ocml/src/ceilD.cl b/ocml/src/ceilD.cl
index dc2eb8dc..654226cc 100644
--- a/ocml/src/ceilD.cl
+++ b/ocml/src/ceilD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(ceil)(double x)
 {
     return BUILTIN_CEIL_F64(x);
diff --git a/ocml/src/ceilF.cl b/ocml/src/ceilF.cl
index 2a563cdf..8b1600c8 100644
--- a/ocml/src/ceilF.cl
+++ b/ocml/src/ceilF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(ceil)(float x)
 {
     return BUILTIN_CEIL_F32(x);
diff --git a/ocml/src/ceilH.cl b/ocml/src/ceilH.cl
index 2db7385c..5b9804cb 100644
--- a/ocml/src/ceilH.cl
+++ b/ocml/src/ceilH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(ceil)(half2 x)
 {
     return BUILTIN_CEIL_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(ceil)(half x)
 {
     return BUILTIN_CEIL_F16(x);
diff --git a/ocml/src/copysignD.cl b/ocml/src/copysignD.cl
index 5c2eb066..b239b793 100644
--- a/ocml/src/copysignD.cl
+++ b/ocml/src/copysignD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(copysign)(double x, double y)
 {
     return BUILTIN_COPYSIGN_F64(x, y);
diff --git a/ocml/src/copysignF.cl b/ocml/src/copysignF.cl
index 87bc68d7..f2fac4ab 100644
--- a/ocml/src/copysignF.cl
+++ b/ocml/src/copysignF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(copysign)(float x, float y)
 {
     return BUILTIN_COPYSIGN_F32(x, y);
diff --git a/ocml/src/copysignH.cl b/ocml/src/copysignH.cl
index f89c061c..7897b1e3 100644
--- a/ocml/src/copysignH.cl
+++ b/ocml/src/copysignH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(copysign)(half2 x, half2 y)
 {
     return BUILTIN_COPYSIGN_2F16(x, y);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(copysign)(half x, half y)
 {
     return BUILTIN_COPYSIGN_F16(x, y);
diff --git a/ocml/src/cosD.cl b/ocml/src/cosD.cl
index fcb55925..b76adff9 100644
--- a/ocml/src/cosD.cl
+++ b/ocml/src/cosD.cl
@@ -8,17 +8,15 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(cos)(double x)
 {
-    double r, rr;
-    int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
+    sc.s = -sc.s;
 
-    double cc;
-    double ss = -MATH_PRIVATE(sincosred2)(r, rr, &cc);
-
-    int2 c = AS_INT2((regn & 1) != 0 ? ss : cc);
-    c.hi ^= regn > 1 ? (int)0x80000000 : 0;
+    int2 c = AS_INT2((r.i & 1) != 0 ? sc.s : sc.c);
+    c.hi ^= r.i > 1 ? (int)0x80000000 : 0;
 
     if (!FINITE_ONLY_OPT()) {
         c = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : c;
diff --git a/ocml/src/cosF.cl b/ocml/src/cosF.cl
index 63da099e..60c57195 100644
--- a/ocml/src/cosF.cl
+++ b/ocml/src/cosF.cl
@@ -8,28 +8,23 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(cos)(float x)
 {
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-#if defined EXTRA_PRECISION
-    float r0, r1;
-    int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax));
+    struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax));
 
-    float cc;
-    float ss = -MATH_PRIVATE(sincosred2)(r0, r1, &cc);
+#if defined EXTRA_PRECISION
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 #else
-    float r;
-    int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax));
-
-    float cc;
-    float ss = -MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 #endif
+    sc.s = -sc.s;
 
-    float c =  (regn & 1) != 0 ? ss : cc;
-    c = AS_FLOAT(AS_INT(c) ^ (regn > 1 ? 0x80000000 : 0));
+    float c =  (r.i & 1) != 0 ? sc.s : sc.c;
+    c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0));
 
     if (!FINITE_ONLY_OPT()) {
         c = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : c;
diff --git a/ocml/src/cosH.cl b/ocml/src/cosH.cl
index 00df3a62..e4edc273 100644
--- a/ocml/src/cosH.cl
+++ b/ocml/src/cosH.cl
@@ -10,17 +10,15 @@
 
 UGEN(cos)
 
-INLINEATTR half
+half
 MATH_MANGLE(cos)(half x)
 {
-    half r;
-    short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
+    sc.s = -sc.s;
 
-    half cc;
-    half ss = -MATH_PRIVATE(sincosred)(r, &cc);
-
-    short c =  AS_SHORT((i & 1) == 0 ? cc : ss);
-    c ^= i > 1 ? (short)0x8000 : (short)0;
+    short c =  AS_SHORT((r.i & 1) == (short)0 ? sc.c : sc.s);
+    c ^= r.i > 1 ? (short)0x8000 : (short)0;
 
     if (!FINITE_ONLY_OPT()) {
         c = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : c;
diff --git a/ocml/src/cosbD.cl b/ocml/src/cosbD.cl
index 36b4f178..0838b618 100644
--- a/ocml/src/cosbD.cl
+++ b/ocml/src/cosbD.cl
@@ -24,31 +24,31 @@
         L = __e; \
     } while (0)
 
-INLINEATTR double
+double
 MATH_PRIVATE(cosb)(double x, int n, double p)
 {
-    double ph, pl, rh, rl, sh, sl;
-    int i = MATH_PRIVATE(trigred)(&rh, &rl, x);
-    bool b = rh < p;
-    i = (i - b - n) & 3;
+    struct redret r = MATH_PRIVATE(trigred)(x);
+    bool b = r.hi < p;
+    r.i = (r.i - b - n) & 3;
 
     // This is a properly signed extra precise pi/4
-    ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0)));
-    pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0)));
+    double ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0)));
+    double pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0)));
 
+    double sh, sl;
     FDIF2(ph, p, ph, sl);
     pl += sl;
     FSUM2(ph, pl, ph, pl);
 
-    FSUM2(ph, rh, sh, sl);
-    sl += pl + rl;
+    FSUM2(ph, r.hi, sh, sl);
+    sl += pl + r.lo;
     FSUM2(sh, sl, sh, sl);
 
-    double cc;
-    double ss = -MATH_PRIVATE(sincosred2)(sh, sl, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl);
+    sc.s = -sc.s;
 
-    int2 c = AS_INT2((i & 1) != 0 ? ss : cc);
-    c.hi ^= i > 1 ? 0x80000000 : 0;
+    int2 c = AS_INT2((r.i & 1) != 0 ? sc.s : sc.c);
+    c.hi ^= r.i > 1 ? 0x80000000 : 0;
 
     return AS_DOUBLE(c);
 }
diff --git a/ocml/src/cosbF.cl b/ocml/src/cosbF.cl
index 10aab950..60e1f415 100644
--- a/ocml/src/cosbF.cl
+++ b/ocml/src/cosbF.cl
@@ -24,42 +24,37 @@
         L = __e; \
     } while (0)
 
-INLINEATTR float
+float
 MATH_PRIVATE(cosb)(float x, int n, float p)
 {
+    struct redret r = MATH_PRIVATE(trigred)(x);
+    bool b = r.hi < p;
+    r.i = (r.i - b - n) & 3;
 
 #if defined EXTRA_PRECISION
-    float ph, pl, rh, rl, sh, sl;
-    int i = MATH_PRIVATE(trigred)(&rh, &rl, x);
-    bool b = rh < p;
-    i = (i - b - n) & 3;
+    float ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
+    float pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0));
 
-    ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
-    pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0));
+    float sh, sl;
 
     FDIF2(ph, p, ph, sl);
     pl += sl;
     FSUM2(ph, pl, ph, pl);
 
-    FSUM2(ph, rh, sh, sl);
-    sl += pl + rl;
+    FSUM2(ph, r.hi, sh, sl);
+    sl += pl + r.lo;
     FSUM2(sh, sl, sh, sl);
 
-    float cc;
-    float ss = -MATH_PRIVATE(sincosred2)(sh, sl, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl);
 #else
-    float r;
-    int i = MATH_PRIVATE(trigred)(&r, x);
-    bool b = r < p;
-    i = (i - b - n) & 3;
-    r = r - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
+    r.hi = r.hi - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
 
-    float cc;
-    float ss = -MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 #endif
+    sc.s = -sc.s;
 
-    float c =  (i & 1) != 0 ? ss : cc;
-    c = AS_FLOAT(AS_INT(c) ^ (i > 1 ? 0x80000000 : 0));
+    float c =  (r.i & 1) != 0 ? sc.s : sc.c;
+    c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0));
     return c;
 }
 
diff --git a/ocml/src/coshD.cl b/ocml/src/coshD.cl
index fe1a676d..da1c54a4 100644
--- a/ocml/src/coshD.cl
+++ b/ocml/src/coshD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(cosh)(double x)
 {
     x = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/coshF.cl b/ocml/src/coshF.cl
index 425bea9d..ef4c46da 100644
--- a/ocml/src/coshF.cl
+++ b/ocml/src/coshF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(cosh)(float x)
 {
     x = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/coshH.cl b/ocml/src/coshH.cl
index 232b8f67..3ddea219 100644
--- a/ocml/src/coshH.cl
+++ b/ocml/src/coshH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(cosh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(cosh)(half hx)
 {
     float x = (float)hx * 0x1.715476p+0f;
diff --git a/ocml/src/cospiD.cl b/ocml/src/cospiD.cl
index 57686b03..fab3bc4d 100644
--- a/ocml/src/cospiD.cl
+++ b/ocml/src/cospiD.cl
@@ -8,17 +8,15 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(cospi)(double x)
 {
-    double t;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
+    sc.s = -sc.s;
 
-    double cc;
-    double ss = -MATH_PRIVATE(sincospired)(t, &cc);
-
-    int2 c = AS_INT2((i & 1) == 0 ? cc : ss);
-    c.hi ^= i > 1 ? (int)0x80000000 : 0;
+    int2 c = AS_INT2((r.i & 1) == 0 ? sc.c : sc.s);
+    c.hi ^= r.i > 1 ? (int)0x80000000 : 0;
 
     if (!FINITE_ONLY_OPT()) {
         c = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : c;
diff --git a/ocml/src/cospiF.cl b/ocml/src/cospiF.cl
index 1d9ed3ee..90d360d0 100644
--- a/ocml/src/cospiF.cl
+++ b/ocml/src/cospiF.cl
@@ -8,19 +8,16 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(cospi)(float x)
 {
     int ax = AS_INT(x) & 0x7fffffff;
+    struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
+    sc.s = -sc.s;
 
-    float r;
-    int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &r);
-
-    float cc;
-    float ss = -MATH_PRIVATE(sincospired)(r, &cc);
-
-    float c =  (i & 1) != 0 ? ss : cc;
-    c = AS_FLOAT(AS_INT(c) ^ (i > 1 ? 0x80000000 : 0));
+    float c =  (r.i & 1) != 0 ? sc.s : sc.c;
+    c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0));
 
     if (!FINITE_ONLY_OPT()) {
         c = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : c;
diff --git a/ocml/src/cospiH.cl b/ocml/src/cospiH.cl
index 830bc239..3f55c79e 100644
--- a/ocml/src/cospiH.cl
+++ b/ocml/src/cospiH.cl
@@ -10,17 +10,15 @@
 
 UGEN(cospi)
 
-INLINEATTR half
+half
 MATH_MANGLE(cospi)(half x)
 {
-    half t;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
+    sc.s = -sc.s;
 
-    half cc;
-    half ss = -MATH_PRIVATE(sincospired)(t, &cc);
-
-    short c =  AS_SHORT((i & (short)1) == (short)0 ? cc : ss);
-    c ^= i > (short)1 ? (short)0x8000 : (short)0;
+    short c =  AS_SHORT((r.i & (short)1) == (short)0 ? sc.c : sc.s);
+    c ^= r.i > (short)1 ? (short)0x8000 : (short)0;
 
     if (!FINITE_ONLY_OPT()) {
         c = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : c;
diff --git a/ocml/src/divD.cl b/ocml/src/divD.cl
index 27ae4318..ad7af822 100644
--- a/ocml/src/divD.cl
+++ b/ocml/src/divD.cl
@@ -7,21 +7,15 @@
 
 #include "mathD.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR double \
-MATH_MANGLE(NAME)(double x, double y) \
+MATH_MANGLE(LN)(double x, double y) \
 { \
-    return BUILTIN_FULL_BINARY(fdiv, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F64(x, y); \
 }
 
-GEN(div_rte, ROUND_TO_NEAREST_EVEN)
-GEN(div_rtp, ROUND_TO_POSINF)
-GEN(div_rtn, ROUND_TO_NEGINF)
-GEN(div_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(div_rte,DIV_RTE)
+GEN(div_rtn,DIV_RTN)
+GEN(div_rtp,DIV_RTP)
+GEN(div_rtz,DIV_RTZ)
 
diff --git a/ocml/src/divF.cl b/ocml/src/divF.cl
index 9dcfb511..ce9519ab 100644
--- a/ocml/src/divF.cl
+++ b/ocml/src/divF.cl
@@ -7,27 +7,15 @@
 
 #include "mathF.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR float \
-MATH_MANGLE(NAME)(float x, float y) \
+MATH_MANGLE(LN)(float x, float y) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_BINARY(fdivf, true, ROUND, x, y); \
-    } else { \
-        ret = BUILTIN_FULL_BINARY(fdivf, false, ROUND, x, y); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x, y); \
 }
 
-GEN(div_rte, ROUND_TO_NEAREST_EVEN)
-GEN(div_rtp, ROUND_TO_POSINF)
-GEN(div_rtn, ROUND_TO_NEGINF)
-GEN(div_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(div_rte,DIV_RTE)
+GEN(div_rtn,DIV_RTN)
+GEN(div_rtp,DIV_RTP)
+GEN(div_rtz,DIV_RTZ)
 
diff --git a/ocml/src/divH.cl b/ocml/src/divH.cl
index 7ac66449..3a7d17d3 100644
--- a/ocml/src/divH.cl
+++ b/ocml/src/divH.cl
@@ -7,21 +7,15 @@
 
 #include "mathH.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR half \
-MATH_MANGLE(NAME)(half x, half y) \
+MATH_MANGLE(LN)(half x, half y) \
 { \
-    return BUILTIN_FULL_BINARY(fdivh, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F16(x, y); \
 }
 
-GEN(div_rte, ROUND_TO_NEAREST_EVEN)
-GEN(div_rtp, ROUND_TO_POSINF)
-GEN(div_rtn, ROUND_TO_NEGINF)
-GEN(div_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(div_rte,DIV_RTE)
+GEN(div_rtn,DIV_RTN)
+GEN(div_rtp,DIV_RTP)
+GEN(div_rtz,DIV_RTZ)
 
diff --git a/ocml/src/epexpepD.cl b/ocml/src/epexpepD.cl
index 292a61b9..f6340e15 100644
--- a/ocml/src/epexpepD.cl
+++ b/ocml/src/epexpepD.cl
@@ -10,7 +10,7 @@
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR double2
+CONSTATTR double2
 MATH_PRIVATE(epexpep)(double2 x)
 {
     double dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0);
diff --git a/ocml/src/epexpepF.cl b/ocml/src/epexpepF.cl
index abeecc70..1ba48e10 100644
--- a/ocml/src/epexpepF.cl
+++ b/ocml/src/epexpepF.cl
@@ -10,7 +10,7 @@
 #define FLOAT_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR float2
+CONSTATTR float2
 MATH_PRIVATE(epexpep)(float2 x)
 {
     float fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f);
diff --git a/ocml/src/eplnD.cl b/ocml/src/eplnD.cl
index f16b4071..7540e5b9 100644
--- a/ocml/src/eplnD.cl
+++ b/ocml/src/eplnD.cl
@@ -10,7 +10,7 @@
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR double2
+CONSTATTR double2
 MATH_PRIVATE(epln)(double a)
 {
     double m = BUILTIN_FREXP_MANT_F64(a);
diff --git a/ocml/src/eplnF.cl b/ocml/src/eplnF.cl
index 9063d677..b7fef2be 100644
--- a/ocml/src/eplnF.cl
+++ b/ocml/src/eplnF.cl
@@ -10,7 +10,7 @@
 #define FLOAT_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR float2
+CONSTATTR float2
 MATH_PRIVATE(epln)(float a)
 {
     float m = BUILTIN_FREXP_MANT_F32(a);
diff --git a/ocml/src/erfH.cl b/ocml/src/erfH.cl
index 883509d8..47c3c353 100644
--- a/ocml/src/erfH.cl
+++ b/ocml/src/erfH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(erf)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erf)(half x)
 {
     return (half)MATH_UPMANGLE(erf)((float)x);
diff --git a/ocml/src/erfcH.cl b/ocml/src/erfcH.cl
index 2adc0236..ec7c7b04 100644
--- a/ocml/src/erfcH.cl
+++ b/ocml/src/erfcH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(erfc)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erfc)(half x)
 {
     return (half)MATH_UPMANGLE(erfc)((float)x);
diff --git a/ocml/src/erfcinvH.cl b/ocml/src/erfcinvH.cl
index 8050709b..6258a9b9 100644
--- a/ocml/src/erfcinvH.cl
+++ b/ocml/src/erfcinvH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(erfcinv)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erfcinv)(half x)
 {
     return (half)MATH_UPMANGLE(erfcinv)((float)x);
diff --git a/ocml/src/erfcxH.cl b/ocml/src/erfcxH.cl
index eb064e47..9fa79b5f 100644
--- a/ocml/src/erfcxH.cl
+++ b/ocml/src/erfcxH.cl
@@ -3,7 +3,7 @@
 
 PUREATTR UGEN(erfcx)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erfcx)(half x)
 {
     return (half)MATH_UPMANGLE(erfcx)((float)x);
diff --git a/ocml/src/erfinvH.cl b/ocml/src/erfinvH.cl
index 60238709..18317b51 100644
--- a/ocml/src/erfinvH.cl
+++ b/ocml/src/erfinvH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(erfinv)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erfinv)(half x)
 {
     return (half)MATH_UPMANGLE(erfinv)((float)x);
diff --git a/ocml/src/exp10H.cl b/ocml/src/exp10H.cl
index d376414e..94a50ce2 100644
--- a/ocml/src/exp10H.cl
+++ b/ocml/src/exp10H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(exp10)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(exp10)(half x)
 {
     return (half)BUILTIN_EXP2_F32((float)x * 0x1.a934f0p+1f);
diff --git a/ocml/src/exp2H.cl b/ocml/src/exp2H.cl
index a8b72ff3..b6afa724 100644
--- a/ocml/src/exp2H.cl
+++ b/ocml/src/exp2H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(exp2)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(exp2)(half x)
 {
     return BUILTIN_EXP2_F16(x);
diff --git a/ocml/src/expF_base.h b/ocml/src/expF_base.h
index 9c42d5fe..08bde388 100644
--- a/ocml/src/expF_base.h
+++ b/ocml/src/expF_base.h
@@ -32,7 +32,7 @@
 // 
 //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 
 
-PUREATTR INLINEATTR float
+PUREATTR float
 #if defined COMPILING_EXP2
 MATH_MANGLE(exp2)(float x)
 #elif defined COMPILING_EXP10
diff --git a/ocml/src/expH.cl b/ocml/src/expH.cl
index 1ff4a024..caa3a4ac 100644
--- a/ocml/src/expH.cl
+++ b/ocml/src/expH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(exp)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(exp)(half x)
 {
     return (half)BUILTIN_EXP2_F32((float)x * 0x1.715476p+0f);
diff --git a/ocml/src/expepD.cl b/ocml/src/expepD.cl
index f8d4fd95..859a023d 100644
--- a/ocml/src/expepD.cl
+++ b/ocml/src/expepD.cl
@@ -10,7 +10,7 @@
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR double
+CONSTATTR double
 MATH_PRIVATE(expep)(double2 x)
 {
     double dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0);
diff --git a/ocml/src/expepF.cl b/ocml/src/expepF.cl
index bf585b47..657267a2 100644
--- a/ocml/src/expepF.cl
+++ b/ocml/src/expepF.cl
@@ -10,7 +10,7 @@
 #define FLOAT_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR float
+CONSTATTR float
 MATH_PRIVATE(expep)(float2 x)
 {
     float fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f);
diff --git a/ocml/src/expm1F.cl b/ocml/src/expm1F.cl
index 583a7a11..31ac3b89 100644
--- a/ocml/src/expm1F.cl
+++ b/ocml/src/expm1F.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(expm1)(float x)
 {
     float2 e = sub(MATH_PRIVATE(epexpep)(con(x, 0.0f)), 1.0f);
diff --git a/ocml/src/expm1H.cl b/ocml/src/expm1H.cl
index c04c6c84..79498be4 100644
--- a/ocml/src/expm1H.cl
+++ b/ocml/src/expm1H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(expm1)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(expm1)(half x)
 {
     half ret;
diff --git a/ocml/src/fabsD.cl b/ocml/src/fabsD.cl
index 2c5332c6..9052cd01 100644
--- a/ocml/src/fabsD.cl
+++ b/ocml/src/fabsD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(fabs)(double x)
 {
     return BUILTIN_ABS_F64(x);
diff --git a/ocml/src/fabsF.cl b/ocml/src/fabsF.cl
index 444e9075..957cb79f 100644
--- a/ocml/src/fabsF.cl
+++ b/ocml/src/fabsF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(fabs)(float x)
 {
     return BUILTIN_ABS_F32(x);
diff --git a/ocml/src/fabsH.cl b/ocml/src/fabsH.cl
index 9cd7dbbd..1504bb6a 100644
--- a/ocml/src/fabsH.cl
+++ b/ocml/src/fabsH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(fabs)(half2 x)
 {
     return BUILTIN_ABS_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(fabs)(half x)
 {
     return BUILTIN_ABS_F16(x);
diff --git a/ocml/src/fdimD.cl b/ocml/src/fdimD.cl
index 8214203e..cc7255c3 100644
--- a/ocml/src/fdimD.cl
+++ b/ocml/src/fdimD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(fdim)(double x, double y)
 {
     long d = AS_LONG(x - y);
diff --git a/ocml/src/fdimF.cl b/ocml/src/fdimF.cl
index 9e418b24..968eb908 100644
--- a/ocml/src/fdimF.cl
+++ b/ocml/src/fdimF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(fdim)(float x, float y)
 {
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/fdimH.cl b/ocml/src/fdimH.cl
index 304c96ab..989f8213 100644
--- a/ocml/src/fdimH.cl
+++ b/ocml/src/fdimH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(fdim)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(fdim)(half x, half y)
 {
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/floorD.cl b/ocml/src/floorD.cl
index 8fd637da..2fc2375d 100644
--- a/ocml/src/floorD.cl
+++ b/ocml/src/floorD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(floor)(double x)
 {
     return BUILTIN_FLOOR_F64(x);
diff --git a/ocml/src/floorF.cl b/ocml/src/floorF.cl
index 3364960a..e8b6d3ef 100644
--- a/ocml/src/floorF.cl
+++ b/ocml/src/floorF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(floor)(float x)
 {
     return BUILTIN_FLOOR_F32(x);
diff --git a/ocml/src/floorH.cl b/ocml/src/floorH.cl
index 16c84eee..f563e648 100644
--- a/ocml/src/floorH.cl
+++ b/ocml/src/floorH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(floor)(half2 x)
 {
     return BUILTIN_FLOOR_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(floor)(half x)
 {
     return BUILTIN_FLOOR_F16(x);
diff --git a/ocml/src/fmaD.cl b/ocml/src/fmaD.cl
index 15d596dc..0a526fe8 100644
--- a/ocml/src/fmaD.cl
+++ b/ocml/src/fmaD.cl
@@ -13,21 +13,15 @@ MATH_MANGLE(fma)(double a, double b, double c)
     return BUILTIN_FMA_F64(a, b, c);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR double \
-MATH_MANGLE(NAME)(double a, double b, double c) \
+MATH_MANGLE(LN)(double a, double b, double c) \
 { \
-    return BUILTIN_FULL_TERNARY(ffma, false, ROUND, a, b, c); \
+    return BUILTIN_##UN##_F64(a, b, c); \
 }
 
-GEN(fma_rte, ROUND_TO_NEAREST_EVEN)
-GEN(fma_rtp, ROUND_TO_POSINF)
-GEN(fma_rtn, ROUND_TO_NEGINF)
-GEN(fma_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(fma_rte,FMA_RTE)
+GEN(fma_rtn,FMA_RTN)
+GEN(fma_rtp,FMA_RTP)
+GEN(fma_rtz,FMA_RTZ)
 
diff --git a/ocml/src/fmaF.cl b/ocml/src/fmaF.cl
index 3974f317..052acae0 100644
--- a/ocml/src/fmaF.cl
+++ b/ocml/src/fmaF.cl
@@ -5,6 +5,7 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
+#include "irif.h"
 #include "mathF.h"
 
 CONSTATTR float
@@ -13,27 +14,15 @@ MATH_MANGLE(fma)(float a, float b, float c)
     return BUILTIN_FMA_F32(a, b, c);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR float \
-MATH_MANGLE(NAME)(float a, float b, float c) \
+MATH_MANGLE(LN)(float a, float b, float c) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_TERNARY(ffmaf, true, ROUND, a, b, c); \
-    } else { \
-        ret = BUILTIN_FULL_TERNARY(ffmaf, false, ROUND, a, b, c); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(a, b, c); \
 }
 
-GEN(fma_rte, ROUND_TO_NEAREST_EVEN)
-GEN(fma_rtp, ROUND_TO_POSINF)
-GEN(fma_rtn, ROUND_TO_NEGINF)
-GEN(fma_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(fma_rte,FMA_RTE)
+GEN(fma_rtn,FMA_RTN)
+GEN(fma_rtp,FMA_RTP)
+GEN(fma_rtz,FMA_RTZ)
 
diff --git a/ocml/src/fmaH.cl b/ocml/src/fmaH.cl
index c34f1781..03bacf72 100644
--- a/ocml/src/fmaH.cl
+++ b/ocml/src/fmaH.cl
@@ -19,21 +19,15 @@ MATH_MANGLE(fma)(half a, half b, half c)
     return BUILTIN_FMA_F16(a, b, c);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR half \
-MATH_MANGLE(NAME)(half a, half b, half c) \
+MATH_MANGLE(LN)(half a, half b, half c) \
 { \
-    return BUILTIN_FULL_TERNARY(ffmah, false, ROUND, a, b, c); \
+    return BUILTIN_##UN##_F16(a, b, c); \
 }
 
-GEN(fma_rte, ROUND_TO_NEAREST_EVEN)
-GEN(fma_rtp, ROUND_TO_POSINF)
-GEN(fma_rtn, ROUND_TO_NEGINF)
-GEN(fma_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(fma_rte,FMA_RTE)
+GEN(fma_rtn,FMA_RTN)
+GEN(fma_rtp,FMA_RTP)
+GEN(fma_rtz,FMA_RTZ)
 
diff --git a/ocml/src/fmaxD.cl b/ocml/src/fmaxD.cl
index 97a80466..fa8fc448 100644
--- a/ocml/src/fmaxD.cl
+++ b/ocml/src/fmaxD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(fmax)(double x, double y)
 {
     return BUILTIN_MAX_F64(BUILTIN_CANONICALIZE_F64(x), BUILTIN_CANONICALIZE_F64(y));
diff --git a/ocml/src/fmaxF.cl b/ocml/src/fmaxF.cl
index d96a4c34..7fa39a8a 100644
--- a/ocml/src/fmaxF.cl
+++ b/ocml/src/fmaxF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(fmax)(float x, float y)
 {
     float ret;
diff --git a/ocml/src/fmaxH.cl b/ocml/src/fmaxH.cl
index f6817006..552be89f 100644
--- a/ocml/src/fmaxH.cl
+++ b/ocml/src/fmaxH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(fmax)(half2 x, half2 y)
 {
     return BUILTIN_MAX_2F16(BUILTIN_CANONICALIZE_2F16(x), BUILTIN_CANONICALIZE_2F16(y));
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(fmax)(half x, half y)
 {
     return BUILTIN_MAX_F16(BUILTIN_CANONICALIZE_F16(x), BUILTIN_CANONICALIZE_F16(y));
diff --git a/ocml/src/fminD.cl b/ocml/src/fminD.cl
index 0ff01127..04fba1fb 100644
--- a/ocml/src/fminD.cl
+++ b/ocml/src/fminD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(fmin)(double x, double y)
 {
     return BUILTIN_MIN_F64(BUILTIN_CANONICALIZE_F64(x), BUILTIN_CANONICALIZE_F64(y));
diff --git a/ocml/src/fminF.cl b/ocml/src/fminF.cl
index ffd6f40a..e979e18e 100644
--- a/ocml/src/fminF.cl
+++ b/ocml/src/fminF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(fmin)(float x, float y)
 {
     float ret;
diff --git a/ocml/src/fminH.cl b/ocml/src/fminH.cl
index 6da1fb55..76398429 100644
--- a/ocml/src/fminH.cl
+++ b/ocml/src/fminH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(fmin)(half2 x, half2 y)
 {
     return BUILTIN_MIN_2F16(BUILTIN_CANONICALIZE_2F16(x), BUILTIN_CANONICALIZE_2F16(y));
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(fmin)(half x, half y)
 {
     return BUILTIN_MIN_F16(BUILTIN_CANONICALIZE_F16(x), BUILTIN_CANONICALIZE_F16(y));
diff --git a/ocml/src/fpclassifyD.cl b/ocml/src/fpclassifyD.cl
index cfefa9d5..8db6b992 100644
--- a/ocml/src/fpclassifyD.cl
+++ b/ocml/src/fpclassifyD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(fpclassify)(double x)
 {
     int ret = BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN;
diff --git a/ocml/src/fpclassifyF.cl b/ocml/src/fpclassifyF.cl
index 824c140e..50a84783 100644
--- a/ocml/src/fpclassifyF.cl
+++ b/ocml/src/fpclassifyF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(fpclassify)(float x)
 {
     int ret = BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN;
diff --git a/ocml/src/fpclassifyH.cl b/ocml/src/fpclassifyH.cl
index 20d34897..a9c2d928 100644
--- a/ocml/src/fpclassifyH.cl
+++ b/ocml/src/fpclassifyH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(fpclassify)(half x)
 {
     int ret = BUILTIN_CLASS_F16(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN;
diff --git a/ocml/src/fractD.cl b/ocml/src/fractD.cl
index 720e3e23..e4b75aec 100644
--- a/ocml/src/fractD.cl
+++ b/ocml/src/fractD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(fract)(double x, __private double *ip)
 {
     double i = BUILTIN_FLOOR_F64(x);
diff --git a/ocml/src/fractF.cl b/ocml/src/fractF.cl
index 9b03b797..b65b517c 100644
--- a/ocml/src/fractF.cl
+++ b/ocml/src/fractF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(fract)(float x, __private float *ip)
 {
     float i = BUILTIN_FLOOR_F32(x);
diff --git a/ocml/src/fractH.cl b/ocml/src/fractH.cl
index ba127717..2cda3a5c 100644
--- a/ocml/src/fractH.cl
+++ b/ocml/src/fractH.cl
@@ -7,14 +7,14 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(fract)(half2 x, __private half2 *ip)
 {
     *ip = BUILTIN_FLOOR_2F16(x);
     return (half2)(BUILTIN_FRACTION_F16(x.lo), BUILTIN_FRACTION_F16(x.hi));
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(fract)(half x, __private half *ip)
 {
     *ip = BUILTIN_FLOOR_F16(x);
diff --git a/ocml/src/frexpD.cl b/ocml/src/frexpD.cl
index b3deeb64..4f9d252d 100644
--- a/ocml/src/frexpD.cl
+++ b/ocml/src/frexpD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(frexp)(double x, __private int *ep)
 {
     int e = BUILTIN_FREXP_EXP_F64(x);
diff --git a/ocml/src/frexpF.cl b/ocml/src/frexpF.cl
index e29554ba..c5b0b84b 100644
--- a/ocml/src/frexpF.cl
+++ b/ocml/src/frexpF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(frexp)(float x, __private int *ep)
 {
     int e = BUILTIN_FREXP_EXP_F32(x);
diff --git a/ocml/src/frexpH.cl b/ocml/src/frexpH.cl
index a5e43691..a4bc6e3c 100644
--- a/ocml/src/frexpH.cl
+++ b/ocml/src/frexpH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(frexp)(half2 x, __private int2 *ep)
 {
     int elo, ehi;
@@ -18,7 +18,7 @@ MATH_MANGLE2(frexp)(half2 x, __private int2 *ep)
     return r;
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(frexp)(half x, __private int *ep)
 {
     int e = (int)BUILTIN_FREXP_EXP_F16(x);
diff --git a/ocml/src/hypotD.cl b/ocml/src/hypotD.cl
index 405720f4..fd99614a 100644
--- a/ocml/src/hypotD.cl
+++ b/ocml/src/hypotD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(hypot)(double x, double y)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/hypotF.cl b/ocml/src/hypotF.cl
index adca99ea..2b697a32 100644
--- a/ocml/src/hypotF.cl
+++ b/ocml/src/hypotF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(hypot)(float x, float y)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/hypotH.cl b/ocml/src/hypotH.cl
index dc0dad36..66b7811a 100644
--- a/ocml/src/hypotH.cl
+++ b/ocml/src/hypotH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(hypot)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(hypot)(half x, half y)
 {
     float fx = (float)x;
diff --git a/ocml/src/i0H.cl b/ocml/src/i0H.cl
index f42f1482..913942f5 100644
--- a/ocml/src/i0H.cl
+++ b/ocml/src/i0H.cl
@@ -9,7 +9,7 @@
 
 UGEN(i0)
 
-INLINEATTR half
+half
 MATH_MANGLE(i0)(half x)
 {
     return (half)MATH_UPMANGLE(i0)((float)x);
diff --git a/ocml/src/i1H.cl b/ocml/src/i1H.cl
index 09b74c6d..d778626b 100644
--- a/ocml/src/i1H.cl
+++ b/ocml/src/i1H.cl
@@ -9,7 +9,7 @@
 
 UGEN(i1)
 
-INLINEATTR half
+half
 MATH_MANGLE(i1)(half x)
 {
     return (half)MATH_UPMANGLE(i1)((float)x);
diff --git a/ocml/src/ilogbD.cl b/ocml/src/ilogbD.cl
index 95ce66fc..0f0b9ace 100644
--- a/ocml/src/ilogbD.cl
+++ b/ocml/src/ilogbD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(ilogb)(double x)
 {
     int r = BUILTIN_FREXP_EXP_F64(x) - 1;
diff --git a/ocml/src/ilogbF.cl b/ocml/src/ilogbF.cl
index e84537b8..1a7e1d1d 100644
--- a/ocml/src/ilogbF.cl
+++ b/ocml/src/ilogbF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(ilogb)(float x)
 {
     int r = BUILTIN_FREXP_EXP_F32(x) - 1;
diff --git a/ocml/src/ilogbH.cl b/ocml/src/ilogbH.cl
index a5aeef18..d7a274e4 100644
--- a/ocml/src/ilogbH.cl
+++ b/ocml/src/ilogbH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR int2
+CONSTATTR int2
 MATH_MANGLE2(ilogb)(half2 x)
 {
     return (int2)(MATH_MANGLE(ilogb)(x.lo), MATH_MANGLE(ilogb)(x.hi));
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(ilogb)(half x)
 {
     int r = (int)BUILTIN_FREXP_EXP_F16(x) - 1;
diff --git a/ocml/src/isfiniteD.cl b/ocml/src/isfiniteD.cl
index 489a390f..bdca20d5 100644
--- a/ocml/src/isfiniteD.cl
+++ b/ocml/src/isfiniteD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isfinite)(double x)
 {
     return BUILTIN_CLASS_F64(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR);
diff --git a/ocml/src/isfiniteF.cl b/ocml/src/isfiniteF.cl
index 11227450..421ab1a1 100644
--- a/ocml/src/isfiniteF.cl
+++ b/ocml/src/isfiniteF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isfinite)(float x)
 {
     return BUILTIN_CLASS_F32(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR);
diff --git a/ocml/src/isfiniteH.cl b/ocml/src/isfiniteH.cl
index c2b62152..dce82701 100644
--- a/ocml/src/isfiniteH.cl
+++ b/ocml/src/isfiniteH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(isfinite)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(isfinite)(half2 x)
          BUILTIN_CLASS_F16(x.hi, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR) ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isfinite)(half x)
 {
     return BUILTIN_CLASS_F16(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR);
diff --git a/ocml/src/isinfD.cl b/ocml/src/isinfD.cl
index 00822a9b..bf33343e 100644
--- a/ocml/src/isinfD.cl
+++ b/ocml/src/isinfD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isinf)(double x)
 {
     return BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF);
diff --git a/ocml/src/isinfF.cl b/ocml/src/isinfF.cl
index 4a0bda85..0a408cd1 100644
--- a/ocml/src/isinfF.cl
+++ b/ocml/src/isinfF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isinf)(float x)
 {
     return BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF);
diff --git a/ocml/src/isinfH.cl b/ocml/src/isinfH.cl
index db18b9b7..d2978f02 100644
--- a/ocml/src/isinfH.cl
+++ b/ocml/src/isinfH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(isinf)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(isinf)(half2 x)
          BUILTIN_CLASS_F16(x.hi, CLASS_PINF|CLASS_NINF) ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isinf)(half x)
 {
     return BUILTIN_CLASS_F16(x, CLASS_PINF|CLASS_NINF);
diff --git a/ocml/src/isnanD.cl b/ocml/src/isnanD.cl
index d1f1b03d..12400473 100644
--- a/ocml/src/isnanD.cl
+++ b/ocml/src/isnanD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnan)(double x)
 {
     return BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN);
diff --git a/ocml/src/isnanF.cl b/ocml/src/isnanF.cl
index 5e305755..47fc9910 100644
--- a/ocml/src/isnanF.cl
+++ b/ocml/src/isnanF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnan)(float x)
 {
     return BUILTIN_CLASS_F32(x, CLASS_SNAN|CLASS_QNAN);
diff --git a/ocml/src/isnanH.cl b/ocml/src/isnanH.cl
index 8eb1b8e2..d831c3e8 100644
--- a/ocml/src/isnanH.cl
+++ b/ocml/src/isnanH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(isnan)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(isnan)(half2 x)
          BUILTIN_CLASS_F16(x.hi, CLASS_SNAN|CLASS_QNAN) ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnan)(half x)
 {
     return BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN);
diff --git a/ocml/src/isnormalD.cl b/ocml/src/isnormalD.cl
index 74907904..55799a17 100644
--- a/ocml/src/isnormalD.cl
+++ b/ocml/src/isnormalD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnormal)(double x)
 {
     return BUILTIN_CLASS_F64(x, CLASS_PNOR|CLASS_NNOR);
diff --git a/ocml/src/isnormalF.cl b/ocml/src/isnormalF.cl
index 2e717e4b..9c640286 100644
--- a/ocml/src/isnormalF.cl
+++ b/ocml/src/isnormalF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnormal)(float x)
 {
     return BUILTIN_CLASS_F32(x, CLASS_PNOR|CLASS_NNOR);
diff --git a/ocml/src/isnormalH.cl b/ocml/src/isnormalH.cl
index 1c0325a3..c33d9092 100644
--- a/ocml/src/isnormalH.cl
+++ b/ocml/src/isnormalH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(isnormal)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(isnormal)(half2 x)
          BUILTIN_CLASS_F16(x.hi, CLASS_PNOR|CLASS_NNOR) ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnormal)(half x)
 {
     return BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_NNOR);
diff --git a/ocml/src/j0H.cl b/ocml/src/j0H.cl
index f61b3fca..83feff6f 100644
--- a/ocml/src/j0H.cl
+++ b/ocml/src/j0H.cl
@@ -9,7 +9,7 @@
 
 UGEN(j0)
 
-INLINEATTR half
+half
 MATH_MANGLE(j0)(half x)
 {
     return (half)MATH_UPMANGLE(j0)((float)x);
diff --git a/ocml/src/j1H.cl b/ocml/src/j1H.cl
index 7cbaddf4..557038f2 100644
--- a/ocml/src/j1H.cl
+++ b/ocml/src/j1H.cl
@@ -9,7 +9,7 @@
 
 UGEN(j1)
 
-INLINEATTR half
+half
 MATH_MANGLE(j1)(half x)
 {
     return (half)MATH_UPMANGLE(j1)((float)x);
diff --git a/ocml/src/ldexpD.cl b/ocml/src/ldexpD.cl
index 1cf0e093..7ba48285 100644
--- a/ocml/src/ldexpD.cl
+++ b/ocml/src/ldexpD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(ldexp)(double x, int n)
 {
     return BUILTIN_FLDEXP_F64(x, n);
diff --git a/ocml/src/ldexpF.cl b/ocml/src/ldexpF.cl
index 435848aa..29a1da28 100644
--- a/ocml/src/ldexpF.cl
+++ b/ocml/src/ldexpF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(ldexp)(float x, int n)
 {
     return BUILTIN_FLDEXP_F32(x, n);
diff --git a/ocml/src/ldexpH.cl b/ocml/src/ldexpH.cl
index 7f06e7ef..d4d57043 100644
--- a/ocml/src/ldexpH.cl
+++ b/ocml/src/ldexpH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(ldexp)(half2 x, int2 n)
 {
     return (half2)(MATH_MANGLE(ldexp)(x.lo, n.lo), MATH_MANGLE(ldexp)(x.hi, n.hi));
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(ldexp)(half x, int n)
 {
     return BUILTIN_FLDEXP_F16(x, BUILTIN_CLAMP_S32(n, SHRT_MIN, SHRT_MAX));
diff --git a/ocml/src/len3D.cl b/ocml/src/len3D.cl
index d0c6e811..fee8e9db 100644
--- a/ocml/src/len3D.cl
+++ b/ocml/src/len3D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(len3)(double x, double y, double z)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/len3F.cl b/ocml/src/len3F.cl
index f2ab9125..bb14ee5a 100644
--- a/ocml/src/len3F.cl
+++ b/ocml/src/len3F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(len3)(float x, float y, float z)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/len3H.cl b/ocml/src/len3H.cl
index 32248780..bb6ef92c 100644
--- a/ocml/src/len3H.cl
+++ b/ocml/src/len3H.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(len3)(half x, half y, half z)
 {
     float fx = (float)x;
diff --git a/ocml/src/len4D.cl b/ocml/src/len4D.cl
index 4fe8b898..b05f0cad 100644
--- a/ocml/src/len4D.cl
+++ b/ocml/src/len4D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(len4)(double x, double y, double z, double w)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/len4F.cl b/ocml/src/len4F.cl
index c80e4c0c..24231618 100644
--- a/ocml/src/len4F.cl
+++ b/ocml/src/len4F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(len4)(float x, float y, float z, float w)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/len4H.cl b/ocml/src/len4H.cl
index 6fee1090..9b320c78 100644
--- a/ocml/src/len4H.cl
+++ b/ocml/src/len4H.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(len4)(half x, half y, half z, half w)
 {
     float fx = (float)x;
diff --git a/ocml/src/lgammaD.cl b/ocml/src/lgammaD.cl
index 4a9849e9..69e50258 100644
--- a/ocml/src/lgammaD.cl
+++ b/ocml/src/lgammaD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(lgamma)(double x)
 {
     int s;
diff --git a/ocml/src/lgammaF.cl b/ocml/src/lgammaF.cl
index 2f53d18c..4a113c1d 100644
--- a/ocml/src/lgammaF.cl
+++ b/ocml/src/lgammaF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(lgamma)(float x)
 {
     int s;
diff --git a/ocml/src/lgammaH.cl b/ocml/src/lgammaH.cl
index 6472f9f6..81a0fcec 100644
--- a/ocml/src/lgammaH.cl
+++ b/ocml/src/lgammaH.cl
@@ -9,7 +9,7 @@
 
 UGEN(lgamma)
 
-INLINEATTR half
+half
 MATH_MANGLE(lgamma)(half x)
 {
     int s;
diff --git a/ocml/src/lgamma_rH.cl b/ocml/src/lgamma_rH.cl
index 377721d9..b1f6d485 100644
--- a/ocml/src/lgamma_rH.cl
+++ b/ocml/src/lgamma_rH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(lgamma_r)(half2 x, __private int2 *signp)
 {
     int slo, shi;
@@ -18,7 +18,7 @@ MATH_MANGLE2(lgamma_r)(half2 x, __private int2 *signp)
     return r;
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(lgamma_r)(half x, __private int *signp)
 {
     return (half)MATH_UPMANGLE(lgamma_r)((float)x, signp);
diff --git a/ocml/src/lnepD.cl b/ocml/src/lnepD.cl
index 6bece3e7..dfe4484d 100644
--- a/ocml/src/lnepD.cl
+++ b/ocml/src/lnepD.cl
@@ -10,7 +10,7 @@
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR double
+CONSTATTR double
 MATH_PRIVATE(lnep)(double2 a)
 {
     int b = BUILTIN_FREXP_MANT_F64(a.hi) < (2.0/3.0);
diff --git a/ocml/src/lnepF.cl b/ocml/src/lnepF.cl
index 65675582..0c4502f2 100644
--- a/ocml/src/lnepF.cl
+++ b/ocml/src/lnepF.cl
@@ -10,7 +10,7 @@
 #define FLOAT_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR float
+CONSTATTR float
 MATH_PRIVATE(lnep)(float2 a)
 {
     int b = BUILTIN_FREXP_MANT_F32(a.hi) < (2.0f/3.0f);
diff --git a/ocml/src/log10H.cl b/ocml/src/log10H.cl
index 1fbf60ae..2a03ff02 100644
--- a/ocml/src/log10H.cl
+++ b/ocml/src/log10H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(log10)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(log10)(half x)
 {
     return (half)(BUILTIN_LOG2_F32((float)x) * 0x1.344136p-2f);
diff --git a/ocml/src/log1pD.cl b/ocml/src/log1pD.cl
index c5f5252f..240b4626 100644
--- a/ocml/src/log1pD.cl
+++ b/ocml/src/log1pD.cl
@@ -12,7 +12,7 @@ extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x);
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(log1p)(double x)
 {
     double z = MATH_PRIVATE(lnep)(add(1.0, x));
diff --git a/ocml/src/log1pF.cl b/ocml/src/log1pF.cl
index b4584519..ce8a5a2b 100644
--- a/ocml/src/log1pF.cl
+++ b/ocml/src/log1pF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(log1p)(float x)
 {
     float z = MATH_PRIVATE(lnep)(add(1.0, x));
diff --git a/ocml/src/log1pH.cl b/ocml/src/log1pH.cl
index da274acf..51b5ff7c 100644
--- a/ocml/src/log1pH.cl
+++ b/ocml/src/log1pH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(log1p)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(log1p)(half x)
 {
     half ret;
diff --git a/ocml/src/log2H.cl b/ocml/src/log2H.cl
index 3d38a9f6..4a46e968 100644
--- a/ocml/src/log2H.cl
+++ b/ocml/src/log2H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(log2)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(log2)(half x)
 {
     return BUILTIN_LOG2_F16(x);
diff --git a/ocml/src/logF_base.h b/ocml/src/logF_base.h
index cddad305..763623ab 100644
--- a/ocml/src/logF_base.h
+++ b/ocml/src/logF_base.h
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR CONSTATTR float
+CONSTATTR float
 #if defined COMPILING_LOG2
 MATH_MANGLE(log2)(float x)
 #elif defined COMPILING_LOG10
diff --git a/ocml/src/logH.cl b/ocml/src/logH.cl
index d62fecc4..08439ff5 100644
--- a/ocml/src/logH.cl
+++ b/ocml/src/logH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(log)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(log)(half x)
 {
     return (half)(BUILTIN_LOG2_F32((float)x) * 0x1.62e430p-1f);
diff --git a/ocml/src/logbD.cl b/ocml/src/logbD.cl
index cbc52224..2b859853 100644
--- a/ocml/src/logbD.cl
+++ b/ocml/src/logbD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(logb)(double x)
 {
     double ret = (double)(BUILTIN_FREXP_EXP_F64(x) - 1);
diff --git a/ocml/src/logbF.cl b/ocml/src/logbF.cl
index f7154d4f..0e6cb740 100644
--- a/ocml/src/logbF.cl
+++ b/ocml/src/logbF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(logb)(float x)
 {
     float ret = (float)(BUILTIN_FREXP_EXP_F32(x) - 1);
diff --git a/ocml/src/logbH.cl b/ocml/src/logbH.cl
index 656d07b0..49af766e 100644
--- a/ocml/src/logbH.cl
+++ b/ocml/src/logbH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR UGEN(logb)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(logb)(half x)
 {
     half ret = (half)(BUILTIN_FREXP_EXP_F16(x) - (short)1);
diff --git a/ocml/src/madD.cl b/ocml/src/madD.cl
index e5573141..293e3fce 100644
--- a/ocml/src/madD.cl
+++ b/ocml/src/madD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(mad)(double a, double b, double c)
 {
     return MATH_MAD(a, b, c);
diff --git a/ocml/src/madF.cl b/ocml/src/madF.cl
index b1f67ec4..06546b44 100644
--- a/ocml/src/madF.cl
+++ b/ocml/src/madF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(mad)(float a, float b, float c)
 {
     return MATH_MAD(a, b, c);
diff --git a/ocml/src/madH.cl b/ocml/src/madH.cl
index 707f99ac..4f3d393f 100644
--- a/ocml/src/madH.cl
+++ b/ocml/src/madH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(mad)(half2 a, half2 b, half2 c)
 {
     return MATH_MAD2(a, b, c);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(mad)(half a, half b, half c)
 {
     return MATH_MAD(a, b, c);
diff --git a/ocml/src/maxD.cl b/ocml/src/maxD.cl
index 49b3dccb..7c6664b0 100644
--- a/ocml/src/maxD.cl
+++ b/ocml/src/maxD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(max)(double x, double y)
 {
     return BUILTIN_CMAX_F64(x, y);
diff --git a/ocml/src/maxF.cl b/ocml/src/maxF.cl
index 6e3e17ba..4cd0bfa9 100644
--- a/ocml/src/maxF.cl
+++ b/ocml/src/maxF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(max)(float x, float y)
 {
     return BUILTIN_CMAX_F32(x, y);
diff --git a/ocml/src/maxH.cl b/ocml/src/maxH.cl
index 31cad270..01479c8a 100644
--- a/ocml/src/maxH.cl
+++ b/ocml/src/maxH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(max)(half2 x, half2 y)
 {
     return BUILTIN_CMAX_2F16(x, y);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(max)(half x, half y)
 {
     return BUILTIN_CMAX_F16(x, y);
diff --git a/ocml/src/maxmagD.cl b/ocml/src/maxmagD.cl
index 9f606da8..3db12aad 100644
--- a/ocml/src/maxmagD.cl
+++ b/ocml/src/maxmagD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(maxmag)(double x, double y)
 {
 #if 0
diff --git a/ocml/src/maxmagF.cl b/ocml/src/maxmagF.cl
index 4997bd06..941fbe4b 100644
--- a/ocml/src/maxmagF.cl
+++ b/ocml/src/maxmagF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(maxmag)(float x, float y)
 {
 #if 0
diff --git a/ocml/src/maxmagH.cl b/ocml/src/maxmagH.cl
index 74ab78aa..9453df4e 100644
--- a/ocml/src/maxmagH.cl
+++ b/ocml/src/maxmagH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(maxmag)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(maxmag)(half x, half y)
 {
     x = BUILTIN_CANONICALIZE_F16(x);
diff --git a/ocml/src/minD.cl b/ocml/src/minD.cl
index c2d0b120..151178c2 100644
--- a/ocml/src/minD.cl
+++ b/ocml/src/minD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(min)(double x, double y)
 {
     return BUILTIN_CMIN_F64(x, y);
diff --git a/ocml/src/minF.cl b/ocml/src/minF.cl
index 9c5e741b..eb38af70 100644
--- a/ocml/src/minF.cl
+++ b/ocml/src/minF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(min)(float x, float y)
 {
     return BUILTIN_CMIN_F32(x, y);
diff --git a/ocml/src/minH.cl b/ocml/src/minH.cl
index 2ed7fa68..2f2eb4d7 100644
--- a/ocml/src/minH.cl
+++ b/ocml/src/minH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(min)(half2 x, half2 y)
 {
     return BUILTIN_CMIN_2F16(x, y);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(min)(half x, half y)
 {
     return BUILTIN_CMIN_F16(x, y);
diff --git a/ocml/src/minmagD.cl b/ocml/src/minmagD.cl
index 80e7e3f4..cb3dbf3d 100644
--- a/ocml/src/minmagD.cl
+++ b/ocml/src/minmagD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(minmag)(double x, double y)
 {
 #if 0
diff --git a/ocml/src/minmagF.cl b/ocml/src/minmagF.cl
index 41fabef4..8994aac0 100644
--- a/ocml/src/minmagF.cl
+++ b/ocml/src/minmagF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(minmag)(float x, float y)
 {
 #if 0
diff --git a/ocml/src/minmagH.cl b/ocml/src/minmagH.cl
index 8b3fd016..e2659945 100644
--- a/ocml/src/minmagH.cl
+++ b/ocml/src/minmagH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(minmag)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(minmag)(half x, half y)
 {
     x = BUILTIN_CANONICALIZE_F16(x);
diff --git a/ocml/src/modfD.cl b/ocml/src/modfD.cl
index 317abdc6..6ad02e35 100644
--- a/ocml/src/modfD.cl
+++ b/ocml/src/modfD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(modf)(double x, __private double *iptr)
 {
     double tx = BUILTIN_TRUNC_F64(x);
diff --git a/ocml/src/modfF.cl b/ocml/src/modfF.cl
index 27b33289..7d9b2964 100644
--- a/ocml/src/modfF.cl
+++ b/ocml/src/modfF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(modf)(float x, __private float *iptr)
 {
     float tx = BUILTIN_TRUNC_F32(x);
diff --git a/ocml/src/modfH.cl b/ocml/src/modfH.cl
index 7c40cba9..8c28ef86 100644
--- a/ocml/src/modfH.cl
+++ b/ocml/src/modfH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(modf)(half2 x, __private half2 *iptr)
 {
     half2 tx = BUILTIN_TRUNC_2F16(x);
@@ -18,7 +18,7 @@ MATH_MANGLE2(modf)(half2 x, __private half2 *iptr)
     return BUILTIN_COPYSIGN_2F16(ret, x);
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(modf)(half x, __private half *iptr)
 {
     half tx = BUILTIN_TRUNC_F16(x);
diff --git a/ocml/src/mulD.cl b/ocml/src/mulD.cl
index c567b07e..05c8aae6 100644
--- a/ocml/src/mulD.cl
+++ b/ocml/src/mulD.cl
@@ -7,21 +7,15 @@
 
 #include "mathD.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR double \
-MATH_MANGLE(NAME)(double x, double y) \
+MATH_MANGLE(LN)(double x, double y) \
 { \
-    return BUILTIN_FULL_BINARY(fmul, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F64(x, y); \
 }
 
-GEN(mul_rte, ROUND_TO_NEAREST_EVEN)
-GEN(mul_rtp, ROUND_TO_POSINF)
-GEN(mul_rtn, ROUND_TO_NEGINF)
-GEN(mul_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(mul_rte,MUL_RTE)
+GEN(mul_rtn,MUL_RTN)
+GEN(mul_rtp,MUL_RTP)
+GEN(mul_rtz,MUL_RTZ)
 
diff --git a/ocml/src/mulF.cl b/ocml/src/mulF.cl
index 0a26fa26..4a4e4da0 100644
--- a/ocml/src/mulF.cl
+++ b/ocml/src/mulF.cl
@@ -7,27 +7,15 @@
 
 #include "mathF.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR float \
-MATH_MANGLE(NAME)(float x, float y) \
+MATH_MANGLE(LN)(float x, float y) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_BINARY(fmulf, true, ROUND, x, y); \
-    } else { \
-        ret = BUILTIN_FULL_BINARY(fmulf, false, ROUND, x, y); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x, y); \
 }
 
-GEN(mul_rte, ROUND_TO_NEAREST_EVEN)
-GEN(mul_rtp, ROUND_TO_POSINF)
-GEN(mul_rtn, ROUND_TO_NEGINF)
-GEN(mul_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(mul_rte,MUL_RTE)
+GEN(mul_rtn,MUL_RTN)
+GEN(mul_rtp,MUL_RTP)
+GEN(mul_rtz,MUL_RTZ)
 
diff --git a/ocml/src/mulH.cl b/ocml/src/mulH.cl
index 7fcf2141..9d738867 100644
--- a/ocml/src/mulH.cl
+++ b/ocml/src/mulH.cl
@@ -7,21 +7,15 @@
 
 #include "mathH.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR half \
-MATH_MANGLE(NAME)(half x, half y) \
+MATH_MANGLE(LN)(half x, half y) \
 { \
-    return BUILTIN_FULL_BINARY(fmulh, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F16(x, y); \
 }
 
-GEN(mul_rte, ROUND_TO_NEAREST_EVEN)
-GEN(mul_rtp, ROUND_TO_POSINF)
-GEN(mul_rtn, ROUND_TO_NEGINF)
-GEN(mul_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(mul_rte,MUL_RTE)
+GEN(mul_rtn,MUL_RTN)
+GEN(mul_rtp,MUL_RTP)
+GEN(mul_rtz,MUL_RTZ)
 
diff --git a/ocml/src/nanD.cl b/ocml/src/nanD.cl
index 439c9654..762365bc 100644
--- a/ocml/src/nanD.cl
+++ b/ocml/src/nanD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(nan)(ulong nancode)
 {
     return AS_DOUBLE((nancode & MANTBITS_DP64) | QNANBITPATT_DP64);
diff --git a/ocml/src/nanF.cl b/ocml/src/nanF.cl
index 7fcf09fd..aeb5e530 100644
--- a/ocml/src/nanF.cl
+++ b/ocml/src/nanF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(nan)(uint nancode)
 {
     return AS_FLOAT(QNANBITPATT_SP32 | (nancode & 0xfffff));
diff --git a/ocml/src/nanH.cl b/ocml/src/nanH.cl
index 086c5f6d..b53e48e8 100644
--- a/ocml/src/nanH.cl
+++ b/ocml/src/nanH.cl
@@ -7,14 +7,14 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(nan)(ushort2 nancode)
 {
     ushort2 h = (ushort2)QNANBITPATT_HP16 | (nancode & (ushort2)0x01ff);
     return AS_HALF2(h);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(nan)(ushort nancode)
 {
     ushort h = (ushort)QNANBITPATT_HP16 | (nancode & (ushort)0x01ff);
diff --git a/ocml/src/ncdfH.cl b/ocml/src/ncdfH.cl
index 1ac2bf9f..cb7bd711 100644
--- a/ocml/src/ncdfH.cl
+++ b/ocml/src/ncdfH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(ncdf)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(ncdf)(half x)
 {
     return (half)MATH_UPMANGLE(ncdf)((float)x);
diff --git a/ocml/src/ncdfinvD.cl b/ocml/src/ncdfinvD.cl
index 300f6048..f2e6cfd5 100644
--- a/ocml/src/ncdfinvD.cl
+++ b/ocml/src/ncdfinvD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR PUREATTR double
+PUREATTR double
 MATH_MANGLE(ncdfinv)(double x)
 {
     return -0x1.6a09e667f3bcdp+0 * MATH_MANGLE(erfcinv)(x + x);
diff --git a/ocml/src/ncdfinvF.cl b/ocml/src/ncdfinvF.cl
index d04dddd0..9c31025d 100644
--- a/ocml/src/ncdfinvF.cl
+++ b/ocml/src/ncdfinvF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR PUREATTR float
+PUREATTR float
 MATH_MANGLE(ncdfinv)(float x)
 {
     return -0x1.6a09e6p+0f * MATH_MANGLE(erfcinv)(x + x);
diff --git a/ocml/src/ncdfinvH.cl b/ocml/src/ncdfinvH.cl
index 3905a68d..8f4fceca 100644
--- a/ocml/src/ncdfinvH.cl
+++ b/ocml/src/ncdfinvH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(ncdfinv)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(ncdfinv)(half x)
 {
     return (half)MATH_UPMANGLE(ncdfinv)((float)x);
diff --git a/ocml/src/nearbyintD.cl b/ocml/src/nearbyintD.cl
index df2d005b..a222532f 100644
--- a/ocml/src/nearbyintD.cl
+++ b/ocml/src/nearbyintD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(nearbyint)(double x)
 {
     return BUILTIN_RINT_F64(x);
diff --git a/ocml/src/nearbyintF.cl b/ocml/src/nearbyintF.cl
index 5ae97fff..44be2481 100644
--- a/ocml/src/nearbyintF.cl
+++ b/ocml/src/nearbyintF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(nearbyint)(float x)
 {
     return BUILTIN_RINT_F32(x);
diff --git a/ocml/src/nearbyintH.cl b/ocml/src/nearbyintH.cl
index cf2e962c..92c0fa3b 100644
--- a/ocml/src/nearbyintH.cl
+++ b/ocml/src/nearbyintH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(nearbyint)(half2 x)
 {
     return BUILTIN_RINT_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(nearbyint)(half x)
 {
     return BUILTIN_RINT_F16(x);
diff --git a/ocml/src/nextafterD.cl b/ocml/src/nextafterD.cl
index ee4031c6..aa1add9f 100644
--- a/ocml/src/nextafterD.cl
+++ b/ocml/src/nextafterD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(nextafter)(double x, double y)
 {
     long ix = AS_LONG(x);
diff --git a/ocml/src/nextafterF.cl b/ocml/src/nextafterF.cl
index 4ef25bcd..0c4180c5 100644
--- a/ocml/src/nextafterF.cl
+++ b/ocml/src/nextafterF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(nextafter)(float x, float y)
 {
     int ix = AS_INT(x);
diff --git a/ocml/src/nextafterH.cl b/ocml/src/nextafterH.cl
index d81028ac..517ce81a 100644
--- a/ocml/src/nextafterH.cl
+++ b/ocml/src/nextafterH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(nextafter)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(nextafter)(half x, half y)
 {
     short ix = AS_SHORT(x);
diff --git a/ocml/src/opts.h b/ocml/src/opts.h
index c6bb1146..3a07cbc2 100644
--- a/ocml/src/opts.h
+++ b/ocml/src/opts.h
@@ -7,7 +7,7 @@
 
 #include "oclc.h"
 
-#define HAVE_FAST_FMA32() (__oclc_ISA_version() == 701 || __oclc_ISA_version() == 801)
+#define HAVE_FAST_FMA32() (__oclc_ISA_version() == 701 || __oclc_ISA_version() == 801 || __oclc_ISA_version() >= 900)
 #define FINITE_ONLY_OPT() __oclc_finite_only_opt()
 #define UNSAFE_MATH_OPT() __oclc_unsafe_math_opt()
 #define DAZ_OPT() __oclc_daz_opt()
diff --git a/ocml/src/pownH.cl b/ocml/src/pownH.cl
index c8c74d31..3604cae6 100644
--- a/ocml/src/pownH.cl
+++ b/ocml/src/pownH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-PUREATTR INLINEATTR half2
+PUREATTR half2
 MATH_MANGLE2(pown)(half2 x, int2 ny)
 {
     return (half2)(MATH_MANGLE(pown)(x.lo, ny.lo), MATH_MANGLE(pown)(x.hi, ny.hi));
diff --git a/ocml/src/rcbrtF.cl b/ocml/src/rcbrtF.cl
index 1fd6c9c0..0e393e68 100644
--- a/ocml/src/rcbrtF.cl
+++ b/ocml/src/rcbrtF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rcbrt)(float x)
 {
     if (DAZ_OPT()) {
diff --git a/ocml/src/remainderF_base.h b/ocml/src/remainderF_base.h
index 4422f826..a79ec5d1 100644
--- a/ocml/src/remainderF_base.h
+++ b/ocml/src/remainderF_base.h
@@ -18,7 +18,7 @@
         CLO = MATH_MAD(__ta, __tb, MATH_MAD(__ta, __hb, MATH_MAD(__ha, __tb, MATH_MAD(__ha, __hb, -CHI)))); \
     } while (0)
 
-CONSTATTR static inline float
+CONSTATTR INLINEATTR static float
 fnma(float a, float b, float c)
 {
     float d;
diff --git a/ocml/src/remquoH.cl b/ocml/src/remquoH.cl
index 3893dded..18106093 100644
--- a/ocml/src/remquoH.cl
+++ b/ocml/src/remquoH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(remquo)(half2 x, half2 y, __private int2 *q7p)
 {
     int qlo, qhi;
diff --git a/ocml/src/rhypotD.cl b/ocml/src/rhypotD.cl
index 0524902b..4339b4f5 100644
--- a/ocml/src/rhypotD.cl
+++ b/ocml/src/rhypotD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rhypot)(double x, double y)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/rhypotF.cl b/ocml/src/rhypotF.cl
index 56cc0d2f..cdf08f86 100644
--- a/ocml/src/rhypotF.cl
+++ b/ocml/src/rhypotF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rhypot)(float x, float y)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/rhypotH.cl b/ocml/src/rhypotH.cl
index d1c571a0..97acf627 100644
--- a/ocml/src/rhypotH.cl
+++ b/ocml/src/rhypotH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(rhypot)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rhypot)(half x, half y)
 {
     float fx = (float)x;
diff --git a/ocml/src/rintD.cl b/ocml/src/rintD.cl
index a43b5ec8..7c3bb107 100644
--- a/ocml/src/rintD.cl
+++ b/ocml/src/rintD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rint)(double x)
 {
     return BUILTIN_RINT_F64(x);
diff --git a/ocml/src/rintF.cl b/ocml/src/rintF.cl
index a95c223b..17254933 100644
--- a/ocml/src/rintF.cl
+++ b/ocml/src/rintF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rint)(float x)
 {
     return BUILTIN_RINT_F32(x);
diff --git a/ocml/src/rintH.cl b/ocml/src/rintH.cl
index fa789d5b..f2ffd3c1 100644
--- a/ocml/src/rintH.cl
+++ b/ocml/src/rintH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(rint)(half2 x)
 {
     return BUILTIN_RINT_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rint)(half x)
 {
     return BUILTIN_RINT_F16(x);
diff --git a/ocml/src/rlen3D.cl b/ocml/src/rlen3D.cl
index f9442e48..a1081a2c 100644
--- a/ocml/src/rlen3D.cl
+++ b/ocml/src/rlen3D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rlen3)(double x, double y, double z)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/rlen3F.cl b/ocml/src/rlen3F.cl
index bf0cab90..03f2c40a 100644
--- a/ocml/src/rlen3F.cl
+++ b/ocml/src/rlen3F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rlen3)(float x, float y, float z)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/rlen3H.cl b/ocml/src/rlen3H.cl
index c12755f7..b147b44d 100644
--- a/ocml/src/rlen3H.cl
+++ b/ocml/src/rlen3H.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rlen3)(half x, half y, half z)
 {
     float fx = (float)x;
diff --git a/ocml/src/rlen4D.cl b/ocml/src/rlen4D.cl
index 9c4fe9bf..4d16f943 100644
--- a/ocml/src/rlen4D.cl
+++ b/ocml/src/rlen4D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rlen4)(double x, double y, double z, double w)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/rlen4F.cl b/ocml/src/rlen4F.cl
index 733f62d1..e6d7603f 100644
--- a/ocml/src/rlen4F.cl
+++ b/ocml/src/rlen4F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rlen4)(float x, float y, float z, float w)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/rlen4H.cl b/ocml/src/rlen4H.cl
index 9fb13359..5abb05f5 100644
--- a/ocml/src/rlen4H.cl
+++ b/ocml/src/rlen4H.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rlen4)(half x, half y, half z, half w)
 {
     float fx = (float)x;
diff --git a/ocml/src/rootnH.cl b/ocml/src/rootnH.cl
index d17abfc8..5bd94272 100644
--- a/ocml/src/rootnH.cl
+++ b/ocml/src/rootnH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-PUREATTR INLINEATTR half2
+PUREATTR half2
 MATH_MANGLE2(rootn)(half2 x, int2 ny)
 {
     return (half2)(MATH_MANGLE(rootn)(x.lo, ny.lo), MATH_MANGLE(rootn)(x.hi, ny.hi));
diff --git a/ocml/src/roundD.cl b/ocml/src/roundD.cl
index e8281f8d..0bc2aedb 100644
--- a/ocml/src/roundD.cl
+++ b/ocml/src/roundD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(round)(double x)
 {
     double t = BUILTIN_TRUNC_F64(x);
diff --git a/ocml/src/roundF.cl b/ocml/src/roundF.cl
index bbaf3e6d..2b98a223 100644
--- a/ocml/src/roundF.cl
+++ b/ocml/src/roundF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(round)(float x)
 {
     float t = BUILTIN_TRUNC_F32(x);
diff --git a/ocml/src/roundH.cl b/ocml/src/roundH.cl
index 045f5d95..d735a7fb 100644
--- a/ocml/src/roundH.cl
+++ b/ocml/src/roundH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(round)(half2 x)
 {
     half2 t = BUILTIN_TRUNC_2F16(x);
@@ -18,7 +18,7 @@ MATH_MANGLE2(round)(half2 x)
     return t + o;
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(round)(half x)
 {
     half t = BUILTIN_TRUNC_F16(x);
diff --git a/ocml/src/rsqrtD.cl b/ocml/src/rsqrtD.cl
index d67127d4..5fd5d156 100644
--- a/ocml/src/rsqrtD.cl
+++ b/ocml/src/rsqrtD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rsqrt)(double x)
 {
     double y0 = BUILTIN_RSQRT_F64(x);
diff --git a/ocml/src/rsqrtF.cl b/ocml/src/rsqrtF.cl
index dc7df5fb..8349387f 100644
--- a/ocml/src/rsqrtF.cl
+++ b/ocml/src/rsqrtF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-PUREATTR INLINEATTR float
+PUREATTR float
 MATH_MANGLE(rsqrt)(float x)
 {
     if (DAZ_OPT()) {
diff --git a/ocml/src/rsqrtH.cl b/ocml/src/rsqrtH.cl
index ec5f9bed..ab42880e 100644
--- a/ocml/src/rsqrtH.cl
+++ b/ocml/src/rsqrtH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR UGEN(rsqrt)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rsqrt)(half x)
 {
     return BUILTIN_RSQRT_F16(x);
diff --git a/ocml/src/scalbD.cl b/ocml/src/scalbD.cl
index 5bfce8a7..cfe4caf3 100644
--- a/ocml/src/scalbD.cl
+++ b/ocml/src/scalbD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(scalb)(double x, double y)
 {
     double t = BUILTIN_MIN_F64(BUILTIN_MAX_F64(y, -0x1.0p+20), 0x1.0p+20);
diff --git a/ocml/src/scalbF.cl b/ocml/src/scalbF.cl
index f957fb7b..05d95969 100644
--- a/ocml/src/scalbF.cl
+++ b/ocml/src/scalbF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(scalb)(float x, float y)
 {
     float t = BUILTIN_CLAMP_F32(y, -0x1.0p+20f, 0x1.0p+20f);
diff --git a/ocml/src/scalbH.cl b/ocml/src/scalbH.cl
index 2d55c644..53b8cc8e 100644
--- a/ocml/src/scalbH.cl
+++ b/ocml/src/scalbH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(scalb)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(scalb)(half x, half y)
 {
     half t = BUILTIN_MIN_F16(BUILTIN_MAX_F16(y, -0x1.0p+6h), 0x1.0p+6h);
diff --git a/ocml/src/scalbnD.cl b/ocml/src/scalbnD.cl
index 350c47f9..07ecd541 100644
--- a/ocml/src/scalbnD.cl
+++ b/ocml/src/scalbnD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(scalbn)(double x, int n)
 {
     return MATH_MANGLE(ldexp)(x, n);
diff --git a/ocml/src/scalbnF.cl b/ocml/src/scalbnF.cl
index 49f4e700..b0adcc1a 100644
--- a/ocml/src/scalbnF.cl
+++ b/ocml/src/scalbnF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(scalbn)(float x, int n)
 {
     return MATH_MANGLE(ldexp)(x, n);
diff --git a/ocml/src/scalbnH.cl b/ocml/src/scalbnH.cl
index 5656013c..f9be702e 100644
--- a/ocml/src/scalbnH.cl
+++ b/ocml/src/scalbnH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(scalbn)(half2 x, int2 n)
 {
     return (half2)(MATH_MANGLE(ldexp)(x.lo, n.lo), MATH_MANGLE(ldexp)(x.hi, n.hi));
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(scalbn)(half x, int n)
 {
     return MATH_MANGLE(ldexp)(x, n);
diff --git a/ocml/src/signbitD.cl b/ocml/src/signbitD.cl
index 3c93ca5b..98681e5d 100644
--- a/ocml/src/signbitD.cl
+++ b/ocml/src/signbitD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(signbit)(double x)
 {
     return AS_INT2(x).hi < 0;
diff --git a/ocml/src/signbitF.cl b/ocml/src/signbitF.cl
index 3ceec89c..e944a72b 100644
--- a/ocml/src/signbitF.cl
+++ b/ocml/src/signbitF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(signbit)(float x)
 {
     return AS_INT(x) < 0;
diff --git a/ocml/src/signbitH.cl b/ocml/src/signbitH.cl
index e5fb9130..b5d99170 100644
--- a/ocml/src/signbitH.cl
+++ b/ocml/src/signbitH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(signbit)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(signbit)(half2 x)
          AS_SHORT(x.hi) < 0 ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(signbit)(half x)
 {
     return AS_SHORT(x) < 0;
diff --git a/ocml/src/sinD.cl b/ocml/src/sinD.cl
index 7ec233bc..8f4464c2 100644
--- a/ocml/src/sinD.cl
+++ b/ocml/src/sinD.cl
@@ -8,17 +8,14 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(sin)(double x)
 {
-    double r, rr;
-    int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincosred2)(r, rr, &cc);
-
-    int2 s = AS_INT2((regn & 1) == 0 ? ss : cc);
-    s.hi ^= (regn > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000);
+    int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c);
+    s.hi ^= (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000);
 
     if (!FINITE_ONLY_OPT()) {
         s = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : s;
diff --git a/ocml/src/sinF.cl b/ocml/src/sinF.cl
index fe6a75d8..c9059771 100644
--- a/ocml/src/sinF.cl
+++ b/ocml/src/sinF.cl
@@ -8,28 +8,22 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(sin)(float x)
 {
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-#if defined EXTRA_PRECISION
-    float r0, r1;
-    int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax));
+    struct redret r =  MATH_PRIVATE(trigred)(AS_FLOAT(ax));
 
-    float cc;
-    float ss = MATH_PRIVATE(sincosred2)(r0, r1, &cc);
+#if defined EXTRA_PRECISION
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 #else
-    float r;
-    int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax));
-
-    float cc;
-    float ss = MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 #endif
 
-    float s = (regn & 1) != 0 ? cc : ss;
-    s = AS_FLOAT(AS_INT(s) ^ (regn > 1 ? 0x80000000 : 0) ^ (ix ^ ax));
+    float s = (r.i & 1) != 0 ? sc.c : sc.s;
+    s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0) ^ (ix ^ ax));
 
     if (!FINITE_ONLY_OPT()) {
         s = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : s;
diff --git a/ocml/src/sinH.cl b/ocml/src/sinH.cl
index 1c92458d..7c018cc3 100644
--- a/ocml/src/sinH.cl
+++ b/ocml/src/sinH.cl
@@ -10,17 +10,14 @@
 
 UGEN(sin)
 
-INLINEATTR half
+half
 MATH_MANGLE(sin)(half x)
 {
-    half r;
-    short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
+    struct scret sc =  MATH_PRIVATE(sincosred)(r.hi);
 
-    half cc;
-    half ss = MATH_PRIVATE(sincosred)(r, &cc);
-
-    short s = AS_SHORT((i & (short)1) == (short)0 ? ss : cc);
-    s ^= (i > (short)1 ? (short)0x8000 : 0) ^ (AS_SHORT(x) & (short)0x8000);
+    short s = AS_SHORT((r.i & (short)1) == (short)0 ? sc.s : sc.c);
+    s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000);
 
     if (!FINITE_ONLY_OPT()) {
         s = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : s;
diff --git a/ocml/src/sinbD.cl b/ocml/src/sinbD.cl
index b300f031..c98a8fa2 100644
--- a/ocml/src/sinbD.cl
+++ b/ocml/src/sinbD.cl
@@ -24,31 +24,31 @@
         L = __e; \
     } while (0)
 
-INLINEATTR double
+double
 MATH_PRIVATE(sinb)(double x, int n, double p)
 {
-    double ph, pl, rh, rl, sh, sl;
-    int i = MATH_PRIVATE(trigred)(&rh, &rl, x);
-    bool b = rh < p;
-    i = (i - b - n) & 3;
+    struct redret r = MATH_PRIVATE(trigred)(x);
+    bool b = r.hi < p;
+    r.i = (r.i - b - n) & 3;
 
     // This is a properly signed extra precise pi/4
-    ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0)));
-    pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0)));
+    double ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0)));
+    double pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0)));
+
+    double sh, sl;
 
     FDIF2(ph, p, ph, sl);
     pl += sl;
     FSUM2(ph, pl, ph, pl);
 
-    FSUM2(ph, rh, sh, sl);
-    sl += pl + rl;
+    FSUM2(ph, r.hi, sh, sl);
+    sl += pl + r.lo;
     FSUM2(sh, sl, sh, sl);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincosred2)(sh, sl, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl);
 
-    int2 s = AS_INT2((i & 1) == 0 ? ss : cc);
-    s.hi ^= i > 1 ? 0x80000000 : 0;
+    int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c);
+    s.hi ^= r.i > 1 ? 0x80000000 : 0;
 
     return AS_DOUBLE(s);
 }
diff --git a/ocml/src/sinbF.cl b/ocml/src/sinbF.cl
index 9e26d0b6..cdc139be 100644
--- a/ocml/src/sinbF.cl
+++ b/ocml/src/sinbF.cl
@@ -24,41 +24,36 @@
         L = __e; \
     } while (0)
 
-INLINEATTR float
+float
 MATH_PRIVATE(sinb)(float x, int n, float p)
 {
+    struct redret r = MATH_PRIVATE(trigred)(x);
+    bool b = r.hi < p;
+    r.i = (r.i - b - n) & 3;
+
 #if defined EXTRA_PRECISION
-    float ph, pl, rh, rl, sh, sl;
-    int i = MATH_PRIVATE(trigred)(&rh, &rl, x);
-    bool b = rh < p;
-    i = (i - b - n) & 3;
+    float ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
+    float pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0));
 
-    ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
-    pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0));
+    float sh, sl;
 
     FDIF2(ph, p, ph, sl);
     pl += sl;
     FSUM2(ph, pl, ph, pl);
 
-    FSUM2(ph, rh, sh, sl);
-    sl += pl + rl;
+    FSUM2(ph, r.hi, sh, sl);
+    sl += pl + r.lo;
     FSUM2(sh, sl, sh, sl);
 
-    float cc;
-    float ss = MATH_PRIVATE(sincosred2)(sh, sl, &cc);
+    struct scret sc =  MATH_PRIVATE(sincosred2)(sh, sl);
 #else
-    float r;
-    int i = MATH_PRIVATE(trigred)(&r, x);
-    bool b = r < p;
-    i = (i - b - n) & 3;
-    r = r - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
+    r.hi = r.hi - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
 
-    float cc;
-    float ss = MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc =  MATH_PRIVATE(sincosred)(r.hi);
 #endif
 
-    float s = (i & 1) != 0 ? cc : ss;
-    s = AS_FLOAT(AS_INT(s) ^ (i > 1 ? 0x80000000 : 0));
+    float s = (r.i & 1) != 0 ? sc.c : sc.s;
+    s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0));
     return s;
 }
 
diff --git a/ocml/src/sincosD.cl b/ocml/src/sincosD.cl
index de851c34..891d083d 100644
--- a/ocml/src/sincosD.cl
+++ b/ocml/src/sincosD.cl
@@ -8,22 +8,19 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(sincos)(double x, __private double * cp)
 {
-    double r, rr;
-    int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincosred2)(r, rr, &cc);
+    int flip = r.i > 1 ? (int)0x80000000 : 0;
+    bool odd = (r.i & 1) != 0;
 
-    int flip = regn > 1 ? (int)0x80000000 : 0;
-    bool odd = (regn & 1) != 0;
-
-    int2 s = AS_INT2(odd ? cc : ss);
+    int2 s = AS_INT2(odd ? sc.c : sc.s);
     s.hi ^= flip ^ (AS_INT2(x).hi &(int)0x80000000);
-    ss = -ss;
-    int2 c = AS_INT2(odd ? ss : cc);
+    sc.s = -sc.s;
+    int2 c = AS_INT2(odd ? sc.s : sc.c);
     c.hi ^= flip;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincosF.cl b/ocml/src/sincosF.cl
index 1baa857f..123b4595 100644
--- a/ocml/src/sincosF.cl
+++ b/ocml/src/sincosF.cl
@@ -8,32 +8,26 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(sincos)(float x, __private float *cp)
 {
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-#if defined EXTRA_PRECISION
-    float r0, r1;
-    int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax));
+    struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax));
 
-    float cc;
-    float ss = MATH_PRIVATE(sincosred2)(r0, r1, &cc);
+#if defined EXTRA_PRECISION
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 #else
-    float r;
-    int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax));
-
-    float cc;
-    float ss = MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 #endif
 
-    int flip = regn > 1 ? 0x80000000 : 0;
-    bool odd = (regn & 1) != 0;
-    float s = odd ? cc : ss;
+    int flip = r.i > 1 ? 0x80000000 : 0;
+    bool odd = (r.i & 1) != 0;
+    float s = odd ? sc.c : sc.s;
     s = AS_FLOAT(AS_INT(s) ^ flip ^ (ax ^ ix));
-    ss = -ss;
-    float c = odd ? ss : cc;
+    sc.s = -sc.s;
+    float c = odd ? sc.s : sc.c;
     c = AS_FLOAT(AS_INT(c) ^ flip);
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincosH.cl b/ocml/src/sincosH.cl
index 43a35c6b..bdf62827 100644
--- a/ocml/src/sincosH.cl
+++ b/ocml/src/sincosH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigredH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(sincos)(half2 x, __private half2 *cp)
 {
     half2 s;
@@ -19,21 +19,18 @@ MATH_MANGLE2(sincos)(half2 x, __private half2 *cp)
     return s;
 }
 
-INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(sincos)(half x, __private half *cp)
 {
-    half r;
-    short regn = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 
-    half cc;
-    half ss = MATH_PRIVATE(sincosred)(r, &cc);
-
-    short flip = regn > (short)1 ? (short)0x8000 : (short)0;
-    bool odd = (regn & 1) != 0;
-    short s = AS_SHORT(odd ? cc : ss);
+    short flip = r.i > (short)1 ? (short)0x8000 : (short)0;
+    bool odd = (r.i & (short)1) != (short)0;
+    short s = AS_SHORT(odd ? sc.c : sc.s);
     s ^= flip ^ (AS_SHORT(x) & (short)0x8000);
-    ss = -ss;
-    short c = AS_SHORT(odd ? ss : cc);
+    sc.s = -sc.s;
+    short c = AS_SHORT(odd ? sc.s : sc.c);
     c ^= flip;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincospiD.cl b/ocml/src/sincospiD.cl
index 1b92e61a..4ede0cc7 100644
--- a/ocml/src/sincospiD.cl
+++ b/ocml/src/sincospiD.cl
@@ -8,22 +8,19 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(sincospi)(double x, __private double * cp)
 {
-    double t;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincospired)(t, &cc);
+    int flip = r.i > 1 ? (int)0x80000000 : 0;
+    bool odd = (r.i & 1) != 0;
 
-    int flip = i > 1 ? (int)0x80000000 : 0;
-    bool odd = (i & 1) != 0;
-
-    int2 s = AS_INT2(odd ? cc : ss);
+    int2 s = AS_INT2(odd ? sc.c : sc.s);
     s.hi ^= flip ^ (AS_INT2(x).hi & 0x80000000);
-    ss = -ss;
-    int2 c = AS_INT2(odd ? ss : cc);
+    sc.s = -sc.s;
+    int2 c = AS_INT2(odd ? sc.s : sc.c);
     c.hi ^= flip;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincospiF.cl b/ocml/src/sincospiF.cl
index af3528ce..9585bb42 100644
--- a/ocml/src/sincospiF.cl
+++ b/ocml/src/sincospiF.cl
@@ -8,24 +8,21 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(sincospi)(float x, __private float *cp)
 {
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-    float t;
-    int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    float cc;
-    float ss = MATH_PRIVATE(sincospired)(t, &cc);
-
-    int flip = i > 1 ? 0x80000000 : 0;
-    bool odd = (i & 1) != 0;
-    float s = odd ? cc : ss;
+    int flip = r.i > 1 ? 0x80000000 : 0;
+    bool odd = (r.i & 1) != 0;
+    float s = odd ? sc.c : sc.s;
     s = AS_FLOAT(AS_INT(s) ^ flip ^ (ax ^ ix));
-    ss = -ss;
-    float c = odd ? ss : cc;
+    sc.s = -sc.s;
+    float c = odd ? sc.s : sc.c;
     c = AS_FLOAT(AS_INT(c) ^ flip);
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincospiH.cl b/ocml/src/sincospiH.cl
index cba66af7..78249533 100644
--- a/ocml/src/sincospiH.cl
+++ b/ocml/src/sincospiH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigpiredH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(sincospi)(half2 x, __private half2 *cp)
 {
     half2 s;
@@ -20,22 +20,18 @@ MATH_MANGLE2(sincospi)(half2 x, __private half2 *cp)
     return s;
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(sincospi)(half x, __private half *cp)
 {
-    half t;
-    short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    half cc;
-    half ss = MATH_PRIVATE(sincospired)(t, &cc);
-
-    short flip = i > (short)1 ? (short)0x8000 : (short)0;
-    bool odd = (i & (short)1) != (short)0;
-
-    short s = AS_SHORT(odd ? cc : ss);
+    short flip = r.i > (short)1 ? (short)0x8000 : (short)0;
+    bool odd = (r.i & (short)1) != (short)0;
+    short s = AS_SHORT(odd ? sc.c : sc.s);
     s ^= flip ^ (AS_SHORT(x) & (short)0x8000);
-    ss = -ss;
-    short c = AS_SHORT(odd ? ss : cc);
+    sc.s = -sc.s;
+    short c = AS_SHORT(odd ? sc.s : sc.c);
     c ^= flip;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincospiredD.cl b/ocml/src/sincospiredD.cl
index 5200346a..aae84504 100644
--- a/ocml/src/sincospiredD.cl
+++ b/ocml/src/sincospiredD.cl
@@ -6,11 +6,11 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathD.h"
+#include "trigpiredD.h"
 
-INLINEATTR double
-MATH_PRIVATE(sincospired)(double x, __private double *cp)
+CONSTATTR struct scret
+MATH_PRIVATE(sincospired)(double x)
 {
-
     double t = x * x;
 
     double sx = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 
@@ -26,7 +26,9 @@ MATH_PRIVATE(sincospired)(double x, __private double *cp)
                     -0x1.55d3c7e3c325bp+0), 0x1.03c1f081b5a67p+2), -0x1.3bd3cc9be45dep+2);
     cx = MATH_MAD(t, cx, 1.0);
 
-    *cp = cx;
-    return sx;
+    struct scret ret;
+    ret.c = cx;
+    ret.s = sx;
+    return ret;
 }
 
diff --git a/ocml/src/sincospiredF.cl b/ocml/src/sincospiredF.cl
index 786036a1..ac164a17 100644
--- a/ocml/src/sincospiredF.cl
+++ b/ocml/src/sincospiredF.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathF.h"
+#include "trigredF.h"
 
-INLINEATTR float
-MATH_PRIVATE(sincospired)(float x, __private float *cp)
+CONSTATTR struct scret
+MATH_PRIVATE(sincospired)(float x)
 {
 
     float t = x * x;
@@ -23,7 +24,9 @@ MATH_PRIVATE(sincospired)(float x, __private float *cp)
                    -0x1.3bd3ccp+2f);
     cx = MATH_MAD(t, cx, 1.0f);
 
-    *cp = cx;
-    return sx;
+    struct scret ret;
+    ret.c = cx;
+    ret.s = sx;
+    return ret;
 }
 
diff --git a/ocml/src/sincospiredH.cl b/ocml/src/sincospiredH.cl
index 28a0fa7a..33a13ab0 100644
--- a/ocml/src/sincospiredH.cl
+++ b/ocml/src/sincospiredH.cl
@@ -6,11 +6,11 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathH.h"
+#include "trigpiredH.h"
 
-INLINEATTR half
-MATH_PRIVATE(sincospired)(half x, __private half *cp)
+CONSTATTR struct scret
+MATH_PRIVATE(sincospired)(half x)
 {
-
     half t = x * x;
 
     half sx = MATH_MAD(t, 0x1.b84p+0h, -0x1.46cp+2h);
@@ -20,7 +20,9 @@ MATH_PRIVATE(sincospired)(half x, __private half *cp)
     half cx = MATH_MAD(t, 0x1.fbp+1h, -0x1.3bcp+2h);
     cx = MATH_MAD(t, cx, 1.0h);
 
-    *cp = cx;
-    return sx;
+    struct scret ret;
+    ret.c = cx;
+    ret.s = sx;
+    return ret;
 }
 
diff --git a/ocml/src/sincosred2D.cl b/ocml/src/sincosred2D.cl
index 800c1021..3d8c487d 100644
--- a/ocml/src/sincosred2D.cl
+++ b/ocml/src/sincosred2D.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathD.h"
+#include "trigredD.h"
 
-INLINEATTR double
-MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp)
+CONSTATTR struct scret
+MATH_PRIVATE(sincosred2)(double x, double y)
 {
     const double S0 = -0x1.5555555555555p-3;
     const double S1 =  0x1.1111111110bb3p-7;
@@ -35,7 +36,9 @@ MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp)
     double sxy = MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, S5, S4), S3), S2), S1);
     sxy = x - MATH_MAD(-x3, S0, MATH_MAD(x2, MATH_MAD(-x3, sxy, 0.5*y), -y));
 
-    *cp = cxy;
-    return sxy;
+    struct scret ret;
+    ret.c = cxy;
+    ret.s = sxy;
+    return ret;
 }
 
diff --git a/ocml/src/sincosred2F.cl b/ocml/src/sincosred2F.cl
index 36767e53..16cd8fde 100644
--- a/ocml/src/sincosred2F.cl
+++ b/ocml/src/sincosred2F.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathF.h"
+#include "trigredF.h"
 
-INLINEATTR float
-MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp)
+CONSTATTR struct scret
+MATH_PRIVATE(sincosred2)(float x, float y)
 {
     const float c0 =  0x1.555556p-5f;
     const float c1 = -0x1.6c16b2p-10f;
@@ -32,7 +33,9 @@ MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp)
     float sxy = MATH_MAD(x2, MATH_MAD(x2, s3, s2), s1);
     sxy = x - MATH_MAD(-x3, s0, MATH_MAD(x2, MATH_MAD(-x3, sxy, 0.5f*y), -y));
 
-    *cp = cxy;
-    return sxy;
+    struct scret ret;
+    ret.c = cxy;
+    ret.s = sxy;
+    return ret;
 }
 
diff --git a/ocml/src/sincosredD.cl b/ocml/src/sincosredD.cl
index ed64d24b..4418d623 100644
--- a/ocml/src/sincosredD.cl
+++ b/ocml/src/sincosredD.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathD.h"
+#include "trigredD.h"
 
-INLINEATTR double
-MATH_PRIVATE(sincosred)(double x, __private double *cp)
+CONSTATTR struct scret
+MATH_PRIVATE(sincosred)(double x)
 {
     const double S0 = -0x1.5555555555555p-3;
     const double S1 =  0x1.1111111110bb3p-7;
@@ -33,7 +34,9 @@ MATH_PRIVATE(sincosred)(double x, __private double *cp)
     double cx = t + MATH_MAD(x2*x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, C5, C4), C3), C2), C1), C0), v);
     double sx = MATH_MAD(x2*x, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, S5, S4), S3), S2), S1), S0), x);
 
-    *cp = cx;
-    return sx;
+    struct scret ret;
+    ret.c = cx;
+    ret.s = sx;
+    return ret;
 }
 
diff --git a/ocml/src/sincosredF.cl b/ocml/src/sincosredF.cl
index e4d2cfd5..54167c47 100644
--- a/ocml/src/sincosredF.cl
+++ b/ocml/src/sincosredF.cl
@@ -8,8 +8,8 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
-MATH_PRIVATE(sincosred)(float x, __private float *cp)
+CONSTATTR struct scret
+MATH_PRIVATE(sincosred)(float x)
 {
     float t = x * x;
 
@@ -17,7 +17,9 @@ MATH_PRIVATE(sincosred)(float x, __private float *cp)
     float c = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t,
                   0x1.aea668p-16f, -0x1.6c9e76p-10f), 0x1.5557eep-5f), -0x1.000008p-1f), 1.0f);
 
-    *cp = c;
-    return s;
+    struct scret ret;
+    ret.c = c;
+    ret.s = s;
+    return ret;
 }
 
diff --git a/ocml/src/sincosredH.cl b/ocml/src/sincosredH.cl
index a3ffec57..0dd4b17d 100644
--- a/ocml/src/sincosredH.cl
+++ b/ocml/src/sincosredH.cl
@@ -8,14 +8,16 @@
 #include "mathH.h"
 #include "trigredH.h"
 
-INLINEATTR half
-MATH_PRIVATE(sincosred)(half x, __private half *cp)
+CONSTATTR struct scret
+MATH_PRIVATE(sincosred)(half x)
 {
     half t = x * x;
     half s = MATH_MAD(x, t*MATH_MAD(t, 0x1.0bp-7h, -0x1.554p-3h), x);
     half c = MATH_MAD(t, MATH_MAD(t, 0x1.4b4p-5h, -0x1.ffcp-2h), 1.0h);
 
-    *cp = c;
-    return s;
+    struct scret ret;
+    ret.c = c;
+    ret.s = s;
+    return ret;
 }
 
diff --git a/ocml/src/sinhD.cl b/ocml/src/sinhD.cl
index 7d377385..0bab018b 100644
--- a/ocml/src/sinhD.cl
+++ b/ocml/src/sinhD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(sinh)(double x)
 {
     double y = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/sinhF.cl b/ocml/src/sinhF.cl
index 5718e06e..9ea55fc9 100644
--- a/ocml/src/sinhF.cl
+++ b/ocml/src/sinhF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(sinh)(float x)
 {
     float y = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/sinhH.cl b/ocml/src/sinhH.cl
index 92954661..c3ab5ed7 100644
--- a/ocml/src/sinhH.cl
+++ b/ocml/src/sinhH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(sinh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(sinh)(half hx)
 {
     float x = (float)hx * 0x1.715476p+0f;
diff --git a/ocml/src/sinpiD.cl b/ocml/src/sinpiD.cl
index 5393c792..ab208901 100644
--- a/ocml/src/sinpiD.cl
+++ b/ocml/src/sinpiD.cl
@@ -8,17 +8,14 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(sinpi)(double x)
 {
-    double t;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincospired)(t, &cc);
-
-    int2 s = AS_INT2((i & 1) == 0 ? ss : cc);
-    s.hi ^= (i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000);
+    int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c);
+    s.hi ^= (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000);
 
     if (!FINITE_ONLY_OPT()) {
         s = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : s;
diff --git a/ocml/src/sinpiF.cl b/ocml/src/sinpiF.cl
index 50fa9a44..2a50553a 100644
--- a/ocml/src/sinpiF.cl
+++ b/ocml/src/sinpiF.cl
@@ -8,20 +8,16 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(sinpi)(float x)
 {
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
+    struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    float r;
-    int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &r);
-
-    float cc;
-    float ss = MATH_PRIVATE(sincospired)(r, &cc);
-
-    float s = (i & 1) == 0 ? ss : cc;
-    s = AS_FLOAT(AS_INT(s) ^ (i > 1 ? 0x80000000 : 0) ^ (ix ^ ax));
+    float s = (r.i & 1) == 0 ? sc.s : sc.c;
+    s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0) ^ (ix ^ ax));
 
     if (!FINITE_ONLY_OPT()) {
         s = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : s;
diff --git a/ocml/src/sinpiH.cl b/ocml/src/sinpiH.cl
index a429ec5b..c738c222 100644
--- a/ocml/src/sinpiH.cl
+++ b/ocml/src/sinpiH.cl
@@ -10,17 +10,14 @@
 
 UGEN(sinpi)
 
-INLINEATTR half
+half
 MATH_MANGLE(sinpi)(half x)
 {
-    half t;
-    short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t);
+    struct redret r =  MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    half cc;
-    half ss = MATH_PRIVATE(sincospired)(t, &cc);
-
-    short s = AS_SHORT((i & (short)1) == (short)0 ? ss : cc);
-    s ^= (i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000);
+    short s = AS_SHORT((r.i & (short)1) == (short)0 ? sc.s : sc.c);
+    s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000);
 
     if (!FINITE_ONLY_OPT()) {
         s = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : s;
diff --git a/ocml/src/sqrtD.cl b/ocml/src/sqrtD.cl
index a68f7bd0..6f484fab 100644
--- a/ocml/src/sqrtD.cl
+++ b/ocml/src/sqrtD.cl
@@ -7,27 +7,21 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(sqrt)(double x)
 {
     return MATH_SQRT(x);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR double \
-MATH_MANGLE(NAME)(double x) \
+MATH_MANGLE(LN)(double x) \
 { \
-    return BUILTIN_FULL_UNARY(fsqrt, false, ROUND, x); \
+    return BUILTIN_##UN##_F64(x); \
 }
 
-GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sqrt_rtp, ROUND_TO_POSINF)
-GEN(sqrt_rtn, ROUND_TO_NEGINF)
-GEN(sqrt_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sqrt_rte,SQRT_RTE)
+GEN(sqrt_rtn,SQRT_RTN)
+GEN(sqrt_rtp,SQRT_RTP)
+GEN(sqrt_rtz,SQRT_RTZ)
 
diff --git a/ocml/src/sqrtF.cl b/ocml/src/sqrtF.cl
index dbf495c5..051e73b6 100644
--- a/ocml/src/sqrtF.cl
+++ b/ocml/src/sqrtF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(sqrt)(float x)
 {
     if (CORRECTLY_ROUNDED_SQRT32()) {
@@ -17,27 +17,15 @@ MATH_MANGLE(sqrt)(float x)
     }
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR float \
-MATH_MANGLE(NAME)(float x) \
+MATH_MANGLE(LN)(float x) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_UNARY(fsqrtf, true, ROUND, x); \
-    } else { \
-        ret = BUILTIN_FULL_UNARY(fsqrtf, false, ROUND, x); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x); \
 }
 
-GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sqrt_rtp, ROUND_TO_POSINF)
-GEN(sqrt_rtn, ROUND_TO_NEGINF)
-GEN(sqrt_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sqrt_rte,SQRT_RTE)
+GEN(sqrt_rtn,SQRT_RTN)
+GEN(sqrt_rtp,SQRT_RTP)
+GEN(sqrt_rtz,SQRT_RTZ)
 
diff --git a/ocml/src/sqrtH.cl b/ocml/src/sqrtH.cl
index 3c663887..b4488e4e 100644
--- a/ocml/src/sqrtH.cl
+++ b/ocml/src/sqrtH.cl
@@ -9,27 +9,21 @@
 
 CONSTATTR UGEN(sqrt)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(sqrt)(half x)
 {
     return BUILTIN_SQRT_F16(x);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR half \
-MATH_MANGLE(NAME)(half x) \
+MATH_MANGLE(LN)(half x) \
 { \
-    return BUILTIN_FULL_UNARY(fsqrth, false, ROUND, x); \
+    return BUILTIN_##UN##_F16(x); \
 }
 
-GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sqrt_rtp, ROUND_TO_POSINF)
-GEN(sqrt_rtn, ROUND_TO_NEGINF)
-GEN(sqrt_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sqrt_rte,SQRT_RTE)
+GEN(sqrt_rtp,SQRT_RTN)
+GEN(sqrt_rtn,SQRT_RTP)
+GEN(sqrt_rtz,SQRT_RTZ)
 
diff --git a/ocml/src/subD.cl b/ocml/src/subD.cl
index beda1a10..f6c9a92b 100644
--- a/ocml/src/subD.cl
+++ b/ocml/src/subD.cl
@@ -7,21 +7,15 @@
 
 #include "mathD.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR double \
-MATH_MANGLE(NAME)(double x, double y) \
+MATH_MANGLE(LN)(double x, double y) \
 { \
-    return BUILTIN_FULL_BINARY(fsub, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F64(x, y); \
 }
 
-GEN(sub_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sub_rtp, ROUND_TO_POSINF)
-GEN(sub_rtn, ROUND_TO_NEGINF)
-GEN(sub_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sub_rte,SUB_RTE)
+GEN(sub_rtn,SUB_RTN)
+GEN(sub_rtp,SUB_RTP)
+GEN(sub_rtz,SUB_RTZ)
 
diff --git a/ocml/src/subF.cl b/ocml/src/subF.cl
index 30664d6c..80d7d3c7 100644
--- a/ocml/src/subF.cl
+++ b/ocml/src/subF.cl
@@ -7,27 +7,15 @@
 
 #include "mathF.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR float \
-MATH_MANGLE(NAME)(float x, float y) \
+MATH_MANGLE(LN)(float x, float y) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_BINARY(fsubf, true, ROUND, x, y); \
-    } else { \
-        ret = BUILTIN_FULL_BINARY(fsubf, false, ROUND, x, y); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x, y); \
 }
 
-GEN(sub_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sub_rtp, ROUND_TO_POSINF)
-GEN(sub_rtn, ROUND_TO_NEGINF)
-GEN(sub_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sub_rte,SUB_RTE)
+GEN(sub_rtn,SUB_RTN)
+GEN(sub_rtp,SUB_RTP)
+GEN(sub_rtz,SUB_RTZ)
 
diff --git a/ocml/src/subH.cl b/ocml/src/subH.cl
index 6ca8e24b..369792e3 100644
--- a/ocml/src/subH.cl
+++ b/ocml/src/subH.cl
@@ -7,21 +7,15 @@
 
 #include "mathH.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
+#define GEN(LN,UN) \
 CONSTATTR INLINEATTR half \
-MATH_MANGLE(NAME)(half x, half y) \
+MATH_MANGLE(LN)(half x, half y) \
 { \
-    return BUILTIN_FULL_BINARY(fsubh, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F16(x, y); \
 }
 
-GEN(sub_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sub_rtp, ROUND_TO_POSINF)
-GEN(sub_rtn, ROUND_TO_NEGINF)
-GEN(sub_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sub_rte,SUB_RTE)
+GEN(sub_rtn,SUB_RTN)
+GEN(sub_rtp,SUB_RTP)
+GEN(sub_rtz,SUB_RTZ)
 
diff --git a/ocml/src/tanD.cl b/ocml/src/tanD.cl
index 442aa20d..0a3193d3 100644
--- a/ocml/src/tanD.cl
+++ b/ocml/src/tanD.cl
@@ -8,13 +8,12 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(tan)(double x)
 {
-    double r, rr;
-    int i = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
 
-    int2 t = AS_INT2(MATH_PRIVATE(tanred2)(r, rr, i & 1));
+    int2 t = AS_INT2(MATH_PRIVATE(tanred2)(r.hi, r.lo, r.i & 1));
     t.hi ^= AS_INT2(x).hi & (int)0x80000000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanF.cl b/ocml/src/tanF.cl
index 81698c4d..efe22a75 100644
--- a/ocml/src/tanF.cl
+++ b/ocml/src/tanF.cl
@@ -8,22 +8,18 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(tan)(float x)
 {
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-#if defined EXTRA_PRECISION
-    float r0, r1;
-    int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax));
+    struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax));
 
-    float t = MATH_PRIVATE(tanred)(r0 + r1, regn & 1);
+#if defined EXTRA_PRECISION
+    float t = MATH_PRIVATE(tanred)(r.hi + r.lo, r.i & 1);
 #else
-    float r;
-    int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax));
-
-    float t = MATH_PRIVATE(tanred)(r, regn & 1);
+    float t = MATH_PRIVATE(tanred)(r.hi, r.i & 1);
 #endif
 
     t = AS_FLOAT(AS_INT(t) ^ (ix ^ ax));
diff --git a/ocml/src/tanH.cl b/ocml/src/tanH.cl
index 201b2c79..36d91d3c 100644
--- a/ocml/src/tanH.cl
+++ b/ocml/src/tanH.cl
@@ -10,13 +10,11 @@
 
 UGEN(tan)
 
-INLINEATTR half
+half
 MATH_MANGLE(tan)(half x)
 {
-    half r;
-    short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x));
-
-    short t = AS_SHORT(MATH_PRIVATE(tanred)(r, i & 1));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
+    short t = AS_SHORT(MATH_PRIVATE(tanred)(r.hi, r.i & (short)1));
     t ^= AS_SHORT(x) & (short)0x8000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanhD.cl b/ocml/src/tanhD.cl
index 834e397e..e0c896d9 100644
--- a/ocml/src/tanhD.cl
+++ b/ocml/src/tanhD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(tanh)(double x)
 {
     double y = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/tanpiD.cl b/ocml/src/tanpiD.cl
index a55fff6f..90c746ef 100644
--- a/ocml/src/tanpiD.cl
+++ b/ocml/src/tanpiD.cl
@@ -8,14 +8,12 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(tanpi)(double x)
 {
-    double r;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &r);
-
-    int2 t = AS_INT2(MATH_PRIVATE(tanpired)(r, i & 1));
-    t.hi ^= (((i == 1) | (i == 2)) & (r == 0.0)) ? 0x80000000 : 0;
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
+    int2 t = AS_INT2(MATH_PRIVATE(tanpired)(r.hi, r.i & 1));
+    t.hi ^= (((r.i == 1) | (r.i == 2)) & (r.hi == 0.0)) ? 0x80000000 : 0;
     t.hi ^= AS_INT2(x).hi & (int)0x80000000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanpiF.cl b/ocml/src/tanpiF.cl
index fc188bc3..a13b9143 100644
--- a/ocml/src/tanpiF.cl
+++ b/ocml/src/tanpiF.cl
@@ -8,14 +8,12 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(tanpi)(float x)
 {
-    float r;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F32(x), &r);
-
-    int t = AS_INT(MATH_PRIVATE(tanpired)(r, i & 1));
-    t ^= (((i == 1) | (i == 2)) & (r == 0.0f)) ? (int)0x80000000 : 0;
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F32(x));
+    int t = AS_INT(MATH_PRIVATE(tanpired)(r.hi, r.i & 1));
+    t ^= (((r.i == 1) | (r.i == 2)) & (r.hi == 0.0f)) ? (int)0x80000000 : 0;
     t ^= AS_INT(x) & (int)0x80000000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanpiH.cl b/ocml/src/tanpiH.cl
index a36e97c0..b0571ba5 100644
--- a/ocml/src/tanpiH.cl
+++ b/ocml/src/tanpiH.cl
@@ -10,14 +10,12 @@
 
 CONSTATTR UGEN(tanpi)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(tanpi)(half x)
 {
-    half r;
-    short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &r);
-
-    short t = AS_SHORT(MATH_PRIVATE(tanpired)(r, i & (short)1));
-    t ^= (((i == (short)1) | (i == (short)2)) & (r == 0.0h)) ? (short)0x8000 : (short)0;
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
+    short t = AS_SHORT(MATH_PRIVATE(tanpired)(r.hi, r.i & (short)1));
+    t ^= (((r.i == (short)1) | (r.i == (short)2)) & (r.hi == 0.0h)) ? (short)0x8000 : (short)0;
     t ^= AS_SHORT(x) & (short)0x8000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanpiredD.cl b/ocml/src/tanpiredD.cl
index 5d877733..ecedafd2 100644
--- a/ocml/src/tanpiredD.cl
+++ b/ocml/src/tanpiredD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(tanpired)(double x, int i)
 {
     double s = x * x;
diff --git a/ocml/src/tanpiredF.cl b/ocml/src/tanpiredF.cl
index 25b2467b..96e63ad2 100644
--- a/ocml/src/tanpiredF.cl
+++ b/ocml/src/tanpiredF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(tanpired)(float x, int i)
 {
     float s = x * x;
diff --git a/ocml/src/tanpiredH.cl b/ocml/src/tanpiredH.cl
index 221797f7..645f58a5 100644
--- a/ocml/src/tanpiredH.cl
+++ b/ocml/src/tanpiredH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigpiredH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_PRIVATE(tanpired)(half x, short i)
 {
     half s = x * x;
diff --git a/ocml/src/tanred2D.cl b/ocml/src/tanred2D.cl
index ae5d49c6..18dd4bf8 100644
--- a/ocml/src/tanred2D.cl
+++ b/ocml/src/tanred2D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR CONSTATTR double
+CONSTATTR double
 MATH_PRIVATE(tanred2)(double x, double xx, int sel)
 {
     const double piby4_lead = 0x1.921fb54442d18p-1;
diff --git a/ocml/src/tanredF.cl b/ocml/src/tanredF.cl
index 0bb6744c..b1a196cc 100644
--- a/ocml/src/tanredF.cl
+++ b/ocml/src/tanredF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(tanred)(float x, int i)
 {
     float s = x * x;
diff --git a/ocml/src/tanredH.cl b/ocml/src/tanredH.cl
index bade03c2..b11844f2 100644
--- a/ocml/src/tanredH.cl
+++ b/ocml/src/tanredH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigredH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_PRIVATE(tanred)(half x, short i)
 {
     half s = x * x;
diff --git a/ocml/src/tgammaH.cl b/ocml/src/tgammaH.cl
index 07a72ef2..8ae01c2c 100644
--- a/ocml/src/tgammaH.cl
+++ b/ocml/src/tgammaH.cl
@@ -9,7 +9,7 @@
 
 UGEN(tgamma)
 
-INLINEATTR half
+half
 MATH_MANGLE(tgamma)(half x)
 {
     return (half)MATH_UPMANGLE(tgamma)((float)x);
diff --git a/ocml/src/trigpiredD.cl b/ocml/src/trigpiredD.cl
index fddfef06..7bea3077 100644
--- a/ocml/src/trigpiredD.cl
+++ b/ocml/src/trigpiredD.cl
@@ -8,13 +8,16 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-INLINEATTR int
-MATH_PRIVATE(trigpired)(double x, __private double *r)
+CONSTATTR struct redret
+MATH_PRIVATE(trigpired)(double x)
 {
     double t = 2.0 * BUILTIN_FRACTION_F64(0.5 * x);
     x = x > 1.0 ? t : x;
     t = BUILTIN_RINT_F64(2.0 * x);
-    *r = MATH_MAD(t, -0.5, x);
-    return (int)t & 0x3;
+
+    struct redret ret;
+    ret.hi = MATH_MAD(t, -0.5, x);
+    ret.i = (int)t & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigpiredD.h b/ocml/src/trigpiredD.h
index 1a464150..3d82c947 100644
--- a/ocml/src/trigpiredD.h
+++ b/ocml/src/trigpiredD.h
@@ -5,7 +5,17 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern int MATH_PRIVATE(trigpired)(double x, __private double *r);
-extern double MATH_PRIVATE(sincospired)(double x, __private double *cp);
+struct redret {
+    double hi;
+    int i;
+};
+
+struct scret {
+    double c;
+    double s;
+};
+
+extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(double x);
+extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(double x);
 extern CONSTATTR double MATH_PRIVATE(tanpired)(double x, int i);
 
diff --git a/ocml/src/trigpiredF.cl b/ocml/src/trigpiredF.cl
index ab2fa371..bcdc5727 100644
--- a/ocml/src/trigpiredF.cl
+++ b/ocml/src/trigpiredF.cl
@@ -8,13 +8,16 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-INLINEATTR int
-MATH_PRIVATE(trigpired)(float x, __private float *r)
+CONSTATTR struct redret
+MATH_PRIVATE(trigpired)(float x)
 {
     float t = 2.0f * BUILTIN_FRACTION_F32(0.5f * x);
     x = x > 1.0f ? t : x;
     t = BUILTIN_RINT_F32(2.0f * x);
-    *r = MATH_MAD(t, -0.5f, x);
-    return (int)t & 0x3;
+
+    struct redret ret;
+    ret.hi = MATH_MAD(t, -0.5f, x);
+    ret.i = (int)t & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigpiredF.h b/ocml/src/trigpiredF.h
index 162544ec..f6727b5b 100644
--- a/ocml/src/trigpiredF.h
+++ b/ocml/src/trigpiredF.h
@@ -5,7 +5,17 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern int MATH_PRIVATE(trigpired)(float x, __private float *r);
-extern float MATH_PRIVATE(sincospired)(float x, __private float *cp);
+struct redret {
+    float hi;
+    int i;
+};
+
+struct scret {
+    float s;
+    float c;
+};
+
+extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(float x);
+extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(float x);
 extern CONSTATTR float MATH_PRIVATE(tanpired)(float x, int i);
 
diff --git a/ocml/src/trigpiredH.cl b/ocml/src/trigpiredH.cl
index b68d43e5..7615528f 100644
--- a/ocml/src/trigpiredH.cl
+++ b/ocml/src/trigpiredH.cl
@@ -8,13 +8,16 @@
 #include "mathH.h"
 #include "trigpiredH.h"
 
-INLINEATTR short
-MATH_PRIVATE(trigpired)(half x, __private half *r)
+CONSTATTR struct redret
+MATH_PRIVATE(trigpired)(half x)
 {
     half t = 2.0h * BUILTIN_FRACTION_F16(0.5h * x);
     x = x > 1.0h ? t : x;
     t = BUILTIN_RINT_F16(2.0h * x);
-    *r = MATH_MAD(t, -0.5h, x);
-    return (short)t & (short)0x3;
+
+    struct redret ret;
+    ret.hi = MATH_MAD(t, -0.5h, x);
+    ret.i = (short)t & (short)0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigpiredH.h b/ocml/src/trigpiredH.h
index 1294ebea..b2d240f5 100644
--- a/ocml/src/trigpiredH.h
+++ b/ocml/src/trigpiredH.h
@@ -5,7 +5,17 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern short MATH_PRIVATE(trigpired)(half x, __private half *r);
-extern half MATH_PRIVATE(sincospired)(half x, __private half *cp);
+struct redret {
+    half hi;
+    short i;
+};
+
+struct scret {
+    half s;
+    half c;
+};
+
+extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(half x);
+extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(half x);
 extern CONSTATTR half MATH_PRIVATE(tanpired)(half x, short i);
 
diff --git a/ocml/src/trigredD.cl b/ocml/src/trigredD.cl
index 60fc8b3f..c9700fd8 100644
--- a/ocml/src/trigredD.cl
+++ b/ocml/src/trigredD.cl
@@ -8,12 +8,12 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR int
-MATH_PRIVATE(trigred)(__private double *r, __private double *rr, double x)
+CONSTATTR struct redret
+MATH_PRIVATE(trigred)(double x)
 {
     if (x < 0x1.0p+21)
-        return MATH_PRIVATE(trigredsmall)(r, rr, x);
+        return MATH_PRIVATE(trigredsmall)(x);
     else
-        return MATH_PRIVATE(trigredlarge)(r, rr, x);
+        return MATH_PRIVATE(trigredlarge)(x);
 }
 
diff --git a/ocml/src/trigredD.h b/ocml/src/trigredD.h
index 6dd96f67..26a9599d 100644
--- a/ocml/src/trigredD.h
+++ b/ocml/src/trigredD.h
@@ -5,12 +5,23 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern int MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x);
-extern int MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x);
-extern int MATH_PRIVATE(trigred)(__private double *r, __private double *rr, double x);
+struct redret {
+    double lo;
+    double hi;
+    int i;
+};
 
-extern double MATH_PRIVATE(sincosred)(double x, __private double *cp);
-extern double MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp);
+struct scret {
+    double s;
+    double c;
+};
+
+extern CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(double x);
+extern CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(double x);
+extern CONSTATTR struct redret MATH_PRIVATE(trigred)(double x);
+
+extern CONSTATTR struct scret MATH_PRIVATE(sincosred)(double x);
+extern CONSTATTR struct scret MATH_PRIVATE(sincosred2)(double x, double y);
 
 extern CONSTATTR double MATH_PRIVATE(tanred2)(double x, double xx, int sel);
 
diff --git a/ocml/src/trigredF.cl b/ocml/src/trigredF.cl
index c73a0fb7..20cbd39b 100644
--- a/ocml/src/trigredF.cl
+++ b/ocml/src/trigredF.cl
@@ -8,24 +8,12 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR int
-#if defined EXTRA_PRECISION
-MATH_PRIVATE(trigred)(__private float *r, __private float *rr, float x)
-#else
-MATH_PRIVATE(trigred)(__private float *r, float x)
-#endif
+CONSTATTR struct redret
+MATH_PRIVATE(trigred)(float x)
 {
     if (x < SMALL_BOUND)
-#if defined EXTRA_PRECISION
-        return MATH_PRIVATE(trigredsmall)(r, rr, x);
-#else
-        return MATH_PRIVATE(trigredsmall)(r, x);
-#endif
+        return MATH_PRIVATE(trigredsmall)(x);
     else
-#if defined EXTRA_PRECISION
-        return MATH_PRIVATE(trigredlarge)(r, rr, x);
-#else
-        return MATH_PRIVATE(trigredlarge)(r, x);
-#endif
+        return MATH_PRIVATE(trigredlarge)(x);
 }
 
diff --git a/ocml/src/trigredF.h b/ocml/src/trigredF.h
index d7a81a1b..e0e50c93 100644
--- a/ocml/src/trigredF.h
+++ b/ocml/src/trigredF.h
@@ -8,18 +8,33 @@
 #define SMALL_BOUND 0x1.0p+17f
 
 #if defined EXTRA_PRECISION
-extern int MATH_PRIVATE(trigredsmall)(__private float *r, __private float *rr, float x);
-extern int MATH_PRIVATE(trigredlarge)(__private float *r, __private float *rr, float x);
-extern int MATH_PRIVATE(trigred)(__private float *r, __private float *rr, float x);
+struct redret {
+    float hi;
+    float lo;
+    int i;
+};
 #else
-extern int MATH_PRIVATE(trigredsmall)(__private float *r, float x);
-extern int MATH_PRIVATE(trigredlarge)(__private float *r, float x);
-extern int MATH_PRIVATE(trigred)(__private float *r, float x);
+struct redret {
+    float hi;
+    int i;
+};
 #endif
 
-extern float MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp);
+struct scret {
+    float s;
+    float c;
+};
 
-extern float MATH_PRIVATE(sincosred)(float x, __private float *cp);
+extern CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(float x);
+extern CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(float x);
+extern CONSTATTR struct redret MATH_PRIVATE(trigred)(float x);
+
+
+#if defined EXTRA_PRECISION
+extern CONSTATTR struct scret  MATH_PRIVATE(sincosred2)(float x, float y);
+#else
+extern CONSTATTR struct scret  MATH_PRIVATE(sincosred)(float x);
+#endif
 
 extern CONSTATTR float MATH_PRIVATE(tanred)(float x, int regn);
 
diff --git a/ocml/src/trigredH.cl b/ocml/src/trigredH.cl
index 5fcf39b1..ac75d51a 100644
--- a/ocml/src/trigredH.cl
+++ b/ocml/src/trigredH.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathH.h"
+#include "trigredH.h"
 
-__attribute__((always_inline)) short
-MATH_PRIVATE(trigred)(__private half *r, half hx)
+CONSTATTR struct redret
+MATH_PRIVATE(trigred)(half hx)
 {
     const float twobypi = 0x1.45f306p-1f;
     const float pb2_a = 0x1.92p+0f;
@@ -18,8 +19,9 @@ MATH_PRIVATE(trigred)(__private half *r, half hx)
     float x = (float)hx;
     float fn = BUILTIN_RINT_F32(x * twobypi);
 
-    *r = (half)BUILTIN_MAD_F32(fn, -pb2_c, BUILTIN_MAD_F32(fn, -pb2_b, BUILTIN_MAD_F32(fn, -pb2_a, x)));
-
-    return (int)fn & 0x3;
+    struct redret ret;
+    ret.hi = (half)BUILTIN_MAD_F32(fn, -pb2_c, BUILTIN_MAD_F32(fn, -pb2_b, BUILTIN_MAD_F32(fn, -pb2_a, x)));
+    ret.i =  (int)fn & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigredH.h b/ocml/src/trigredH.h
index 97764561..2f02b42a 100644
--- a/ocml/src/trigredH.h
+++ b/ocml/src/trigredH.h
@@ -5,7 +5,17 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern short MATH_PRIVATE(trigred)(__private half *r, half x);
-extern half MATH_PRIVATE(sincosred)(half x, __private half *cp);
-extern CONSTATTR half MATH_PRIVATE(tanred)(half x, short regn);
+struct redret {
+    half hi;
+    short i;
+};
+
+struct scret {
+    half s;
+    half c;
+};
+
+extern CONSTATTR struct redret  MATH_PRIVATE(trigred)(half x);
+extern CONSTATTR struct scret  MATH_PRIVATE(sincosred)(half x);
+extern CONSTATTR half MATH_PRIVATE(tanred)(half x, short i);
 
diff --git a/ocml/src/trigredlargeD.cl b/ocml/src/trigredlargeD.cl
index caea8352..02804e0b 100644
--- a/ocml/src/trigredlargeD.cl
+++ b/ocml/src/trigredlargeD.cl
@@ -63,8 +63,8 @@
         C3 += C2; \
     } while (0)
 
-int
-MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x)
+CONSTATTR struct redret
+MATH_PRIVATE(trigredlarge)(double x)
 {
     // Scale x by relevant part of 2/pi
     double p2 = BUILTIN_TRIG_PREOP_F64(x, 0);
@@ -106,9 +106,11 @@ MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x)
     double rt = BUILTIN_FMA_F64(f1, pio2h, BUILTIN_FMA_F64(f2, pio2t, BUILTIN_FMA_F64(f2, pio2h, -rh)));
 
     FSUM2(rh, rt, rh, rt);
-    *r = rh;
-    *rr = rt;
 
-    return i & 0x3;
+    struct redret ret;
+    ret.hi = rh;
+    ret.lo = rt;
+    ret.i = i & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigredlargeF.cl b/ocml/src/trigredlargeF.cl
index dcb2057c..94ea8ae5 100644
--- a/ocml/src/trigredlargeF.cl
+++ b/ocml/src/trigredlargeF.cl
@@ -17,12 +17,8 @@
     HI = BUILTIN_MULHI_U32(A, B); \
     HI += LO < C
 
-int
-#if defined EXTRA_PRECISION
-MATH_PRIVATE(trigredlarge)(__private float *r, __private float *rr, float x)
-#else
-MATH_PRIVATE(trigredlarge)(__private float *r, float x)
-#endif
+CONSTATTR struct redret
+MATH_PRIVATE(trigredlarge)(float x)
 {
     int xe = (int)(AS_UINT(x) >> 23) - 127;
     uint xm = 0x00800000U | (AS_UINT(x) & 0x7fffffU);
@@ -152,16 +148,18 @@ MATH_PRIVATE(trigredlarge)(__private float *r, float x)
              MATH_MAD(q0, pio2h, q1*pio2t);
     }
 
+    struct redret ret;
 #if defined EXTRA_PRECISION
     float t = rh + rt;
     rt = rt - (t - rh);
 
-    *r = t;
-    *rr = rt;
+    ret.hi = t;
+    ret.lo = rt;
 #else
-    *r = rh + rt;
+    ret.hi  = rh + rt;
 #endif
 
-    return ((i >> 1) + (i & 1)) & 0x3;
+    ret.i = ((i >> 1) + (i & 1)) & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigredsmallD.cl b/ocml/src/trigredsmallD.cl
index 59f74c87..0cac73ef 100644
--- a/ocml/src/trigredsmallD.cl
+++ b/ocml/src/trigredsmallD.cl
@@ -8,8 +8,8 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR int
-MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x)
+CONSTATTR struct redret
+MATH_PRIVATE(trigredsmall)(double x)
 {
     const double twobypi = 0x1.45f306dc9c883p-1;
     const double piby2_h = 0x1.921fb54442d18p+0;
@@ -27,9 +27,10 @@ MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x)
     double rh = yh + yt;
     double rt = yt - (rh - yh);
 
-    *r = rh;
-    *rr = rt;
-
-    return (int)dn & 0x3;
+    struct redret ret;
+    ret.hi = rh;
+    ret.lo = rt;
+    ret.i = (int)dn & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigredsmallF.cl b/ocml/src/trigredsmallF.cl
index eaf2bc08..c93a2761 100644
--- a/ocml/src/trigredsmallF.cl
+++ b/ocml/src/trigredsmallF.cl
@@ -22,12 +22,8 @@
         D = __t + (((C - __t) - __ph) - __pt); \
     } while(0)
 
-static inline int
-#if defined EXTRA_PRECISION
-mad_reduce(__private float *hi, __private float *lo, float x)
-#else
-mad_reduce(__private float *hi, float x)
-#endif
+static inline struct redret
+mad_reduce(float x)
 {
 #if defined EXTRA_PRECISION
 #error Not implemented
@@ -54,17 +50,16 @@ mad_reduce(__private float *hi, float x)
     float r;
     FNMA(fn, fnh, fnl, piby2_h, piby2_hh, piby2_hl, x, r);
     FNMA(fn, fnh, fnl, piby2_m, piby2_mh, piby2_ml, r, r);
-    *hi = MATH_MAD(-piby2_l, fn, r);
-    return (int)fn & 0x3;
+
+    struct redret ret;
+    ret.hi = MATH_MAD(-piby2_l, fn, r);
+    ret.i = (int)fn & 0x3;
+    return ret;
 #endif
 }
 
-static inline int
-#if defined EXTRA_PRECISION
-fma_reduce(__private float *hi, __private float *lo, float x)
-#else
-fma_reduce(__private float *hi, float x)
-#endif
+static inline struct redret
+fma_reduce(float x)
 {
     const float twobypi = 0x1.45f306p-1f;
     const float piby2_h = 0x1.921fb4p+0f;
@@ -72,6 +67,9 @@ fma_reduce(__private float *hi, float x)
     const float piby2_l = 0x1.846988p-48f;
 
     float fn = BUILTIN_RINT_F32(x * twobypi);
+
+    struct redret ret;
+
 #if defined EXTRA_PRECISION
     float xt = BUILTIN_FMA_F32(fn, -piby2_h, x);
     float yh = BUILTIN_FMA_F32(fn, -piby2_m, xt);
@@ -82,34 +80,24 @@ fma_reduce(__private float *hi, float x)
     float yt = BUILTIN_FMA_F32(fn, -piby2_l, ((th - yh) + tt) - pt);
     float rh = yh + yt;
     float rt = yt - (rh - yh);
-    *hi = rh;
-    *lo = rt;
+    ret.hi = rh;
+    ret.lo = rt;
 #else
     float r = BUILTIN_FMA_F32(fn, -piby2_l, BUILTIN_FMA_F32(fn, -piby2_m, BUILTIN_FMA_F32(fn, -piby2_h, x)));
-    *hi = r;
+    ret.hi = r;
 #endif
-    return (int)fn & 0x3;
+
+    ret.i =(int)fn & 0x3;
+    return ret;
 }
 
-INLINEATTR int
-#if defined EXTRA_PRECISION
-MATH_PRIVATE(trigredsmall)(__private float *r, __private float *rr, float x)
-#else
-MATH_PRIVATE(trigredsmall)(__private float *r, float x)
-#endif
+CONSTATTR struct redret
+MATH_PRIVATE(trigredsmall)(float x)
 {
     if (HAVE_FAST_FMA32()) {
-#if defined EXTRA_PRECISION
-	return fma_reduce(r, rr, x);
-#else
-	return fma_reduce(r, x);
-#endif
+	return fma_reduce(x);
     } else {
-#if defined EXTRA_PRECISION
-        return mad_reduce(r, rr, x);
-#else
-	return mad_reduce(r, x);
-#endif
+	return mad_reduce(x);
     }
 }
 
diff --git a/ocml/src/truncD.cl b/ocml/src/truncD.cl
index 91810829..b1ae0417 100644
--- a/ocml/src/truncD.cl
+++ b/ocml/src/truncD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(trunc)(double x)
 {
     return BUILTIN_TRUNC_F64(x);
diff --git a/ocml/src/truncF.cl b/ocml/src/truncF.cl
index 88ee87da..3d279363 100644
--- a/ocml/src/truncF.cl
+++ b/ocml/src/truncF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(trunc)(float x)
 {
     return BUILTIN_TRUNC_F32(x);
diff --git a/ocml/src/truncH.cl b/ocml/src/truncH.cl
index 77292247..6787af80 100644
--- a/ocml/src/truncH.cl
+++ b/ocml/src/truncH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(trunc)(half2 x)
 {
     return BUILTIN_TRUNC_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(trunc)(half x)
 {
     return BUILTIN_TRUNC_F16(x);
diff --git a/ocml/src/y0H.cl b/ocml/src/y0H.cl
index b2a81454..c187f45a 100644
--- a/ocml/src/y0H.cl
+++ b/ocml/src/y0H.cl
@@ -9,7 +9,7 @@
 
 UGEN(y0)
 
-INLINEATTR half
+half
 MATH_MANGLE(y0)(half x)
 {
     return (half)MATH_UPMANGLE(y0)((float)x);
diff --git a/ocml/src/y1H.cl b/ocml/src/y1H.cl
index 0c4197f0..a09ad9ef 100644
--- a/ocml/src/y1H.cl
+++ b/ocml/src/y1H.cl
@@ -9,7 +9,7 @@
 
 UGEN(y1)
 
-INLINEATTR half
+half
 MATH_MANGLE(y1)(half x)
 {
     return (half)MATH_UPMANGLE(y1)((float)x);
diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt
index 8ac5f76f..8da642aa 100644
--- a/opencl/CMakeLists.txt
+++ b/opencl/CMakeLists.txt
@@ -14,6 +14,7 @@ file(GLOB cl_sources
   ${CMAKE_CURRENT_SOURCE_DIR}/src/math/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/media/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/misc/*.cl
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/pipes/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/relational/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/subgroup/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/vldst/*.cl
@@ -36,7 +37,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/../utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB ll_srcs
diff --git a/opencl/src/integer/clz.cl b/opencl/src/integer/clz.cl
index f24a648a..c3f4b6af 100644
--- a/opencl/src/integer/clz.cl
+++ b/opencl/src/integer/clz.cl
@@ -20,33 +20,25 @@ UEXP(ulong,clz)
 UEXPATTR char
 clz(char x)
 {
-    uint y = (uint)(uchar)x;
-    uint z = __ockl_clz_u32(y);
-    return (char)(z - 24u);
+    return (char)__ockl_clz_u8((uchar)x);
 }
 
 UEXPATTR uchar
 clz(uchar x)
 {
-    uint y = (uint)x;
-    uint z = __ockl_clz_u32(y);
-    return (uchar)(z - 24u);
+    return __ockl_clz_u8(x);
 }
 
 UEXPATTR short
 clz(short x)
 {
-    uint y = (uint)(ushort)x;
-    uint z = __ockl_clz_u32(y);
-    return (short)(z - 16u);
+    return (short)__ockl_clz_u16((ushort)x);
 }
 
 UEXPATTR ushort
 clz(ushort x)
 {
-    uint y = (uint)x;
-    uint z = __ockl_clz_u32(y);
-    return (ushort)(z - 16u);
+    return __ockl_clz_u16(x);
 }
 
 UEXPATTR int
@@ -61,16 +53,15 @@ clz(uint x)
     return __ockl_clz_u32(x);
 }
 
-__attribute__((always_inline, const)) static ulong
-clz_u64(ulong x)
+UEXPATTR long
+clz(long x)
 {
-    uint xlo = (uint)x;
-    uint xhi = (uint)(x >> 32);
-    uint zlo = __ockl_clz_u32(xlo) + 32u;
-    uint zhi = __ockl_clz_u32(xhi);
-    return (ulong)(xhi == 0 ? zlo : zhi);
+    return (long)__ockl_clz_u64((ulong)x);
 }
 
-extern __attribute__((overloadable, always_inline, const, alias("clz_u64"))) ulong clz(ulong);
-extern __attribute__((overloadable, always_inline, const, alias("clz_u64")))  long clz(long);
+UEXPATTR ulong
+clz(ulong x)
+{
+    return __ockl_clz_u64(x);
+}
 
diff --git a/opencl/src/integer/ctz.cl b/opencl/src/integer/ctz.cl
index d75fc386..b583bf52 100644
--- a/opencl/src/integer/ctz.cl
+++ b/opencl/src/integer/ctz.cl
@@ -20,29 +20,25 @@ UEXP(ulong,ctz)
 UEXPATTR char
 ctz(char x)
 {
-    uint y = (uint)(uchar)x;
-    return (char)min(__ockl_ctz_u32(y), 8u);
+    return (char)__ockl_ctz_u8((uchar)x);
 }
 
 UEXPATTR uchar
 ctz(uchar x)
 {
-    uint y = (uint)x;
-    return (uchar)min(__ockl_ctz_u32(y), 8u);
+    return __ockl_ctz_u8(x);
 }
 
 UEXPATTR short
 ctz(short x)
 {
-    uint y = (uint)(ushort)x;
-    return (short)min(__ockl_ctz_u32(y), 16u);
+    return (short)__ockl_ctz_u16((ushort)x);
 }
 
 UEXPATTR ushort
 ctz(ushort x)
 {
-    uint y = (uint)x;
-    return (ushort)min(__ockl_ctz_u32(y), 16u);
+    return __ockl_ctz_u16(x);
 }
 
 UEXPATTR int
@@ -57,16 +53,15 @@ ctz(uint x)
     return __ockl_ctz_u32(x);
 }
 
-__attribute__((always_inline, const)) static ulong
-ctz_u64(ulong x)
+UEXPATTR long
+ctz(long x)
 {
-    uint xlo = (uint)x;
-    uint xhi = (uint)(x >> 32);
-    uint zlo = __ockl_ctz_u32(xlo);
-    uint zhi = __ockl_ctz_u32(xhi) + 32u;
-    return (ulong)(xlo == 0 ? zhi : zlo);
+    return (long)__ockl_ctz_u64((ulong)x);
 }
 
-extern __attribute__((overloadable, always_inline, const, alias("ctz_u64"))) ulong ctz(ulong);
-extern __attribute__((overloadable, always_inline, const, alias("ctz_u64")))  long ctz(long);
+UEXPATTR ulong
+ctz(ulong x)
+{
+    return __ockl_ctz_u64(x);
+}
 
diff --git a/opencl/src/integer/popcount.cl b/opencl/src/integer/popcount.cl
index f40f32b1..53c525ad 100644
--- a/opencl/src/integer/popcount.cl
+++ b/opencl/src/integer/popcount.cl
@@ -57,14 +57,12 @@ popcount(uint x)
 UEXPATTR long
 popcount(long x)
 {
-    uint2 y = as_uint2(x);
-    return (long)(__ockl_popcount_u32(y.lo) + __ockl_popcount_u32(y.hi));
+    return (long)__ockl_popcount_u64((ulong)x);
 }
 
 UEXPATTR ulong
 popcount(ulong x)
 {
-    uint2 y = as_uint2(x);
-    return (ulong)(__ockl_popcount_u32(y.lo) + __ockl_popcount_u32(y.hi));
+    return __ockl_popcount_u64(x);
 }
 
diff --git a/opencl/src/misc/asqf.cl b/opencl/src/misc/asqf.cl
index 84ff26ed..d6a05968 100644
--- a/opencl/src/misc/asqf.cl
+++ b/opencl/src/misc/asqf.cl
@@ -1,3 +1,9 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
 
 #include "ockl.h"
 
diff --git a/opencl/src/misc/atom.cl b/opencl/src/misc/atom.cl
new file mode 100644
index 00000000..d9f9ab38
--- /dev/null
+++ b/opencl/src/misc/atom.cl
@@ -0,0 +1,390 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+#define ATTR __attribute__((overloadable, always_inline))
+
+#define AC_int(X) X
+#define AC_uint(X) X
+#define AC_long(X) X
+#define AC_ulong(X) X
+#define AC_intptr_t(X) X
+#define AC_uintptr_t(X) X
+#define AC_size_t(X) X
+#define AC_ptrdiff_t(X) X
+#define AC_float(X) as_int(X)
+#define AC_double(X) as_long(X)
+
+#define RC_int(X) X
+#define RC_uint(X) X
+#define RC_long(X) X
+#define RC_ulong(X) X
+#define RC_intptr_t(X) X
+#define RC_uintptr_t(X) X
+#define RC_size_t(X) X
+#define RC_ptrdiff_t(X) X
+#define RC_float(X) as_float(X)
+#define RC_double(X) as_double(X)
+
+#define PC_int
+#define PC_uint
+#define PC_long
+#define PC_ulong
+#define PC_intptr_t
+#define PC_uintptr_t
+#define PC_size_t
+#define PC_ptrdiff_t
+#define PC_float (volatile atomic_int *)
+#define PC_double (volatile atomic_long *)
+
+#define EC_int
+#define EC_uint
+#define EC_long
+#define EC_ulong
+#define EC_intptr_t
+#define EC_uintptr_t
+#define EC_size_t
+#define EC_ptrdiff_t
+#define EC_float (int *)
+#define EC_double (long *)
+
+#define OCL12_MEMORY_ORDER memory_order_relaxed
+#define OCL12_MEMORY_SCOPE memory_scope_device
+
+#define F_inc __opencl_atomic_fetch_add
+#define F_dec __opencl_atomic_fetch_sub
+
+// extension and 1.2 functions
+#define GEN1(T,A,O) \
+ATTR T \
+atom_##O(volatile A T *p, T v) \
+{ \
+    return __opencl_atomic_fetch_##O((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define GEN2(T,A,O) \
+ATTR T \
+atomic_##O(volatile A T *p, T v) \
+{ \
+    return __opencl_atomic_fetch_##O((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define OPSA(F,T,A) \
+    F(T,A,add) \
+    F(T,A,sub) \
+    F(T,A,max) \
+    F(T,A,min) \
+    F(T,A,and) \
+    F(T,A,or) \
+    F(T,A,xor)
+
+#define OPS(F,T) \
+    OPSA(F,T,)
+
+#define ALL() \
+    OPS(GEN1,int) \
+    OPS(GEN2,int) \
+    OPS(GEN1,uint) \
+    OPS(GEN2,uint) \
+    OPS(GEN1,long) \
+    OPS(GEN1,ulong)
+
+ALL()
+
+// Handle inc and dec
+#undef GEN1
+#undef GEN2
+#undef OPSA
+
+#define OPSA(F,T,A) \
+    F(T,A,inc) \
+    F(T,A,dec)
+
+
+#define GEN1(T,A,O) \
+ATTR T \
+atom_##O(volatile A T *p) \
+{ \
+    return F_##O((volatile atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define GEN2(T,A,O) \
+ATTR T \
+atomic_##O(volatile A T *p) \
+{ \
+    return F_##O((volatile atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+ALL()
+
+// Handle xchg
+#undef GEN1
+#undef GEN2
+#undef OPSA
+#undef OPS
+
+#define GEN1(T,A) \
+ATTR T \
+atom_xchg(volatile A T *p, T v) \
+{ \
+    return __opencl_atomic_exchange((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define GEN2(T,A) \
+ATTR T \
+atomic_xchg(volatile A T *p, T v) \
+{ \
+    return __opencl_atomic_exchange((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define OPS(F,T) \
+    F(T,)
+
+ALL()
+
+ATTR float
+atomic_xchg(volatile float *p, float v)
+{
+    return as_float(__opencl_atomic_exchange((volatile atomic_int *)p, as_int(v), OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE));
+}
+
+// Handle cmpxchg
+#undef GEN1
+#undef GEN2
+
+#define GEN1(T,A) \
+ATTR T \
+atom_cmpxchg(volatile A T *p, T e, T d) \
+{ \
+    __opencl_atomic_compare_exchange_strong((volatile atomic_##T *)p, &e, d,  OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+    return e; \
+}
+
+#define GEN2(T,A) \
+ATTR T \
+atomic_cmpxchg(volatile A T *p, T e, T d) \
+{ \
+    __opencl_atomic_compare_exchange_strong((volatile atomic_##T *)p, &e, d,  OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+    return e; \
+}
+
+ALL()
+#undef GEN1
+#undef GEN2
+#undef ALL
+
+// 2.0 functions
+
+#define GENI(T) \
+ATTR void \
+atomic_init(volatile atomic_##T *p, T v) \
+{ \
+    __opencl_atomic_init(p, v); \
+}
+
+#define GENS(T) \
+ATTR void \
+atomic_store(volatile atomic_##T *p, T v) \
+{ \
+    __opencl_atomic_store(p, v, memory_order_seq_cst, memory_scope_device); \
+} \
+ \
+ATTR void \
+atomic_store_explicit(volatile atomic_##T *p, T v, memory_order o) \
+{ \
+    __opencl_atomic_store(p, v, o, memory_scope_device); \
+} \
+ \
+ATTR void \
+atomic_store_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \
+{ \
+    __opencl_atomic_store(p, v, o, s); \
+}
+
+#define GENL(T) \
+ATTR T \
+atomic_load(volatile atomic_##T *p) \
+{ \
+    return __opencl_atomic_load(p, memory_order_seq_cst, memory_scope_device); \
+} \
+ \
+ATTR T \
+atomic_load_explicit(volatile atomic_##T *p, memory_order o) \
+{ \
+    return __opencl_atomic_load(p, o, memory_scope_device); \
+} \
+ \
+ATTR T \
+atomic_load_explicit(volatile atomic_##T *p, memory_order o, memory_scope s) \
+{ \
+    return __opencl_atomic_load(p, o, s); \
+}
+
+#define GENX(T) \
+ATTR T \
+atomic_exchange(volatile atomic_##T *p, T v) \
+{ \
+    return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \
+} \
+ \
+ATTR T \
+atomic_exchange_explicit(volatile atomic_##T *p, T v, memory_order o) \
+{ \
+    return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), o, memory_scope_device)); \
+} \
+ \
+ATTR T \
+atomic_exchange_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \
+{ \
+    return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), o, s)); \
+}
+
+#define GENCX(T,K) \
+ATTR bool \
+atomic_compare_exchange_##K(volatile atomic_##T *p, T *e, T d) \
+{ \
+    return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \
+} \
+ \
+ATTR bool \
+atomic_compare_exchange_##K##_explicit(volatile atomic_##T *p, T *e, T d, memory_order os, memory_order of) \
+{ \
+    return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), os, of, memory_scope_device); \
+} \
+ \
+ATTR bool \
+atomic_compare_exchange_##K##_explicit(volatile atomic_##T *p, T *e, T d, memory_order os, memory_order of, memory_scope s) \
+{ \
+    return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), os, of, s); \
+}
+
+#define GENFO(T,O) \
+ATTR T \
+atomic_fetch_##O(volatile atomic_##T *p, T v) \
+{ \
+    return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \
+} \
+ \
+ATTR T \
+atomic_fetch_##O##_explicit(volatile atomic_##T *p, T v, memory_order o) \
+{ \
+    return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), o, memory_scope_device)); \
+} \
+ \
+ATTR T \
+atomic_fetch_##O##_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \
+{ \
+    return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), o, s)); \
+}
+
+#define CX(T) \
+    GENCX(T,strong) \
+    GENCX(T,weak)
+
+#define FO(T) \
+    GENFO(T,add) \
+    GENFO(T,sub) \
+    GENFO(T,or) \
+    GENFO(T,xor) \
+    GENFO(T,and) \
+    GENFO(T,min) \
+    GENFO(T,max) \
+
+#define ALLI(F) \
+    F(int) \
+    F(uint) \
+    F(long) \
+    F(ulong)
+
+#define ALL(F) \
+    ALLI(F) \
+    F(float) \
+    F(double)
+
+ALL(GENI)
+ALL(GENL)
+ALL(GENS)
+ALL(GENX)
+ALL(CX)
+ALLI(FO)
+
+// These are needed for uintptr_t
+ATTR ulong
+atomic_fetch_add(volatile atomic_ulong *p, long v)
+{
+    return __opencl_atomic_fetch_add(p, (ulong)v, memory_order_seq_cst, memory_scope_device);
+}
+
+ATTR ulong
+atomic_fetch_add_explicit(volatile atomic_ulong *p, long v, memory_order o)
+{
+    return __opencl_atomic_fetch_add(p, (ulong)v, o, memory_scope_device);
+}
+
+ATTR ulong
+atomic_fetch_add_explicit(volatile atomic_ulong *p, long v, memory_order o, memory_scope s)
+{
+    return __opencl_atomic_fetch_add(p, (ulong)v, o, s);
+}
+
+ATTR ulong
+atomic_fetch_sub(volatile atomic_ulong *p, long v)
+{
+    return __opencl_atomic_fetch_sub(p, (ulong)v, memory_order_seq_cst, memory_scope_device);
+}
+
+ATTR ulong
+atomic_fetch_sub_explicit(volatile atomic_ulong *p, long v, memory_order o)
+{
+    return __opencl_atomic_fetch_sub(p, (ulong)v, o, memory_scope_device);
+}
+
+ATTR ulong
+atomic_fetch_sub_explicit(volatile atomic_ulong *p, long v, memory_order o, memory_scope s)
+{
+    return __opencl_atomic_fetch_sub(p, (ulong)v, o, s);
+}
+
+// flag functions
+ATTR bool
+atomic_flag_test_and_set(volatile atomic_flag *p)
+{
+    return __opencl_atomic_exchange((volatile atomic_int *)p, 1, memory_order_seq_cst, memory_scope_device);
+}
+
+ATTR bool
+atomic_flag_test_and_set_explicit(volatile atomic_flag *p, memory_order o)
+{
+    return __opencl_atomic_exchange((volatile atomic_int *)p, 1, o, memory_scope_device);
+}
+
+ATTR bool
+atomic_flag_test_and_set_explicit(volatile atomic_flag *p, memory_order o, memory_scope s)
+{
+    return __opencl_atomic_exchange((volatile atomic_int *)p, 1, o, s);
+}
+
+ATTR void
+atomic_flag_clear(volatile atomic_flag *p)
+{
+    __opencl_atomic_store((volatile atomic_int *)p, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+ATTR void
+atomic_flag_clear_explicit(volatile atomic_flag *p, memory_order o)
+{
+    __opencl_atomic_store((volatile atomic_int *)p, 0, o, memory_scope_device);
+}
+
+ATTR void
+atomic_flag_clear_explicit(volatile atomic_flag *p, memory_order o, memory_scope s)
+{
+    __opencl_atomic_store((volatile atomic_int *)p, 0, o, s);
+}
+
diff --git a/opencl/src/misc/printf.cl b/opencl/src/misc/printf.cl
index 815f96ea..f80ebf1c 100644
--- a/opencl/src/misc/printf.cl
+++ b/opencl/src/misc/printf.cl
@@ -1,3 +1,9 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
 
 #include "irif.h"
 
diff --git a/opencl/src/pipes/commitp.cl b/opencl/src/pipes/commitp.cl
new file mode 100644
index 00000000..51528cb8
--- /dev/null
+++ b/opencl/src/pipes/commitp.cl
@@ -0,0 +1,93 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(COMMIT_READ_PIPE_SIZE)
+
+ATTR void
+__commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+#define COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(COMMIT_WRITE_PIPE_SIZE)
+
+ATTR void
+__commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+// Work group functions
+
+#define WORK_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__work_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(WORK_GROUP_COMMIT_READ_PIPE_SIZE)
+
+ATTR void
+__work_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+#define WORK_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__work_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(WORK_GROUP_COMMIT_WRITE_PIPE_SIZE)
+
+ATTR void
+__work_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+// sub group functions
+
+#define SUB_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__sub_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(SUB_GROUP_COMMIT_READ_PIPE_SIZE)
+
+ATTR void
+__sub_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+#define SUB_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__sub_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(SUB_GROUP_COMMIT_WRITE_PIPE_SIZE)
+
+ATTR void
+__sub_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
diff --git a/opencl/src/pipes/getp.cl b/opencl/src/pipes/getp.cl
new file mode 100644
index 00000000..d5531996
--- /dev/null
+++ b/opencl/src/pipes/getp.cl
@@ -0,0 +1,45 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+#define ATTR __attribute__((always_inline, pure))
+
+#define GET_PIPE_NUM_PACKETS_SIZE(SIZE, STYPE) \
+ATTR uint \
+__get_pipe_num_packets_##SIZE(__global struct pipeimp* p) \
+{ \
+    size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    return (uint)(wi - ri); \
+}
+
+// DO_PIPE_SIZE(GET_PIPE_NUM_PACKETS_SIZE)
+
+ATTR uint
+__get_pipe_num_packets(__global struct pipeimp* p, uint size, uint align)
+{
+    size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    return (uint)(wi - ri);
+}
+
+#define GET_PIPE_MAX_PACKETS_SIZE(SIZE, STYPE) \
+ATTR uint \
+__get_pipe_max_packets_##SIZE(__global struct pipeimp* p) \
+{ \
+    return (uint)p->end_idx; \
+}
+
+// DO_PIPE_SIZE(GET_PIPE_MAX_PACKETS_SIZE)
+
+ATTR uint
+__get_pipe_max_packets(__global struct pipeimp* p, uint size, uint align)
+{
+    return (uint)p->end_idx;
+}
+
diff --git a/opencl/src/pipes/memcpyia.cl b/opencl/src/pipes/memcpyia.cl
new file mode 100644
index 00000000..f536d044
--- /dev/null
+++ b/opencl/src/pipes/memcpyia.cl
@@ -0,0 +1,55 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+__attribute__((always_inline, weak)) void 
+__memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align)
+{
+    if (align == 2) {
+	short *d2 = (short *)d;
+	short *s2 = (short *)s;
+	short *e2 = s2 + size/2;
+
+	while (s2 < e2)
+	    *d2++ = *s2++;
+    } else if (align == 4) {
+	int *d4 = (int *)d;
+	int *s4 = (int *)s;
+	int *e4 = s4 + size/4;
+
+	while (s4 < e4)
+	    *d4++ = *s4++;
+    } else if (align == 8) {
+	long *d8 = (long *)d;
+	long *s8 = (long *)s;
+	long *e8 = s8 + size/8;
+
+	while (s8 < e8)
+	    *d8++ = *s8++;
+    } else if (align == 16) {
+	long2 *d16 = (long2 *)d;
+	long2 *s16 = (long2 *)s;
+	long2 *e16 = s16 + size/16;
+
+	while (s16 < e16)
+	    *d16++ = *s16++;
+    } else if (align == 32 || align == 64 || align == 128) {
+	long4 *d32 = (long4 *)d;
+	long4 *s32 = (long4 *)s;
+	long4 *e32 = s32 + size/32;
+
+	while (s32 < e32)
+	    *d32++ = *s32++;
+    } else {
+	char *d1 = (char *)d;
+	char *s1 = (char *)s;
+	char *e1 = s1 + size;
+
+	while (s1 < e1)
+	    *d1++ = *s1++;
+    }
+}
+
diff --git a/opencl/src/pipes/pipes.h b/opencl/src/pipes/pipes.h
new file mode 100644
index 00000000..16ab22fd
--- /dev/null
+++ b/opencl/src/pipes/pipes.h
@@ -0,0 +1,109 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "irif.h"
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+ 
+extern size_t __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n);
+
+#define DO_PIPE_SIZE(F) \
+F(1,uchar) \
+F(2,ushort) \
+F(4,uint) \
+F(8,ulong) \
+F(16,ulong2) \
+F(32,ulong4) \
+F(64,ulong8) \
+F(128,ulong16)
+
+struct pipeimp {
+    atomic_size_t read_idx;
+    atomic_size_t write_idx;
+    size_t end_idx;
+    uchar pad[128 - 3*sizeof(size_t)];
+    uchar packets[1];
+};
+
+extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t);
+
+static __attribute__((always_inline)) size_t
+reserve(volatile __global atomic_size_t *pi, size_t lim, size_t n)
+{
+    size_t i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device);
+
+    for (;;) {
+        if (i + n > lim)
+            return ~(size_t)0;
+
+        if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device))
+            break;
+    }
+
+    return i;
+}
+
+static inline size_t
+wave_reserve_1(volatile __global atomic_size_t *pi, size_t lim)
+{
+    size_t n = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) +
+                        __llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi()));
+    uint l = __llvm_amdgcn_mbcnt_hi(__llvm_amdgcn_read_exec_hi(),
+               __llvm_amdgcn_mbcnt_lo(__llvm_amdgcn_read_exec_lo(), 0u));
+    size_t i = 0;
+
+    if (l == 0) {
+        i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device);
+
+        for (;;) {
+            if (i + n > lim) {
+                i = ~(size_t)0;
+                break;
+            }
+
+            if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device))
+                break;
+        }
+    }
+
+    __llvm_amdgcn_wave_barrier();
+
+    // Broadcast the result; the ctz tells us which lane has active lane id 0
+    uint k = (uint)__llvm_cttz_i64(__llvm_amdgcn_read_exec());
+    i = ((size_t)__llvm_amdgcn_readlane((uint)(i >> 32), k) << 32) |
+        (size_t)__llvm_amdgcn_readlane((uint)i, k);
+
+    __llvm_amdgcn_wave_barrier();
+
+    if (i != ~(size_t)0)
+        i += l;
+    else {
+        // The entire group didn't fit, have to handle one by one
+        i = reserve(pi, lim, (size_t)1);
+    }
+
+    return i;
+}
+
+static inline size_t
+wrap(size_t i, size_t n)
+{
+    // Assume end_i < 2^32
+    size_t ret;
+    if (as_uint2(i).y == 0U) {
+        uint j = (uint)i;
+        uint m = (uint)n;
+        if (j < m)
+            ret = i;
+        else
+            ret = (ulong)(j % m);
+    } else
+        ret = i % n;
+    return ret;
+}
+
diff --git a/opencl/src/pipes/readp.cl b/opencl/src/pipes/readp.cl
new file mode 100644
index 00000000..1808ad3a
--- /dev/null
+++ b/opencl/src/pipes/readp.cl
@@ -0,0 +1,75 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR int \
+__read_pipe_2_##SIZE(__global struct pipeimp* p, STYPE* ptr) \
+{ \
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ri = wave_reserve_1(&p->read_idx, wi); \
+    if (ri == ~(size_t)0) \
+        return -1; \
+ \
+    size_t pi = wrap(ri, p->end_idx); \
+    *ptr = ((__global STYPE *)p->packets)[pi]; \
+ \
+    if (ri == wi-1) { \
+        __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \
+        __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+    }\
+\
+    return 0; \
+}
+
+DO_PIPE_SIZE(READ_PIPE_SIZE)
+
+ATTR int
+__read_pipe_2(__global struct pipeimp* p, void* ptr, uint size, uint align)
+{
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    size_t ri = wave_reserve_1(&p->read_idx, wi);
+    if (ri == ~(size_t)0)
+        return -1;
+
+    size_t pi = wrap(ri, p->end_idx);
+    __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align);
+
+    if (ri == wi-1) {
+        __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
+        __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+    }
+
+    return 0;
+}
+
+#define READ_PIPE_RESERVED_SIZE(SIZE, STYPE) \
+ATTR int \
+__read_pipe_4_##SIZE(__global struct pipeimp* p, size_t rid, uint i, STYPE* ptr)  \
+{ \
+    rid += i; \
+    size_t pi = wrap(rid, p->end_idx); \
+    *ptr = ((__global STYPE *)p->packets)[pi]; \
+ \
+    return 0; \
+}
+
+DO_PIPE_SIZE(READ_PIPE_RESERVED_SIZE)
+
+ATTR int
+__read_pipe_4(__global struct pipeimp* p, size_t rid, uint i, void *ptr, uint size, uint align)
+{
+    rid += i;
+    size_t pi = wrap(rid, p->end_idx);
+    __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align);
+
+    return 0;
+}
+
diff --git a/opencl/src/pipes/reservep.cl b/opencl/src/pipes/reservep.cl
new file mode 100644
index 00000000..18e073be
--- /dev/null
+++ b/opencl/src/pipes/reservep.cl
@@ -0,0 +1,219 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#include "pipes.h"
+#include "../workgroup/wg.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets); \
+ \
+    if (rid + num_packets == wi) { \
+        __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \
+        __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+    } \
+ \
+    return rid; \
+}
+
+// DO_PIPE_SIZE(RESERVE_READ_PIPE_SIZE)
+
+ATTR size_t
+__reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets);
+
+    if (rid + num_packets == wi) {
+        __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
+        __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+    }
+
+    return rid;
+}
+
+#define RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ei = p->end_idx; \
+    return __amd_wresvn(&p->write_idx, ri + ei, num_packets); \
+}
+
+// DO_PIPE_SIZE(RESERVE_WRITE_PIPE_SIZE)
+
+ATTR size_t
+__reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t ei = p->end_idx;
+    return __amd_wresvn(&p->write_idx, ri + ei, num_packets);
+}
+
+// Work group functions
+
+#define WORK_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__work_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    __local size_t *t = (__local size_t *)__get_scratch_lds(); \
+ \
+    if ((int)get_local_linear_id() == 0) { \
+        size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+        size_t rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+        if (rid + num_packets == wi) { \
+            __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \
+            __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+        } \
+ \
+        *t = rid; \
+    } \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return *t; \
+}
+
+// DO_PIPE_SIZE(WORK_GROUP_RESERVE_READ_PIPE_SIZE)
+
+ATTR size_t
+__work_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    __local size_t *t = (__local size_t *)__get_scratch_lds();
+
+    if ((int)get_local_linear_id() == 0) {
+        size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+        size_t rid = reserve(&p->read_idx, wi, num_packets);
+
+        if (rid + num_packets == wi) {
+            __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
+            __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+        }
+
+        *t = rid;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    return *t;
+}
+
+#define WORK_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__work_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    __local size_t *t = (__local size_t *)__get_scratch_lds(); \
+ \
+    if ((int)get_local_linear_id() == 0) { \
+        size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+        size_t ei = p->end_idx; \
+        *t = reserve(&p->write_idx, ri + ei, num_packets); \
+    } \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return *t; \
+}
+
+// DO_PIPE_SIZE(WORK_GROUP_RESERVE_WRITE_PIPE_SIZE)
+
+ATTR size_t
+__work_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    __local size_t *t = (__local size_t *)__get_scratch_lds();
+
+    if ((int)get_local_linear_id() == 0) {
+        size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device);
+        size_t ei = p->end_idx;
+        *t = reserve(&p->write_idx, ri + ei, num_packets);
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    return *t;
+}
+
+// sub group functions
+
+#define SUB_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__sub_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t rid = ~(size_t)0; \
+ \
+    if (get_sub_group_local_id() == 0) { \
+        size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+        rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+        if (rid + num_packets == wi) { \
+            __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \
+            __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+        } \
+    } \
+ \
+    return sub_group_broadcast(rid, 0); \
+}
+
+// DO_PIPE_SIZE(SUB_GROUP_RESERVE_READ_PIPE_SIZE)
+
+ATTR size_t
+__sub_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    size_t rid = ~(size_t)0;
+
+    if (get_sub_group_local_id() == 0) {
+        size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+        rid = reserve(&p->read_idx, wi, num_packets);
+
+        if (rid + num_packets == wi) {
+            __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
+            __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+        }
+    }
+
+    return sub_group_broadcast(rid, 0);
+}
+
+#define SUB_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__sub_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t rid = ~(size_t)0; \
+ \
+    if (get_sub_group_local_id() == 0) { \
+        size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+        size_t ei = p->end_idx; \
+        rid = reserve(&p->write_idx, ri + ei, num_packets); \
+    } \
+ \
+    return sub_group_broadcast(rid, 0); \
+}
+
+// DO_PIPE_SIZE(SUB_GROUP_RESERVE_WRITE_PIPE_SIZE)
+
+ATTR size_t
+__sub_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+     size_t rid = ~(size_t)0;
+
+    if (get_sub_group_local_id() == 0) {
+        size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device);
+        size_t ei = p->end_idx;
+        rid = reserve(&p->write_idx, ri + ei, num_packets);
+    }
+
+    return sub_group_broadcast(rid, 0);
+}
+
diff --git a/opencl/src/pipes/validp.cl b/opencl/src/pipes/validp.cl
new file mode 100644
index 00000000..5397dfce
--- /dev/null
+++ b/opencl/src/pipes/validp.cl
@@ -0,0 +1,14 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+
+__attribute__((overloadable, always_inline)) bool
+is_valid_reserve_id(reserve_id_t rid)
+{
+    return as_ulong(rid) != ~(size_t)0;
+}
+
diff --git a/opencl/src/pipes/wresvnp.cl b/opencl/src/pipes/wresvnp.cl
new file mode 100644
index 00000000..2b4f2fa4
--- /dev/null
+++ b/opencl/src/pipes/wresvnp.cl
@@ -0,0 +1,148 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+size_t
+__amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
+{
+    uint alc = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) +
+                        __llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi()));
+    uint l = __llvm_amdgcn_mbcnt_hi(-1, __llvm_amdgcn_mbcnt_lo(-1, 0u));
+    size_t rid;
+
+    if (__llvm_amdgcn_read_exec() == (1UL << alc) - 1UL) {
+        // Handle fully active subgroup
+        uint sum = sub_group_scan_inclusive_add((uint)n);
+        size_t idx = 0;
+        if (l == alc-1) {
+            idx = reserve(pidx, lim, (size_t)sum);
+        }
+        idx = sub_group_broadcast(idx, alc-1);
+        rid = idx + (size_t)(sum - (uint)n);
+        rid = idx != ~(size_t)0 ? rid : idx;
+    } else {
+        // Inclusive add scan with not all lanes active
+        const ulong nomsb = 0x7fffffffffffffffUL;
+
+        // Step 1
+        ulong smask = __llvm_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL);
+        int slid = 63 - (int)clz(smask);
+        uint t = __llvm_amdgcn_ds_bpermute(slid << 2, n);
+        uint sum = n + (slid < 0 ? 0 : t);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 2
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 3
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 4
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 5
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 6
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+        __llvm_amdgcn_wave_barrier();
+
+        size_t idx = 0;
+        if (l == 63 - (int)clz(__llvm_amdgcn_read_exec())) {
+            idx = reserve(pidx, lim, (size_t)sum);
+        }
+        __llvm_amdgcn_wave_barrier();
+
+        // Broadcast
+        uint k = 63u - (uint)clz(__llvm_amdgcn_read_exec());
+        idx = ((size_t)__llvm_amdgcn_readlane((uint)(idx >> 32), k) << 32) |
+              (size_t)__llvm_amdgcn_readlane((uint)idx, k);
+        __llvm_amdgcn_wave_barrier();
+
+        rid = idx + (size_t)(sum - (uint)n);
+        rid = idx != ~(size_t)0 ? rid : idx;
+    }
+
+    if (rid == ~(size_t)0) {
+        // Try again one at a time
+        rid = reserve(pidx, lim, n);
+    }
+
+    return rid;
+}
+
diff --git a/opencl/src/pipes/writep.cl b/opencl/src/pipes/writep.cl
new file mode 100644
index 00000000..e07026cd
--- /dev/null
+++ b/opencl/src/pipes/writep.cl
@@ -0,0 +1,65 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR int \
+__write_pipe_2_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \
+{ \
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ei = p->end_idx; \
+    size_t wi = wave_reserve_1(&p->write_idx, ri+ei); \
+    if (wi == ~(size_t)0) \
+        return -1; \
+ \
+    size_t pi = wrap(wi, ei); \
+    ((__global STYPE *)p->packets)[pi] = *ptr; \
+    return 0; \
+}
+
+DO_PIPE_SIZE(WRITE_PIPE_SIZE)
+
+ATTR int
+__write_pipe_2(__global struct pipeimp* p, const void* ptr, uint size, uint align)
+{
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t ei = p->end_idx;
+    size_t wi = wave_reserve_1(&p->write_idx, ri+ei);
+    if (wi == ~(size_t)0)
+        return -1;
+
+    size_t pi = wrap(wi, ei);
+    __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align);
+
+    return 0;
+}
+
+#define WRITE_PIPE_RESERVED_SIZE(SIZE, STYPE) \
+ATTR int \
+__write_pipe_4_##SIZE(__global struct pipeimp* p, size_t rid, uint i, const STYPE* ptr)  \
+{ \
+    rid += i; \
+    size_t pi = wrap(rid, p->end_idx); \
+    ((__global STYPE *)p->packets)[pi] = *ptr; \
+    return 0; \
+}
+
+DO_PIPE_SIZE(WRITE_PIPE_RESERVED_SIZE)
+
+ATTR int
+__write_pipe_4(__global struct pipeimp* p, size_t rid, uint i, const void *ptr, uint size, uint align)
+{
+    rid += i;
+    size_t pi = wrap(rid, p->end_idx);
+    __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align);
+
+    return 0;
+}
+
diff --git a/utils/add_amdgiz.sed b/utils/add_amdgiz.sed
index 995f6af9..ee495d3c 100755
--- a/utils/add_amdgiz.sed
+++ b/utils/add_amdgiz.sed
@@ -6,7 +6,8 @@
 #######################
 
 # amdgcn--amdhsa-amd -> amdgcn--amdhsa-amdgiz
-/target triple/s/\"amdgcn--amdhsa\"/\"amdgcn--amdhsa-amdgiz\"/
+# This is now done directly by change-addr-space.sh
+# /target triple/s/\"amdgcn--amdhsa\"/\"amdgcn--amdhsa-amdgiz\"/
 
 #####################
 # change data layout
diff --git a/utils/change-addr-space.sh b/utils/change-addr-space.sh
index 02d82af5..cfdea769 100755
--- a/utils/change-addr-space.sh
+++ b/utils/change-addr-space.sh
@@ -6,8 +6,14 @@
 # utils/change-addr-space.sh src x : apply utils/remove_amdgiz.sed
 #                                adopt generic address space is address space 4
 
-if [ $# -lt 2 ]; then
-  find . -name "*.ll" | xargs sed -i -f "$1/add_amdgiz.sed"
+tmpfile=/tmp/cas$$.sed
+if [ $# -lt 3 ]; then
+  echo "/target triple/s/\\\"amdgcn--amdhsa\\\"/\\\"${1}\\\"/" >$tmpfile
+  cat $2/add_amdgiz.sed >>$tmpfile
 else
-  find . -name "*.ll" | xargs sed -i -f "$1/remove_amdgiz.sed"
+  echo "/target triple/s/\\\"${1}\\\"/\\\"amdgcn--amdhsa\\\"/" >$tmpfile
+  cat $2/remove_amdgiz.sed >>$tmpfile
 fi
+
+find . -name "*.ll" | xargs sed -i -f "$tmpfile"
+rm $tmpfile
diff --git a/utils/prepare-builtins/prepare-builtins.cpp b/utils/prepare-builtins/prepare-builtins.cpp
index b1145363..ce3596fe 100644
--- a/utils/prepare-builtins/prepare-builtins.cpp
+++ b/utils/prepare-builtins/prepare-builtins.cpp
@@ -114,8 +114,8 @@ int main(int argc, char **argv) {
   }
 
   std::error_code EC;
-  std::unique_ptr<tool_output_file> Out
-  (new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+  std::unique_ptr<ToolOutputFile> Out
+  (new ToolOutputFile(OutputFilename, EC, sys::fs::F_None));
   if (EC) {
     errs() << EC.message() << '\n';
     exit(1);
diff --git a/utils/remove_amdgiz.sed b/utils/remove_amdgiz.sed
index d10630c6..7c76dd78 100755
--- a/utils/remove_amdgiz.sed
+++ b/utils/remove_amdgiz.sed
@@ -6,7 +6,8 @@
 #######################
 
 # amdgcn--amdhsa-amdgiz -> amdgcn--amdhsa
-/target triple/s/\"amdgcn--amdhsa-amdgiz\"/\"amdgcn--amdhsa\"/
+# This is now done directly by change-addr-space.sh 
+#/target triple/s/\"amdgcn--amdhsa-amdgiz\"/\"amdgcn--amdhsa\"/
 
 #####################
 # change data layout