From 570aabe080288e2bde0e2d5b6570fd7144346661 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <Ashwin.Aji@amd.com>
Date: Fri, 12 May 2017 12:12:45 -0500
Subject: [PATCH 01/25] installing other required headers for device enqueue

Change-Id: I4cb4f1e4b780bbfc3465a90f846098068157073d
---
 ockl/CMakeLists.txt | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/ockl/CMakeLists.txt b/ockl/CMakeLists.txt
index cb9bb25a..687d2684 100644
--- a/ockl/CMakeLists.txt
+++ b/ockl/CMakeLists.txt
@@ -15,4 +15,14 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc)
 
 opencl_bc_lib(ockl ${sources})
 
-install(FILES inc/ockl.h DESTINATION include COMPONENT OpenCL)
+install(FILES 
+        inc/amd_hsa_common.h
+        inc/amd_hsa_elf.h
+        inc/amd_hsa_kernel_code.h
+        inc/amd_hsa_queue.h
+        inc/amd_hsa_signal.h
+        inc/device_amd_hsa.h
+        inc/hsa.h
+        inc/ockl_hsa.h
+        inc/ockl.h 
+        DESTINATION include COMPONENT OpenCL)

From e77df287e9def5d6a8ad69276ab3f70c00e36514 Mon Sep 17 00:00:00 2001
From: Ashwin Aji <Ashwin.Aji@amd.com>
Date: Wed, 17 May 2017 19:15:46 -0500
Subject: [PATCH 02/25] removed couple of trailing spaces

Change-Id: Id4b1064d90959b1010c9a440d336437e0ae824db
---
 ockl/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ockl/CMakeLists.txt b/ockl/CMakeLists.txt
index 687d2684..7fa87ef4 100644
--- a/ockl/CMakeLists.txt
+++ b/ockl/CMakeLists.txt
@@ -15,7 +15,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc)
 
 opencl_bc_lib(ockl ${sources})
 
-install(FILES 
+install(FILES
         inc/amd_hsa_common.h
         inc/amd_hsa_elf.h
         inc/amd_hsa_kernel_code.h
@@ -24,5 +24,5 @@ install(FILES
         inc/device_amd_hsa.h
         inc/hsa.h
         inc/ockl_hsa.h
-        inc/ockl.h 
+        inc/ockl.h
         DESTINATION include COMPONENT OpenCL)

From 07b961ea7cb614997d2ef3b36c588e7a65400ee0 Mon Sep 17 00:00:00 2001
From: Guansong Zhang <guansong.zhang@amd.com>
Date: Tue, 11 Jul 2017 15:06:04 -0400
Subject: [PATCH 03/25] Add cuda wrapper functions from
 remove-promote-change-addr-space branch

Change-Id: I6340cb4605ba37e84aeada9d8fe407be118cf126
---
 CMakeLists.txt              |   1 +
 cuda2gcn/CMakeLists.txt     |  17 ++
 cuda2gcn/src/bitsbytes.cl   |  50 +++++
 cuda2gcn/src/convert.cl     | 150 +++++++++++++++
 cuda2gcn/src/float.cl       |  33 ++++
 cuda2gcn/src/generic.cl     |  54 ++++++
 cuda2gcn/src/half.cl        |  23 +++
 cuda2gcn/src/integer.cl     |  29 +++
 cuda2gcn/src/math.cl        | 354 ++++++++++++++++++++++++++++++++++++
 cuda2gcn/src/precision.cl   |  56 ++++++
 cuda2gcn/src/reinterpret.cl |  63 +++++++
 cuda2gcn/src/rounding.cl    |  23 +++
 12 files changed, 853 insertions(+)
 create mode 100644 cuda2gcn/CMakeLists.txt
 create mode 100644 cuda2gcn/src/bitsbytes.cl
 create mode 100644 cuda2gcn/src/convert.cl
 create mode 100644 cuda2gcn/src/float.cl
 create mode 100644 cuda2gcn/src/generic.cl
 create mode 100644 cuda2gcn/src/half.cl
 create mode 100644 cuda2gcn/src/integer.cl
 create mode 100644 cuda2gcn/src/math.cl
 create mode 100644 cuda2gcn/src/precision.cl
 create mode 100644 cuda2gcn/src/reinterpret.cl
 create mode 100644 cuda2gcn/src/rounding.cl

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f5b85f7d..f46fba2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,7 @@ add_subdirectory(oclc)
 add_subdirectory(ocml)
 add_subdirectory(ockl)
 add_subdirectory(opencl)
+add_subdirectory(cuda2gcn)
 
 if(BUILD_HC_LIB)
   add_subdirectory(hc)
diff --git a/cuda2gcn/CMakeLists.txt b/cuda2gcn/CMakeLists.txt
new file mode 100644
index 00000000..c2ed32fe
--- /dev/null
+++ b/cuda2gcn/CMakeLists.txt
@@ -0,0 +1,17 @@
+##===--------------------------------------------------------------------------
+##                   ROCm Device Libraries
+##
+## This file is distributed under the University of Illinois Open Source
+## License. See LICENSE.TXT for details.
+##===--------------------------------------------------------------------------
+
+file(GLOB cl_sources
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl
+)
+
+file(GLOB sources ${cl_sources})
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ocml/inc)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ockl/inc)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc)
+opencl_bc_lib(cuda2gcn ${sources})
diff --git a/cuda2gcn/src/bitsbytes.cl b/cuda2gcn/src/bitsbytes.cl
new file mode 100644
index 00000000..03cb20cb
--- /dev/null
+++ b/cuda2gcn/src/bitsbytes.cl
@@ -0,0 +1,50 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+extern __attribute__((const)) int __llvm_bitreverse_i32(int);
+extern __attribute__((const)) long __llvm_bitreverse_i64(long);
+extern __attribute__((const)) int __llvm_ctpop_i32(int);
+extern __attribute__((const)) long __llvm_ctpop_i64(long);
+
+#include "ockl.h"
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_brev
+ATTR int __nv_brev(int x) { return __llvm_bitreverse_i32(x); }
+
+//-------- T __nv_brevll
+ATTR long __nv_brevll(long x) { return __llvm_bitreverse_i64(x); }
+
+//-------- T __nv_clz
+ATTR int __nv_clz(int x)
+{
+    return (int)__ockl_clz_u32((uint)x);
+}
+
+//-------- T __nv_clzll
+ATTR int __nv_clzll(long x)
+{
+    uint xlo = (uint)x;
+    uint xhi = (uint)(x >> 32);
+    uint zlo = __ockl_clz_u32(xlo) + 32u;
+    uint zhi = __ockl_clz_u32(xhi);
+    return (int)(xhi == 0 ? zlo : zhi);
+}
+
+//-------- T __nv_ffs
+ATTR int __nv_ffs(int x) { return (32 - __nv_clz(x&(-x))); }
+
+//-------- T __nv_ffsll
+ATTR int __nv_ffsll(long x) { return (int)(64 - __nv_clzll(x&(-x))); }
+
+//-------- T __nv_popc
+ATTR int __nv_popc(int x) { return __llvm_ctpop_i32(x); }
+
+//-------- T __nv_popcll
+ATTR int __nv_popcll(long x) { return (int)__llvm_ctpop_i64(x); }
+
diff --git a/cuda2gcn/src/convert.cl b/cuda2gcn/src/convert.cl
new file mode 100644
index 00000000..43113915
--- /dev/null
+++ b/cuda2gcn/src/convert.cl
@@ -0,0 +1,150 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define ATTR __attribute__((always_inline, const))
+
+#define CONVERTM(A,B,m,n) ATTR B __nv_##A##2##B##_##m(A x) \
+    { return convert_##B##_##n(x); }
+
+#define CONVERT(A,B) \
+    CONVERTM(A, B, rd, rtn) \
+    CONVERTM(A, B, rn, rte) \
+    CONVERTM(A, B, ru, rtp) \
+    CONVERTM(A, B, rz, rtz)
+
+//-------- T __nv_double2float_rd
+//-------- T __nv_double2float_rn
+//-------- T __nv_double2float_ru
+//-------- T __nv_double2float_rz
+CONVERT(double, float)
+
+//-------- T __nv_double2int_rd
+//-------- T __nv_double2int_rn
+//-------- T __nv_double2int_ru
+//-------- T __nv_double2int_rz
+CONVERT(double, int)
+
+//-------- T __nv_float2int_rd
+//-------- T __nv_float2int_rn
+//-------- T __nv_float2int_ru
+//-------- T __nv_float2int_rz
+CONVERT(float, int)
+
+//-------- T __nv_int2float_rd
+//-------- T __nv_int2float_rn
+//-------- T __nv_int2float_ru
+//-------- T __nv_int2float_rz
+CONVERT(int, float)
+
+//-------- T __nv_double2uint_rd
+//-------- T __nv_double2uint_rn
+//-------- T __nv_double2uint_ru
+//-------- T __nv_double2uint_rz
+CONVERT(double, uint)
+
+//-------- T __nv_float2uint_rd
+//-------- T __nv_float2uint_rn
+//-------- T __nv_float2uint_ru
+//-------- T __nv_float2uint_rz
+CONVERT(float, uint)
+
+//-------- T __nv_uint2double_rd
+//-------- T __nv_uint2double_rn
+//-------- T __nv_uint2double_ru
+//-------- T __nv_uint2double_rz
+CONVERT(uint, double)
+
+//-------- T __nv_uint2float_rd
+//-------- T __nv_uint2float_rn
+//-------- T __nv_uint2float_ru
+//-------- T __nv_uint2float_rz
+CONVERT(uint, float)
+
+#define CONVERT2LLM(A,B,m,n) ATTR long __nv_##A##2ll_##m(A x) \
+    { return convert_long_##n(x); }
+
+#define CONVERT2LL(A) \
+    CONVERT2LLM(A, long, rd, rtn) \
+    CONVERT2LLM(A, long, rn, rte) \
+    CONVERT2LLM(A, long, ru, rtp) \
+    CONVERT2LLM(A, long, rz, rtz)
+
+//-------- T __nv_double2ll_rd
+//-------- T __nv_double2ll_rn
+//-------- T __nv_double2ll_ru
+//-------- T __nv_double2ll_rz
+CONVERT2LL(double)
+
+//-------- T __nv_float2ll_rd
+//-------- T __nv_float2ll_rn
+//-------- T __nv_float2ll_ru
+//-------- T __nv_float2ll_rz
+CONVERT2LL(float)
+
+#define CONVERT2ULLM(A,B,m,n) ATTR ulong __nv_##A##2ull_##m(A x) \
+    { return convert_ulong_##n(x); }
+
+#define CONVERT2ULL(A) \
+    CONVERT2ULLM(A, ulong, rd, rtn) \
+    CONVERT2ULLM(A, ulong, rn, rte) \
+    CONVERT2ULLM(A, ulong, ru, rtp) \
+    CONVERT2ULLM(A, ulong, rz, rtz)
+
+//-------- T __nv_double2ull_rd
+//-------- T __nv_double2ull_rn
+//-------- T __nv_double2ull_ru
+//-------- T __nv_double2ull_rz
+CONVERT2ULL(double)
+
+//-------- T __nv_float2ull_rd
+//-------- T __nv_float2ull_rn
+//-------- T __nv_float2ull_ru
+//-------- T __nv_float2ull_rz
+CONVERT2ULL(float)
+
+#define CONVERT4LLM(A,B,m,n) ATTR B __nv_ll2##B##_##m(long x) \
+    { return convert_##B##_##n(x); }
+
+#define CONVERT4LL(B) \
+    CONVERT4LLM(long, B, rd, rtn) \
+    CONVERT4LLM(long, B, rn, rte) \
+    CONVERT4LLM(long, B, ru, rtp) \
+    CONVERT4LLM(long, B, rz, rtz)
+
+//-------- T __nv_ll2double_rd
+//-------- T __nv_ll2double_rn
+//-------- T __nv_ll2double_ru
+//-------- T __nv_ll2double_rz
+CONVERT4LL(double)
+
+//-------- T __nv_ll2float_rd
+//-------- T __nv_ll2float_rn
+//-------- T __nv_ll2float_ru
+//-------- T __nv_ll2float_rz
+CONVERT4LL(float)
+
+#define CONVERT4ULLM(A,B,m,n) ATTR B __nv_ull2##B##_##m(ulong x) \
+    { return convert_##B##_##n(x); }
+
+#define CONVERT4ULL(B) \
+    CONVERT4ULLM(ulong, B, rd, rtn) \
+    CONVERT4ULLM(ulong, B, rn, rte) \
+    CONVERT4ULLM(ulong, B, ru, rtp) \
+    CONVERT4ULLM(ulong, B, rz, rtz)
+
+//-------- T __nv_ull2double_rd
+//-------- T __nv_ull2double_rn
+//-------- T __nv_ull2double_ru
+//-------- T __nv_ull2double_rz
+CONVERT4ULL(double)
+
+//-------- T __nv_ull2float_rd
+//-------- T __nv_ull2float_rn
+//-------- T __nv_ull2float_ru
+//-------- T __nv_ull2float_rz
+CONVERT4ULL(float)
+
diff --git a/cuda2gcn/src/float.cl b/cuda2gcn/src/float.cl
new file mode 100644
index 00000000..58c8a00b
--- /dev/null
+++ b/cuda2gcn/src/float.cl
@@ -0,0 +1,33 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_finitef
+ATTR int __nv_finitef(float x) { return isfinite(x); }
+
+//-------- T __nv_isfinited
+ATTR int __nv_isfinited(double x) { return isfinite(x); }
+
+//-------- T __nv_isinfd
+ATTR int __nv_isinfd(double x) { return isinf(x); }
+
+//-------- T __nv_isinff
+ATTR int __nv_isinff(float x) { return isinf(x); }
+
+//-------- T __nv_isnand
+ATTR int __nv_isnand(double x) { return isnan(x); }
+
+//-------- T __nv_isnanf
+ATTR int __nv_isnanf(float x) { return isnan(x); }
+
+//-------- T __nv_nan
+ATTR double __nv_nan(char *tagp) { return __builtin_nan(tagp); }
+
+//-------- T __nv_nanf
+ATTR float __nv_nanf(char *tagp) { return __builtin_nan(tagp); }
+
diff --git a/cuda2gcn/src/generic.cl b/cuda2gcn/src/generic.cl
new file mode 100644
index 00000000..c2a232c9
--- /dev/null
+++ b/cuda2gcn/src/generic.cl
@@ -0,0 +1,54 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define ATTR __attribute__((always_inline, const))
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+//-------- T __nv_abs
+ATTR int __nv_abs(int x) { return abs(x); }
+
+//-------- T __nv_llabs
+ATTR long __nv_llabs(long x) { return abs(x); }
+
+//-------- T __nv_max
+ATTR int __nv_max(int a, int b) { return MAX(a,b); }
+
+//-------- T __nv_llmax
+ATTR long __nv_llmax(long a, long b) { return MAX(a,b); }
+
+//-------- T __nv_ullmax
+ATTR ulong __nv_ullmax(ulong a, ulong b) { return MAX(a,b); }
+
+//-------- T __nv_umax
+ATTR uint __nv_umax(uint a, uint b) { return MAX(a,b); }
+
+//-------- T __nv_min
+ATTR int __nv_min(int a, int b) { return MIN(a,b); }
+
+//-------- T __nv_llmin
+ATTR long __nv_llmin(long a, long b) { return MIN(a,b); }
+
+//-------- T __nv_ullmin
+ATTR ulong __nv_ullmin(ulong a, ulong b) { return MIN(a,b); }
+
+//-------- T __nv_umin
+ATTR uint __nv_umin(uint a, uint b) { return MIN(a,b); }
+
+//-------- T __nv_sad
+ATTR uint __nv_sad(int x, int y, uint z)
+{
+    return (z+abs(x-y));
+}
+
+//-------- T __nv_usad
+ATTR uint __nv_usad(uint x, uint y, uint z)
+{
+    return (z+abs(x-y));
+}
+
diff --git a/cuda2gcn/src/half.cl b/cuda2gcn/src/half.cl
new file mode 100644
index 00000000..02a26529
--- /dev/null
+++ b/cuda2gcn/src/half.cl
@@ -0,0 +1,23 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_float2half_rn
+half __nv_float2half_rn(float x)
+{
+    return (half)x;
+}
+
+//-------- T __nv_half2float
+float __nv_half2float(half x)
+{
+    return (float)x;
+}
+
diff --git a/cuda2gcn/src/integer.cl b/cuda2gcn/src/integer.cl
new file mode 100644
index 00000000..58b8bf5a
--- /dev/null
+++ b/cuda2gcn/src/integer.cl
@@ -0,0 +1,29 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ockl.h"
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_mul24
+ATTR int __nv_mul24(int x, int y) { return __ockl_mul24_i32(x, y); }
+
+//-------- T __nv_umul24
+ATTR uint __nv_umul24(uint x, uint y) { return __ockl_mul24_u32(x, y); }
+
+//-------- T __nv_mul64hi
+ATTR long __nv_mul64hi(long x, long y) { return __ockl_mul_hi_i64(x,y); }
+
+//-------- T __nv_mulhi
+ATTR int __nv_mulhi(int x, int y) { return __ockl_mul_hi_i32(x,y); }
+
+//-------- T __nv_umul64hi
+ATTR ulong __nv_umul64hi(ulong x, ulong y) { return __ockl_mul_hi_u64(x,y); }
+
+//-------- T __nv_umulhi
+ATTR uint __nv_umulhi(uint x, uint y) { return __ockl_mul_hi_u32(x,y); }
+
diff --git a/cuda2gcn/src/math.cl b/cuda2gcn/src/math.cl
new file mode 100644
index 00000000..2c4eaf55
--- /dev/null
+++ b/cuda2gcn/src/math.cl
@@ -0,0 +1,354 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define FUNC1D(root) \
+  ATTR double __nv_##root(double x) { return __ocml_##root##_f64(x); }
+#define FUNC1F(root) \
+  ATTR float __nv_##root##f(float x) { return __ocml_##root##_f32(x); }
+#define FUNC1(root) FUNC1D(root) FUNC1F(root)
+
+#define FUNC2D(root) \
+  ATTR double __nv_##root(double x, double y) { return __ocml_##root##_f64(x, y); }
+#define FUNC2F(root) \
+  ATTR float __nv_##root##f(float x, float y) { return __ocml_##root##_f32(x, y); }
+#define FUNC2(root) FUNC2D(root) FUNC2F(root)
+
+#define FUNC3D(root) \
+  ATTR double __nv_##root(double x, double y, double z) { return __ocml_##root##_f64(x, y, z); }
+#define FUNC3F(root) \
+  ATTR float __nv_##root##f(float x, float y, float z) { return __ocml_##root##_f32(x, y, z); }
+#define FUNC3(root) FUNC3D(root) FUNC3F(root)
+
+//-------- T __nv_acos
+//-------- T __nv_acosf
+FUNC1(acos)
+
+//-------- T __nv_acosh
+//-------- T __nv_acoshf
+FUNC1(acosh)
+
+//-------- T __nv_asin
+//-------- T __nv_asinf
+FUNC1(asin)
+
+//-------- T __nv_asinh
+//-------- T __nv_asinhf
+FUNC1(asinh)
+
+//-------- T __nv_atan
+//-------- T __nv_atanf
+FUNC1(atan)
+
+//-------- T __nv_atan2
+//-------- T __nv_atan2f
+FUNC2(atan2)
+
+//-------- T __nv_atanh
+//-------- T __nv_atanhf
+FUNC1(atanh)
+
+//-------- T __nv_cbrt
+//-------- T __nv_cbrtf
+FUNC1(cbrt)
+
+//-------- T __nv_ceil
+//-------- T __nv_ceilf
+FUNC1(ceil)
+
+//-------- T __nv_copysign
+//-------- T __nv_copysignf
+FUNC2(copysign)
+
+//-------- T __nv_cos
+//-------- T __nv_cosf
+FUNC1(cos)
+
+//-------- T __nv_cosh
+//-------- T __nv_coshf
+FUNC1(cosh)
+
+//-------- T __nv_cospi
+//-------- T __nv_cospif
+FUNC1(cospi)
+
+//-------- T __nv_erf
+//-------- T __nv_erff
+FUNC1(erf)
+
+//-------- T __nv_erfc
+//-------- T __nv_erfcf
+FUNC1(erfc)
+
+//-------- T __nv_erfcinv
+//-------- T __nv_erfcinvf
+FUNC1(erfcinv)
+
+//-------- T __nv_erfcx
+//-------- T __nv_erfcxf
+FUNC1(erfcx)
+
+//-------- T __nv_erfinv
+//-------- T __nv_erfinvf
+FUNC1(erfinv)
+
+//-------- T __nv_exp
+//-------- T __nv_expf
+FUNC1(exp)
+
+//-------- T __nv_exp10
+//-------- T __nv_exp10f
+FUNC1(exp10)
+
+//-------- T __nv_exp2
+//-------- T __nv_exp2f
+FUNC1(exp2)
+
+//-------- T __nv_expm1
+//-------- T __nv_expm1f
+FUNC1(expm1)
+
+//-------- T __nv_fabs
+//-------- T __nv_fabsf
+FUNC1(fabs)
+
+//-------- T __nv_fdim
+//-------- T __nv_fdimf
+FUNC2(fdim)
+
+//-------- T __nv_floor
+//-------- T __nv_floorf
+FUNC1(floor)
+
+//-------- T __nv_fma
+//-------- T __nv_fmaf
+FUNC3(fma)
+
+//-------- T __nv_fmax
+//-------- T __nv_fmaxf
+FUNC2(fmax)
+
+//-------- T __nv_fmin
+//-------- T __nv_fminf
+FUNC2(fmin)
+
+//-------- T __nv_fmod
+//-------- T __nv_fmodf
+FUNC2(fmod)
+
+//-------- T __nv_hypot
+//-------- T __nv_hypotf
+FUNC2(hypot)
+
+//-------- T __nv_j0
+//-------- T __nv_j0f
+FUNC1(j0)
+
+//-------- T __nv_j1
+//-------- T __nv_j1f
+FUNC1(j1)
+
+//-------- T __nv_lgamma
+//-------- T __nv_lgammaf
+FUNC1(lgamma)
+
+//-------- T __nv_log
+//-------- T __nv_logf
+FUNC1(log)
+
+//-------- T __nv_log10
+//-------- T __nv_log10f
+FUNC1(log10)
+
+//-------- T __nv_log1p
+//-------- T __nv_log1pf
+FUNC1(log1p)
+
+//-------- T __nv_log2
+//-------- T __nv_log2f
+FUNC1(log2)
+
+//-------- T __nv_logb
+//-------- T __nv_logbf
+FUNC1(logb)
+
+//-------- T __nv_pow
+//-------- T __nv_powf
+FUNC2(pow)
+
+//-------- T __nv_rcbrt
+//-------- T __nv_rcbrtf
+FUNC1(rcbrt)
+
+//-------- T __nv_remainder
+//-------- T __nv_remainderf
+FUNC2(remainder)
+
+//-------- T __nv_rhypot
+//-------- T __nv_rhypotf
+FUNC2(rhypot)
+
+//-------- T __nv_nearbyint
+//-------- T __nv_nearbyintf
+FUNC1(nearbyint)
+
+//-------- T __nv_nextafter
+//-------- T __nv_nextafterf
+FUNC2(nextafter)
+
+//-------- T __nv_rint
+//-------- T __nv_rintf
+FUNC1(rint)
+
+//-------- T __nv_round
+//-------- T __nv_roundf
+FUNC1(round)
+
+//-------- T __nv_rsqrt
+//-------- T __nv_rsqrtf
+FUNC1(rsqrt)
+
+//-------- T __nv_scalbn
+//-------- T __nv_scalbnf
+FUNC2(scalbn)
+
+//-------- T __nv_sin
+//-------- T __nv_sinf
+FUNC1(sin)
+
+//-------- T __nv_sinh
+//-------- T __nv_sinhf
+FUNC1(sinh)
+
+//-------- T __nv_sinpi
+//-------- T __nv_sinpif
+FUNC1(sinpi)
+
+//-------- T __nv_sqrt
+//-------- T __nv_sqrtf
+FUNC1(sqrt)
+
+//-------- T __nv_tan
+//-------- T __nv_tanf
+FUNC1(tan)
+
+//-------- T __nv_tanh
+//-------- T __nv_tanhf
+FUNC1(tanh)
+
+//-------- T __nv_tgamma
+//-------- T __nv_tgammaf
+FUNC1(tgamma)
+
+//-------- T __nv_trunc
+//-------- T __nv_truncf
+FUNC1(trunc)
+
+//-------- T __nv_y0
+//-------- T __nv_y0f
+FUNC1(y0)
+
+//-------- T __nv_y1
+//-------- T __nv_y1f
+FUNC1(y1)
+
+//-------- T __nv_cyl_bessel_i0
+ATTR double __nv_cyl_bessel_i0(double x) { return __ocml_i0_f64(x); }
+
+//-------- T __nv_cyl_bessel_i0f
+ATTR float __nv_cyl_bessel_i0f(float x) { return __ocml_i0_f32(x); }
+
+//-------- T __nv_cyl_bessel_i1
+ATTR double __nv_cyl_bessel_i1(double x) { return __ocml_i1_f64(x); }
+
+//-------- T __nv_cyl_bessel_i1f
+ATTR float __nv_cyl_bessel_i1f(float x) { return __ocml_i1_f32(x); }
+
+//-------- T __nv_frexp
+ATTR double __nv_frexp(double x, __private int *ptr) { return __ocml_frexp_f64(x, ptr); }
+
+//-------- T __nv_frexpf
+ATTR float __nv_frexpf(float x, __private int *ptr) { return __ocml_frexp_f32(x, ptr); }
+
+//-------- T __nv_ilogb
+ATTR int __nv_ilogb(double x) { return __ocml_ilogb_f64(x); }
+
+//-------- T __nv_ilogbf
+ATTR int __nv_ilogbf(float x) { return __ocml_ilogb_f32(x); }
+
+//-------- T __nv_ldexp
+ATTR double __nv_ldexp(double x, int i) { return __ocml_ldexp_f64(x, i); }
+
+//-------- T __nv_ldexpf
+ATTR float __nv_ldexpf(float x, int i) { return __ocml_ldexp_f32(x, i); }
+
+//-------- T __nv_modf
+ATTR double __nv_modf(double x, __private double *ptr) { return __ocml_modf_f64(x, ptr); }
+
+//-------- T __nv_modff
+ATTR float __nv_modff(float x, __private float *ptr) { return __ocml_modf_f32(x, ptr); }
+
+//-------- T __nv_norm3d
+ATTR double __nv_norm3d(double x, double y, double z) { return __ocml_len3_f64(x,y,z); }
+
+//-------- T __nv_norm3df
+ATTR float __nv_norm3df(float x, float y, float z) { return __ocml_len3_f32(x,y,z); }
+
+//-------- T __nv_norm4d
+ATTR double __nv_norm4d(double a, double b, double c, double d) { return __ocml_len4_f64(a,b,c,d); }
+
+//-------- T __nv_norm4df
+ATTR float __nv_norm4df(float a, float b, float c, float d) { return __ocml_len4_f32(a,b,c,d); }
+
+//-------- T __nv_normcdf
+ATTR double __nv_normcdf(double x) { return __ocml_ncdf_f64(x); }
+
+//-------- T __nv_normcdff
+ATTR float __nv_normcdff(float x) { return __ocml_ncdf_f32(x); }
+
+//-------- T __nv_normcdfinv
+ATTR double __nv_normcdfinv(double x) { return __ocml_ncdfinv_f64(x); }
+
+//-------- T __nv_normcdfinvf
+ATTR float __nv_normcdfinvf(float x) { return __ocml_ncdfinv_f32(x); }
+
+//-------- T __nv_powi
+ATTR double __nv_powi(double x, int n) { return __ocml_pown_f64(x, n); }
+
+//-------- T __nv_powi
+ATTR float __nv_powif(float x, int n) { return __ocml_pown_f32(x, n); }
+
+//-------- T __nv_remquo
+ATTR double __nv_remquo(double x, double y, __private int *ptr) { return __ocml_remquo_f64(x, y, ptr); }
+
+//-------- T __nv_remquof
+ATTR float __nv_remquof(float x, float y, __private int *ptr) { return __ocml_remquo_f32(x, y, ptr); }
+
+//-------- T __nv_saturatef
+ATTR float __nv_saturatef(float x) { return __ocml_min_f32(__ocml_max_f32(x, 0.0f), 1.0f); }
+
+//-------- T __nv_signbitd
+ATTR int __nv_signbitd(double x) { return __ocml_signbit_f64(x); }
+
+//-------- T __nv_signbitf
+ATTR int __nv_signbitf(float x) { return __ocml_signbit_f32(x); }
+
+//-------- T __nv_sincos
+ATTR void __nv_sincos(double x, __private double * sptr, __private double *cptr) { (*sptr)=__ocml_sincos_f64(x, cptr); }
+
+//-------- T __nv_sincosf
+ATTR void __nv_sincosf(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincos_f32(x, cptr); }
+
+//-------- T __nv_sincospi
+ATTR void __nv_sincospi(double x, __private double * sptr, __private double *cptr) { (*sptr)=__ocml_sincospi_f64(x, cptr); }
+
+//-------- T __nv_sincospif
+ATTR void __nv_sincosfpif(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincospi_f32(x, cptr); }
+
diff --git a/cuda2gcn/src/precision.cl b/cuda2gcn/src/precision.cl
new file mode 100644
index 00000000..21a13d6e
--- /dev/null
+++ b/cuda2gcn/src/precision.cl
@@ -0,0 +1,56 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define FUNC1F(root) \
+  ATTR float __nv_fast_##root##f(float x) { return __ocml_##root##_f32(x); }
+#define FUNC1(root) FUNC1F(root)
+
+#define FUNC2F(root) \
+  ATTR float __nv_fast_##root##f(float x, float y) { return __ocml_##root##_f32(x, y); }
+#define FUNC2(root) FUNC2F(root)
+
+#define FUNC3F(root) \
+  ATTR float __nv_fast_##root##f(float x, float y, float z) { return __ocml_##root##_f32(x, y, z); }
+#define FUNC3(root) FUNC3F(root)
+
+//-------- T __nv_fast_cosf
+FUNC1(cos)
+
+//-------- T __nv_fast_exp10f
+FUNC1(exp10)
+
+//-------- T __nv_fast_expf
+FUNC1(exp)
+
+//-------- T __nv_fast_log10f
+FUNC1(log10)
+
+//-------- T __nv_fast_log2f
+FUNC1(log2)
+
+//-------- T __nv_fast_logf
+FUNC1(log)
+
+//-------- T __nv_fast_powf
+FUNC2(pow)
+
+//-------- T __nv_fast_sinf
+FUNC1(sin)
+
+//-------- T __nv_fast_tanf
+FUNC1(tan)
+
+//-------- T __nv_fast_fdividef
+ATTR float __nv_fast_fdividef(float x, float y) { return native_divide(x, y); }
+
+//-------- T __nv_fast_sincosf
+ATTR void __nv_fast_sincosf(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincos_f32(x, cptr); }
+
diff --git a/cuda2gcn/src/reinterpret.cl b/cuda2gcn/src/reinterpret.cl
new file mode 100644
index 00000000..0d55cded
--- /dev/null
+++ b/cuda2gcn/src/reinterpret.cl
@@ -0,0 +1,63 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_double_as_longlong
+ATTR long __nv_double_as_longlong(double x)
+{
+  return as_long(x);
+}
+
+//-------- T __nv_float_as_int
+ATTR int __nv_float_as_int(float x)
+{
+  return as_int(x);
+}
+
+//-------- T __nv_float_as_uint
+ATTR unsigned int __nv_float_as_uint(float x)
+{
+  return as_uint(x);
+}
+
+//-------- T __nv_int_as_float
+ATTR float __nv_int_as_float(int x)
+{
+  return as_float(x);
+}
+
+//-------- T __nv_longlong_as_double
+ATTR double __nv_longlong_as_double(long x)
+{
+  return as_double(x);
+}
+
+//-------- T __nv_uint_as_float
+ATTR float __nv_uint_as_float(unsigned int x)
+{
+  return as_float(x);
+}
+
+//-------- T __nv_double2hiint
+int __nv_double2hiint(double x)
+{
+    return (int) as_long(x) >> 32;
+}
+
+//-------- T __nv_double2loint
+int __nv_double2loint(double x)
+{
+    return (int) as_long(x);
+}
+
+//-------- T __nv_hiloint2double
+double __nv_hiloint2double(int x, int y)
+{
+    return as_double((long)x << 32 | y);
+}
+
diff --git a/cuda2gcn/src/rounding.cl b/cuda2gcn/src/rounding.cl
new file mode 100644
index 00000000..a377e39d
--- /dev/null
+++ b/cuda2gcn/src/rounding.cl
@@ -0,0 +1,23 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml.h"
+
+#define ATTR __attribute__((always_inline, const))
+
+//-------- T __nv_llrint
+ATTR long __nv_llrint(double x) { return (long)__ocml_rint_f64(x); }
+
+//-------- T __nv_llrintf
+ATTR long __nv_llrintf(float x) { return (long)__ocml_rint_f32(x); }
+
+//-------- T __nv_llround
+ATTR long __nv_llround(double x) { return (long)__ocml_round_f64(x); }
+
+//-------- T __nv_llroundf
+ATTR long __nv_llroundf(float x) { return (long)__ocml_round_f32(x); }
+

From 21000a22bd0839ea102e9332ab5843c796c2c14a Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Wed, 12 Jul 2017 19:10:25 -0400
Subject: [PATCH 04/25] Update syncscope usage based on  
 https://reviews.llvm.org/rL307722

Change-Id: Iaf3d356d753b4665fc2ceb108952976e55705904
---
 irif/src/fence.ll | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/irif/src/fence.ll b/irif/src/fence.ll
index 14f04b03..0bcaaaa9 100644
--- a/irif/src/fence.ll
+++ b/irif/src/fence.ll
@@ -1,27 +1,23 @@
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 target triple = "amdgcn--amdhsa"
 
-;;
-;; syncscope number mapping is in llvm/target/AMDGPU/AMDGPU.h class AMDGPUSynchronizationScope
-;;
-
 define void @__llvm_fence_acq_wi() local_unnamed_addr #0 {
-  fence syncscope(5) acquire
+  fence syncscope("singlethread") acquire
   ret void
 }
 
 define void @__llvm_fence_acq_sg() local_unnamed_addr #0 {
-  fence syncscope(4) acquire
+  fence syncscope("wavefront") acquire
   ret void
 }
 
 define void @__llvm_fence_acq_wg() local_unnamed_addr #0 {
-  fence syncscope(3) acquire
+  fence syncscope("workgroup") acquire
   ret void
 }
 
 define void @__llvm_fence_acq_dev() local_unnamed_addr #0 {
-  fence syncscope(2) acquire
+  fence syncscope("agent") acquire
   ret void
 }
 
@@ -31,22 +27,22 @@ define void @__llvm_fence_acq_sys() local_unnamed_addr #0 {
 }
 
 define void @__llvm_fence_rel_wi() local_unnamed_addr #0 {
-  fence syncscope(5) release
+  fence syncscope("singlethread") release
   ret void
 }
 
 define void @__llvm_fence_rel_sg() local_unnamed_addr #0 {
-  fence syncscope(4) release
+  fence syncscope("wavefront") release
   ret void
 }
 
 define void @__llvm_fence_rel_wg() local_unnamed_addr #0 {
-  fence syncscope(3) release
+  fence syncscope("workgroup") release
   ret void
 }
 
 define void @__llvm_fence_rel_dev() local_unnamed_addr #0 {
-  fence syncscope(2) release
+  fence syncscope("agent") release
   ret void
 }
 
@@ -56,22 +52,22 @@ define void @__llvm_fence_rel_sys() local_unnamed_addr #0 {
 }
 
 define void @__llvm_fence_ar_wi() local_unnamed_addr #0 {
-  fence syncscope(5) acq_rel
+  fence syncscope("singlethread") acq_rel
   ret void
 }
 
 define void @__llvm_fence_ar_sg() local_unnamed_addr #0 {
-  fence syncscope(4) acq_rel
+  fence syncscope("wavefront") acq_rel
   ret void
 }
 
 define void @__llvm_fence_ar_wg() local_unnamed_addr #0 {
-  fence syncscope(3) acq_rel
+  fence syncscope("workgroup") acq_rel
   ret void
 }
 
 define void @__llvm_fence_ar_dev() local_unnamed_addr #0 {
-  fence syncscope(2) acq_rel
+  fence syncscope("agent") acq_rel
   ret void
 }
 
@@ -81,22 +77,22 @@ define void @__llvm_fence_ar_sys() local_unnamed_addr #0 {
 }
 
 define void @__llvm_fence_sc_wi() local_unnamed_addr #0 {
-  fence syncscope(5) seq_cst
+  fence syncscope("singlethread") seq_cst
   ret void
 }
 
 define void @__llvm_fence_sc_sg() local_unnamed_addr #0 {
-  fence syncscope(4) seq_cst
+  fence syncscope("wavefront") seq_cst
   ret void
 }
 
 define void @__llvm_fence_sc_wg() local_unnamed_addr #0 {
-  fence syncscope(3) seq_cst
+  fence syncscope("workgroup") seq_cst
   ret void
 }
 
 define void @__llvm_fence_sc_dev() local_unnamed_addr #0 {
-  fence syncscope(2) seq_cst
+  fence syncscope("agent") seq_cst
   ret void
 }
 

From 757a62df50fec586925bfaa50b663604738b273d Mon Sep 17 00:00:00 2001
From: Guansong Zhang <guansong.zhang@amd.com>
Date: Thu, 13 Jul 2017 23:08:46 -0400
Subject: [PATCH 05/25] Add amdgcn--cuda as an option of AMDGPU_TARGET_TRIPLE

Change-Id: I4423aaab86cce06eb750e422fe855e21312406fa
---
 CMakeLists.txt             |  7 ++++++-
 hc/CMakeLists.txt          |  2 +-
 irif/CMakeLists.txt        |  2 +-
 opencl/CMakeLists.txt      |  2 +-
 utils/add_amdgiz.sed       |  3 ++-
 utils/change-addr-space.sh | 12 +++++++++---
 utils/remove_amdgiz.sed    |  3 ++-
 7 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f46fba2e..1008f1a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,9 @@ if (GENERIC_IS_ZERO)
   set(AMDGPU_TARGET_TRIPLE "amdgcn--amdhsa-amdgizcl")
   # HCC will execute utils/change-addr-space.sh
   # and apply utils/add_amdgiz.sed on all .ll files in subdirectory hc/, irif/, opencl/
+  if (CUDA_TRIPLE)
+    set(AMDGPU_TARGET_TRIPLE "amdgcn--cuda")
+  endif (CUDA_TRIPLE)
 
 endif (GENERIC_IS_ZERO)
 
@@ -52,7 +55,9 @@ add_subdirectory(oclc)
 add_subdirectory(ocml)
 add_subdirectory(ockl)
 add_subdirectory(opencl)
-add_subdirectory(cuda2gcn)
+if (CUDA_TRIPLE)
+  add_subdirectory(cuda2gcn)
+endif (CUDA_TRIPLE)
 
 if(BUILD_HC_LIB)
   add_subdirectory(hc)
diff --git a/hc/CMakeLists.txt b/hc/CMakeLists.txt
index 6c4eb4e7..d8d0c0d3 100644
--- a/hc/CMakeLists.txt
+++ b/hc/CMakeLists.txt
@@ -24,7 +24,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB ll_srcs
diff --git a/irif/CMakeLists.txt b/irif/CMakeLists.txt
index 37e89dca..12b0397d 100644
--- a/irif/CMakeLists.txt
+++ b/irif/CMakeLists.txt
@@ -20,7 +20,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB srcs
diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt
index 8ac5f76f..ed78ec85 100644
--- a/opencl/CMakeLists.txt
+++ b/opencl/CMakeLists.txt
@@ -36,7 +36,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB ll_srcs
diff --git a/utils/add_amdgiz.sed b/utils/add_amdgiz.sed
index 995f6af9..ee495d3c 100755
--- a/utils/add_amdgiz.sed
+++ b/utils/add_amdgiz.sed
@@ -6,7 +6,8 @@
 #######################
 
 # amdgcn--amdhsa-amd -> amdgcn--amdhsa-amdgiz
-/target triple/s/\"amdgcn--amdhsa\"/\"amdgcn--amdhsa-amdgiz\"/
+# This is now done directly by change-addr-space.sh
+# /target triple/s/\"amdgcn--amdhsa\"/\"amdgcn--amdhsa-amdgiz\"/
 
 #####################
 # change data layout
diff --git a/utils/change-addr-space.sh b/utils/change-addr-space.sh
index 02d82af5..cfdea769 100755
--- a/utils/change-addr-space.sh
+++ b/utils/change-addr-space.sh
@@ -6,8 +6,14 @@
 # utils/change-addr-space.sh src x : apply utils/remove_amdgiz.sed
 #                                adopt generic address space is address space 4
 
-if [ $# -lt 2 ]; then
-  find . -name "*.ll" | xargs sed -i -f "$1/add_amdgiz.sed"
+tmpfile=/tmp/cas$$.sed
+if [ $# -lt 3 ]; then
+  echo "/target triple/s/\\\"amdgcn--amdhsa\\\"/\\\"${1}\\\"/" >$tmpfile
+  cat $2/add_amdgiz.sed >>$tmpfile
 else
-  find . -name "*.ll" | xargs sed -i -f "$1/remove_amdgiz.sed"
+  echo "/target triple/s/\\\"${1}\\\"/\\\"amdgcn--amdhsa\\\"/" >$tmpfile
+  cat $2/remove_amdgiz.sed >>$tmpfile
 fi
+
+find . -name "*.ll" | xargs sed -i -f "$tmpfile"
+rm $tmpfile
diff --git a/utils/remove_amdgiz.sed b/utils/remove_amdgiz.sed
index d10630c6..7c76dd78 100755
--- a/utils/remove_amdgiz.sed
+++ b/utils/remove_amdgiz.sed
@@ -6,7 +6,8 @@
 #######################
 
 # amdgcn--amdhsa-amdgiz -> amdgcn--amdhsa
-/target triple/s/\"amdgcn--amdhsa-amdgiz\"/\"amdgcn--amdhsa\"/
+# This is now done directly by change-addr-space.sh 
+#/target triple/s/\"amdgcn--amdhsa-amdgiz\"/\"amdgcn--amdhsa\"/
 
 #####################
 # change data layout

From 835f0785984276b7e25520536d27864193dcaf37 Mon Sep 17 00:00:00 2001
From: Guansong Zhang <guansong.zhang@amd.com>
Date: Tue, 18 Jul 2017 10:12:03 -0400
Subject: [PATCH 06/25] Use irif.h header to include functions using inline asm

Change-Id: I4658fd709d808529d25fa3d524c895ade73a9103
---
 cuda2gcn/src/bitsbytes.cl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cuda2gcn/src/bitsbytes.cl b/cuda2gcn/src/bitsbytes.cl
index 03cb20cb..2df61c5a 100644
--- a/cuda2gcn/src/bitsbytes.cl
+++ b/cuda2gcn/src/bitsbytes.cl
@@ -5,12 +5,8 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern __attribute__((const)) int __llvm_bitreverse_i32(int);
-extern __attribute__((const)) long __llvm_bitreverse_i64(long);
-extern __attribute__((const)) int __llvm_ctpop_i32(int);
-extern __attribute__((const)) long __llvm_ctpop_i64(long);
-
 #include "ockl.h"
+#include "irif.h"
 
 #define ATTR __attribute__((always_inline, const))
 

From 8e108c46e77cbab82d945b0af989fe145d87ed54 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Wed, 26 Jul 2017 14:23:05 -0700
Subject: [PATCH 07/25] Eliminate internal use of out arguments

Change-Id: Ic89087ee823bb6ea2fb571f11ce1bcf6f582b921
---
 ocml/src/cosD.cl          | 12 ++++-----
 ocml/src/cosF.cl          | 19 +++++--------
 ocml/src/cosH.cl          | 12 ++++-----
 ocml/src/cosbD.cl         | 24 ++++++++---------
 ocml/src/cosbF.cl         | 33 ++++++++++-------------
 ocml/src/cospiD.cl        | 12 ++++-----
 ocml/src/cospiF.cl        | 15 +++++------
 ocml/src/cospiH.cl        | 12 ++++-----
 ocml/src/sinD.cl          | 13 ++++-----
 ocml/src/sinF.cl          | 18 +++++--------
 ocml/src/sinH.cl          | 11 +++-----
 ocml/src/sinbD.cl         | 24 ++++++++---------
 ocml/src/sinbF.cl         | 33 ++++++++++-------------
 ocml/src/sincosD.cl       | 17 +++++-------
 ocml/src/sincosF.cl       | 24 +++++++----------
 ocml/src/sincosH.cl       | 19 ++++++-------
 ocml/src/sincospiD.cl     | 17 +++++-------
 ocml/src/sincospiF.cl     | 17 +++++-------
 ocml/src/sincospiH.cl     | 18 +++++--------
 ocml/src/sincospiredD.cl  | 12 +++++----
 ocml/src/sincospiredF.cl  | 11 +++++---
 ocml/src/sincospiredH.cl  | 12 +++++----
 ocml/src/sincosred2D.cl   | 11 +++++---
 ocml/src/sincosred2F.cl   | 11 +++++---
 ocml/src/sincosredD.cl    | 11 +++++---
 ocml/src/sincosredF.cl    | 10 ++++---
 ocml/src/sincosredH.cl    | 10 ++++---
 ocml/src/sinpiD.cl        | 11 +++-----
 ocml/src/sinpiF.cl        | 12 +++------
 ocml/src/sinpiH.cl        | 11 +++-----
 ocml/src/tanD.cl          |  7 +++--
 ocml/src/tanF.cl          | 12 +++------
 ocml/src/tanH.cl          |  6 ++---
 ocml/src/tanpiD.cl        |  8 +++---
 ocml/src/tanpiF.cl        |  8 +++---
 ocml/src/tanpiH.cl        |  8 +++---
 ocml/src/trigpiredD.cl    | 11 +++++---
 ocml/src/trigpiredD.h     | 14 ++++++++--
 ocml/src/trigpiredF.cl    | 11 +++++---
 ocml/src/trigpiredF.h     | 14 ++++++++--
 ocml/src/trigpiredH.cl    | 11 +++++---
 ocml/src/trigpiredH.h     | 14 ++++++++--
 ocml/src/trigredD.cl      |  8 +++---
 ocml/src/trigredD.h       | 21 +++++++++++----
 ocml/src/trigredF.cl      | 20 +++-----------
 ocml/src/trigredF.h       | 31 ++++++++++++++++------
 ocml/src/trigredH.cl      | 12 +++++----
 ocml/src/trigredH.h       | 16 ++++++++---
 ocml/src/trigredlargeD.cl | 12 +++++----
 ocml/src/trigredlargeF.cl | 18 ++++++-------
 ocml/src/trigredsmallD.cl | 13 ++++-----
 ocml/src/trigredsmallF.cl | 56 +++++++++++++++------------------------
 52 files changed, 397 insertions(+), 406 deletions(-)

diff --git a/ocml/src/cosD.cl b/ocml/src/cosD.cl
index fcb55925..12a43884 100644
--- a/ocml/src/cosD.cl
+++ b/ocml/src/cosD.cl
@@ -11,14 +11,12 @@
 INLINEATTR double
 MATH_MANGLE(cos)(double x)
 {
-    double r, rr;
-    int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
+    sc.s = -sc.s;
 
-    double cc;
-    double ss = -MATH_PRIVATE(sincosred2)(r, rr, &cc);
-
-    int2 c = AS_INT2((regn & 1) != 0 ? ss : cc);
-    c.hi ^= regn > 1 ? (int)0x80000000 : 0;
+    int2 c = AS_INT2((r.i & 1) != 0 ? sc.s : sc.c);
+    c.hi ^= r.i > 1 ? (int)0x80000000 : 0;
 
     if (!FINITE_ONLY_OPT()) {
         c = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : c;
diff --git a/ocml/src/cosF.cl b/ocml/src/cosF.cl
index 63da099e..a0768dd2 100644
--- a/ocml/src/cosF.cl
+++ b/ocml/src/cosF.cl
@@ -14,22 +14,17 @@ MATH_MANGLE(cos)(float x)
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-#if defined EXTRA_PRECISION
-    float r0, r1;
-    int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax));
+    struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax));
 
-    float cc;
-    float ss = -MATH_PRIVATE(sincosred2)(r0, r1, &cc);
+#if defined EXTRA_PRECISION
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 #else
-    float r;
-    int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax));
-
-    float cc;
-    float ss = -MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 #endif
+    sc.s = -sc.s;
 
-    float c =  (regn & 1) != 0 ? ss : cc;
-    c = AS_FLOAT(AS_INT(c) ^ (regn > 1 ? 0x80000000 : 0));
+    float c =  (r.i & 1) != 0 ? sc.s : sc.c;
+    c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0));
 
     if (!FINITE_ONLY_OPT()) {
         c = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : c;
diff --git a/ocml/src/cosH.cl b/ocml/src/cosH.cl
index 00df3a62..3aa1295b 100644
--- a/ocml/src/cosH.cl
+++ b/ocml/src/cosH.cl
@@ -13,14 +13,12 @@ UGEN(cos)
 INLINEATTR half
 MATH_MANGLE(cos)(half x)
 {
-    half r;
-    short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
+    sc.s = -sc.s;
 
-    half cc;
-    half ss = -MATH_PRIVATE(sincosred)(r, &cc);
-
-    short c =  AS_SHORT((i & 1) == 0 ? cc : ss);
-    c ^= i > 1 ? (short)0x8000 : (short)0;
+    short c =  AS_SHORT((r.i & 1) == (short)0 ? sc.c : sc.s);
+    c ^= r.i > 1 ? (short)0x8000 : (short)0;
 
     if (!FINITE_ONLY_OPT()) {
         c = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : c;
diff --git a/ocml/src/cosbD.cl b/ocml/src/cosbD.cl
index 36b4f178..49ca4e7f 100644
--- a/ocml/src/cosbD.cl
+++ b/ocml/src/cosbD.cl
@@ -27,28 +27,28 @@
 INLINEATTR double
 MATH_PRIVATE(cosb)(double x, int n, double p)
 {
-    double ph, pl, rh, rl, sh, sl;
-    int i = MATH_PRIVATE(trigred)(&rh, &rl, x);
-    bool b = rh < p;
-    i = (i - b - n) & 3;
+    struct redret r = MATH_PRIVATE(trigred)(x);
+    bool b = r.hi < p;
+    r.i = (r.i - b - n) & 3;
 
     // This is a properly signed extra precise pi/4
-    ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0)));
-    pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0)));
+    double ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0)));
+    double pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0)));
 
+    double sh, sl;
     FDIF2(ph, p, ph, sl);
     pl += sl;
     FSUM2(ph, pl, ph, pl);
 
-    FSUM2(ph, rh, sh, sl);
-    sl += pl + rl;
+    FSUM2(ph, r.hi, sh, sl);
+    sl += pl + r.lo;
     FSUM2(sh, sl, sh, sl);
 
-    double cc;
-    double ss = -MATH_PRIVATE(sincosred2)(sh, sl, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl);
+    sc.s = -sc.s;
 
-    int2 c = AS_INT2((i & 1) != 0 ? ss : cc);
-    c.hi ^= i > 1 ? 0x80000000 : 0;
+    int2 c = AS_INT2((r.i & 1) != 0 ? sc.s : sc.c);
+    c.hi ^= r.i > 1 ? 0x80000000 : 0;
 
     return AS_DOUBLE(c);
 }
diff --git a/ocml/src/cosbF.cl b/ocml/src/cosbF.cl
index 10aab950..34e5d857 100644
--- a/ocml/src/cosbF.cl
+++ b/ocml/src/cosbF.cl
@@ -27,39 +27,34 @@
 INLINEATTR float
 MATH_PRIVATE(cosb)(float x, int n, float p)
 {
+    struct redret r = MATH_PRIVATE(trigred)(x);
+    bool b = r.hi < p;
+    r.i = (r.i - b - n) & 3;
 
 #if defined EXTRA_PRECISION
-    float ph, pl, rh, rl, sh, sl;
-    int i = MATH_PRIVATE(trigred)(&rh, &rl, x);
-    bool b = rh < p;
-    i = (i - b - n) & 3;
+    float ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
+    float pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0));
 
-    ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
-    pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0));
+    float sh, sl;
 
     FDIF2(ph, p, ph, sl);
     pl += sl;
     FSUM2(ph, pl, ph, pl);
 
-    FSUM2(ph, rh, sh, sl);
-    sl += pl + rl;
+    FSUM2(ph, r.hi, sh, sl);
+    sl += pl + r.lo;
     FSUM2(sh, sl, sh, sl);
 
-    float cc;
-    float ss = -MATH_PRIVATE(sincosred2)(sh, sl, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl);
 #else
-    float r;
-    int i = MATH_PRIVATE(trigred)(&r, x);
-    bool b = r < p;
-    i = (i - b - n) & 3;
-    r = r - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
+    r.hi = r.hi - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
 
-    float cc;
-    float ss = -MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 #endif
+    sc.s = -sc.s;
 
-    float c =  (i & 1) != 0 ? ss : cc;
-    c = AS_FLOAT(AS_INT(c) ^ (i > 1 ? 0x80000000 : 0));
+    float c =  (r.i & 1) != 0 ? sc.s : sc.c;
+    c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0));
     return c;
 }
 
diff --git a/ocml/src/cospiD.cl b/ocml/src/cospiD.cl
index 57686b03..ef407179 100644
--- a/ocml/src/cospiD.cl
+++ b/ocml/src/cospiD.cl
@@ -11,14 +11,12 @@
 INLINEATTR double
 MATH_MANGLE(cospi)(double x)
 {
-    double t;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
+    sc.s = -sc.s;
 
-    double cc;
-    double ss = -MATH_PRIVATE(sincospired)(t, &cc);
-
-    int2 c = AS_INT2((i & 1) == 0 ? cc : ss);
-    c.hi ^= i > 1 ? (int)0x80000000 : 0;
+    int2 c = AS_INT2((r.i & 1) == 0 ? sc.c : sc.s);
+    c.hi ^= r.i > 1 ? (int)0x80000000 : 0;
 
     if (!FINITE_ONLY_OPT()) {
         c = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : c;
diff --git a/ocml/src/cospiF.cl b/ocml/src/cospiF.cl
index 1d9ed3ee..6891ba87 100644
--- a/ocml/src/cospiF.cl
+++ b/ocml/src/cospiF.cl
@@ -8,19 +8,16 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-INLINEATTR float
+CONSTATTR INLINEATTR float
 MATH_MANGLE(cospi)(float x)
 {
     int ax = AS_INT(x) & 0x7fffffff;
+    struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
+    sc.s = -sc.s;
 
-    float r;
-    int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &r);
-
-    float cc;
-    float ss = -MATH_PRIVATE(sincospired)(r, &cc);
-
-    float c =  (i & 1) != 0 ? ss : cc;
-    c = AS_FLOAT(AS_INT(c) ^ (i > 1 ? 0x80000000 : 0));
+    float c =  (r.i & 1) != 0 ? sc.s : sc.c;
+    c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0));
 
     if (!FINITE_ONLY_OPT()) {
         c = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : c;
diff --git a/ocml/src/cospiH.cl b/ocml/src/cospiH.cl
index 830bc239..e4b74521 100644
--- a/ocml/src/cospiH.cl
+++ b/ocml/src/cospiH.cl
@@ -13,14 +13,12 @@ UGEN(cospi)
 INLINEATTR half
 MATH_MANGLE(cospi)(half x)
 {
-    half t;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
+    sc.s = -sc.s;
 
-    half cc;
-    half ss = -MATH_PRIVATE(sincospired)(t, &cc);
-
-    short c =  AS_SHORT((i & (short)1) == (short)0 ? cc : ss);
-    c ^= i > (short)1 ? (short)0x8000 : (short)0;
+    short c =  AS_SHORT((r.i & (short)1) == (short)0 ? sc.c : sc.s);
+    c ^= r.i > (short)1 ? (short)0x8000 : (short)0;
 
     if (!FINITE_ONLY_OPT()) {
         c = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : c;
diff --git a/ocml/src/sinD.cl b/ocml/src/sinD.cl
index 7ec233bc..c44c1101 100644
--- a/ocml/src/sinD.cl
+++ b/ocml/src/sinD.cl
@@ -8,17 +8,14 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR double
+CONSTATTR INLINEATTR double
 MATH_MANGLE(sin)(double x)
 {
-    double r, rr;
-    int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincosred2)(r, rr, &cc);
-
-    int2 s = AS_INT2((regn & 1) == 0 ? ss : cc);
-    s.hi ^= (regn > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000);
+    int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c);
+    s.hi ^= (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000);
 
     if (!FINITE_ONLY_OPT()) {
         s = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : s;
diff --git a/ocml/src/sinF.cl b/ocml/src/sinF.cl
index fe6a75d8..c42c05e4 100644
--- a/ocml/src/sinF.cl
+++ b/ocml/src/sinF.cl
@@ -14,22 +14,16 @@ MATH_MANGLE(sin)(float x)
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-#if defined EXTRA_PRECISION
-    float r0, r1;
-    int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax));
+    struct redret r =  MATH_PRIVATE(trigred)(AS_FLOAT(ax));
 
-    float cc;
-    float ss = MATH_PRIVATE(sincosred2)(r0, r1, &cc);
+#if defined EXTRA_PRECISION
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 #else
-    float r;
-    int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax));
-
-    float cc;
-    float ss = MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 #endif
 
-    float s = (regn & 1) != 0 ? cc : ss;
-    s = AS_FLOAT(AS_INT(s) ^ (regn > 1 ? 0x80000000 : 0) ^ (ix ^ ax));
+    float s = (r.i & 1) != 0 ? sc.c : sc.s;
+    s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0) ^ (ix ^ ax));
 
     if (!FINITE_ONLY_OPT()) {
         s = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : s;
diff --git a/ocml/src/sinH.cl b/ocml/src/sinH.cl
index 1c92458d..7cd9aae7 100644
--- a/ocml/src/sinH.cl
+++ b/ocml/src/sinH.cl
@@ -13,14 +13,11 @@ UGEN(sin)
 INLINEATTR half
 MATH_MANGLE(sin)(half x)
 {
-    half r;
-    short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
+    struct scret sc =  MATH_PRIVATE(sincosred)(r.hi);
 
-    half cc;
-    half ss = MATH_PRIVATE(sincosred)(r, &cc);
-
-    short s = AS_SHORT((i & (short)1) == (short)0 ? ss : cc);
-    s ^= (i > (short)1 ? (short)0x8000 : 0) ^ (AS_SHORT(x) & (short)0x8000);
+    short s = AS_SHORT((r.i & (short)1) == (short)0 ? sc.s : sc.c);
+    s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000);
 
     if (!FINITE_ONLY_OPT()) {
         s = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : s;
diff --git a/ocml/src/sinbD.cl b/ocml/src/sinbD.cl
index b300f031..c8db0800 100644
--- a/ocml/src/sinbD.cl
+++ b/ocml/src/sinbD.cl
@@ -27,28 +27,28 @@
 INLINEATTR double
 MATH_PRIVATE(sinb)(double x, int n, double p)
 {
-    double ph, pl, rh, rl, sh, sl;
-    int i = MATH_PRIVATE(trigred)(&rh, &rl, x);
-    bool b = rh < p;
-    i = (i - b - n) & 3;
+    struct redret r = MATH_PRIVATE(trigred)(x);
+    bool b = r.hi < p;
+    r.i = (r.i - b - n) & 3;
 
     // This is a properly signed extra precise pi/4
-    ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0)));
-    pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0)));
+    double ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0)));
+    double pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0)));
+
+    double sh, sl;
 
     FDIF2(ph, p, ph, sl);
     pl += sl;
     FSUM2(ph, pl, ph, pl);
 
-    FSUM2(ph, rh, sh, sl);
-    sl += pl + rl;
+    FSUM2(ph, r.hi, sh, sl);
+    sl += pl + r.lo;
     FSUM2(sh, sl, sh, sl);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincosred2)(sh, sl, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl);
 
-    int2 s = AS_INT2((i & 1) == 0 ? ss : cc);
-    s.hi ^= i > 1 ? 0x80000000 : 0;
+    int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c);
+    s.hi ^= r.i > 1 ? 0x80000000 : 0;
 
     return AS_DOUBLE(s);
 }
diff --git a/ocml/src/sinbF.cl b/ocml/src/sinbF.cl
index 9e26d0b6..0bd14e13 100644
--- a/ocml/src/sinbF.cl
+++ b/ocml/src/sinbF.cl
@@ -27,38 +27,33 @@
 INLINEATTR float
 MATH_PRIVATE(sinb)(float x, int n, float p)
 {
+    struct redret r = MATH_PRIVATE(trigred)(x);
+    bool b = r.hi < p;
+    r.i = (r.i - b - n) & 3;
+
 #if defined EXTRA_PRECISION
-    float ph, pl, rh, rl, sh, sl;
-    int i = MATH_PRIVATE(trigred)(&rh, &rl, x);
-    bool b = rh < p;
-    i = (i - b - n) & 3;
+    float ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
+    float pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0));
 
-    ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
-    pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0));
+    float sh, sl;
 
     FDIF2(ph, p, ph, sl);
     pl += sl;
     FSUM2(ph, pl, ph, pl);
 
-    FSUM2(ph, rh, sh, sl);
-    sl += pl + rl;
+    FSUM2(ph, r.hi, sh, sl);
+    sl += pl + r.lo;
     FSUM2(sh, sl, sh, sl);
 
-    float cc;
-    float ss = MATH_PRIVATE(sincosred2)(sh, sl, &cc);
+    struct scret sc =  MATH_PRIVATE(sincosred2)(sh, sl);
 #else
-    float r;
-    int i = MATH_PRIVATE(trigred)(&r, x);
-    bool b = r < p;
-    i = (i - b - n) & 3;
-    r = r - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
+    r.hi = r.hi - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0));
 
-    float cc;
-    float ss = MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc =  MATH_PRIVATE(sincosred)(r.hi);
 #endif
 
-    float s = (i & 1) != 0 ? cc : ss;
-    s = AS_FLOAT(AS_INT(s) ^ (i > 1 ? 0x80000000 : 0));
+    float s = (r.i & 1) != 0 ? sc.c : sc.s;
+    s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0));
     return s;
 }
 
diff --git a/ocml/src/sincosD.cl b/ocml/src/sincosD.cl
index de851c34..ba74767d 100644
--- a/ocml/src/sincosD.cl
+++ b/ocml/src/sincosD.cl
@@ -11,19 +11,16 @@
 INLINEATTR double
 MATH_MANGLE(sincos)(double x, __private double * cp)
 {
-    double r, rr;
-    int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincosred2)(r, rr, &cc);
+    int flip = r.i > 1 ? (int)0x80000000 : 0;
+    bool odd = (r.i & 1) != 0;
 
-    int flip = regn > 1 ? (int)0x80000000 : 0;
-    bool odd = (regn & 1) != 0;
-
-    int2 s = AS_INT2(odd ? cc : ss);
+    int2 s = AS_INT2(odd ? sc.c : sc.s);
     s.hi ^= flip ^ (AS_INT2(x).hi &(int)0x80000000);
-    ss = -ss;
-    int2 c = AS_INT2(odd ? ss : cc);
+    sc.s = -sc.s;
+    int2 c = AS_INT2(odd ? sc.s : sc.c);
     c.hi ^= flip;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincosF.cl b/ocml/src/sincosF.cl
index 1baa857f..a1286cc5 100644
--- a/ocml/src/sincosF.cl
+++ b/ocml/src/sincosF.cl
@@ -14,26 +14,20 @@ MATH_MANGLE(sincos)(float x, __private float *cp)
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-#if defined EXTRA_PRECISION
-    float r0, r1;
-    int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax));
+    struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax));
 
-    float cc;
-    float ss = MATH_PRIVATE(sincosred2)(r0, r1, &cc);
+#if defined EXTRA_PRECISION
+    struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo);
 #else
-    float r;
-    int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax));
-
-    float cc;
-    float ss = MATH_PRIVATE(sincosred)(r, &cc);
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 #endif
 
-    int flip = regn > 1 ? 0x80000000 : 0;
-    bool odd = (regn & 1) != 0;
-    float s = odd ? cc : ss;
+    int flip = r.i > 1 ? 0x80000000 : 0;
+    bool odd = (r.i & 1) != 0;
+    float s = odd ? sc.c : sc.s;
     s = AS_FLOAT(AS_INT(s) ^ flip ^ (ax ^ ix));
-    ss = -ss;
-    float c = odd ? ss : cc;
+    sc.s = -sc.s;
+    float c = odd ? sc.s : sc.c;
     c = AS_FLOAT(AS_INT(c) ^ flip);
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincosH.cl b/ocml/src/sincosH.cl
index 43a35c6b..8c314f12 100644
--- a/ocml/src/sincosH.cl
+++ b/ocml/src/sincosH.cl
@@ -19,21 +19,18 @@ MATH_MANGLE2(sincos)(half2 x, __private half2 *cp)
     return s;
 }
 
-INLINEATTR half
+CONSTATTR INLINEATTR half
 MATH_MANGLE(sincos)(half x, __private half *cp)
 {
-    half r;
-    short regn = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincosred)(r.hi);
 
-    half cc;
-    half ss = MATH_PRIVATE(sincosred)(r, &cc);
-
-    short flip = regn > (short)1 ? (short)0x8000 : (short)0;
-    bool odd = (regn & 1) != 0;
-    short s = AS_SHORT(odd ? cc : ss);
+    short flip = r.i > (short)1 ? (short)0x8000 : (short)0;
+    bool odd = (r.i & (short)1) != (short)0;
+    short s = AS_SHORT(odd ? sc.c : sc.s);
     s ^= flip ^ (AS_SHORT(x) & (short)0x8000);
-    ss = -ss;
-    short c = AS_SHORT(odd ? ss : cc);
+    sc.s = -sc.s;
+    short c = AS_SHORT(odd ? sc.s : sc.c);
     c ^= flip;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincospiD.cl b/ocml/src/sincospiD.cl
index 1b92e61a..41e1438d 100644
--- a/ocml/src/sincospiD.cl
+++ b/ocml/src/sincospiD.cl
@@ -11,19 +11,16 @@
 INLINEATTR double
 MATH_MANGLE(sincospi)(double x, __private double * cp)
 {
-    double t;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincospired)(t, &cc);
+    int flip = r.i > 1 ? (int)0x80000000 : 0;
+    bool odd = (r.i & 1) != 0;
 
-    int flip = i > 1 ? (int)0x80000000 : 0;
-    bool odd = (i & 1) != 0;
-
-    int2 s = AS_INT2(odd ? cc : ss);
+    int2 s = AS_INT2(odd ? sc.c : sc.s);
     s.hi ^= flip ^ (AS_INT2(x).hi & 0x80000000);
-    ss = -ss;
-    int2 c = AS_INT2(odd ? ss : cc);
+    sc.s = -sc.s;
+    int2 c = AS_INT2(odd ? sc.s : sc.c);
     c.hi ^= flip;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincospiF.cl b/ocml/src/sincospiF.cl
index af3528ce..91b61dc4 100644
--- a/ocml/src/sincospiF.cl
+++ b/ocml/src/sincospiF.cl
@@ -14,18 +14,15 @@ MATH_MANGLE(sincospi)(float x, __private float *cp)
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-    float t;
-    int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    float cc;
-    float ss = MATH_PRIVATE(sincospired)(t, &cc);
-
-    int flip = i > 1 ? 0x80000000 : 0;
-    bool odd = (i & 1) != 0;
-    float s = odd ? cc : ss;
+    int flip = r.i > 1 ? 0x80000000 : 0;
+    bool odd = (r.i & 1) != 0;
+    float s = odd ? sc.c : sc.s;
     s = AS_FLOAT(AS_INT(s) ^ flip ^ (ax ^ ix));
-    ss = -ss;
-    float c = odd ? ss : cc;
+    sc.s = -sc.s;
+    float c = odd ? sc.s : sc.c;
     c = AS_FLOAT(AS_INT(c) ^ flip);
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincospiH.cl b/ocml/src/sincospiH.cl
index cba66af7..495bac5c 100644
--- a/ocml/src/sincospiH.cl
+++ b/ocml/src/sincospiH.cl
@@ -23,19 +23,15 @@ MATH_MANGLE2(sincospi)(half2 x, __private half2 *cp)
 INLINEATTR half
 MATH_MANGLE(sincospi)(half x, __private half *cp)
 {
-    half t;
-    short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    half cc;
-    half ss = MATH_PRIVATE(sincospired)(t, &cc);
-
-    short flip = i > (short)1 ? (short)0x8000 : (short)0;
-    bool odd = (i & (short)1) != (short)0;
-
-    short s = AS_SHORT(odd ? cc : ss);
+    short flip = r.i > (short)1 ? (short)0x8000 : (short)0;
+    bool odd = (r.i & (short)1) != (short)0;
+    short s = AS_SHORT(odd ? sc.c : sc.s);
     s ^= flip ^ (AS_SHORT(x) & (short)0x8000);
-    ss = -ss;
-    short c = AS_SHORT(odd ? ss : cc);
+    sc.s = -sc.s;
+    short c = AS_SHORT(odd ? sc.s : sc.c);
     c ^= flip;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/sincospiredD.cl b/ocml/src/sincospiredD.cl
index 5200346a..4e750f8f 100644
--- a/ocml/src/sincospiredD.cl
+++ b/ocml/src/sincospiredD.cl
@@ -6,11 +6,11 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathD.h"
+#include "trigpiredD.h"
 
-INLINEATTR double
-MATH_PRIVATE(sincospired)(double x, __private double *cp)
+CONSTATTR INLINEATTR struct scret
+MATH_PRIVATE(sincospired)(double x)
 {
-
     double t = x * x;
 
     double sx = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 
@@ -26,7 +26,9 @@ MATH_PRIVATE(sincospired)(double x, __private double *cp)
                     -0x1.55d3c7e3c325bp+0), 0x1.03c1f081b5a67p+2), -0x1.3bd3cc9be45dep+2);
     cx = MATH_MAD(t, cx, 1.0);
 
-    *cp = cx;
-    return sx;
+    struct scret ret;
+    ret.c = cx;
+    ret.s = sx;
+    return ret;
 }
 
diff --git a/ocml/src/sincospiredF.cl b/ocml/src/sincospiredF.cl
index 786036a1..1a528847 100644
--- a/ocml/src/sincospiredF.cl
+++ b/ocml/src/sincospiredF.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathF.h"
+#include "trigredF.h"
 
-INLINEATTR float
-MATH_PRIVATE(sincospired)(float x, __private float *cp)
+CONSTATTR INLINEATTR struct scret
+MATH_PRIVATE(sincospired)(float x)
 {
 
     float t = x * x;
@@ -23,7 +24,9 @@ MATH_PRIVATE(sincospired)(float x, __private float *cp)
                    -0x1.3bd3ccp+2f);
     cx = MATH_MAD(t, cx, 1.0f);
 
-    *cp = cx;
-    return sx;
+    struct scret ret;
+    ret.c = cx;
+    ret.s = sx;
+    return ret;
 }
 
diff --git a/ocml/src/sincospiredH.cl b/ocml/src/sincospiredH.cl
index 28a0fa7a..a7aa0f3f 100644
--- a/ocml/src/sincospiredH.cl
+++ b/ocml/src/sincospiredH.cl
@@ -6,11 +6,11 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathH.h"
+#include "trigpiredH.h"
 
-INLINEATTR half
-MATH_PRIVATE(sincospired)(half x, __private half *cp)
+CONSTATTR INLINEATTR struct scret
+MATH_PRIVATE(sincospired)(half x)
 {
-
     half t = x * x;
 
     half sx = MATH_MAD(t, 0x1.b84p+0h, -0x1.46cp+2h);
@@ -20,7 +20,9 @@ MATH_PRIVATE(sincospired)(half x, __private half *cp)
     half cx = MATH_MAD(t, 0x1.fbp+1h, -0x1.3bcp+2h);
     cx = MATH_MAD(t, cx, 1.0h);
 
-    *cp = cx;
-    return sx;
+    struct scret ret;
+    ret.c = cx;
+    ret.s = sx;
+    return ret;
 }
 
diff --git a/ocml/src/sincosred2D.cl b/ocml/src/sincosred2D.cl
index 800c1021..e63b71a3 100644
--- a/ocml/src/sincosred2D.cl
+++ b/ocml/src/sincosred2D.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathD.h"
+#include "trigredD.h"
 
-INLINEATTR double
-MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp)
+CONSTATTR INLINEATTR struct scret
+MATH_PRIVATE(sincosred2)(double x, double y)
 {
     const double S0 = -0x1.5555555555555p-3;
     const double S1 =  0x1.1111111110bb3p-7;
@@ -35,7 +36,9 @@ MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp)
     double sxy = MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, S5, S4), S3), S2), S1);
     sxy = x - MATH_MAD(-x3, S0, MATH_MAD(x2, MATH_MAD(-x3, sxy, 0.5*y), -y));
 
-    *cp = cxy;
-    return sxy;
+    struct scret ret;
+    ret.c = cxy;
+    ret.s = sxy;
+    return ret;
 }
 
diff --git a/ocml/src/sincosred2F.cl b/ocml/src/sincosred2F.cl
index 36767e53..96eb3c18 100644
--- a/ocml/src/sincosred2F.cl
+++ b/ocml/src/sincosred2F.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathF.h"
+#include "trigredF.h"
 
-INLINEATTR float
-MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp)
+CONSTATTR INLINEATTR struct scret
+MATH_PRIVATE(sincosred2)(float x, float y)
 {
     const float c0 =  0x1.555556p-5f;
     const float c1 = -0x1.6c16b2p-10f;
@@ -32,7 +33,9 @@ MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp)
     float sxy = MATH_MAD(x2, MATH_MAD(x2, s3, s2), s1);
     sxy = x - MATH_MAD(-x3, s0, MATH_MAD(x2, MATH_MAD(-x3, sxy, 0.5f*y), -y));
 
-    *cp = cxy;
-    return sxy;
+    struct scret ret;
+    ret.c = cxy;
+    ret.s = sxy;
+    return ret;
 }
 
diff --git a/ocml/src/sincosredD.cl b/ocml/src/sincosredD.cl
index ed64d24b..3d549195 100644
--- a/ocml/src/sincosredD.cl
+++ b/ocml/src/sincosredD.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathD.h"
+#include "trigredD.h"
 
-INLINEATTR double
-MATH_PRIVATE(sincosred)(double x, __private double *cp)
+CONSTATTR INLINEATTR struct scret
+MATH_PRIVATE(sincosred)(double x)
 {
     const double S0 = -0x1.5555555555555p-3;
     const double S1 =  0x1.1111111110bb3p-7;
@@ -33,7 +34,9 @@ MATH_PRIVATE(sincosred)(double x, __private double *cp)
     double cx = t + MATH_MAD(x2*x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, C5, C4), C3), C2), C1), C0), v);
     double sx = MATH_MAD(x2*x, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, S5, S4), S3), S2), S1), S0), x);
 
-    *cp = cx;
-    return sx;
+    struct scret ret;
+    ret.c = cx;
+    ret.s = sx;
+    return ret;
 }
 
diff --git a/ocml/src/sincosredF.cl b/ocml/src/sincosredF.cl
index e4d2cfd5..8e21dfad 100644
--- a/ocml/src/sincosredF.cl
+++ b/ocml/src/sincosredF.cl
@@ -8,8 +8,8 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
-MATH_PRIVATE(sincosred)(float x, __private float *cp)
+CONSTATTR INLINEATTR struct scret
+MATH_PRIVATE(sincosred)(float x)
 {
     float t = x * x;
 
@@ -17,7 +17,9 @@ MATH_PRIVATE(sincosred)(float x, __private float *cp)
     float c = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t,
                   0x1.aea668p-16f, -0x1.6c9e76p-10f), 0x1.5557eep-5f), -0x1.000008p-1f), 1.0f);
 
-    *cp = c;
-    return s;
+    struct scret ret;
+    ret.c = c;
+    ret.s = s;
+    return ret;
 }
 
diff --git a/ocml/src/sincosredH.cl b/ocml/src/sincosredH.cl
index a3ffec57..202732d9 100644
--- a/ocml/src/sincosredH.cl
+++ b/ocml/src/sincosredH.cl
@@ -8,14 +8,16 @@
 #include "mathH.h"
 #include "trigredH.h"
 
-INLINEATTR half
-MATH_PRIVATE(sincosred)(half x, __private half *cp)
+CONSTATTR INLINEATTR struct scret
+MATH_PRIVATE(sincosred)(half x)
 {
     half t = x * x;
     half s = MATH_MAD(x, t*MATH_MAD(t, 0x1.0bp-7h, -0x1.554p-3h), x);
     half c = MATH_MAD(t, MATH_MAD(t, 0x1.4b4p-5h, -0x1.ffcp-2h), 1.0h);
 
-    *cp = c;
-    return s;
+    struct scret ret;
+    ret.c = c;
+    ret.s = s;
+    return ret;
 }
 
diff --git a/ocml/src/sinpiD.cl b/ocml/src/sinpiD.cl
index 5393c792..84039c97 100644
--- a/ocml/src/sinpiD.cl
+++ b/ocml/src/sinpiD.cl
@@ -11,14 +11,11 @@
 INLINEATTR double
 MATH_MANGLE(sinpi)(double x)
 {
-    double t;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t);
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    double cc;
-    double ss = MATH_PRIVATE(sincospired)(t, &cc);
-
-    int2 s = AS_INT2((i & 1) == 0 ? ss : cc);
-    s.hi ^= (i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000);
+    int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c);
+    s.hi ^= (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000);
 
     if (!FINITE_ONLY_OPT()) {
         s = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : s;
diff --git a/ocml/src/sinpiF.cl b/ocml/src/sinpiF.cl
index 50fa9a44..07f1e97d 100644
--- a/ocml/src/sinpiF.cl
+++ b/ocml/src/sinpiF.cl
@@ -13,15 +13,11 @@ MATH_MANGLE(sinpi)(float x)
 {
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
+    struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    float r;
-    int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &r);
-
-    float cc;
-    float ss = MATH_PRIVATE(sincospired)(r, &cc);
-
-    float s = (i & 1) == 0 ? ss : cc;
-    s = AS_FLOAT(AS_INT(s) ^ (i > 1 ? 0x80000000 : 0) ^ (ix ^ ax));
+    float s = (r.i & 1) == 0 ? sc.s : sc.c;
+    s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0) ^ (ix ^ ax));
 
     if (!FINITE_ONLY_OPT()) {
         s = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : s;
diff --git a/ocml/src/sinpiH.cl b/ocml/src/sinpiH.cl
index a429ec5b..2848978c 100644
--- a/ocml/src/sinpiH.cl
+++ b/ocml/src/sinpiH.cl
@@ -13,14 +13,11 @@ UGEN(sinpi)
 INLINEATTR half
 MATH_MANGLE(sinpi)(half x)
 {
-    half t;
-    short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t);
+    struct redret r =  MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
+    struct scret sc = MATH_PRIVATE(sincospired)(r.hi);
 
-    half cc;
-    half ss = MATH_PRIVATE(sincospired)(t, &cc);
-
-    short s = AS_SHORT((i & (short)1) == (short)0 ? ss : cc);
-    s ^= (i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000);
+    short s = AS_SHORT((r.i & (short)1) == (short)0 ? sc.s : sc.c);
+    s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000);
 
     if (!FINITE_ONLY_OPT()) {
         s = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : s;
diff --git a/ocml/src/tanD.cl b/ocml/src/tanD.cl
index 442aa20d..e004e1be 100644
--- a/ocml/src/tanD.cl
+++ b/ocml/src/tanD.cl
@@ -8,13 +8,12 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR double
+CONSTATTR INLINEATTR double
 MATH_MANGLE(tan)(double x)
 {
-    double r, rr;
-    int i = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
 
-    int2 t = AS_INT2(MATH_PRIVATE(tanred2)(r, rr, i & 1));
+    int2 t = AS_INT2(MATH_PRIVATE(tanred2)(r.hi, r.lo, r.i & 1));
     t.hi ^= AS_INT2(x).hi & (int)0x80000000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanF.cl b/ocml/src/tanF.cl
index 81698c4d..8bd8a7c5 100644
--- a/ocml/src/tanF.cl
+++ b/ocml/src/tanF.cl
@@ -14,16 +14,12 @@ MATH_MANGLE(tan)(float x)
     int ix = AS_INT(x);
     int ax = ix & 0x7fffffff;
 
-#if defined EXTRA_PRECISION
-    float r0, r1;
-    int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax));
+    struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax));
 
-    float t = MATH_PRIVATE(tanred)(r0 + r1, regn & 1);
+#if defined EXTRA_PRECISION
+    float t = MATH_PRIVATE(tanred)(r.hi + r.lo, r.i & 1);
 #else
-    float r;
-    int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax));
-
-    float t = MATH_PRIVATE(tanred)(r, regn & 1);
+    float t = MATH_PRIVATE(tanred)(r.hi, r.i & 1);
 #endif
 
     t = AS_FLOAT(AS_INT(t) ^ (ix ^ ax));
diff --git a/ocml/src/tanH.cl b/ocml/src/tanH.cl
index 201b2c79..87fbceb6 100644
--- a/ocml/src/tanH.cl
+++ b/ocml/src/tanH.cl
@@ -13,10 +13,8 @@ UGEN(tan)
 INLINEATTR half
 MATH_MANGLE(tan)(half x)
 {
-    half r;
-    short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x));
-
-    short t = AS_SHORT(MATH_PRIVATE(tanred)(r, i & 1));
+    struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
+    short t = AS_SHORT(MATH_PRIVATE(tanred)(r.hi, r.i & (short)1));
     t ^= AS_SHORT(x) & (short)0x8000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanpiD.cl b/ocml/src/tanpiD.cl
index a55fff6f..d6e5a27a 100644
--- a/ocml/src/tanpiD.cl
+++ b/ocml/src/tanpiD.cl
@@ -11,11 +11,9 @@
 CONSTATTR INLINEATTR double
 MATH_MANGLE(tanpi)(double x)
 {
-    double r;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &r);
-
-    int2 t = AS_INT2(MATH_PRIVATE(tanpired)(r, i & 1));
-    t.hi ^= (((i == 1) | (i == 2)) & (r == 0.0)) ? 0x80000000 : 0;
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
+    int2 t = AS_INT2(MATH_PRIVATE(tanpired)(r.hi, r.i & 1));
+    t.hi ^= (((r.i == 1) | (r.i == 2)) & (r.hi == 0.0)) ? 0x80000000 : 0;
     t.hi ^= AS_INT2(x).hi & (int)0x80000000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanpiF.cl b/ocml/src/tanpiF.cl
index fc188bc3..9c951e55 100644
--- a/ocml/src/tanpiF.cl
+++ b/ocml/src/tanpiF.cl
@@ -11,11 +11,9 @@
 CONSTATTR INLINEATTR float
 MATH_MANGLE(tanpi)(float x)
 {
-    float r;
-    int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F32(x), &r);
-
-    int t = AS_INT(MATH_PRIVATE(tanpired)(r, i & 1));
-    t ^= (((i == 1) | (i == 2)) & (r == 0.0f)) ? (int)0x80000000 : 0;
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F32(x));
+    int t = AS_INT(MATH_PRIVATE(tanpired)(r.hi, r.i & 1));
+    t ^= (((r.i == 1) | (r.i == 2)) & (r.hi == 0.0f)) ? (int)0x80000000 : 0;
     t ^= AS_INT(x) & (int)0x80000000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/tanpiH.cl b/ocml/src/tanpiH.cl
index a36e97c0..25a6fa90 100644
--- a/ocml/src/tanpiH.cl
+++ b/ocml/src/tanpiH.cl
@@ -13,11 +13,9 @@ CONSTATTR UGEN(tanpi)
 CONSTATTR INLINEATTR half
 MATH_MANGLE(tanpi)(half x)
 {
-    half r;
-    short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &r);
-
-    short t = AS_SHORT(MATH_PRIVATE(tanpired)(r, i & (short)1));
-    t ^= (((i == (short)1) | (i == (short)2)) & (r == 0.0h)) ? (short)0x8000 : (short)0;
+    struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
+    short t = AS_SHORT(MATH_PRIVATE(tanpired)(r.hi, r.i & (short)1));
+    t ^= (((r.i == (short)1) | (r.i == (short)2)) & (r.hi == 0.0h)) ? (short)0x8000 : (short)0;
     t ^= AS_SHORT(x) & (short)0x8000;
 
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/trigpiredD.cl b/ocml/src/trigpiredD.cl
index fddfef06..8411e57d 100644
--- a/ocml/src/trigpiredD.cl
+++ b/ocml/src/trigpiredD.cl
@@ -8,13 +8,16 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-INLINEATTR int
-MATH_PRIVATE(trigpired)(double x, __private double *r)
+CONSTATTR INLINEATTR struct redret
+MATH_PRIVATE(trigpired)(double x)
 {
     double t = 2.0 * BUILTIN_FRACTION_F64(0.5 * x);
     x = x > 1.0 ? t : x;
     t = BUILTIN_RINT_F64(2.0 * x);
-    *r = MATH_MAD(t, -0.5, x);
-    return (int)t & 0x3;
+
+    struct redret ret;
+    ret.hi = MATH_MAD(t, -0.5, x);
+    ret.i = (int)t & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigpiredD.h b/ocml/src/trigpiredD.h
index 1a464150..3d82c947 100644
--- a/ocml/src/trigpiredD.h
+++ b/ocml/src/trigpiredD.h
@@ -5,7 +5,17 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern int MATH_PRIVATE(trigpired)(double x, __private double *r);
-extern double MATH_PRIVATE(sincospired)(double x, __private double *cp);
+struct redret {
+    double hi;
+    int i;
+};
+
+struct scret {
+    double c;
+    double s;
+};
+
+extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(double x);
+extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(double x);
 extern CONSTATTR double MATH_PRIVATE(tanpired)(double x, int i);
 
diff --git a/ocml/src/trigpiredF.cl b/ocml/src/trigpiredF.cl
index ab2fa371..2f93312b 100644
--- a/ocml/src/trigpiredF.cl
+++ b/ocml/src/trigpiredF.cl
@@ -8,13 +8,16 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-INLINEATTR int
-MATH_PRIVATE(trigpired)(float x, __private float *r)
+CONSTATTR INLINEATTR struct redret
+MATH_PRIVATE(trigpired)(float x)
 {
     float t = 2.0f * BUILTIN_FRACTION_F32(0.5f * x);
     x = x > 1.0f ? t : x;
     t = BUILTIN_RINT_F32(2.0f * x);
-    *r = MATH_MAD(t, -0.5f, x);
-    return (int)t & 0x3;
+
+    struct redret ret;
+    ret.hi = MATH_MAD(t, -0.5f, x);
+    ret.i = (int)t & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigpiredF.h b/ocml/src/trigpiredF.h
index 162544ec..f6727b5b 100644
--- a/ocml/src/trigpiredF.h
+++ b/ocml/src/trigpiredF.h
@@ -5,7 +5,17 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern int MATH_PRIVATE(trigpired)(float x, __private float *r);
-extern float MATH_PRIVATE(sincospired)(float x, __private float *cp);
+struct redret {
+    float hi;
+    int i;
+};
+
+struct scret {
+    float s;
+    float c;
+};
+
+extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(float x);
+extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(float x);
 extern CONSTATTR float MATH_PRIVATE(tanpired)(float x, int i);
 
diff --git a/ocml/src/trigpiredH.cl b/ocml/src/trigpiredH.cl
index b68d43e5..7023567d 100644
--- a/ocml/src/trigpiredH.cl
+++ b/ocml/src/trigpiredH.cl
@@ -8,13 +8,16 @@
 #include "mathH.h"
 #include "trigpiredH.h"
 
-INLINEATTR short
-MATH_PRIVATE(trigpired)(half x, __private half *r)
+CONSTATTR INLINEATTR struct redret
+MATH_PRIVATE(trigpired)(half x)
 {
     half t = 2.0h * BUILTIN_FRACTION_F16(0.5h * x);
     x = x > 1.0h ? t : x;
     t = BUILTIN_RINT_F16(2.0h * x);
-    *r = MATH_MAD(t, -0.5h, x);
-    return (short)t & (short)0x3;
+
+    struct redret ret;
+    ret.hi = MATH_MAD(t, -0.5h, x);
+    ret.i = (short)t & (short)0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigpiredH.h b/ocml/src/trigpiredH.h
index 1294ebea..b2d240f5 100644
--- a/ocml/src/trigpiredH.h
+++ b/ocml/src/trigpiredH.h
@@ -5,7 +5,17 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern short MATH_PRIVATE(trigpired)(half x, __private half *r);
-extern half MATH_PRIVATE(sincospired)(half x, __private half *cp);
+struct redret {
+    half hi;
+    short i;
+};
+
+struct scret {
+    half s;
+    half c;
+};
+
+extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(half x);
+extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(half x);
 extern CONSTATTR half MATH_PRIVATE(tanpired)(half x, short i);
 
diff --git a/ocml/src/trigredD.cl b/ocml/src/trigredD.cl
index 60fc8b3f..76c78d4a 100644
--- a/ocml/src/trigredD.cl
+++ b/ocml/src/trigredD.cl
@@ -8,12 +8,12 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR int
-MATH_PRIVATE(trigred)(__private double *r, __private double *rr, double x)
+CONSTATTR INLINEATTR struct redret
+MATH_PRIVATE(trigred)(double x)
 {
     if (x < 0x1.0p+21)
-        return MATH_PRIVATE(trigredsmall)(r, rr, x);
+        return MATH_PRIVATE(trigredsmall)(x);
     else
-        return MATH_PRIVATE(trigredlarge)(r, rr, x);
+        return MATH_PRIVATE(trigredlarge)(x);
 }
 
diff --git a/ocml/src/trigredD.h b/ocml/src/trigredD.h
index 6dd96f67..26a9599d 100644
--- a/ocml/src/trigredD.h
+++ b/ocml/src/trigredD.h
@@ -5,12 +5,23 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern int MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x);
-extern int MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x);
-extern int MATH_PRIVATE(trigred)(__private double *r, __private double *rr, double x);
+struct redret {
+    double lo;
+    double hi;
+    int i;
+};
 
-extern double MATH_PRIVATE(sincosred)(double x, __private double *cp);
-extern double MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp);
+struct scret {
+    double s;
+    double c;
+};
+
+extern CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(double x);
+extern CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(double x);
+extern CONSTATTR struct redret MATH_PRIVATE(trigred)(double x);
+
+extern CONSTATTR struct scret MATH_PRIVATE(sincosred)(double x);
+extern CONSTATTR struct scret MATH_PRIVATE(sincosred2)(double x, double y);
 
 extern CONSTATTR double MATH_PRIVATE(tanred2)(double x, double xx, int sel);
 
diff --git a/ocml/src/trigredF.cl b/ocml/src/trigredF.cl
index c73a0fb7..240eee20 100644
--- a/ocml/src/trigredF.cl
+++ b/ocml/src/trigredF.cl
@@ -8,24 +8,12 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR int
-#if defined EXTRA_PRECISION
-MATH_PRIVATE(trigred)(__private float *r, __private float *rr, float x)
-#else
-MATH_PRIVATE(trigred)(__private float *r, float x)
-#endif
+CONSTATTR INLINEATTR struct redret
+MATH_PRIVATE(trigred)(float x)
 {
     if (x < SMALL_BOUND)
-#if defined EXTRA_PRECISION
-        return MATH_PRIVATE(trigredsmall)(r, rr, x);
-#else
-        return MATH_PRIVATE(trigredsmall)(r, x);
-#endif
+        return MATH_PRIVATE(trigredsmall)(x);
     else
-#if defined EXTRA_PRECISION
-        return MATH_PRIVATE(trigredlarge)(r, rr, x);
-#else
-        return MATH_PRIVATE(trigredlarge)(r, x);
-#endif
+        return MATH_PRIVATE(trigredlarge)(x);
 }
 
diff --git a/ocml/src/trigredF.h b/ocml/src/trigredF.h
index d7a81a1b..e0e50c93 100644
--- a/ocml/src/trigredF.h
+++ b/ocml/src/trigredF.h
@@ -8,18 +8,33 @@
 #define SMALL_BOUND 0x1.0p+17f
 
 #if defined EXTRA_PRECISION
-extern int MATH_PRIVATE(trigredsmall)(__private float *r, __private float *rr, float x);
-extern int MATH_PRIVATE(trigredlarge)(__private float *r, __private float *rr, float x);
-extern int MATH_PRIVATE(trigred)(__private float *r, __private float *rr, float x);
+struct redret {
+    float hi;
+    float lo;
+    int i;
+};
 #else
-extern int MATH_PRIVATE(trigredsmall)(__private float *r, float x);
-extern int MATH_PRIVATE(trigredlarge)(__private float *r, float x);
-extern int MATH_PRIVATE(trigred)(__private float *r, float x);
+struct redret {
+    float hi;
+    int i;
+};
 #endif
 
-extern float MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp);
+struct scret {
+    float s;
+    float c;
+};
 
-extern float MATH_PRIVATE(sincosred)(float x, __private float *cp);
+extern CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(float x);
+extern CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(float x);
+extern CONSTATTR struct redret MATH_PRIVATE(trigred)(float x);
+
+
+#if defined EXTRA_PRECISION
+extern CONSTATTR struct scret  MATH_PRIVATE(sincosred2)(float x, float y);
+#else
+extern CONSTATTR struct scret  MATH_PRIVATE(sincosred)(float x);
+#endif
 
 extern CONSTATTR float MATH_PRIVATE(tanred)(float x, int regn);
 
diff --git a/ocml/src/trigredH.cl b/ocml/src/trigredH.cl
index 5fcf39b1..b69d38f9 100644
--- a/ocml/src/trigredH.cl
+++ b/ocml/src/trigredH.cl
@@ -6,9 +6,10 @@
  *===------------------------------------------------------------------------*/
 
 #include "mathH.h"
+#include "trigredH.h"
 
-__attribute__((always_inline)) short
-MATH_PRIVATE(trigred)(__private half *r, half hx)
+CONSTATTR INLINEATTR struct redret
+MATH_PRIVATE(trigred)(half hx)
 {
     const float twobypi = 0x1.45f306p-1f;
     const float pb2_a = 0x1.92p+0f;
@@ -18,8 +19,9 @@ MATH_PRIVATE(trigred)(__private half *r, half hx)
     float x = (float)hx;
     float fn = BUILTIN_RINT_F32(x * twobypi);
 
-    *r = (half)BUILTIN_MAD_F32(fn, -pb2_c, BUILTIN_MAD_F32(fn, -pb2_b, BUILTIN_MAD_F32(fn, -pb2_a, x)));
-
-    return (int)fn & 0x3;
+    struct redret ret;
+    ret.hi = (half)BUILTIN_MAD_F32(fn, -pb2_c, BUILTIN_MAD_F32(fn, -pb2_b, BUILTIN_MAD_F32(fn, -pb2_a, x)));
+    ret.i =  (int)fn & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigredH.h b/ocml/src/trigredH.h
index 97764561..2f02b42a 100644
--- a/ocml/src/trigredH.h
+++ b/ocml/src/trigredH.h
@@ -5,7 +5,17 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
-extern short MATH_PRIVATE(trigred)(__private half *r, half x);
-extern half MATH_PRIVATE(sincosred)(half x, __private half *cp);
-extern CONSTATTR half MATH_PRIVATE(tanred)(half x, short regn);
+struct redret {
+    half hi;
+    short i;
+};
+
+struct scret {
+    half s;
+    half c;
+};
+
+extern CONSTATTR struct redret  MATH_PRIVATE(trigred)(half x);
+extern CONSTATTR struct scret  MATH_PRIVATE(sincosred)(half x);
+extern CONSTATTR half MATH_PRIVATE(tanred)(half x, short i);
 
diff --git a/ocml/src/trigredlargeD.cl b/ocml/src/trigredlargeD.cl
index caea8352..02804e0b 100644
--- a/ocml/src/trigredlargeD.cl
+++ b/ocml/src/trigredlargeD.cl
@@ -63,8 +63,8 @@
         C3 += C2; \
     } while (0)
 
-int
-MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x)
+CONSTATTR struct redret
+MATH_PRIVATE(trigredlarge)(double x)
 {
     // Scale x by relevant part of 2/pi
     double p2 = BUILTIN_TRIG_PREOP_F64(x, 0);
@@ -106,9 +106,11 @@ MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x)
     double rt = BUILTIN_FMA_F64(f1, pio2h, BUILTIN_FMA_F64(f2, pio2t, BUILTIN_FMA_F64(f2, pio2h, -rh)));
 
     FSUM2(rh, rt, rh, rt);
-    *r = rh;
-    *rr = rt;
 
-    return i & 0x3;
+    struct redret ret;
+    ret.hi = rh;
+    ret.lo = rt;
+    ret.i = i & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigredlargeF.cl b/ocml/src/trigredlargeF.cl
index dcb2057c..94ea8ae5 100644
--- a/ocml/src/trigredlargeF.cl
+++ b/ocml/src/trigredlargeF.cl
@@ -17,12 +17,8 @@
     HI = BUILTIN_MULHI_U32(A, B); \
     HI += LO < C
 
-int
-#if defined EXTRA_PRECISION
-MATH_PRIVATE(trigredlarge)(__private float *r, __private float *rr, float x)
-#else
-MATH_PRIVATE(trigredlarge)(__private float *r, float x)
-#endif
+CONSTATTR struct redret
+MATH_PRIVATE(trigredlarge)(float x)
 {
     int xe = (int)(AS_UINT(x) >> 23) - 127;
     uint xm = 0x00800000U | (AS_UINT(x) & 0x7fffffU);
@@ -152,16 +148,18 @@ MATH_PRIVATE(trigredlarge)(__private float *r, float x)
              MATH_MAD(q0, pio2h, q1*pio2t);
     }
 
+    struct redret ret;
 #if defined EXTRA_PRECISION
     float t = rh + rt;
     rt = rt - (t - rh);
 
-    *r = t;
-    *rr = rt;
+    ret.hi = t;
+    ret.lo = rt;
 #else
-    *r = rh + rt;
+    ret.hi  = rh + rt;
 #endif
 
-    return ((i >> 1) + (i & 1)) & 0x3;
+    ret.i = ((i >> 1) + (i & 1)) & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigredsmallD.cl b/ocml/src/trigredsmallD.cl
index 59f74c87..fafe0fd8 100644
--- a/ocml/src/trigredsmallD.cl
+++ b/ocml/src/trigredsmallD.cl
@@ -8,8 +8,8 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR int
-MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x)
+CONSTATTR INLINEATTR struct redret
+MATH_PRIVATE(trigredsmall)(double x)
 {
     const double twobypi = 0x1.45f306dc9c883p-1;
     const double piby2_h = 0x1.921fb54442d18p+0;
@@ -27,9 +27,10 @@ MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x)
     double rh = yh + yt;
     double rt = yt - (rh - yh);
 
-    *r = rh;
-    *rr = rt;
-
-    return (int)dn & 0x3;
+    struct redret ret;
+    ret.hi = rh;
+    ret.lo = rt;
+    ret.i = (int)dn & 0x3;
+    return ret;
 }
 
diff --git a/ocml/src/trigredsmallF.cl b/ocml/src/trigredsmallF.cl
index eaf2bc08..c56841b3 100644
--- a/ocml/src/trigredsmallF.cl
+++ b/ocml/src/trigredsmallF.cl
@@ -22,12 +22,8 @@
         D = __t + (((C - __t) - __ph) - __pt); \
     } while(0)
 
-static inline int
-#if defined EXTRA_PRECISION
-mad_reduce(__private float *hi, __private float *lo, float x)
-#else
-mad_reduce(__private float *hi, float x)
-#endif
+static inline struct redret
+mad_reduce(float x)
 {
 #if defined EXTRA_PRECISION
 #error Not implemented
@@ -54,17 +50,16 @@ mad_reduce(__private float *hi, float x)
     float r;
     FNMA(fn, fnh, fnl, piby2_h, piby2_hh, piby2_hl, x, r);
     FNMA(fn, fnh, fnl, piby2_m, piby2_mh, piby2_ml, r, r);
-    *hi = MATH_MAD(-piby2_l, fn, r);
-    return (int)fn & 0x3;
+
+    struct redret ret;
+    ret.hi = MATH_MAD(-piby2_l, fn, r);
+    ret.i = (int)fn & 0x3;
+    return ret;
 #endif
 }
 
-static inline int
-#if defined EXTRA_PRECISION
-fma_reduce(__private float *hi, __private float *lo, float x)
-#else
-fma_reduce(__private float *hi, float x)
-#endif
+static inline struct redret
+fma_reduce(float x)
 {
     const float twobypi = 0x1.45f306p-1f;
     const float piby2_h = 0x1.921fb4p+0f;
@@ -72,6 +67,9 @@ fma_reduce(__private float *hi, float x)
     const float piby2_l = 0x1.846988p-48f;
 
     float fn = BUILTIN_RINT_F32(x * twobypi);
+
+    struct redret ret;
+
 #if defined EXTRA_PRECISION
     float xt = BUILTIN_FMA_F32(fn, -piby2_h, x);
     float yh = BUILTIN_FMA_F32(fn, -piby2_m, xt);
@@ -82,34 +80,24 @@ fma_reduce(__private float *hi, float x)
     float yt = BUILTIN_FMA_F32(fn, -piby2_l, ((th - yh) + tt) - pt);
     float rh = yh + yt;
     float rt = yt - (rh - yh);
-    *hi = rh;
-    *lo = rt;
+    ret.hi = rh;
+    ret.lo = rt;
 #else
     float r = BUILTIN_FMA_F32(fn, -piby2_l, BUILTIN_FMA_F32(fn, -piby2_m, BUILTIN_FMA_F32(fn, -piby2_h, x)));
-    *hi = r;
+    ret.hi = r;
 #endif
-    return (int)fn & 0x3;
+
+    ret.i =(int)fn & 0x3;
+    return ret;
 }
 
-INLINEATTR int
-#if defined EXTRA_PRECISION
-MATH_PRIVATE(trigredsmall)(__private float *r, __private float *rr, float x)
-#else
-MATH_PRIVATE(trigredsmall)(__private float *r, float x)
-#endif
+CONSTATTR INLINEATTR struct redret
+MATH_PRIVATE(trigredsmall)(float x)
 {
     if (HAVE_FAST_FMA32()) {
-#if defined EXTRA_PRECISION
-	return fma_reduce(r, rr, x);
-#else
-	return fma_reduce(r, x);
-#endif
+	return fma_reduce(x);
     } else {
-#if defined EXTRA_PRECISION
-        return mad_reduce(r, rr, x);
-#else
-	return mad_reduce(r, x);
-#endif
+	return mad_reduce(x);
     }
 }
 

From d4b5f4d1d25598cd078d5030fdf2c32c08f1e746 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Fri, 28 Jul 2017 09:54:32 -0700
Subject: [PATCH 08/25] Drop always_inline attribute

Change-Id: I624de6a34980e1dd905a4865e3eb3db80f62770f
---
 ocml/src/acoshD.cl         | 2 +-
 ocml/src/acoshF.cl         | 2 +-
 ocml/src/acoshH.cl         | 2 +-
 ocml/src/addD.cl           | 2 +-
 ocml/src/addF.cl           | 2 +-
 ocml/src/addH.cl           | 2 +-
 ocml/src/asinhD.cl         | 2 +-
 ocml/src/asinhF.cl         | 2 +-
 ocml/src/asinhH.cl         | 2 +-
 ocml/src/atanF.cl          | 2 +-
 ocml/src/atanH.cl          | 2 +-
 ocml/src/atanhD.cl         | 2 +-
 ocml/src/atanhF.cl         | 2 +-
 ocml/src/atanhH.cl         | 2 +-
 ocml/src/atanpiF.cl        | 2 +-
 ocml/src/atanpiH.cl        | 2 +-
 ocml/src/atanpiredF.cl     | 2 +-
 ocml/src/atanpiredH.cl     | 2 +-
 ocml/src/atanredF.cl       | 2 +-
 ocml/src/atanredH.cl       | 2 +-
 ocml/src/ba0D.cl           | 2 +-
 ocml/src/ba0F.cl           | 2 +-
 ocml/src/ba1D.cl           | 2 +-
 ocml/src/ba1F.cl           | 2 +-
 ocml/src/bp0D.cl           | 2 +-
 ocml/src/bp0F.cl           | 2 +-
 ocml/src/bp1D.cl           | 2 +-
 ocml/src/bp1F.cl           | 2 +-
 ocml/src/cbrtD.cl          | 2 +-
 ocml/src/cbrtF.cl          | 2 +-
 ocml/src/ceilD.cl          | 2 +-
 ocml/src/ceilF.cl          | 2 +-
 ocml/src/ceilH.cl          | 4 ++--
 ocml/src/copysignD.cl      | 2 +-
 ocml/src/copysignF.cl      | 2 +-
 ocml/src/copysignH.cl      | 4 ++--
 ocml/src/cosD.cl           | 2 +-
 ocml/src/cosF.cl           | 2 +-
 ocml/src/cosH.cl           | 2 +-
 ocml/src/cosbD.cl          | 2 +-
 ocml/src/cosbF.cl          | 2 +-
 ocml/src/coshD.cl          | 2 +-
 ocml/src/coshF.cl          | 2 +-
 ocml/src/coshH.cl          | 2 +-
 ocml/src/cospiD.cl         | 2 +-
 ocml/src/cospiF.cl         | 2 +-
 ocml/src/cospiH.cl         | 2 +-
 ocml/src/divD.cl           | 2 +-
 ocml/src/divF.cl           | 2 +-
 ocml/src/divH.cl           | 2 +-
 ocml/src/epexpepD.cl       | 2 +-
 ocml/src/epexpepF.cl       | 2 +-
 ocml/src/eplnD.cl          | 2 +-
 ocml/src/eplnF.cl          | 2 +-
 ocml/src/erfH.cl           | 2 +-
 ocml/src/erfcH.cl          | 2 +-
 ocml/src/erfcinvH.cl       | 2 +-
 ocml/src/erfcxH.cl         | 2 +-
 ocml/src/erfinvH.cl        | 2 +-
 ocml/src/exp10H.cl         | 2 +-
 ocml/src/exp2H.cl          | 2 +-
 ocml/src/expF_base.h       | 2 +-
 ocml/src/expH.cl           | 2 +-
 ocml/src/expepD.cl         | 2 +-
 ocml/src/expepF.cl         | 2 +-
 ocml/src/expm1F.cl         | 2 +-
 ocml/src/expm1H.cl         | 2 +-
 ocml/src/fabsD.cl          | 2 +-
 ocml/src/fabsF.cl          | 2 +-
 ocml/src/fabsH.cl          | 4 ++--
 ocml/src/fdimD.cl          | 2 +-
 ocml/src/fdimF.cl          | 2 +-
 ocml/src/fdimH.cl          | 2 +-
 ocml/src/floorD.cl         | 2 +-
 ocml/src/floorF.cl         | 2 +-
 ocml/src/floorH.cl         | 4 ++--
 ocml/src/fmaD.cl           | 2 +-
 ocml/src/fmaF.cl           | 2 +-
 ocml/src/fmaH.cl           | 2 +-
 ocml/src/fmaxD.cl          | 2 +-
 ocml/src/fmaxF.cl          | 2 +-
 ocml/src/fmaxH.cl          | 4 ++--
 ocml/src/fminD.cl          | 2 +-
 ocml/src/fminF.cl          | 2 +-
 ocml/src/fminH.cl          | 4 ++--
 ocml/src/fpclassifyD.cl    | 2 +-
 ocml/src/fpclassifyF.cl    | 2 +-
 ocml/src/fpclassifyH.cl    | 2 +-
 ocml/src/fractD.cl         | 2 +-
 ocml/src/fractF.cl         | 2 +-
 ocml/src/fractH.cl         | 4 ++--
 ocml/src/frexpD.cl         | 2 +-
 ocml/src/frexpF.cl         | 2 +-
 ocml/src/frexpH.cl         | 4 ++--
 ocml/src/hypotD.cl         | 2 +-
 ocml/src/hypotF.cl         | 2 +-
 ocml/src/hypotH.cl         | 2 +-
 ocml/src/i0H.cl            | 2 +-
 ocml/src/i1H.cl            | 2 +-
 ocml/src/ilogbD.cl         | 2 +-
 ocml/src/ilogbF.cl         | 2 +-
 ocml/src/ilogbH.cl         | 4 ++--
 ocml/src/isfiniteD.cl      | 2 +-
 ocml/src/isfiniteF.cl      | 2 +-
 ocml/src/isfiniteH.cl      | 4 ++--
 ocml/src/isinfD.cl         | 2 +-
 ocml/src/isinfF.cl         | 2 +-
 ocml/src/isinfH.cl         | 4 ++--
 ocml/src/isnanD.cl         | 2 +-
 ocml/src/isnanF.cl         | 2 +-
 ocml/src/isnanH.cl         | 4 ++--
 ocml/src/isnormalD.cl      | 2 +-
 ocml/src/isnormalF.cl      | 2 +-
 ocml/src/isnormalH.cl      | 4 ++--
 ocml/src/j0H.cl            | 2 +-
 ocml/src/j1H.cl            | 2 +-
 ocml/src/ldexpD.cl         | 2 +-
 ocml/src/ldexpF.cl         | 2 +-
 ocml/src/ldexpH.cl         | 4 ++--
 ocml/src/len3D.cl          | 2 +-
 ocml/src/len3F.cl          | 2 +-
 ocml/src/len3H.cl          | 2 +-
 ocml/src/len4D.cl          | 2 +-
 ocml/src/len4F.cl          | 2 +-
 ocml/src/len4H.cl          | 2 +-
 ocml/src/lgammaD.cl        | 2 +-
 ocml/src/lgammaF.cl        | 2 +-
 ocml/src/lgammaH.cl        | 2 +-
 ocml/src/lgamma_rH.cl      | 4 ++--
 ocml/src/lnepD.cl          | 2 +-
 ocml/src/lnepF.cl          | 2 +-
 ocml/src/log10H.cl         | 2 +-
 ocml/src/log1pD.cl         | 2 +-
 ocml/src/log1pF.cl         | 2 +-
 ocml/src/log1pH.cl         | 2 +-
 ocml/src/log2H.cl          | 2 +-
 ocml/src/logF_base.h       | 2 +-
 ocml/src/logH.cl           | 2 +-
 ocml/src/logbD.cl          | 2 +-
 ocml/src/logbF.cl          | 2 +-
 ocml/src/logbH.cl          | 2 +-
 ocml/src/madD.cl           | 2 +-
 ocml/src/madF.cl           | 2 +-
 ocml/src/madH.cl           | 4 ++--
 ocml/src/maxD.cl           | 2 +-
 ocml/src/maxF.cl           | 2 +-
 ocml/src/maxH.cl           | 4 ++--
 ocml/src/maxmagD.cl        | 2 +-
 ocml/src/maxmagF.cl        | 2 +-
 ocml/src/maxmagH.cl        | 2 +-
 ocml/src/minD.cl           | 2 +-
 ocml/src/minF.cl           | 2 +-
 ocml/src/minH.cl           | 4 ++--
 ocml/src/minmagD.cl        | 2 +-
 ocml/src/minmagF.cl        | 2 +-
 ocml/src/minmagH.cl        | 2 +-
 ocml/src/modfD.cl          | 2 +-
 ocml/src/modfF.cl          | 2 +-
 ocml/src/modfH.cl          | 4 ++--
 ocml/src/mulD.cl           | 2 +-
 ocml/src/mulF.cl           | 2 +-
 ocml/src/mulH.cl           | 2 +-
 ocml/src/nanD.cl           | 2 +-
 ocml/src/nanF.cl           | 2 +-
 ocml/src/nanH.cl           | 4 ++--
 ocml/src/ncdfH.cl          | 2 +-
 ocml/src/ncdfinvD.cl       | 2 +-
 ocml/src/ncdfinvF.cl       | 2 +-
 ocml/src/ncdfinvH.cl       | 2 +-
 ocml/src/nearbyintD.cl     | 2 +-
 ocml/src/nearbyintF.cl     | 2 +-
 ocml/src/nearbyintH.cl     | 4 ++--
 ocml/src/nextafterD.cl     | 2 +-
 ocml/src/nextafterF.cl     | 2 +-
 ocml/src/nextafterH.cl     | 2 +-
 ocml/src/pownH.cl          | 2 +-
 ocml/src/rcbrtF.cl         | 2 +-
 ocml/src/remainderF_base.h | 2 +-
 ocml/src/remquoH.cl        | 2 +-
 ocml/src/rhypotD.cl        | 2 +-
 ocml/src/rhypotF.cl        | 2 +-
 ocml/src/rhypotH.cl        | 2 +-
 ocml/src/rintD.cl          | 2 +-
 ocml/src/rintF.cl          | 2 +-
 ocml/src/rintH.cl          | 4 ++--
 ocml/src/rlen3D.cl         | 2 +-
 ocml/src/rlen3F.cl         | 2 +-
 ocml/src/rlen3H.cl         | 2 +-
 ocml/src/rlen4D.cl         | 2 +-
 ocml/src/rlen4F.cl         | 2 +-
 ocml/src/rlen4H.cl         | 2 +-
 ocml/src/rootnH.cl         | 2 +-
 ocml/src/roundD.cl         | 2 +-
 ocml/src/roundF.cl         | 2 +-
 ocml/src/roundH.cl         | 4 ++--
 ocml/src/rsqrtD.cl         | 2 +-
 ocml/src/rsqrtF.cl         | 2 +-
 ocml/src/rsqrtH.cl         | 2 +-
 ocml/src/scalbD.cl         | 2 +-
 ocml/src/scalbF.cl         | 2 +-
 ocml/src/scalbH.cl         | 2 +-
 ocml/src/scalbnD.cl        | 2 +-
 ocml/src/scalbnF.cl        | 2 +-
 ocml/src/scalbnH.cl        | 4 ++--
 ocml/src/signbitD.cl       | 2 +-
 ocml/src/signbitF.cl       | 2 +-
 ocml/src/signbitH.cl       | 4 ++--
 ocml/src/sinD.cl           | 2 +-
 ocml/src/sinF.cl           | 2 +-
 ocml/src/sinH.cl           | 2 +-
 ocml/src/sinbD.cl          | 2 +-
 ocml/src/sinbF.cl          | 2 +-
 ocml/src/sincosD.cl        | 2 +-
 ocml/src/sincosF.cl        | 2 +-
 ocml/src/sincosH.cl        | 4 ++--
 ocml/src/sincospiD.cl      | 2 +-
 ocml/src/sincospiF.cl      | 2 +-
 ocml/src/sincospiH.cl      | 4 ++--
 ocml/src/sincospiredD.cl   | 2 +-
 ocml/src/sincospiredF.cl   | 2 +-
 ocml/src/sincospiredH.cl   | 2 +-
 ocml/src/sincosred2D.cl    | 2 +-
 ocml/src/sincosred2F.cl    | 2 +-
 ocml/src/sincosredD.cl     | 2 +-
 ocml/src/sincosredF.cl     | 2 +-
 ocml/src/sincosredH.cl     | 2 +-
 ocml/src/sinhD.cl          | 2 +-
 ocml/src/sinhF.cl          | 2 +-
 ocml/src/sinhH.cl          | 2 +-
 ocml/src/sinpiD.cl         | 2 +-
 ocml/src/sinpiF.cl         | 2 +-
 ocml/src/sinpiH.cl         | 2 +-
 ocml/src/sqrtD.cl          | 4 ++--
 ocml/src/sqrtF.cl          | 4 ++--
 ocml/src/sqrtH.cl          | 4 ++--
 ocml/src/subD.cl           | 2 +-
 ocml/src/subF.cl           | 2 +-
 ocml/src/subH.cl           | 2 +-
 ocml/src/tanD.cl           | 2 +-
 ocml/src/tanF.cl           | 2 +-
 ocml/src/tanH.cl           | 2 +-
 ocml/src/tanhD.cl          | 2 +-
 ocml/src/tanpiD.cl         | 2 +-
 ocml/src/tanpiF.cl         | 2 +-
 ocml/src/tanpiH.cl         | 2 +-
 ocml/src/tanpiredD.cl      | 2 +-
 ocml/src/tanpiredF.cl      | 2 +-
 ocml/src/tanpiredH.cl      | 2 +-
 ocml/src/tanred2D.cl       | 2 +-
 ocml/src/tanredF.cl        | 2 +-
 ocml/src/tanredH.cl        | 2 +-
 ocml/src/tgammaH.cl        | 2 +-
 ocml/src/trigpiredD.cl     | 2 +-
 ocml/src/trigpiredF.cl     | 2 +-
 ocml/src/trigpiredH.cl     | 2 +-
 ocml/src/trigredD.cl       | 2 +-
 ocml/src/trigredF.cl       | 2 +-
 ocml/src/trigredH.cl       | 2 +-
 ocml/src/trigredsmallD.cl  | 2 +-
 ocml/src/trigredsmallF.cl  | 2 +-
 ocml/src/truncD.cl         | 2 +-
 ocml/src/truncF.cl         | 2 +-
 ocml/src/truncH.cl         | 4 ++--
 ocml/src/y0H.cl            | 2 +-
 ocml/src/y1H.cl            | 2 +-
 265 files changed, 296 insertions(+), 296 deletions(-)

diff --git a/ocml/src/acoshD.cl b/ocml/src/acoshD.cl
index 619f35a5..064897ff 100644
--- a/ocml/src/acoshD.cl
+++ b/ocml/src/acoshD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(acosh)(double x)
 {
     bool b = x >= 0x1.0p+512;
diff --git a/ocml/src/acoshF.cl b/ocml/src/acoshF.cl
index 179b413d..962e0e39 100644
--- a/ocml/src/acoshF.cl
+++ b/ocml/src/acoshF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(acosh)(float x)
 {
     bool b = x >= 0x1.0p+64f;
diff --git a/ocml/src/acoshH.cl b/ocml/src/acoshH.cl
index 074a7166..a8dc827e 100644
--- a/ocml/src/acoshH.cl
+++ b/ocml/src/acoshH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(acosh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(acosh)(half hx)
 {
     half ret;
diff --git a/ocml/src/addD.cl b/ocml/src/addD.cl
index 9e85367d..1cb4d527 100644
--- a/ocml/src/addD.cl
+++ b/ocml/src/addD.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR double \
+CONSTATTR double \
 MATH_MANGLE(NAME)(double x, double y) \
 { \
     return BUILTIN_FULL_BINARY(fadd, false, ROUND, x, y); \
diff --git a/ocml/src/addF.cl b/ocml/src/addF.cl
index 8e676725..d01f0c28 100644
--- a/ocml/src/addF.cl
+++ b/ocml/src/addF.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR float \
+CONSTATTR float \
 MATH_MANGLE(NAME)(float x, float y) \
 { \
     float ret; \
diff --git a/ocml/src/addH.cl b/ocml/src/addH.cl
index b540fdfe..8df6e2fa 100644
--- a/ocml/src/addH.cl
+++ b/ocml/src/addH.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR half \
+CONSTATTR half \
 MATH_MANGLE(NAME)(half x, half y) \
 { \
     return BUILTIN_FULL_BINARY(faddh, false, ROUND, x, y); \
diff --git a/ocml/src/asinhD.cl b/ocml/src/asinhD.cl
index 75c3408a..09957fcc 100644
--- a/ocml/src/asinhD.cl
+++ b/ocml/src/asinhD.cl
@@ -13,7 +13,7 @@
 extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x);
 
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(asinh)(double x)
 {
     double y = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/asinhF.cl b/ocml/src/asinhF.cl
index 407d9545..f5eeaf04 100644
--- a/ocml/src/asinhF.cl
+++ b/ocml/src/asinhF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(asinh)(float x)
 {
     float y = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/asinhH.cl b/ocml/src/asinhH.cl
index 027aed99..ae994c76 100644
--- a/ocml/src/asinhH.cl
+++ b/ocml/src/asinhH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(asinh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(asinh)(half hx)
 {
     half ret;
diff --git a/ocml/src/atanF.cl b/ocml/src/atanF.cl
index ad3cdb03..08a7b1b1 100644
--- a/ocml/src/atanF.cl
+++ b/ocml/src/atanF.cl
@@ -9,7 +9,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(atanred)(float);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(atan)(float x)
 {
     float v = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/atanH.cl b/ocml/src/atanH.cl
index 9fe95d8c..42ba6898 100644
--- a/ocml/src/atanH.cl
+++ b/ocml/src/atanH.cl
@@ -11,7 +11,7 @@ extern CONSTATTR half MATH_PRIVATE(atanred)(half);
 
 CONSTATTR UGEN(atan)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(atan)(half x)
 {
     half v = BUILTIN_ABS_F16(x);
diff --git a/ocml/src/atanhD.cl b/ocml/src/atanhD.cl
index c044f71a..62d9ddb1 100644
--- a/ocml/src/atanhD.cl
+++ b/ocml/src/atanhD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(atanh)(double x)
 {
     double y = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/atanhF.cl b/ocml/src/atanhF.cl
index 82a5c3ab..817ed41a 100644
--- a/ocml/src/atanhF.cl
+++ b/ocml/src/atanhF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(atanh)(float x)
 {
     float y = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/atanhH.cl b/ocml/src/atanhH.cl
index 46c30ff8..c86722cd 100644
--- a/ocml/src/atanhH.cl
+++ b/ocml/src/atanhH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(atanh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(atanh)(half hx)
 {
     half ret;
diff --git a/ocml/src/atanpiF.cl b/ocml/src/atanpiF.cl
index f9af9b72..1c46c155 100644
--- a/ocml/src/atanpiF.cl
+++ b/ocml/src/atanpiF.cl
@@ -9,7 +9,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(atanpired)(float);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(atanpi)(float x)
 {
     float v = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/atanpiH.cl b/ocml/src/atanpiH.cl
index d85fe700..44cb201c 100644
--- a/ocml/src/atanpiH.cl
+++ b/ocml/src/atanpiH.cl
@@ -12,7 +12,7 @@ extern CONSTATTR half MATH_PRIVATE(atanpired)(half);
 
 CONSTATTR UGEN(atanpi)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(atanpi)(half x)
 {
     half v = BUILTIN_ABS_F16(x);
diff --git a/ocml/src/atanpiredF.cl b/ocml/src/atanpiredF.cl
index d982869a..63af0f76 100644
--- a/ocml/src/atanpiredF.cl
+++ b/ocml/src/atanpiredF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(atanpired)(float v)
 {
     float t = v * v;
diff --git a/ocml/src/atanpiredH.cl b/ocml/src/atanpiredH.cl
index 121d304f..3eabd599 100644
--- a/ocml/src/atanpiredH.cl
+++ b/ocml/src/atanpiredH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_PRIVATE(atanpired)(half v)
 {
     half t = v * v;
diff --git a/ocml/src/atanredF.cl b/ocml/src/atanredF.cl
index 10b5c5c1..a0895928 100644
--- a/ocml/src/atanredF.cl
+++ b/ocml/src/atanredF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(atanred)(float v)
 {
     float t = v * v;
diff --git a/ocml/src/atanredH.cl b/ocml/src/atanredH.cl
index dd2d1ba9..d721edb3 100644
--- a/ocml/src/atanredH.cl
+++ b/ocml/src/atanredH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_PRIVATE(atanred)(half v)
 {
     half t = v * v;
diff --git a/ocml/src/ba0D.cl b/ocml/src/ba0D.cl
index c21d308b..e87226bc 100644
--- a/ocml/src/ba0D.cl
+++ b/ocml/src/ba0D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(ba0)(double t)
 {
     return
diff --git a/ocml/src/ba0F.cl b/ocml/src/ba0F.cl
index fc18577f..309ad267 100644
--- a/ocml/src/ba0F.cl
+++ b/ocml/src/ba0F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(ba0)(float t)
 {
     return
diff --git a/ocml/src/ba1D.cl b/ocml/src/ba1D.cl
index c735f595..d4453e00 100644
--- a/ocml/src/ba1D.cl
+++ b/ocml/src/ba1D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(ba1)(double t)
 {
     return
diff --git a/ocml/src/ba1F.cl b/ocml/src/ba1F.cl
index 2b974a39..5dd1ea96 100644
--- a/ocml/src/ba1F.cl
+++ b/ocml/src/ba1F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(ba1)(float t)
 {
     return
diff --git a/ocml/src/bp0D.cl b/ocml/src/bp0D.cl
index 0e08cb4b..9014ae9e 100644
--- a/ocml/src/bp0D.cl
+++ b/ocml/src/bp0D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(bp0)(double t)
 {
     return
diff --git a/ocml/src/bp0F.cl b/ocml/src/bp0F.cl
index da6b9b4f..c0c27a1f 100644
--- a/ocml/src/bp0F.cl
+++ b/ocml/src/bp0F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(bp0)(float t)
 {
     return
diff --git a/ocml/src/bp1D.cl b/ocml/src/bp1D.cl
index 55ace155..c9239c95 100644
--- a/ocml/src/bp1D.cl
+++ b/ocml/src/bp1D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(bp1)(double t)
 {
     return
diff --git a/ocml/src/bp1F.cl b/ocml/src/bp1F.cl
index 3267c7f4..18569cb6 100644
--- a/ocml/src/bp1F.cl
+++ b/ocml/src/bp1F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(bp1)(float t)
 {
     return
diff --git a/ocml/src/cbrtD.cl b/ocml/src/cbrtD.cl
index 67cd2628..fd83a2fb 100644
--- a/ocml/src/cbrtD.cl
+++ b/ocml/src/cbrtD.cl
@@ -1,7 +1,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(cbrt)(double x)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/cbrtF.cl b/ocml/src/cbrtF.cl
index cab2df26..5e436900 100644
--- a/ocml/src/cbrtF.cl
+++ b/ocml/src/cbrtF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(cbrt)(float x)
 {
     if (DAZ_OPT()) {
diff --git a/ocml/src/ceilD.cl b/ocml/src/ceilD.cl
index dc2eb8dc..654226cc 100644
--- a/ocml/src/ceilD.cl
+++ b/ocml/src/ceilD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(ceil)(double x)
 {
     return BUILTIN_CEIL_F64(x);
diff --git a/ocml/src/ceilF.cl b/ocml/src/ceilF.cl
index 2a563cdf..8b1600c8 100644
--- a/ocml/src/ceilF.cl
+++ b/ocml/src/ceilF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(ceil)(float x)
 {
     return BUILTIN_CEIL_F32(x);
diff --git a/ocml/src/ceilH.cl b/ocml/src/ceilH.cl
index 2db7385c..5b9804cb 100644
--- a/ocml/src/ceilH.cl
+++ b/ocml/src/ceilH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(ceil)(half2 x)
 {
     return BUILTIN_CEIL_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(ceil)(half x)
 {
     return BUILTIN_CEIL_F16(x);
diff --git a/ocml/src/copysignD.cl b/ocml/src/copysignD.cl
index 5c2eb066..b239b793 100644
--- a/ocml/src/copysignD.cl
+++ b/ocml/src/copysignD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(copysign)(double x, double y)
 {
     return BUILTIN_COPYSIGN_F64(x, y);
diff --git a/ocml/src/copysignF.cl b/ocml/src/copysignF.cl
index 87bc68d7..f2fac4ab 100644
--- a/ocml/src/copysignF.cl
+++ b/ocml/src/copysignF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(copysign)(float x, float y)
 {
     return BUILTIN_COPYSIGN_F32(x, y);
diff --git a/ocml/src/copysignH.cl b/ocml/src/copysignH.cl
index f89c061c..7897b1e3 100644
--- a/ocml/src/copysignH.cl
+++ b/ocml/src/copysignH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(copysign)(half2 x, half2 y)
 {
     return BUILTIN_COPYSIGN_2F16(x, y);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(copysign)(half x, half y)
 {
     return BUILTIN_COPYSIGN_F16(x, y);
diff --git a/ocml/src/cosD.cl b/ocml/src/cosD.cl
index 12a43884..b76adff9 100644
--- a/ocml/src/cosD.cl
+++ b/ocml/src/cosD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(cos)(double x)
 {
     struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
diff --git a/ocml/src/cosF.cl b/ocml/src/cosF.cl
index a0768dd2..60c57195 100644
--- a/ocml/src/cosF.cl
+++ b/ocml/src/cosF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(cos)(float x)
 {
     int ix = AS_INT(x);
diff --git a/ocml/src/cosH.cl b/ocml/src/cosH.cl
index 3aa1295b..e4edc273 100644
--- a/ocml/src/cosH.cl
+++ b/ocml/src/cosH.cl
@@ -10,7 +10,7 @@
 
 UGEN(cos)
 
-INLINEATTR half
+half
 MATH_MANGLE(cos)(half x)
 {
     struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
diff --git a/ocml/src/cosbD.cl b/ocml/src/cosbD.cl
index 49ca4e7f..0838b618 100644
--- a/ocml/src/cosbD.cl
+++ b/ocml/src/cosbD.cl
@@ -24,7 +24,7 @@
         L = __e; \
     } while (0)
 
-INLINEATTR double
+double
 MATH_PRIVATE(cosb)(double x, int n, double p)
 {
     struct redret r = MATH_PRIVATE(trigred)(x);
diff --git a/ocml/src/cosbF.cl b/ocml/src/cosbF.cl
index 34e5d857..60e1f415 100644
--- a/ocml/src/cosbF.cl
+++ b/ocml/src/cosbF.cl
@@ -24,7 +24,7 @@
         L = __e; \
     } while (0)
 
-INLINEATTR float
+float
 MATH_PRIVATE(cosb)(float x, int n, float p)
 {
     struct redret r = MATH_PRIVATE(trigred)(x);
diff --git a/ocml/src/coshD.cl b/ocml/src/coshD.cl
index fe1a676d..da1c54a4 100644
--- a/ocml/src/coshD.cl
+++ b/ocml/src/coshD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(cosh)(double x)
 {
     x = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/coshF.cl b/ocml/src/coshF.cl
index 425bea9d..ef4c46da 100644
--- a/ocml/src/coshF.cl
+++ b/ocml/src/coshF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(cosh)(float x)
 {
     x = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/coshH.cl b/ocml/src/coshH.cl
index 232b8f67..3ddea219 100644
--- a/ocml/src/coshH.cl
+++ b/ocml/src/coshH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(cosh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(cosh)(half hx)
 {
     float x = (float)hx * 0x1.715476p+0f;
diff --git a/ocml/src/cospiD.cl b/ocml/src/cospiD.cl
index ef407179..fab3bc4d 100644
--- a/ocml/src/cospiD.cl
+++ b/ocml/src/cospiD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(cospi)(double x)
 {
     struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
diff --git a/ocml/src/cospiF.cl b/ocml/src/cospiF.cl
index 6891ba87..90d360d0 100644
--- a/ocml/src/cospiF.cl
+++ b/ocml/src/cospiF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(cospi)(float x)
 {
     int ax = AS_INT(x) & 0x7fffffff;
diff --git a/ocml/src/cospiH.cl b/ocml/src/cospiH.cl
index e4b74521..3f55c79e 100644
--- a/ocml/src/cospiH.cl
+++ b/ocml/src/cospiH.cl
@@ -10,7 +10,7 @@
 
 UGEN(cospi)
 
-INLINEATTR half
+half
 MATH_MANGLE(cospi)(half x)
 {
     struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
diff --git a/ocml/src/divD.cl b/ocml/src/divD.cl
index 27ae4318..eed7cbb7 100644
--- a/ocml/src/divD.cl
+++ b/ocml/src/divD.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR double \
+CONSTATTR double \
 MATH_MANGLE(NAME)(double x, double y) \
 { \
     return BUILTIN_FULL_BINARY(fdiv, false, ROUND, x, y); \
diff --git a/ocml/src/divF.cl b/ocml/src/divF.cl
index 9dcfb511..e0c7b961 100644
--- a/ocml/src/divF.cl
+++ b/ocml/src/divF.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR float \
+CONSTATTR float \
 MATH_MANGLE(NAME)(float x, float y) \
 { \
     float ret; \
diff --git a/ocml/src/divH.cl b/ocml/src/divH.cl
index 7ac66449..927784b1 100644
--- a/ocml/src/divH.cl
+++ b/ocml/src/divH.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR half \
+CONSTATTR half \
 MATH_MANGLE(NAME)(half x, half y) \
 { \
     return BUILTIN_FULL_BINARY(fdivh, false, ROUND, x, y); \
diff --git a/ocml/src/epexpepD.cl b/ocml/src/epexpepD.cl
index 292a61b9..f6340e15 100644
--- a/ocml/src/epexpepD.cl
+++ b/ocml/src/epexpepD.cl
@@ -10,7 +10,7 @@
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR double2
+CONSTATTR double2
 MATH_PRIVATE(epexpep)(double2 x)
 {
     double dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0);
diff --git a/ocml/src/epexpepF.cl b/ocml/src/epexpepF.cl
index abeecc70..1ba48e10 100644
--- a/ocml/src/epexpepF.cl
+++ b/ocml/src/epexpepF.cl
@@ -10,7 +10,7 @@
 #define FLOAT_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR float2
+CONSTATTR float2
 MATH_PRIVATE(epexpep)(float2 x)
 {
     float fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f);
diff --git a/ocml/src/eplnD.cl b/ocml/src/eplnD.cl
index f16b4071..7540e5b9 100644
--- a/ocml/src/eplnD.cl
+++ b/ocml/src/eplnD.cl
@@ -10,7 +10,7 @@
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR double2
+CONSTATTR double2
 MATH_PRIVATE(epln)(double a)
 {
     double m = BUILTIN_FREXP_MANT_F64(a);
diff --git a/ocml/src/eplnF.cl b/ocml/src/eplnF.cl
index 9063d677..b7fef2be 100644
--- a/ocml/src/eplnF.cl
+++ b/ocml/src/eplnF.cl
@@ -10,7 +10,7 @@
 #define FLOAT_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR float2
+CONSTATTR float2
 MATH_PRIVATE(epln)(float a)
 {
     float m = BUILTIN_FREXP_MANT_F32(a);
diff --git a/ocml/src/erfH.cl b/ocml/src/erfH.cl
index 883509d8..47c3c353 100644
--- a/ocml/src/erfH.cl
+++ b/ocml/src/erfH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(erf)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erf)(half x)
 {
     return (half)MATH_UPMANGLE(erf)((float)x);
diff --git a/ocml/src/erfcH.cl b/ocml/src/erfcH.cl
index 2adc0236..ec7c7b04 100644
--- a/ocml/src/erfcH.cl
+++ b/ocml/src/erfcH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(erfc)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erfc)(half x)
 {
     return (half)MATH_UPMANGLE(erfc)((float)x);
diff --git a/ocml/src/erfcinvH.cl b/ocml/src/erfcinvH.cl
index 8050709b..6258a9b9 100644
--- a/ocml/src/erfcinvH.cl
+++ b/ocml/src/erfcinvH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(erfcinv)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erfcinv)(half x)
 {
     return (half)MATH_UPMANGLE(erfcinv)((float)x);
diff --git a/ocml/src/erfcxH.cl b/ocml/src/erfcxH.cl
index eb064e47..9fa79b5f 100644
--- a/ocml/src/erfcxH.cl
+++ b/ocml/src/erfcxH.cl
@@ -3,7 +3,7 @@
 
 PUREATTR UGEN(erfcx)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erfcx)(half x)
 {
     return (half)MATH_UPMANGLE(erfcx)((float)x);
diff --git a/ocml/src/erfinvH.cl b/ocml/src/erfinvH.cl
index 60238709..18317b51 100644
--- a/ocml/src/erfinvH.cl
+++ b/ocml/src/erfinvH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(erfinv)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(erfinv)(half x)
 {
     return (half)MATH_UPMANGLE(erfinv)((float)x);
diff --git a/ocml/src/exp10H.cl b/ocml/src/exp10H.cl
index d376414e..94a50ce2 100644
--- a/ocml/src/exp10H.cl
+++ b/ocml/src/exp10H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(exp10)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(exp10)(half x)
 {
     return (half)BUILTIN_EXP2_F32((float)x * 0x1.a934f0p+1f);
diff --git a/ocml/src/exp2H.cl b/ocml/src/exp2H.cl
index a8b72ff3..b6afa724 100644
--- a/ocml/src/exp2H.cl
+++ b/ocml/src/exp2H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(exp2)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(exp2)(half x)
 {
     return BUILTIN_EXP2_F16(x);
diff --git a/ocml/src/expF_base.h b/ocml/src/expF_base.h
index 9c42d5fe..08bde388 100644
--- a/ocml/src/expF_base.h
+++ b/ocml/src/expF_base.h
@@ -32,7 +32,7 @@
 // 
 //    e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) 
 
-PUREATTR INLINEATTR float
+PUREATTR float
 #if defined COMPILING_EXP2
 MATH_MANGLE(exp2)(float x)
 #elif defined COMPILING_EXP10
diff --git a/ocml/src/expH.cl b/ocml/src/expH.cl
index 1ff4a024..caa3a4ac 100644
--- a/ocml/src/expH.cl
+++ b/ocml/src/expH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(exp)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(exp)(half x)
 {
     return (half)BUILTIN_EXP2_F32((float)x * 0x1.715476p+0f);
diff --git a/ocml/src/expepD.cl b/ocml/src/expepD.cl
index f8d4fd95..859a023d 100644
--- a/ocml/src/expepD.cl
+++ b/ocml/src/expepD.cl
@@ -10,7 +10,7 @@
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR double
+CONSTATTR double
 MATH_PRIVATE(expep)(double2 x)
 {
     double dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0);
diff --git a/ocml/src/expepF.cl b/ocml/src/expepF.cl
index bf585b47..657267a2 100644
--- a/ocml/src/expepF.cl
+++ b/ocml/src/expepF.cl
@@ -10,7 +10,7 @@
 #define FLOAT_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR float
+CONSTATTR float
 MATH_PRIVATE(expep)(float2 x)
 {
     float fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f);
diff --git a/ocml/src/expm1F.cl b/ocml/src/expm1F.cl
index 583a7a11..31ac3b89 100644
--- a/ocml/src/expm1F.cl
+++ b/ocml/src/expm1F.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(expm1)(float x)
 {
     float2 e = sub(MATH_PRIVATE(epexpep)(con(x, 0.0f)), 1.0f);
diff --git a/ocml/src/expm1H.cl b/ocml/src/expm1H.cl
index c04c6c84..79498be4 100644
--- a/ocml/src/expm1H.cl
+++ b/ocml/src/expm1H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(expm1)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(expm1)(half x)
 {
     half ret;
diff --git a/ocml/src/fabsD.cl b/ocml/src/fabsD.cl
index 2c5332c6..9052cd01 100644
--- a/ocml/src/fabsD.cl
+++ b/ocml/src/fabsD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(fabs)(double x)
 {
     return BUILTIN_ABS_F64(x);
diff --git a/ocml/src/fabsF.cl b/ocml/src/fabsF.cl
index 444e9075..957cb79f 100644
--- a/ocml/src/fabsF.cl
+++ b/ocml/src/fabsF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(fabs)(float x)
 {
     return BUILTIN_ABS_F32(x);
diff --git a/ocml/src/fabsH.cl b/ocml/src/fabsH.cl
index 9cd7dbbd..1504bb6a 100644
--- a/ocml/src/fabsH.cl
+++ b/ocml/src/fabsH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(fabs)(half2 x)
 {
     return BUILTIN_ABS_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(fabs)(half x)
 {
     return BUILTIN_ABS_F16(x);
diff --git a/ocml/src/fdimD.cl b/ocml/src/fdimD.cl
index 8214203e..cc7255c3 100644
--- a/ocml/src/fdimD.cl
+++ b/ocml/src/fdimD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(fdim)(double x, double y)
 {
     long d = AS_LONG(x - y);
diff --git a/ocml/src/fdimF.cl b/ocml/src/fdimF.cl
index 9e418b24..968eb908 100644
--- a/ocml/src/fdimF.cl
+++ b/ocml/src/fdimF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(fdim)(float x, float y)
 {
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/fdimH.cl b/ocml/src/fdimH.cl
index 304c96ab..989f8213 100644
--- a/ocml/src/fdimH.cl
+++ b/ocml/src/fdimH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(fdim)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(fdim)(half x, half y)
 {
     if (!FINITE_ONLY_OPT()) {
diff --git a/ocml/src/floorD.cl b/ocml/src/floorD.cl
index 8fd637da..2fc2375d 100644
--- a/ocml/src/floorD.cl
+++ b/ocml/src/floorD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(floor)(double x)
 {
     return BUILTIN_FLOOR_F64(x);
diff --git a/ocml/src/floorF.cl b/ocml/src/floorF.cl
index 3364960a..e8b6d3ef 100644
--- a/ocml/src/floorF.cl
+++ b/ocml/src/floorF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(floor)(float x)
 {
     return BUILTIN_FLOOR_F32(x);
diff --git a/ocml/src/floorH.cl b/ocml/src/floorH.cl
index 16c84eee..f563e648 100644
--- a/ocml/src/floorH.cl
+++ b/ocml/src/floorH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(floor)(half2 x)
 {
     return BUILTIN_FLOOR_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(floor)(half x)
 {
     return BUILTIN_FLOOR_F16(x);
diff --git a/ocml/src/fmaD.cl b/ocml/src/fmaD.cl
index 15d596dc..61a47ea5 100644
--- a/ocml/src/fmaD.cl
+++ b/ocml/src/fmaD.cl
@@ -17,7 +17,7 @@ MATH_MANGLE(fma)(double a, double b, double c)
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR double \
+CONSTATTR double \
 MATH_MANGLE(NAME)(double a, double b, double c) \
 { \
     return BUILTIN_FULL_TERNARY(ffma, false, ROUND, a, b, c); \
diff --git a/ocml/src/fmaF.cl b/ocml/src/fmaF.cl
index 3974f317..bf45492c 100644
--- a/ocml/src/fmaF.cl
+++ b/ocml/src/fmaF.cl
@@ -17,7 +17,7 @@ MATH_MANGLE(fma)(float a, float b, float c)
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR float \
+CONSTATTR float \
 MATH_MANGLE(NAME)(float a, float b, float c) \
 { \
     float ret; \
diff --git a/ocml/src/fmaH.cl b/ocml/src/fmaH.cl
index c34f1781..98320551 100644
--- a/ocml/src/fmaH.cl
+++ b/ocml/src/fmaH.cl
@@ -23,7 +23,7 @@ MATH_MANGLE(fma)(half a, half b, half c)
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR half \
+CONSTATTR half \
 MATH_MANGLE(NAME)(half a, half b, half c) \
 { \
     return BUILTIN_FULL_TERNARY(ffmah, false, ROUND, a, b, c); \
diff --git a/ocml/src/fmaxD.cl b/ocml/src/fmaxD.cl
index 97a80466..fa8fc448 100644
--- a/ocml/src/fmaxD.cl
+++ b/ocml/src/fmaxD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(fmax)(double x, double y)
 {
     return BUILTIN_MAX_F64(BUILTIN_CANONICALIZE_F64(x), BUILTIN_CANONICALIZE_F64(y));
diff --git a/ocml/src/fmaxF.cl b/ocml/src/fmaxF.cl
index d96a4c34..7fa39a8a 100644
--- a/ocml/src/fmaxF.cl
+++ b/ocml/src/fmaxF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(fmax)(float x, float y)
 {
     float ret;
diff --git a/ocml/src/fmaxH.cl b/ocml/src/fmaxH.cl
index f6817006..552be89f 100644
--- a/ocml/src/fmaxH.cl
+++ b/ocml/src/fmaxH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(fmax)(half2 x, half2 y)
 {
     return BUILTIN_MAX_2F16(BUILTIN_CANONICALIZE_2F16(x), BUILTIN_CANONICALIZE_2F16(y));
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(fmax)(half x, half y)
 {
     return BUILTIN_MAX_F16(BUILTIN_CANONICALIZE_F16(x), BUILTIN_CANONICALIZE_F16(y));
diff --git a/ocml/src/fminD.cl b/ocml/src/fminD.cl
index 0ff01127..04fba1fb 100644
--- a/ocml/src/fminD.cl
+++ b/ocml/src/fminD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(fmin)(double x, double y)
 {
     return BUILTIN_MIN_F64(BUILTIN_CANONICALIZE_F64(x), BUILTIN_CANONICALIZE_F64(y));
diff --git a/ocml/src/fminF.cl b/ocml/src/fminF.cl
index ffd6f40a..e979e18e 100644
--- a/ocml/src/fminF.cl
+++ b/ocml/src/fminF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(fmin)(float x, float y)
 {
     float ret;
diff --git a/ocml/src/fminH.cl b/ocml/src/fminH.cl
index 6da1fb55..76398429 100644
--- a/ocml/src/fminH.cl
+++ b/ocml/src/fminH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(fmin)(half2 x, half2 y)
 {
     return BUILTIN_MIN_2F16(BUILTIN_CANONICALIZE_2F16(x), BUILTIN_CANONICALIZE_2F16(y));
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(fmin)(half x, half y)
 {
     return BUILTIN_MIN_F16(BUILTIN_CANONICALIZE_F16(x), BUILTIN_CANONICALIZE_F16(y));
diff --git a/ocml/src/fpclassifyD.cl b/ocml/src/fpclassifyD.cl
index cfefa9d5..8db6b992 100644
--- a/ocml/src/fpclassifyD.cl
+++ b/ocml/src/fpclassifyD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(fpclassify)(double x)
 {
     int ret = BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN;
diff --git a/ocml/src/fpclassifyF.cl b/ocml/src/fpclassifyF.cl
index 824c140e..50a84783 100644
--- a/ocml/src/fpclassifyF.cl
+++ b/ocml/src/fpclassifyF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(fpclassify)(float x)
 {
     int ret = BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN;
diff --git a/ocml/src/fpclassifyH.cl b/ocml/src/fpclassifyH.cl
index 20d34897..a9c2d928 100644
--- a/ocml/src/fpclassifyH.cl
+++ b/ocml/src/fpclassifyH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(fpclassify)(half x)
 {
     int ret = BUILTIN_CLASS_F16(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN;
diff --git a/ocml/src/fractD.cl b/ocml/src/fractD.cl
index 720e3e23..e4b75aec 100644
--- a/ocml/src/fractD.cl
+++ b/ocml/src/fractD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(fract)(double x, __private double *ip)
 {
     double i = BUILTIN_FLOOR_F64(x);
diff --git a/ocml/src/fractF.cl b/ocml/src/fractF.cl
index 9b03b797..b65b517c 100644
--- a/ocml/src/fractF.cl
+++ b/ocml/src/fractF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(fract)(float x, __private float *ip)
 {
     float i = BUILTIN_FLOOR_F32(x);
diff --git a/ocml/src/fractH.cl b/ocml/src/fractH.cl
index ba127717..2cda3a5c 100644
--- a/ocml/src/fractH.cl
+++ b/ocml/src/fractH.cl
@@ -7,14 +7,14 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(fract)(half2 x, __private half2 *ip)
 {
     *ip = BUILTIN_FLOOR_2F16(x);
     return (half2)(BUILTIN_FRACTION_F16(x.lo), BUILTIN_FRACTION_F16(x.hi));
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(fract)(half x, __private half *ip)
 {
     *ip = BUILTIN_FLOOR_F16(x);
diff --git a/ocml/src/frexpD.cl b/ocml/src/frexpD.cl
index b3deeb64..4f9d252d 100644
--- a/ocml/src/frexpD.cl
+++ b/ocml/src/frexpD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(frexp)(double x, __private int *ep)
 {
     int e = BUILTIN_FREXP_EXP_F64(x);
diff --git a/ocml/src/frexpF.cl b/ocml/src/frexpF.cl
index e29554ba..c5b0b84b 100644
--- a/ocml/src/frexpF.cl
+++ b/ocml/src/frexpF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(frexp)(float x, __private int *ep)
 {
     int e = BUILTIN_FREXP_EXP_F32(x);
diff --git a/ocml/src/frexpH.cl b/ocml/src/frexpH.cl
index a5e43691..a4bc6e3c 100644
--- a/ocml/src/frexpH.cl
+++ b/ocml/src/frexpH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(frexp)(half2 x, __private int2 *ep)
 {
     int elo, ehi;
@@ -18,7 +18,7 @@ MATH_MANGLE2(frexp)(half2 x, __private int2 *ep)
     return r;
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(frexp)(half x, __private int *ep)
 {
     int e = (int)BUILTIN_FREXP_EXP_F16(x);
diff --git a/ocml/src/hypotD.cl b/ocml/src/hypotD.cl
index 405720f4..fd99614a 100644
--- a/ocml/src/hypotD.cl
+++ b/ocml/src/hypotD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(hypot)(double x, double y)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/hypotF.cl b/ocml/src/hypotF.cl
index adca99ea..2b697a32 100644
--- a/ocml/src/hypotF.cl
+++ b/ocml/src/hypotF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(hypot)(float x, float y)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/hypotH.cl b/ocml/src/hypotH.cl
index dc0dad36..66b7811a 100644
--- a/ocml/src/hypotH.cl
+++ b/ocml/src/hypotH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(hypot)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(hypot)(half x, half y)
 {
     float fx = (float)x;
diff --git a/ocml/src/i0H.cl b/ocml/src/i0H.cl
index f42f1482..913942f5 100644
--- a/ocml/src/i0H.cl
+++ b/ocml/src/i0H.cl
@@ -9,7 +9,7 @@
 
 UGEN(i0)
 
-INLINEATTR half
+half
 MATH_MANGLE(i0)(half x)
 {
     return (half)MATH_UPMANGLE(i0)((float)x);
diff --git a/ocml/src/i1H.cl b/ocml/src/i1H.cl
index 09b74c6d..d778626b 100644
--- a/ocml/src/i1H.cl
+++ b/ocml/src/i1H.cl
@@ -9,7 +9,7 @@
 
 UGEN(i1)
 
-INLINEATTR half
+half
 MATH_MANGLE(i1)(half x)
 {
     return (half)MATH_UPMANGLE(i1)((float)x);
diff --git a/ocml/src/ilogbD.cl b/ocml/src/ilogbD.cl
index 95ce66fc..0f0b9ace 100644
--- a/ocml/src/ilogbD.cl
+++ b/ocml/src/ilogbD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(ilogb)(double x)
 {
     int r = BUILTIN_FREXP_EXP_F64(x) - 1;
diff --git a/ocml/src/ilogbF.cl b/ocml/src/ilogbF.cl
index e84537b8..1a7e1d1d 100644
--- a/ocml/src/ilogbF.cl
+++ b/ocml/src/ilogbF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(ilogb)(float x)
 {
     int r = BUILTIN_FREXP_EXP_F32(x) - 1;
diff --git a/ocml/src/ilogbH.cl b/ocml/src/ilogbH.cl
index a5aeef18..d7a274e4 100644
--- a/ocml/src/ilogbH.cl
+++ b/ocml/src/ilogbH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR int2
+CONSTATTR int2
 MATH_MANGLE2(ilogb)(half2 x)
 {
     return (int2)(MATH_MANGLE(ilogb)(x.lo), MATH_MANGLE(ilogb)(x.hi));
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(ilogb)(half x)
 {
     int r = (int)BUILTIN_FREXP_EXP_F16(x) - 1;
diff --git a/ocml/src/isfiniteD.cl b/ocml/src/isfiniteD.cl
index 489a390f..bdca20d5 100644
--- a/ocml/src/isfiniteD.cl
+++ b/ocml/src/isfiniteD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isfinite)(double x)
 {
     return BUILTIN_CLASS_F64(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR);
diff --git a/ocml/src/isfiniteF.cl b/ocml/src/isfiniteF.cl
index 11227450..421ab1a1 100644
--- a/ocml/src/isfiniteF.cl
+++ b/ocml/src/isfiniteF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isfinite)(float x)
 {
     return BUILTIN_CLASS_F32(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR);
diff --git a/ocml/src/isfiniteH.cl b/ocml/src/isfiniteH.cl
index c2b62152..dce82701 100644
--- a/ocml/src/isfiniteH.cl
+++ b/ocml/src/isfiniteH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(isfinite)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(isfinite)(half2 x)
          BUILTIN_CLASS_F16(x.hi, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR) ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isfinite)(half x)
 {
     return BUILTIN_CLASS_F16(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR);
diff --git a/ocml/src/isinfD.cl b/ocml/src/isinfD.cl
index 00822a9b..bf33343e 100644
--- a/ocml/src/isinfD.cl
+++ b/ocml/src/isinfD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isinf)(double x)
 {
     return BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF);
diff --git a/ocml/src/isinfF.cl b/ocml/src/isinfF.cl
index 4a0bda85..0a408cd1 100644
--- a/ocml/src/isinfF.cl
+++ b/ocml/src/isinfF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isinf)(float x)
 {
     return BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF);
diff --git a/ocml/src/isinfH.cl b/ocml/src/isinfH.cl
index db18b9b7..d2978f02 100644
--- a/ocml/src/isinfH.cl
+++ b/ocml/src/isinfH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(isinf)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(isinf)(half2 x)
          BUILTIN_CLASS_F16(x.hi, CLASS_PINF|CLASS_NINF) ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isinf)(half x)
 {
     return BUILTIN_CLASS_F16(x, CLASS_PINF|CLASS_NINF);
diff --git a/ocml/src/isnanD.cl b/ocml/src/isnanD.cl
index d1f1b03d..12400473 100644
--- a/ocml/src/isnanD.cl
+++ b/ocml/src/isnanD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnan)(double x)
 {
     return BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN);
diff --git a/ocml/src/isnanF.cl b/ocml/src/isnanF.cl
index 5e305755..47fc9910 100644
--- a/ocml/src/isnanF.cl
+++ b/ocml/src/isnanF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnan)(float x)
 {
     return BUILTIN_CLASS_F32(x, CLASS_SNAN|CLASS_QNAN);
diff --git a/ocml/src/isnanH.cl b/ocml/src/isnanH.cl
index 8eb1b8e2..d831c3e8 100644
--- a/ocml/src/isnanH.cl
+++ b/ocml/src/isnanH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(isnan)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(isnan)(half2 x)
          BUILTIN_CLASS_F16(x.hi, CLASS_SNAN|CLASS_QNAN) ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnan)(half x)
 {
     return BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN);
diff --git a/ocml/src/isnormalD.cl b/ocml/src/isnormalD.cl
index 74907904..55799a17 100644
--- a/ocml/src/isnormalD.cl
+++ b/ocml/src/isnormalD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnormal)(double x)
 {
     return BUILTIN_CLASS_F64(x, CLASS_PNOR|CLASS_NNOR);
diff --git a/ocml/src/isnormalF.cl b/ocml/src/isnormalF.cl
index 2e717e4b..9c640286 100644
--- a/ocml/src/isnormalF.cl
+++ b/ocml/src/isnormalF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnormal)(float x)
 {
     return BUILTIN_CLASS_F32(x, CLASS_PNOR|CLASS_NNOR);
diff --git a/ocml/src/isnormalH.cl b/ocml/src/isnormalH.cl
index 1c0325a3..c33d9092 100644
--- a/ocml/src/isnormalH.cl
+++ b/ocml/src/isnormalH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(isnormal)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(isnormal)(half2 x)
          BUILTIN_CLASS_F16(x.hi, CLASS_PNOR|CLASS_NNOR) ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(isnormal)(half x)
 {
     return BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_NNOR);
diff --git a/ocml/src/j0H.cl b/ocml/src/j0H.cl
index f61b3fca..83feff6f 100644
--- a/ocml/src/j0H.cl
+++ b/ocml/src/j0H.cl
@@ -9,7 +9,7 @@
 
 UGEN(j0)
 
-INLINEATTR half
+half
 MATH_MANGLE(j0)(half x)
 {
     return (half)MATH_UPMANGLE(j0)((float)x);
diff --git a/ocml/src/j1H.cl b/ocml/src/j1H.cl
index 7cbaddf4..557038f2 100644
--- a/ocml/src/j1H.cl
+++ b/ocml/src/j1H.cl
@@ -9,7 +9,7 @@
 
 UGEN(j1)
 
-INLINEATTR half
+half
 MATH_MANGLE(j1)(half x)
 {
     return (half)MATH_UPMANGLE(j1)((float)x);
diff --git a/ocml/src/ldexpD.cl b/ocml/src/ldexpD.cl
index 1cf0e093..7ba48285 100644
--- a/ocml/src/ldexpD.cl
+++ b/ocml/src/ldexpD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(ldexp)(double x, int n)
 {
     return BUILTIN_FLDEXP_F64(x, n);
diff --git a/ocml/src/ldexpF.cl b/ocml/src/ldexpF.cl
index 435848aa..29a1da28 100644
--- a/ocml/src/ldexpF.cl
+++ b/ocml/src/ldexpF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(ldexp)(float x, int n)
 {
     return BUILTIN_FLDEXP_F32(x, n);
diff --git a/ocml/src/ldexpH.cl b/ocml/src/ldexpH.cl
index 7f06e7ef..d4d57043 100644
--- a/ocml/src/ldexpH.cl
+++ b/ocml/src/ldexpH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(ldexp)(half2 x, int2 n)
 {
     return (half2)(MATH_MANGLE(ldexp)(x.lo, n.lo), MATH_MANGLE(ldexp)(x.hi, n.hi));
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(ldexp)(half x, int n)
 {
     return BUILTIN_FLDEXP_F16(x, BUILTIN_CLAMP_S32(n, SHRT_MIN, SHRT_MAX));
diff --git a/ocml/src/len3D.cl b/ocml/src/len3D.cl
index d0c6e811..fee8e9db 100644
--- a/ocml/src/len3D.cl
+++ b/ocml/src/len3D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(len3)(double x, double y, double z)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/len3F.cl b/ocml/src/len3F.cl
index f2ab9125..bb14ee5a 100644
--- a/ocml/src/len3F.cl
+++ b/ocml/src/len3F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(len3)(float x, float y, float z)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/len3H.cl b/ocml/src/len3H.cl
index 32248780..bb6ef92c 100644
--- a/ocml/src/len3H.cl
+++ b/ocml/src/len3H.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(len3)(half x, half y, half z)
 {
     float fx = (float)x;
diff --git a/ocml/src/len4D.cl b/ocml/src/len4D.cl
index 4fe8b898..b05f0cad 100644
--- a/ocml/src/len4D.cl
+++ b/ocml/src/len4D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(len4)(double x, double y, double z, double w)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/len4F.cl b/ocml/src/len4F.cl
index c80e4c0c..24231618 100644
--- a/ocml/src/len4F.cl
+++ b/ocml/src/len4F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(len4)(float x, float y, float z, float w)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/len4H.cl b/ocml/src/len4H.cl
index 6fee1090..9b320c78 100644
--- a/ocml/src/len4H.cl
+++ b/ocml/src/len4H.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(len4)(half x, half y, half z, half w)
 {
     float fx = (float)x;
diff --git a/ocml/src/lgammaD.cl b/ocml/src/lgammaD.cl
index 4a9849e9..69e50258 100644
--- a/ocml/src/lgammaD.cl
+++ b/ocml/src/lgammaD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(lgamma)(double x)
 {
     int s;
diff --git a/ocml/src/lgammaF.cl b/ocml/src/lgammaF.cl
index 2f53d18c..4a113c1d 100644
--- a/ocml/src/lgammaF.cl
+++ b/ocml/src/lgammaF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(lgamma)(float x)
 {
     int s;
diff --git a/ocml/src/lgammaH.cl b/ocml/src/lgammaH.cl
index 6472f9f6..81a0fcec 100644
--- a/ocml/src/lgammaH.cl
+++ b/ocml/src/lgammaH.cl
@@ -9,7 +9,7 @@
 
 UGEN(lgamma)
 
-INLINEATTR half
+half
 MATH_MANGLE(lgamma)(half x)
 {
     int s;
diff --git a/ocml/src/lgamma_rH.cl b/ocml/src/lgamma_rH.cl
index 377721d9..b1f6d485 100644
--- a/ocml/src/lgamma_rH.cl
+++ b/ocml/src/lgamma_rH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(lgamma_r)(half2 x, __private int2 *signp)
 {
     int slo, shi;
@@ -18,7 +18,7 @@ MATH_MANGLE2(lgamma_r)(half2 x, __private int2 *signp)
     return r;
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(lgamma_r)(half x, __private int *signp)
 {
     return (half)MATH_UPMANGLE(lgamma_r)((float)x, signp);
diff --git a/ocml/src/lnepD.cl b/ocml/src/lnepD.cl
index 6bece3e7..dfe4484d 100644
--- a/ocml/src/lnepD.cl
+++ b/ocml/src/lnepD.cl
@@ -10,7 +10,7 @@
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR double
+CONSTATTR double
 MATH_PRIVATE(lnep)(double2 a)
 {
     int b = BUILTIN_FREXP_MANT_F64(a.hi) < (2.0/3.0);
diff --git a/ocml/src/lnepF.cl b/ocml/src/lnepF.cl
index 65675582..0c4502f2 100644
--- a/ocml/src/lnepF.cl
+++ b/ocml/src/lnepF.cl
@@ -10,7 +10,7 @@
 #define FLOAT_SPECIALIZATION
 #include "ep.h"
 
-INLINEATTR CONSTATTR float
+CONSTATTR float
 MATH_PRIVATE(lnep)(float2 a)
 {
     int b = BUILTIN_FREXP_MANT_F32(a.hi) < (2.0f/3.0f);
diff --git a/ocml/src/log10H.cl b/ocml/src/log10H.cl
index 1fbf60ae..2a03ff02 100644
--- a/ocml/src/log10H.cl
+++ b/ocml/src/log10H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(log10)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(log10)(half x)
 {
     return (half)(BUILTIN_LOG2_F32((float)x) * 0x1.344136p-2f);
diff --git a/ocml/src/log1pD.cl b/ocml/src/log1pD.cl
index c5f5252f..240b4626 100644
--- a/ocml/src/log1pD.cl
+++ b/ocml/src/log1pD.cl
@@ -12,7 +12,7 @@ extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x);
 #define DOUBLE_SPECIALIZATION
 #include "ep.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(log1p)(double x)
 {
     double z = MATH_PRIVATE(lnep)(add(1.0, x));
diff --git a/ocml/src/log1pF.cl b/ocml/src/log1pF.cl
index b4584519..ce8a5a2b 100644
--- a/ocml/src/log1pF.cl
+++ b/ocml/src/log1pF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(log1p)(float x)
 {
     float z = MATH_PRIVATE(lnep)(add(1.0, x));
diff --git a/ocml/src/log1pH.cl b/ocml/src/log1pH.cl
index da274acf..51b5ff7c 100644
--- a/ocml/src/log1pH.cl
+++ b/ocml/src/log1pH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(log1p)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(log1p)(half x)
 {
     half ret;
diff --git a/ocml/src/log2H.cl b/ocml/src/log2H.cl
index 3d38a9f6..4a46e968 100644
--- a/ocml/src/log2H.cl
+++ b/ocml/src/log2H.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(log2)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(log2)(half x)
 {
     return BUILTIN_LOG2_F16(x);
diff --git a/ocml/src/logF_base.h b/ocml/src/logF_base.h
index cddad305..763623ab 100644
--- a/ocml/src/logF_base.h
+++ b/ocml/src/logF_base.h
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR CONSTATTR float
+CONSTATTR float
 #if defined COMPILING_LOG2
 MATH_MANGLE(log2)(float x)
 #elif defined COMPILING_LOG10
diff --git a/ocml/src/logH.cl b/ocml/src/logH.cl
index d62fecc4..08439ff5 100644
--- a/ocml/src/logH.cl
+++ b/ocml/src/logH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(log)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(log)(half x)
 {
     return (half)(BUILTIN_LOG2_F32((float)x) * 0x1.62e430p-1f);
diff --git a/ocml/src/logbD.cl b/ocml/src/logbD.cl
index cbc52224..2b859853 100644
--- a/ocml/src/logbD.cl
+++ b/ocml/src/logbD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(logb)(double x)
 {
     double ret = (double)(BUILTIN_FREXP_EXP_F64(x) - 1);
diff --git a/ocml/src/logbF.cl b/ocml/src/logbF.cl
index f7154d4f..0e6cb740 100644
--- a/ocml/src/logbF.cl
+++ b/ocml/src/logbF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(logb)(float x)
 {
     float ret = (float)(BUILTIN_FREXP_EXP_F32(x) - 1);
diff --git a/ocml/src/logbH.cl b/ocml/src/logbH.cl
index 656d07b0..49af766e 100644
--- a/ocml/src/logbH.cl
+++ b/ocml/src/logbH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR UGEN(logb)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(logb)(half x)
 {
     half ret = (half)(BUILTIN_FREXP_EXP_F16(x) - (short)1);
diff --git a/ocml/src/madD.cl b/ocml/src/madD.cl
index e5573141..293e3fce 100644
--- a/ocml/src/madD.cl
+++ b/ocml/src/madD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(mad)(double a, double b, double c)
 {
     return MATH_MAD(a, b, c);
diff --git a/ocml/src/madF.cl b/ocml/src/madF.cl
index b1f67ec4..06546b44 100644
--- a/ocml/src/madF.cl
+++ b/ocml/src/madF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(mad)(float a, float b, float c)
 {
     return MATH_MAD(a, b, c);
diff --git a/ocml/src/madH.cl b/ocml/src/madH.cl
index 707f99ac..4f3d393f 100644
--- a/ocml/src/madH.cl
+++ b/ocml/src/madH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(mad)(half2 a, half2 b, half2 c)
 {
     return MATH_MAD2(a, b, c);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(mad)(half a, half b, half c)
 {
     return MATH_MAD(a, b, c);
diff --git a/ocml/src/maxD.cl b/ocml/src/maxD.cl
index 49b3dccb..7c6664b0 100644
--- a/ocml/src/maxD.cl
+++ b/ocml/src/maxD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(max)(double x, double y)
 {
     return BUILTIN_CMAX_F64(x, y);
diff --git a/ocml/src/maxF.cl b/ocml/src/maxF.cl
index 6e3e17ba..4cd0bfa9 100644
--- a/ocml/src/maxF.cl
+++ b/ocml/src/maxF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(max)(float x, float y)
 {
     return BUILTIN_CMAX_F32(x, y);
diff --git a/ocml/src/maxH.cl b/ocml/src/maxH.cl
index 31cad270..01479c8a 100644
--- a/ocml/src/maxH.cl
+++ b/ocml/src/maxH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(max)(half2 x, half2 y)
 {
     return BUILTIN_CMAX_2F16(x, y);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(max)(half x, half y)
 {
     return BUILTIN_CMAX_F16(x, y);
diff --git a/ocml/src/maxmagD.cl b/ocml/src/maxmagD.cl
index 9f606da8..3db12aad 100644
--- a/ocml/src/maxmagD.cl
+++ b/ocml/src/maxmagD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(maxmag)(double x, double y)
 {
 #if 0
diff --git a/ocml/src/maxmagF.cl b/ocml/src/maxmagF.cl
index 4997bd06..941fbe4b 100644
--- a/ocml/src/maxmagF.cl
+++ b/ocml/src/maxmagF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(maxmag)(float x, float y)
 {
 #if 0
diff --git a/ocml/src/maxmagH.cl b/ocml/src/maxmagH.cl
index 74ab78aa..9453df4e 100644
--- a/ocml/src/maxmagH.cl
+++ b/ocml/src/maxmagH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(maxmag)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(maxmag)(half x, half y)
 {
     x = BUILTIN_CANONICALIZE_F16(x);
diff --git a/ocml/src/minD.cl b/ocml/src/minD.cl
index c2d0b120..151178c2 100644
--- a/ocml/src/minD.cl
+++ b/ocml/src/minD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(min)(double x, double y)
 {
     return BUILTIN_CMIN_F64(x, y);
diff --git a/ocml/src/minF.cl b/ocml/src/minF.cl
index 9c5e741b..eb38af70 100644
--- a/ocml/src/minF.cl
+++ b/ocml/src/minF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(min)(float x, float y)
 {
     return BUILTIN_CMIN_F32(x, y);
diff --git a/ocml/src/minH.cl b/ocml/src/minH.cl
index 2ed7fa68..2f2eb4d7 100644
--- a/ocml/src/minH.cl
+++ b/ocml/src/minH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(min)(half2 x, half2 y)
 {
     return BUILTIN_CMIN_2F16(x, y);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(min)(half x, half y)
 {
     return BUILTIN_CMIN_F16(x, y);
diff --git a/ocml/src/minmagD.cl b/ocml/src/minmagD.cl
index 80e7e3f4..cb3dbf3d 100644
--- a/ocml/src/minmagD.cl
+++ b/ocml/src/minmagD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(minmag)(double x, double y)
 {
 #if 0
diff --git a/ocml/src/minmagF.cl b/ocml/src/minmagF.cl
index 41fabef4..8994aac0 100644
--- a/ocml/src/minmagF.cl
+++ b/ocml/src/minmagF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(minmag)(float x, float y)
 {
 #if 0
diff --git a/ocml/src/minmagH.cl b/ocml/src/minmagH.cl
index 8b3fd016..e2659945 100644
--- a/ocml/src/minmagH.cl
+++ b/ocml/src/minmagH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(minmag)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(minmag)(half x, half y)
 {
     x = BUILTIN_CANONICALIZE_F16(x);
diff --git a/ocml/src/modfD.cl b/ocml/src/modfD.cl
index 317abdc6..6ad02e35 100644
--- a/ocml/src/modfD.cl
+++ b/ocml/src/modfD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(modf)(double x, __private double *iptr)
 {
     double tx = BUILTIN_TRUNC_F64(x);
diff --git a/ocml/src/modfF.cl b/ocml/src/modfF.cl
index 27b33289..7d9b2964 100644
--- a/ocml/src/modfF.cl
+++ b/ocml/src/modfF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(modf)(float x, __private float *iptr)
 {
     float tx = BUILTIN_TRUNC_F32(x);
diff --git a/ocml/src/modfH.cl b/ocml/src/modfH.cl
index 7c40cba9..8c28ef86 100644
--- a/ocml/src/modfH.cl
+++ b/ocml/src/modfH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(modf)(half2 x, __private half2 *iptr)
 {
     half2 tx = BUILTIN_TRUNC_2F16(x);
@@ -18,7 +18,7 @@ MATH_MANGLE2(modf)(half2 x, __private half2 *iptr)
     return BUILTIN_COPYSIGN_2F16(ret, x);
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(modf)(half x, __private half *iptr)
 {
     half tx = BUILTIN_TRUNC_F16(x);
diff --git a/ocml/src/mulD.cl b/ocml/src/mulD.cl
index c567b07e..6d7e296f 100644
--- a/ocml/src/mulD.cl
+++ b/ocml/src/mulD.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR double \
+CONSTATTR double \
 MATH_MANGLE(NAME)(double x, double y) \
 { \
     return BUILTIN_FULL_BINARY(fmul, false, ROUND, x, y); \
diff --git a/ocml/src/mulF.cl b/ocml/src/mulF.cl
index 0a26fa26..ace8b656 100644
--- a/ocml/src/mulF.cl
+++ b/ocml/src/mulF.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR float \
+CONSTATTR float \
 MATH_MANGLE(NAME)(float x, float y) \
 { \
     float ret; \
diff --git a/ocml/src/mulH.cl b/ocml/src/mulH.cl
index 7fcf2141..2cb52876 100644
--- a/ocml/src/mulH.cl
+++ b/ocml/src/mulH.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR half \
+CONSTATTR half \
 MATH_MANGLE(NAME)(half x, half y) \
 { \
     return BUILTIN_FULL_BINARY(fmulh, false, ROUND, x, y); \
diff --git a/ocml/src/nanD.cl b/ocml/src/nanD.cl
index 439c9654..762365bc 100644
--- a/ocml/src/nanD.cl
+++ b/ocml/src/nanD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(nan)(ulong nancode)
 {
     return AS_DOUBLE((nancode & MANTBITS_DP64) | QNANBITPATT_DP64);
diff --git a/ocml/src/nanF.cl b/ocml/src/nanF.cl
index 7fcf09fd..aeb5e530 100644
--- a/ocml/src/nanF.cl
+++ b/ocml/src/nanF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(nan)(uint nancode)
 {
     return AS_FLOAT(QNANBITPATT_SP32 | (nancode & 0xfffff));
diff --git a/ocml/src/nanH.cl b/ocml/src/nanH.cl
index 086c5f6d..b53e48e8 100644
--- a/ocml/src/nanH.cl
+++ b/ocml/src/nanH.cl
@@ -7,14 +7,14 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(nan)(ushort2 nancode)
 {
     ushort2 h = (ushort2)QNANBITPATT_HP16 | (nancode & (ushort2)0x01ff);
     return AS_HALF2(h);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(nan)(ushort nancode)
 {
     ushort h = (ushort)QNANBITPATT_HP16 | (nancode & (ushort)0x01ff);
diff --git a/ocml/src/ncdfH.cl b/ocml/src/ncdfH.cl
index 1ac2bf9f..cb7bd711 100644
--- a/ocml/src/ncdfH.cl
+++ b/ocml/src/ncdfH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(ncdf)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(ncdf)(half x)
 {
     return (half)MATH_UPMANGLE(ncdf)((float)x);
diff --git a/ocml/src/ncdfinvD.cl b/ocml/src/ncdfinvD.cl
index 300f6048..f2e6cfd5 100644
--- a/ocml/src/ncdfinvD.cl
+++ b/ocml/src/ncdfinvD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR PUREATTR double
+PUREATTR double
 MATH_MANGLE(ncdfinv)(double x)
 {
     return -0x1.6a09e667f3bcdp+0 * MATH_MANGLE(erfcinv)(x + x);
diff --git a/ocml/src/ncdfinvF.cl b/ocml/src/ncdfinvF.cl
index d04dddd0..9c31025d 100644
--- a/ocml/src/ncdfinvF.cl
+++ b/ocml/src/ncdfinvF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-INLINEATTR PUREATTR float
+PUREATTR float
 MATH_MANGLE(ncdfinv)(float x)
 {
     return -0x1.6a09e6p+0f * MATH_MANGLE(erfcinv)(x + x);
diff --git a/ocml/src/ncdfinvH.cl b/ocml/src/ncdfinvH.cl
index 3905a68d..8f4fceca 100644
--- a/ocml/src/ncdfinvH.cl
+++ b/ocml/src/ncdfinvH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(ncdfinv)
 
-INLINEATTR PUREATTR half
+PUREATTR half
 MATH_MANGLE(ncdfinv)(half x)
 {
     return (half)MATH_UPMANGLE(ncdfinv)((float)x);
diff --git a/ocml/src/nearbyintD.cl b/ocml/src/nearbyintD.cl
index df2d005b..a222532f 100644
--- a/ocml/src/nearbyintD.cl
+++ b/ocml/src/nearbyintD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(nearbyint)(double x)
 {
     return BUILTIN_RINT_F64(x);
diff --git a/ocml/src/nearbyintF.cl b/ocml/src/nearbyintF.cl
index 5ae97fff..44be2481 100644
--- a/ocml/src/nearbyintF.cl
+++ b/ocml/src/nearbyintF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(nearbyint)(float x)
 {
     return BUILTIN_RINT_F32(x);
diff --git a/ocml/src/nearbyintH.cl b/ocml/src/nearbyintH.cl
index cf2e962c..92c0fa3b 100644
--- a/ocml/src/nearbyintH.cl
+++ b/ocml/src/nearbyintH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(nearbyint)(half2 x)
 {
     return BUILTIN_RINT_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(nearbyint)(half x)
 {
     return BUILTIN_RINT_F16(x);
diff --git a/ocml/src/nextafterD.cl b/ocml/src/nextafterD.cl
index ee4031c6..aa1add9f 100644
--- a/ocml/src/nextafterD.cl
+++ b/ocml/src/nextafterD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(nextafter)(double x, double y)
 {
     long ix = AS_LONG(x);
diff --git a/ocml/src/nextafterF.cl b/ocml/src/nextafterF.cl
index 4ef25bcd..0c4180c5 100644
--- a/ocml/src/nextafterF.cl
+++ b/ocml/src/nextafterF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(nextafter)(float x, float y)
 {
     int ix = AS_INT(x);
diff --git a/ocml/src/nextafterH.cl b/ocml/src/nextafterH.cl
index d81028ac..517ce81a 100644
--- a/ocml/src/nextafterH.cl
+++ b/ocml/src/nextafterH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(nextafter)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(nextafter)(half x, half y)
 {
     short ix = AS_SHORT(x);
diff --git a/ocml/src/pownH.cl b/ocml/src/pownH.cl
index c8c74d31..3604cae6 100644
--- a/ocml/src/pownH.cl
+++ b/ocml/src/pownH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-PUREATTR INLINEATTR half2
+PUREATTR half2
 MATH_MANGLE2(pown)(half2 x, int2 ny)
 {
     return (half2)(MATH_MANGLE(pown)(x.lo, ny.lo), MATH_MANGLE(pown)(x.hi, ny.hi));
diff --git a/ocml/src/rcbrtF.cl b/ocml/src/rcbrtF.cl
index 1fd6c9c0..0e393e68 100644
--- a/ocml/src/rcbrtF.cl
+++ b/ocml/src/rcbrtF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rcbrt)(float x)
 {
     if (DAZ_OPT()) {
diff --git a/ocml/src/remainderF_base.h b/ocml/src/remainderF_base.h
index 4422f826..a79ec5d1 100644
--- a/ocml/src/remainderF_base.h
+++ b/ocml/src/remainderF_base.h
@@ -18,7 +18,7 @@
         CLO = MATH_MAD(__ta, __tb, MATH_MAD(__ta, __hb, MATH_MAD(__ha, __tb, MATH_MAD(__ha, __hb, -CHI)))); \
     } while (0)
 
-CONSTATTR static inline float
+CONSTATTR INLINEATTR static float
 fnma(float a, float b, float c)
 {
     float d;
diff --git a/ocml/src/remquoH.cl b/ocml/src/remquoH.cl
index 3893dded..18106093 100644
--- a/ocml/src/remquoH.cl
+++ b/ocml/src/remquoH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(remquo)(half2 x, half2 y, __private int2 *q7p)
 {
     int qlo, qhi;
diff --git a/ocml/src/rhypotD.cl b/ocml/src/rhypotD.cl
index 0524902b..4339b4f5 100644
--- a/ocml/src/rhypotD.cl
+++ b/ocml/src/rhypotD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rhypot)(double x, double y)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/rhypotF.cl b/ocml/src/rhypotF.cl
index 56cc0d2f..cdf08f86 100644
--- a/ocml/src/rhypotF.cl
+++ b/ocml/src/rhypotF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rhypot)(float x, float y)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/rhypotH.cl b/ocml/src/rhypotH.cl
index d1c571a0..97acf627 100644
--- a/ocml/src/rhypotH.cl
+++ b/ocml/src/rhypotH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(rhypot)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rhypot)(half x, half y)
 {
     float fx = (float)x;
diff --git a/ocml/src/rintD.cl b/ocml/src/rintD.cl
index a43b5ec8..7c3bb107 100644
--- a/ocml/src/rintD.cl
+++ b/ocml/src/rintD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rint)(double x)
 {
     return BUILTIN_RINT_F64(x);
diff --git a/ocml/src/rintF.cl b/ocml/src/rintF.cl
index a95c223b..17254933 100644
--- a/ocml/src/rintF.cl
+++ b/ocml/src/rintF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rint)(float x)
 {
     return BUILTIN_RINT_F32(x);
diff --git a/ocml/src/rintH.cl b/ocml/src/rintH.cl
index fa789d5b..f2ffd3c1 100644
--- a/ocml/src/rintH.cl
+++ b/ocml/src/rintH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(rint)(half2 x)
 {
     return BUILTIN_RINT_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rint)(half x)
 {
     return BUILTIN_RINT_F16(x);
diff --git a/ocml/src/rlen3D.cl b/ocml/src/rlen3D.cl
index f9442e48..a1081a2c 100644
--- a/ocml/src/rlen3D.cl
+++ b/ocml/src/rlen3D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rlen3)(double x, double y, double z)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/rlen3F.cl b/ocml/src/rlen3F.cl
index bf0cab90..03f2c40a 100644
--- a/ocml/src/rlen3F.cl
+++ b/ocml/src/rlen3F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rlen3)(float x, float y, float z)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/rlen3H.cl b/ocml/src/rlen3H.cl
index c12755f7..b147b44d 100644
--- a/ocml/src/rlen3H.cl
+++ b/ocml/src/rlen3H.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rlen3)(half x, half y, half z)
 {
     float fx = (float)x;
diff --git a/ocml/src/rlen4D.cl b/ocml/src/rlen4D.cl
index 9c4fe9bf..4d16f943 100644
--- a/ocml/src/rlen4D.cl
+++ b/ocml/src/rlen4D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rlen4)(double x, double y, double z, double w)
 {
     double a = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/rlen4F.cl b/ocml/src/rlen4F.cl
index 733f62d1..e6d7603f 100644
--- a/ocml/src/rlen4F.cl
+++ b/ocml/src/rlen4F.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(rlen4)(float x, float y, float z, float w)
 {
     float a = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/rlen4H.cl b/ocml/src/rlen4H.cl
index 9fb13359..5abb05f5 100644
--- a/ocml/src/rlen4H.cl
+++ b/ocml/src/rlen4H.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rlen4)(half x, half y, half z, half w)
 {
     float fx = (float)x;
diff --git a/ocml/src/rootnH.cl b/ocml/src/rootnH.cl
index d17abfc8..5bd94272 100644
--- a/ocml/src/rootnH.cl
+++ b/ocml/src/rootnH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-PUREATTR INLINEATTR half2
+PUREATTR half2
 MATH_MANGLE2(rootn)(half2 x, int2 ny)
 {
     return (half2)(MATH_MANGLE(rootn)(x.lo, ny.lo), MATH_MANGLE(rootn)(x.hi, ny.hi));
diff --git a/ocml/src/roundD.cl b/ocml/src/roundD.cl
index e8281f8d..0bc2aedb 100644
--- a/ocml/src/roundD.cl
+++ b/ocml/src/roundD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(round)(double x)
 {
     double t = BUILTIN_TRUNC_F64(x);
diff --git a/ocml/src/roundF.cl b/ocml/src/roundF.cl
index bbaf3e6d..2b98a223 100644
--- a/ocml/src/roundF.cl
+++ b/ocml/src/roundF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(round)(float x)
 {
     float t = BUILTIN_TRUNC_F32(x);
diff --git a/ocml/src/roundH.cl b/ocml/src/roundH.cl
index 045f5d95..d735a7fb 100644
--- a/ocml/src/roundH.cl
+++ b/ocml/src/roundH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(round)(half2 x)
 {
     half2 t = BUILTIN_TRUNC_2F16(x);
@@ -18,7 +18,7 @@ MATH_MANGLE2(round)(half2 x)
     return t + o;
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(round)(half x)
 {
     half t = BUILTIN_TRUNC_F16(x);
diff --git a/ocml/src/rsqrtD.cl b/ocml/src/rsqrtD.cl
index d67127d4..5fd5d156 100644
--- a/ocml/src/rsqrtD.cl
+++ b/ocml/src/rsqrtD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(rsqrt)(double x)
 {
     double y0 = BUILTIN_RSQRT_F64(x);
diff --git a/ocml/src/rsqrtF.cl b/ocml/src/rsqrtF.cl
index dc7df5fb..8349387f 100644
--- a/ocml/src/rsqrtF.cl
+++ b/ocml/src/rsqrtF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-PUREATTR INLINEATTR float
+PUREATTR float
 MATH_MANGLE(rsqrt)(float x)
 {
     if (DAZ_OPT()) {
diff --git a/ocml/src/rsqrtH.cl b/ocml/src/rsqrtH.cl
index ec5f9bed..ab42880e 100644
--- a/ocml/src/rsqrtH.cl
+++ b/ocml/src/rsqrtH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR UGEN(rsqrt)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(rsqrt)(half x)
 {
     return BUILTIN_RSQRT_F16(x);
diff --git a/ocml/src/scalbD.cl b/ocml/src/scalbD.cl
index 5bfce8a7..cfe4caf3 100644
--- a/ocml/src/scalbD.cl
+++ b/ocml/src/scalbD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(scalb)(double x, double y)
 {
     double t = BUILTIN_MIN_F64(BUILTIN_MAX_F64(y, -0x1.0p+20), 0x1.0p+20);
diff --git a/ocml/src/scalbF.cl b/ocml/src/scalbF.cl
index f957fb7b..05d95969 100644
--- a/ocml/src/scalbF.cl
+++ b/ocml/src/scalbF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(scalb)(float x, float y)
 {
     float t = BUILTIN_CLAMP_F32(y, -0x1.0p+20f, 0x1.0p+20f);
diff --git a/ocml/src/scalbH.cl b/ocml/src/scalbH.cl
index 2d55c644..53b8cc8e 100644
--- a/ocml/src/scalbH.cl
+++ b/ocml/src/scalbH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR BGEN(scalb)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(scalb)(half x, half y)
 {
     half t = BUILTIN_MIN_F16(BUILTIN_MAX_F16(y, -0x1.0p+6h), 0x1.0p+6h);
diff --git a/ocml/src/scalbnD.cl b/ocml/src/scalbnD.cl
index 350c47f9..07ecd541 100644
--- a/ocml/src/scalbnD.cl
+++ b/ocml/src/scalbnD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(scalbn)(double x, int n)
 {
     return MATH_MANGLE(ldexp)(x, n);
diff --git a/ocml/src/scalbnF.cl b/ocml/src/scalbnF.cl
index 49f4e700..b0adcc1a 100644
--- a/ocml/src/scalbnF.cl
+++ b/ocml/src/scalbnF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(scalbn)(float x, int n)
 {
     return MATH_MANGLE(ldexp)(x, n);
diff --git a/ocml/src/scalbnH.cl b/ocml/src/scalbnH.cl
index 5656013c..f9be702e 100644
--- a/ocml/src/scalbnH.cl
+++ b/ocml/src/scalbnH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(scalbn)(half2 x, int2 n)
 {
     return (half2)(MATH_MANGLE(ldexp)(x.lo, n.lo), MATH_MANGLE(ldexp)(x.hi, n.hi));
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(scalbn)(half x, int n)
 {
     return MATH_MANGLE(ldexp)(x, n);
diff --git a/ocml/src/signbitD.cl b/ocml/src/signbitD.cl
index 3c93ca5b..98681e5d 100644
--- a/ocml/src/signbitD.cl
+++ b/ocml/src/signbitD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(signbit)(double x)
 {
     return AS_INT2(x).hi < 0;
diff --git a/ocml/src/signbitF.cl b/ocml/src/signbitF.cl
index 3ceec89c..e944a72b 100644
--- a/ocml/src/signbitF.cl
+++ b/ocml/src/signbitF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(signbit)(float x)
 {
     return AS_INT(x) < 0;
diff --git a/ocml/src/signbitH.cl b/ocml/src/signbitH.cl
index e5fb9130..b5d99170 100644
--- a/ocml/src/signbitH.cl
+++ b/ocml/src/signbitH.cl
@@ -7,7 +7,7 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR short2
+CONSTATTR short2
 MATH_MANGLE2(signbit)(half2 x)
 {
     return (short2)
@@ -15,7 +15,7 @@ MATH_MANGLE2(signbit)(half2 x)
          AS_SHORT(x.hi) < 0 ? (short)-1 : (short)0);
 }
 
-CONSTATTR INLINEATTR int
+CONSTATTR int
 MATH_MANGLE(signbit)(half x)
 {
     return AS_SHORT(x) < 0;
diff --git a/ocml/src/sinD.cl b/ocml/src/sinD.cl
index c44c1101..8f4464c2 100644
--- a/ocml/src/sinD.cl
+++ b/ocml/src/sinD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(sin)(double x)
 {
     struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
diff --git a/ocml/src/sinF.cl b/ocml/src/sinF.cl
index c42c05e4..c9059771 100644
--- a/ocml/src/sinF.cl
+++ b/ocml/src/sinF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(sin)(float x)
 {
     int ix = AS_INT(x);
diff --git a/ocml/src/sinH.cl b/ocml/src/sinH.cl
index 7cd9aae7..7c018cc3 100644
--- a/ocml/src/sinH.cl
+++ b/ocml/src/sinH.cl
@@ -10,7 +10,7 @@
 
 UGEN(sin)
 
-INLINEATTR half
+half
 MATH_MANGLE(sin)(half x)
 {
     struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
diff --git a/ocml/src/sinbD.cl b/ocml/src/sinbD.cl
index c8db0800..c98a8fa2 100644
--- a/ocml/src/sinbD.cl
+++ b/ocml/src/sinbD.cl
@@ -24,7 +24,7 @@
         L = __e; \
     } while (0)
 
-INLINEATTR double
+double
 MATH_PRIVATE(sinb)(double x, int n, double p)
 {
     struct redret r = MATH_PRIVATE(trigred)(x);
diff --git a/ocml/src/sinbF.cl b/ocml/src/sinbF.cl
index 0bd14e13..cdc139be 100644
--- a/ocml/src/sinbF.cl
+++ b/ocml/src/sinbF.cl
@@ -24,7 +24,7 @@
         L = __e; \
     } while (0)
 
-INLINEATTR float
+float
 MATH_PRIVATE(sinb)(float x, int n, float p)
 {
     struct redret r = MATH_PRIVATE(trigred)(x);
diff --git a/ocml/src/sincosD.cl b/ocml/src/sincosD.cl
index ba74767d..891d083d 100644
--- a/ocml/src/sincosD.cl
+++ b/ocml/src/sincosD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(sincos)(double x, __private double * cp)
 {
     struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
diff --git a/ocml/src/sincosF.cl b/ocml/src/sincosF.cl
index a1286cc5..123b4595 100644
--- a/ocml/src/sincosF.cl
+++ b/ocml/src/sincosF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(sincos)(float x, __private float *cp)
 {
     int ix = AS_INT(x);
diff --git a/ocml/src/sincosH.cl b/ocml/src/sincosH.cl
index 8c314f12..bdf62827 100644
--- a/ocml/src/sincosH.cl
+++ b/ocml/src/sincosH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigredH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(sincos)(half2 x, __private half2 *cp)
 {
     half2 s;
@@ -19,7 +19,7 @@ MATH_MANGLE2(sincos)(half2 x, __private half2 *cp)
     return s;
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(sincos)(half x, __private half *cp)
 {
     struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
diff --git a/ocml/src/sincospiD.cl b/ocml/src/sincospiD.cl
index 41e1438d..4ede0cc7 100644
--- a/ocml/src/sincospiD.cl
+++ b/ocml/src/sincospiD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(sincospi)(double x, __private double * cp)
 {
     struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
diff --git a/ocml/src/sincospiF.cl b/ocml/src/sincospiF.cl
index 91b61dc4..9585bb42 100644
--- a/ocml/src/sincospiF.cl
+++ b/ocml/src/sincospiF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(sincospi)(float x, __private float *cp)
 {
     int ix = AS_INT(x);
diff --git a/ocml/src/sincospiH.cl b/ocml/src/sincospiH.cl
index 495bac5c..78249533 100644
--- a/ocml/src/sincospiH.cl
+++ b/ocml/src/sincospiH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigpiredH.h"
 
-INLINEATTR half2
+half2
 MATH_MANGLE2(sincospi)(half2 x, __private half2 *cp)
 {
     half2 s;
@@ -20,7 +20,7 @@ MATH_MANGLE2(sincospi)(half2 x, __private half2 *cp)
     return s;
 }
 
-INLINEATTR half
+half
 MATH_MANGLE(sincospi)(half x, __private half *cp)
 {
     struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
diff --git a/ocml/src/sincospiredD.cl b/ocml/src/sincospiredD.cl
index 4e750f8f..aae84504 100644
--- a/ocml/src/sincospiredD.cl
+++ b/ocml/src/sincospiredD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-CONSTATTR INLINEATTR struct scret
+CONSTATTR struct scret
 MATH_PRIVATE(sincospired)(double x)
 {
     double t = x * x;
diff --git a/ocml/src/sincospiredF.cl b/ocml/src/sincospiredF.cl
index 1a528847..ac164a17 100644
--- a/ocml/src/sincospiredF.cl
+++ b/ocml/src/sincospiredF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-CONSTATTR INLINEATTR struct scret
+CONSTATTR struct scret
 MATH_PRIVATE(sincospired)(float x)
 {
 
diff --git a/ocml/src/sincospiredH.cl b/ocml/src/sincospiredH.cl
index a7aa0f3f..33a13ab0 100644
--- a/ocml/src/sincospiredH.cl
+++ b/ocml/src/sincospiredH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigpiredH.h"
 
-CONSTATTR INLINEATTR struct scret
+CONSTATTR struct scret
 MATH_PRIVATE(sincospired)(half x)
 {
     half t = x * x;
diff --git a/ocml/src/sincosred2D.cl b/ocml/src/sincosred2D.cl
index e63b71a3..3d8c487d 100644
--- a/ocml/src/sincosred2D.cl
+++ b/ocml/src/sincosred2D.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-CONSTATTR INLINEATTR struct scret
+CONSTATTR struct scret
 MATH_PRIVATE(sincosred2)(double x, double y)
 {
     const double S0 = -0x1.5555555555555p-3;
diff --git a/ocml/src/sincosred2F.cl b/ocml/src/sincosred2F.cl
index 96eb3c18..16cd8fde 100644
--- a/ocml/src/sincosred2F.cl
+++ b/ocml/src/sincosred2F.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-CONSTATTR INLINEATTR struct scret
+CONSTATTR struct scret
 MATH_PRIVATE(sincosred2)(float x, float y)
 {
     const float c0 =  0x1.555556p-5f;
diff --git a/ocml/src/sincosredD.cl b/ocml/src/sincosredD.cl
index 3d549195..4418d623 100644
--- a/ocml/src/sincosredD.cl
+++ b/ocml/src/sincosredD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-CONSTATTR INLINEATTR struct scret
+CONSTATTR struct scret
 MATH_PRIVATE(sincosred)(double x)
 {
     const double S0 = -0x1.5555555555555p-3;
diff --git a/ocml/src/sincosredF.cl b/ocml/src/sincosredF.cl
index 8e21dfad..54167c47 100644
--- a/ocml/src/sincosredF.cl
+++ b/ocml/src/sincosredF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-CONSTATTR INLINEATTR struct scret
+CONSTATTR struct scret
 MATH_PRIVATE(sincosred)(float x)
 {
     float t = x * x;
diff --git a/ocml/src/sincosredH.cl b/ocml/src/sincosredH.cl
index 202732d9..0dd4b17d 100644
--- a/ocml/src/sincosredH.cl
+++ b/ocml/src/sincosredH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigredH.h"
 
-CONSTATTR INLINEATTR struct scret
+CONSTATTR struct scret
 MATH_PRIVATE(sincosred)(half x)
 {
     half t = x * x;
diff --git a/ocml/src/sinhD.cl b/ocml/src/sinhD.cl
index 7d377385..0bab018b 100644
--- a/ocml/src/sinhD.cl
+++ b/ocml/src/sinhD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(sinh)(double x)
 {
     double y = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/sinhF.cl b/ocml/src/sinhF.cl
index 5718e06e..9ea55fc9 100644
--- a/ocml/src/sinhF.cl
+++ b/ocml/src/sinhF.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x);
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(sinh)(float x)
 {
     float y = BUILTIN_ABS_F32(x);
diff --git a/ocml/src/sinhH.cl b/ocml/src/sinhH.cl
index 92954661..c3ab5ed7 100644
--- a/ocml/src/sinhH.cl
+++ b/ocml/src/sinhH.cl
@@ -9,7 +9,7 @@
 
 PUREATTR UGEN(sinh)
 
-PUREATTR INLINEATTR half
+PUREATTR half
 MATH_MANGLE(sinh)(half hx)
 {
     float x = (float)hx * 0x1.715476p+0f;
diff --git a/ocml/src/sinpiD.cl b/ocml/src/sinpiD.cl
index 84039c97..ab208901 100644
--- a/ocml/src/sinpiD.cl
+++ b/ocml/src/sinpiD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-INLINEATTR double
+double
 MATH_MANGLE(sinpi)(double x)
 {
     struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
diff --git a/ocml/src/sinpiF.cl b/ocml/src/sinpiF.cl
index 07f1e97d..2a50553a 100644
--- a/ocml/src/sinpiF.cl
+++ b/ocml/src/sinpiF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(sinpi)(float x)
 {
     int ix = AS_INT(x);
diff --git a/ocml/src/sinpiH.cl b/ocml/src/sinpiH.cl
index 2848978c..c738c222 100644
--- a/ocml/src/sinpiH.cl
+++ b/ocml/src/sinpiH.cl
@@ -10,7 +10,7 @@
 
 UGEN(sinpi)
 
-INLINEATTR half
+half
 MATH_MANGLE(sinpi)(half x)
 {
     struct redret r =  MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
diff --git a/ocml/src/sqrtD.cl b/ocml/src/sqrtD.cl
index a68f7bd0..e2a6fedf 100644
--- a/ocml/src/sqrtD.cl
+++ b/ocml/src/sqrtD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(sqrt)(double x)
 {
     return MATH_SQRT(x);
@@ -17,7 +17,7 @@ MATH_MANGLE(sqrt)(double x)
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR double \
+CONSTATTR double \
 MATH_MANGLE(NAME)(double x) \
 { \
     return BUILTIN_FULL_UNARY(fsqrt, false, ROUND, x); \
diff --git a/ocml/src/sqrtF.cl b/ocml/src/sqrtF.cl
index dbf495c5..99b32c25 100644
--- a/ocml/src/sqrtF.cl
+++ b/ocml/src/sqrtF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(sqrt)(float x)
 {
     if (CORRECTLY_ROUNDED_SQRT32()) {
@@ -21,7 +21,7 @@ MATH_MANGLE(sqrt)(float x)
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR float \
+CONSTATTR float \
 MATH_MANGLE(NAME)(float x) \
 { \
     float ret; \
diff --git a/ocml/src/sqrtH.cl b/ocml/src/sqrtH.cl
index 3c663887..321003f9 100644
--- a/ocml/src/sqrtH.cl
+++ b/ocml/src/sqrtH.cl
@@ -9,7 +9,7 @@
 
 CONSTATTR UGEN(sqrt)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(sqrt)(half x)
 {
     return BUILTIN_SQRT_F16(x);
@@ -19,7 +19,7 @@ MATH_MANGLE(sqrt)(half x)
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR half \
+CONSTATTR half \
 MATH_MANGLE(NAME)(half x) \
 { \
     return BUILTIN_FULL_UNARY(fsqrth, false, ROUND, x); \
diff --git a/ocml/src/subD.cl b/ocml/src/subD.cl
index beda1a10..a9f4d3f1 100644
--- a/ocml/src/subD.cl
+++ b/ocml/src/subD.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR double \
+CONSTATTR double \
 MATH_MANGLE(NAME)(double x, double y) \
 { \
     return BUILTIN_FULL_BINARY(fsub, false, ROUND, x, y); \
diff --git a/ocml/src/subF.cl b/ocml/src/subF.cl
index 30664d6c..a8a4f2dd 100644
--- a/ocml/src/subF.cl
+++ b/ocml/src/subF.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR float \
+CONSTATTR float \
 MATH_MANGLE(NAME)(float x, float y) \
 { \
     float ret; \
diff --git a/ocml/src/subH.cl b/ocml/src/subH.cl
index 6ca8e24b..054c46d2 100644
--- a/ocml/src/subH.cl
+++ b/ocml/src/subH.cl
@@ -11,7 +11,7 @@
 #if defined HSAIL_BUILD
 
 #define GEN(NAME,ROUND)\
-CONSTATTR INLINEATTR half \
+CONSTATTR half \
 MATH_MANGLE(NAME)(half x, half y) \
 { \
     return BUILTIN_FULL_BINARY(fsubh, false, ROUND, x, y); \
diff --git a/ocml/src/tanD.cl b/ocml/src/tanD.cl
index e004e1be..0a3193d3 100644
--- a/ocml/src/tanD.cl
+++ b/ocml/src/tanD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(tan)(double x)
 {
     struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x));
diff --git a/ocml/src/tanF.cl b/ocml/src/tanF.cl
index 8bd8a7c5..efe22a75 100644
--- a/ocml/src/tanF.cl
+++ b/ocml/src/tanF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-INLINEATTR float
+float
 MATH_MANGLE(tan)(float x)
 {
     int ix = AS_INT(x);
diff --git a/ocml/src/tanH.cl b/ocml/src/tanH.cl
index 87fbceb6..36d91d3c 100644
--- a/ocml/src/tanH.cl
+++ b/ocml/src/tanH.cl
@@ -10,7 +10,7 @@
 
 UGEN(tan)
 
-INLINEATTR half
+half
 MATH_MANGLE(tan)(half x)
 {
     struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x));
diff --git a/ocml/src/tanhD.cl b/ocml/src/tanhD.cl
index 834e397e..e0c896d9 100644
--- a/ocml/src/tanhD.cl
+++ b/ocml/src/tanhD.cl
@@ -12,7 +12,7 @@
 
 extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x);
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(tanh)(double x)
 {
     double y = BUILTIN_ABS_F64(x);
diff --git a/ocml/src/tanpiD.cl b/ocml/src/tanpiD.cl
index d6e5a27a..90c746ef 100644
--- a/ocml/src/tanpiD.cl
+++ b/ocml/src/tanpiD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(tanpi)(double x)
 {
     struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x));
diff --git a/ocml/src/tanpiF.cl b/ocml/src/tanpiF.cl
index 9c951e55..a13b9143 100644
--- a/ocml/src/tanpiF.cl
+++ b/ocml/src/tanpiF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(tanpi)(float x)
 {
     struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F32(x));
diff --git a/ocml/src/tanpiH.cl b/ocml/src/tanpiH.cl
index 25a6fa90..b0571ba5 100644
--- a/ocml/src/tanpiH.cl
+++ b/ocml/src/tanpiH.cl
@@ -10,7 +10,7 @@
 
 CONSTATTR UGEN(tanpi)
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(tanpi)(half x)
 {
     struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x));
diff --git a/ocml/src/tanpiredD.cl b/ocml/src/tanpiredD.cl
index 5d877733..ecedafd2 100644
--- a/ocml/src/tanpiredD.cl
+++ b/ocml/src/tanpiredD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_PRIVATE(tanpired)(double x, int i)
 {
     double s = x * x;
diff --git a/ocml/src/tanpiredF.cl b/ocml/src/tanpiredF.cl
index 25b2467b..96e63ad2 100644
--- a/ocml/src/tanpiredF.cl
+++ b/ocml/src/tanpiredF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(tanpired)(float x, int i)
 {
     float s = x * x;
diff --git a/ocml/src/tanpiredH.cl b/ocml/src/tanpiredH.cl
index 221797f7..645f58a5 100644
--- a/ocml/src/tanpiredH.cl
+++ b/ocml/src/tanpiredH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigpiredH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_PRIVATE(tanpired)(half x, short i)
 {
     half s = x * x;
diff --git a/ocml/src/tanred2D.cl b/ocml/src/tanred2D.cl
index ae5d49c6..18dd4bf8 100644
--- a/ocml/src/tanred2D.cl
+++ b/ocml/src/tanred2D.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-INLINEATTR CONSTATTR double
+CONSTATTR double
 MATH_PRIVATE(tanred2)(double x, double xx, int sel)
 {
     const double piby4_lead = 0x1.921fb54442d18p-1;
diff --git a/ocml/src/tanredF.cl b/ocml/src/tanredF.cl
index 0bb6744c..b1a196cc 100644
--- a/ocml/src/tanredF.cl
+++ b/ocml/src/tanredF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_PRIVATE(tanred)(float x, int i)
 {
     float s = x * x;
diff --git a/ocml/src/tanredH.cl b/ocml/src/tanredH.cl
index bade03c2..b11844f2 100644
--- a/ocml/src/tanredH.cl
+++ b/ocml/src/tanredH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigredH.h"
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_PRIVATE(tanred)(half x, short i)
 {
     half s = x * x;
diff --git a/ocml/src/tgammaH.cl b/ocml/src/tgammaH.cl
index 07a72ef2..8ae01c2c 100644
--- a/ocml/src/tgammaH.cl
+++ b/ocml/src/tgammaH.cl
@@ -9,7 +9,7 @@
 
 UGEN(tgamma)
 
-INLINEATTR half
+half
 MATH_MANGLE(tgamma)(half x)
 {
     return (half)MATH_UPMANGLE(tgamma)((float)x);
diff --git a/ocml/src/trigpiredD.cl b/ocml/src/trigpiredD.cl
index 8411e57d..7bea3077 100644
--- a/ocml/src/trigpiredD.cl
+++ b/ocml/src/trigpiredD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigpiredD.h"
 
-CONSTATTR INLINEATTR struct redret
+CONSTATTR struct redret
 MATH_PRIVATE(trigpired)(double x)
 {
     double t = 2.0 * BUILTIN_FRACTION_F64(0.5 * x);
diff --git a/ocml/src/trigpiredF.cl b/ocml/src/trigpiredF.cl
index 2f93312b..bcdc5727 100644
--- a/ocml/src/trigpiredF.cl
+++ b/ocml/src/trigpiredF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigpiredF.h"
 
-CONSTATTR INLINEATTR struct redret
+CONSTATTR struct redret
 MATH_PRIVATE(trigpired)(float x)
 {
     float t = 2.0f * BUILTIN_FRACTION_F32(0.5f * x);
diff --git a/ocml/src/trigpiredH.cl b/ocml/src/trigpiredH.cl
index 7023567d..7615528f 100644
--- a/ocml/src/trigpiredH.cl
+++ b/ocml/src/trigpiredH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigpiredH.h"
 
-CONSTATTR INLINEATTR struct redret
+CONSTATTR struct redret
 MATH_PRIVATE(trigpired)(half x)
 {
     half t = 2.0h * BUILTIN_FRACTION_F16(0.5h * x);
diff --git a/ocml/src/trigredD.cl b/ocml/src/trigredD.cl
index 76c78d4a..c9700fd8 100644
--- a/ocml/src/trigredD.cl
+++ b/ocml/src/trigredD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-CONSTATTR INLINEATTR struct redret
+CONSTATTR struct redret
 MATH_PRIVATE(trigred)(double x)
 {
     if (x < 0x1.0p+21)
diff --git a/ocml/src/trigredF.cl b/ocml/src/trigredF.cl
index 240eee20..20cbd39b 100644
--- a/ocml/src/trigredF.cl
+++ b/ocml/src/trigredF.cl
@@ -8,7 +8,7 @@
 #include "mathF.h"
 #include "trigredF.h"
 
-CONSTATTR INLINEATTR struct redret
+CONSTATTR struct redret
 MATH_PRIVATE(trigred)(float x)
 {
     if (x < SMALL_BOUND)
diff --git a/ocml/src/trigredH.cl b/ocml/src/trigredH.cl
index b69d38f9..ac75d51a 100644
--- a/ocml/src/trigredH.cl
+++ b/ocml/src/trigredH.cl
@@ -8,7 +8,7 @@
 #include "mathH.h"
 #include "trigredH.h"
 
-CONSTATTR INLINEATTR struct redret
+CONSTATTR struct redret
 MATH_PRIVATE(trigred)(half hx)
 {
     const float twobypi = 0x1.45f306p-1f;
diff --git a/ocml/src/trigredsmallD.cl b/ocml/src/trigredsmallD.cl
index fafe0fd8..0cac73ef 100644
--- a/ocml/src/trigredsmallD.cl
+++ b/ocml/src/trigredsmallD.cl
@@ -8,7 +8,7 @@
 #include "mathD.h"
 #include "trigredD.h"
 
-CONSTATTR INLINEATTR struct redret
+CONSTATTR struct redret
 MATH_PRIVATE(trigredsmall)(double x)
 {
     const double twobypi = 0x1.45f306dc9c883p-1;
diff --git a/ocml/src/trigredsmallF.cl b/ocml/src/trigredsmallF.cl
index c56841b3..c93a2761 100644
--- a/ocml/src/trigredsmallF.cl
+++ b/ocml/src/trigredsmallF.cl
@@ -91,7 +91,7 @@ fma_reduce(float x)
     return ret;
 }
 
-CONSTATTR INLINEATTR struct redret
+CONSTATTR struct redret
 MATH_PRIVATE(trigredsmall)(float x)
 {
     if (HAVE_FAST_FMA32()) {
diff --git a/ocml/src/truncD.cl b/ocml/src/truncD.cl
index 91810829..b1ae0417 100644
--- a/ocml/src/truncD.cl
+++ b/ocml/src/truncD.cl
@@ -7,7 +7,7 @@
 
 #include "mathD.h"
 
-CONSTATTR INLINEATTR double
+CONSTATTR double
 MATH_MANGLE(trunc)(double x)
 {
     return BUILTIN_TRUNC_F64(x);
diff --git a/ocml/src/truncF.cl b/ocml/src/truncF.cl
index 88ee87da..3d279363 100644
--- a/ocml/src/truncF.cl
+++ b/ocml/src/truncF.cl
@@ -7,7 +7,7 @@
 
 #include "mathF.h"
 
-CONSTATTR INLINEATTR float
+CONSTATTR float
 MATH_MANGLE(trunc)(float x)
 {
     return BUILTIN_TRUNC_F32(x);
diff --git a/ocml/src/truncH.cl b/ocml/src/truncH.cl
index 77292247..6787af80 100644
--- a/ocml/src/truncH.cl
+++ b/ocml/src/truncH.cl
@@ -7,13 +7,13 @@
 
 #include "mathH.h"
 
-CONSTATTR INLINEATTR half2
+CONSTATTR half2
 MATH_MANGLE2(trunc)(half2 x)
 {
     return BUILTIN_TRUNC_2F16(x);
 }
 
-CONSTATTR INLINEATTR half
+CONSTATTR half
 MATH_MANGLE(trunc)(half x)
 {
     return BUILTIN_TRUNC_F16(x);
diff --git a/ocml/src/y0H.cl b/ocml/src/y0H.cl
index b2a81454..c187f45a 100644
--- a/ocml/src/y0H.cl
+++ b/ocml/src/y0H.cl
@@ -9,7 +9,7 @@
 
 UGEN(y0)
 
-INLINEATTR half
+half
 MATH_MANGLE(y0)(half x)
 {
     return (half)MATH_UPMANGLE(y0)((float)x);
diff --git a/ocml/src/y1H.cl b/ocml/src/y1H.cl
index 0c4197f0..a09ad9ef 100644
--- a/ocml/src/y1H.cl
+++ b/ocml/src/y1H.cl
@@ -9,7 +9,7 @@
 
 UGEN(y1)
 
-INLINEATTR half
+half
 MATH_MANGLE(y1)(half x)
 {
     return (half)MATH_UPMANGLE(y1)((float)x);

From a6f6461392b4cde5edeebd0b9fcf02c005fc7f11 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Mon, 31 Jul 2017 13:39:05 -0700
Subject: [PATCH 09/25] Add relaxed math attributes to all functions

Set relaxed math attributes. This does not mean a library module was built
with those relaxations, but marks it compatible with the relaxations which
may be used for the kernel module. Setting them prevents removal of them
for a caller function, thus retaining original caller attributes.

Change-Id: I45dcb6e05e6e92ebc916ba7cebcb0ca7a0de2502
---
 utils/prepare-builtins/prepare-builtins.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/utils/prepare-builtins/prepare-builtins.cpp b/utils/prepare-builtins/prepare-builtins.cpp
index b1145363..5d5009bf 100644
--- a/utils/prepare-builtins/prepare-builtins.cpp
+++ b/utils/prepare-builtins/prepare-builtins.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
@@ -107,6 +108,18 @@ int main(int argc, char **argv) {
     }
   }
 
+  // Set relaxed math attributes. This does not mean a library module was built
+  // with those relaxations, but marks it compatible with the relaxations which
+  // may be used for the kernel module. Setting them prevents removal of them
+  // for a caller function, thus retaining original caller attributes.
+  AttrBuilder B;
+  B.addAttribute("less-precise-fpmad", "true");
+  B.addAttribute("no-infs-fp-math", "true");
+  B.addAttribute("no-nans-fp-math", "true");
+  B.addAttribute("unsafe-fp-math", "true");
+  for (Function &F : M->functions()) {
+    F.addAttributes(AttributeList::FunctionIndex, B);
+  }
 
   if (OutputFilename.empty()) {
     errs() << "no output file\n";

From 3e18a4562dfa01b4092f1bf4cde0875b530b4d3b Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Tue, 8 Aug 2017 08:05:18 -0700
Subject: [PATCH 10/25] Implement explicitly rounded basic operations

Change-Id: Iae43c85d8e6f071674d235247ff4390dc0a94789
---
 irif/inc/irif.h     |  74 +++++++++
 irif/src/rounded.ll | 393 ++++++++++++++++++++++++++++++++++++++++++++
 ocml/src/addD.cl    |  22 +--
 ocml/src/addF.cl    |  28 +---
 ocml/src/addH.cl    |  22 +--
 ocml/src/builtins.h |  78 +++++++++
 ocml/src/divD.cl    |  22 +--
 ocml/src/divF.cl    |  28 +---
 ocml/src/divH.cl    |  22 +--
 ocml/src/fmaD.cl    |  22 +--
 ocml/src/fmaF.cl    |  29 +---
 ocml/src/fmaH.cl    |  22 +--
 ocml/src/mulD.cl    |  22 +--
 ocml/src/mulF.cl    |  28 +---
 ocml/src/mulH.cl    |  22 +--
 ocml/src/sqrtD.cl   |  22 +--
 ocml/src/sqrtF.cl   |  28 +---
 ocml/src/sqrtH.cl   |  22 +--
 ocml/src/subD.cl    |  22 +--
 ocml/src/subF.cl    |  28 +---
 ocml/src/subH.cl    |  22 +--
 21 files changed, 690 insertions(+), 288 deletions(-)
 create mode 100644 irif/src/rounded.ll

diff --git a/irif/inc/irif.h b/irif/inc/irif.h
index ec91d9f1..2205d466 100644
--- a/irif/inc/irif.h
+++ b/irif/inc/irif.h
@@ -197,6 +197,80 @@ extern ulong __llvm_cmpxchg_a1_x_x_dev_i64(__global ulong *, ulong, ulong);
 extern uint __llvm_cmpxchg_a3_x_x_wg_i32(__local uint *, uint, uint);
 extern ulong __llvm_cmpxchg_a3_x_x_wg_i64(__local ulong *, ulong, ulong);
 
+// Constrained floating point
+extern __attribute__((const)) half __llvm_add_rte_f16(half, half);
+extern __attribute__((const)) half __llvm_add_rtn_f16(half, half);
+extern __attribute__((const)) half __llvm_add_rtp_f16(half, half);
+extern __attribute__((const)) half __llvm_add_rtz_f16(half, half);
+extern __attribute__((const)) float __llvm_add_rte_f32(float, float);
+extern __attribute__((const)) float __llvm_add_rtn_f32(float, float);
+extern __attribute__((const)) float __llvm_add_rtp_f32(float, float);
+extern __attribute__((const)) float __llvm_add_rtz_f32(float, float);
+extern __attribute__((const)) double __llvm_add_rte_f64(double, double);
+extern __attribute__((const)) double __llvm_add_rtn_f64(double, double);
+extern __attribute__((const)) double __llvm_add_rtp_f64(double, double);
+extern __attribute__((const)) double __llvm_add_rtz_f64(double, double);
+extern __attribute__((const)) half __llvm_sub_rte_f16(half, half);
+extern __attribute__((const)) half __llvm_sub_rtn_f16(half, half);
+extern __attribute__((const)) half __llvm_sub_rtp_f16(half, half);
+extern __attribute__((const)) half __llvm_sub_rtz_f16(half, half);
+extern __attribute__((const)) float __llvm_sub_rte_f32(float, float);
+extern __attribute__((const)) float __llvm_sub_rtn_f32(float, float);
+extern __attribute__((const)) float __llvm_sub_rtp_f32(float, float);
+extern __attribute__((const)) float __llvm_sub_rtz_f32(float, float);
+extern __attribute__((const)) double __llvm_sub_rte_f64(double, double);
+extern __attribute__((const)) double __llvm_sub_rtn_f64(double, double);
+extern __attribute__((const)) double __llvm_sub_rtp_f64(double, double);
+extern __attribute__((const)) double __llvm_sub_rtz_f64(double, double);
+extern __attribute__((const)) half __llvm_mul_rte_f16(half, half);
+extern __attribute__((const)) half __llvm_mul_rtn_f16(half, half);
+extern __attribute__((const)) half __llvm_mul_rtp_f16(half, half);
+extern __attribute__((const)) half __llvm_mul_rtz_f16(half, half);
+extern __attribute__((const)) float __llvm_mul_rte_f32(float, float);
+extern __attribute__((const)) float __llvm_mul_rtn_f32(float, float);
+extern __attribute__((const)) float __llvm_mul_rtp_f32(float, float);
+extern __attribute__((const)) float __llvm_mul_rtz_f32(float, float);
+extern __attribute__((const)) double __llvm_mul_rte_f64(double, double);
+extern __attribute__((const)) double __llvm_mul_rtn_f64(double, double);
+extern __attribute__((const)) double __llvm_mul_rtp_f64(double, double);
+extern __attribute__((const)) double __llvm_mul_rtz_f64(double, double);
+extern __attribute__((const)) half __llvm_div_rte_f16(half, half);
+extern __attribute__((const)) half __llvm_div_rtn_f16(half, half);
+extern __attribute__((const)) half __llvm_div_rtp_f16(half, half);
+extern __attribute__((const)) half __llvm_div_rtz_f16(half, half);
+extern __attribute__((const)) float __llvm_div_rte_f32(float, float);
+extern __attribute__((const)) float __llvm_div_rtn_f32(float, float);
+extern __attribute__((const)) float __llvm_div_rtp_f32(float, float);
+extern __attribute__((const)) float __llvm_div_rtz_f32(float, float);
+extern __attribute__((const)) double __llvm_div_rte_f64(double, double);
+extern __attribute__((const)) double __llvm_div_rtn_f64(double, double);
+extern __attribute__((const)) double __llvm_div_rtp_f64(double, double);
+extern __attribute__((const)) double __llvm_div_rtz_f64(double, double);
+extern __attribute__((const)) half __llvm_sqrt_rte_f16(half);
+extern __attribute__((const)) half __llvm_sqrt_rtn_f16(half);
+extern __attribute__((const)) half __llvm_sqrt_rtp_f16(half);
+extern __attribute__((const)) half __llvm_sqrt_rtz_f16(half);
+extern __attribute__((const)) float __llvm_sqrt_rte_f32(float);
+extern __attribute__((const)) float __llvm_sqrt_rtn_f32(float);
+extern __attribute__((const)) float __llvm_sqrt_rtp_f32(float);
+extern __attribute__((const)) float __llvm_sqrt_rtz_f32(float);
+extern __attribute__((const)) double __llvm_sqrt_rte_f64(double);
+extern __attribute__((const)) double __llvm_sqrt_rtn_f64(double);
+extern __attribute__((const)) double __llvm_sqrt_rtp_f64(double);
+extern __attribute__((const)) double __llvm_sqrt_rtz_f64(double);
+extern __attribute__((const)) half __llvm_fma_rte_f16(half, half, half);
+extern __attribute__((const)) half __llvm_fma_rtn_f16(half, half, half);
+extern __attribute__((const)) half __llvm_fma_rtp_f16(half, half, half);
+extern __attribute__((const)) half __llvm_fma_rtz_f16(half, half, half);
+extern __attribute__((const)) float __llvm_fma_rte_f32(float, float, float);
+extern __attribute__((const)) float __llvm_fma_rtn_f32(float, float, float);
+extern __attribute__((const)) float __llvm_fma_rtp_f32(float, float, float);
+extern __attribute__((const)) float __llvm_fma_rtz_f32(float, float, float);
+extern __attribute__((const)) double __llvm_fma_rte_f64(double, double, double);
+extern __attribute__((const)) double __llvm_fma_rtn_f64(double, double, double);
+extern __attribute__((const)) double __llvm_fma_rtp_f64(double, double, double);
+extern __attribute__((const)) double __llvm_fma_rtz_f64(double, double, double);
+
 // AMDGPU intrinsics
 extern __attribute__((const)) bool __llvm_amdgcn_class_f16(half, int) __asm("llvm.amdgcn.class.f16");
 extern __attribute__((const)) bool __llvm_amdgcn_class_f32(float, int) __asm("llvm.amdgcn.class.f32");
diff --git a/irif/src/rounded.ll b/irif/src/rounded.ll
new file mode 100644
index 00000000..80b0082b
--- /dev/null
+++ b/irif/src/rounded.ll
@@ -0,0 +1,393 @@
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn-amd-amdhsa-opencl"
+
+;;;;; Add ;;;;;
+define half @__llvm_add_rte_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_add_rtn_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_add_rtp_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_add_rtz_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define float @__llvm_add_rte_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_add_rtn_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_add_rtp_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_add_rtz_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define double @__llvm_add_rte_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_add_rtn_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_add_rtp_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_add_rtz_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+;;;;; Sub ;;;;;
+define half @__llvm_sub_rte_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_sub_rtn_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_sub_rtp_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_sub_rtz_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define float @__llvm_sub_rte_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_sub_rtn_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_sub_rtp_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_sub_rtz_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define double @__llvm_sub_rte_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_sub_rtn_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_sub_rtp_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_sub_rtz_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+;;;;; Mul ;;;;;
+define half @__llvm_mul_rte_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_mul_rtn_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_mul_rtp_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_mul_rtz_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define float @__llvm_mul_rte_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_mul_rtn_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_mul_rtp_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_mul_rtz_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define double @__llvm_mul_rte_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_mul_rtn_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_mul_rtp_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_mul_rtz_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+;;;;; Div ;;;;;
+define half @__llvm_div_rte_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_div_rtn_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_div_rtp_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define half @__llvm_div_rtz_f16(half, half) local_unnamed_addr #0 {
+  %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %3
+}
+
+define float @__llvm_div_rte_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_div_rtn_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_div_rtp_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define float @__llvm_div_rtz_f32(float, float) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %3
+}
+
+define double @__llvm_div_rte_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_div_rtn_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_div_rtp_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+define double @__llvm_div_rtz_f64(double, double) local_unnamed_addr #0 {
+  %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %3
+}
+
+;;;;; Sqrt ;;;;;
+define half @__llvm_sqrt_rte_f16(half) local_unnamed_addr #0 {
+  %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0,  metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %2
+}
+
+define half @__llvm_sqrt_rtn_f16(half) local_unnamed_addr #0 {
+  %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0,  metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %2
+}
+
+define half @__llvm_sqrt_rtp_f16(half) local_unnamed_addr #0 {
+  %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0,  metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %2
+}
+
+define half @__llvm_sqrt_rtz_f16(half) local_unnamed_addr #0 {
+  %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0,  metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %2
+}
+
+define float @__llvm_sqrt_rte_f32(float) local_unnamed_addr #0 {
+  %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0,  metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %2
+}
+
+define float @__llvm_sqrt_rtn_f32(float) local_unnamed_addr #0 {
+  %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0,  metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %2
+}
+
+define float @__llvm_sqrt_rtp_f32(float) local_unnamed_addr #0 {
+  %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0,  metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %2
+}
+
+define float @__llvm_sqrt_rtz_f32(float) local_unnamed_addr #0 {
+  %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0,  metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %2
+}
+
+define double @__llvm_sqrt_rte_f64(double) local_unnamed_addr #0 {
+  %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0,  metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %2
+}
+
+define double @__llvm_sqrt_rtn_f64(double) local_unnamed_addr #0 {
+  %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0,  metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %2
+}
+
+define double @__llvm_sqrt_rtp_f64(double) local_unnamed_addr #0 {
+  %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0,  metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %2
+}
+
+define double @__llvm_sqrt_rtz_f64(double) local_unnamed_addr #0 {
+  %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0,  metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %2
+}
+
+;;;;; Fma ;;;;;
+define half @__llvm_fma_rte_f16(half, half, half) local_unnamed_addr #0 {
+  %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret half %4
+}
+
+define half @__llvm_fma_rtn_f16(half, half, half) local_unnamed_addr #0 {
+  %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret half %4
+}
+
+define half @__llvm_fma_rtp_f16(half, half, half) local_unnamed_addr #0 {
+  %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret half %4
+}
+
+define half @__llvm_fma_rtz_f16(half, half, half) local_unnamed_addr #0 {
+  %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret half %4
+}
+
+define float @__llvm_fma_rte_f32(float, float, float) local_unnamed_addr #0 {
+  %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret float %4
+}
+
+define float @__llvm_fma_rtn_f32(float, float, float) local_unnamed_addr #0 {
+  %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret float %4
+}
+
+define float @__llvm_fma_rtp_f32(float, float, float) local_unnamed_addr #0 {
+  %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret float %4
+}
+
+define float @__llvm_fma_rtz_f32(float, float, float) local_unnamed_addr #0 {
+  %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret float %4
+}
+
+define double @__llvm_fma_rte_f64(double, double, double) local_unnamed_addr #0 {
+  %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1
+  ret double %4
+}
+
+define double @__llvm_fma_rtn_f64(double, double, double) local_unnamed_addr #0 {
+  %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.downward", metadata !"fpexcept.strict") #1
+  ret double %4
+}
+
+define double @__llvm_fma_rtp_f64(double, double, double) local_unnamed_addr #0 {
+  %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.upward", metadata !"fpexcept.strict") #1
+  ret double %4
+}
+
+define double @__llvm_fma_rtz_f64(double, double, double) local_unnamed_addr #0 {
+  %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1
+  ret double %4
+}
+
+declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) local_unnamed_addr #1
+declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) local_unnamed_addr #1
+
+declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) local_unnamed_addr #1
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) local_unnamed_addr #1
+
+declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) local_unnamed_addr #1
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) local_unnamed_addr #1
+
+attributes #0 = { alwaysinline nounwind readnone }
+attributes #1 = { nounwind readnone }
+
diff --git a/ocml/src/addD.cl b/ocml/src/addD.cl
index 1cb4d527..7a96f339 100644
--- a/ocml/src/addD.cl
+++ b/ocml/src/addD.cl
@@ -7,21 +7,15 @@
 
 #include "mathD.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR double \
-MATH_MANGLE(NAME)(double x, double y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR double \
+MATH_MANGLE(LN)(double x, double y) \
 { \
-    return BUILTIN_FULL_BINARY(fadd, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F64(x, y); \
 }
 
-GEN(add_rte, ROUND_TO_NEAREST_EVEN)
-GEN(add_rtp, ROUND_TO_POSINF)
-GEN(add_rtn, ROUND_TO_NEGINF)
-GEN(add_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(add_rte,ADD_RTE)
+GEN(add_rtn,ADD_RTN)
+GEN(add_rtp,ADD_RTP)
+GEN(add_rtz,ADD_RTZ)
 
diff --git a/ocml/src/addF.cl b/ocml/src/addF.cl
index d01f0c28..95debe18 100644
--- a/ocml/src/addF.cl
+++ b/ocml/src/addF.cl
@@ -7,27 +7,15 @@
 
 #include "mathF.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR float \
-MATH_MANGLE(NAME)(float x, float y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR float \
+MATH_MANGLE(LN)(float x, float y) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_BINARY(faddf, true, ROUND, x, y); \
-    } else { \
-        ret = BUILTIN_FULL_BINARY(faddf, false, ROUND, x, y); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x, y); \
 }
 
-GEN(add_rte, ROUND_TO_NEAREST_EVEN)
-GEN(add_rtp, ROUND_TO_POSINF)
-GEN(add_rtn, ROUND_TO_NEGINF)
-GEN(add_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(add_rte,ADD_RTE)
+GEN(add_rtn,ADD_RTN)
+GEN(add_rtp,ADD_RTP)
+GEN(add_rtz,ADD_RTZ)
 
diff --git a/ocml/src/addH.cl b/ocml/src/addH.cl
index 8df6e2fa..e77e7a0a 100644
--- a/ocml/src/addH.cl
+++ b/ocml/src/addH.cl
@@ -7,21 +7,15 @@
 
 #include "mathH.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR half \
-MATH_MANGLE(NAME)(half x, half y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR half \
+MATH_MANGLE(LN)(half x, half y) \
 { \
-    return BUILTIN_FULL_BINARY(faddh, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F16(x, y); \
 }
 
-GEN(add_rte, ROUND_TO_NEAREST_EVEN)
-GEN(add_rtp, ROUND_TO_POSINF)
-GEN(add_rtn, ROUND_TO_NEGINF)
-GEN(add_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(add_rte,ADD_RTE)
+GEN(add_rtn,ADD_RTN)
+GEN(add_rtp,ADD_RTP)
+GEN(add_rtz,ADD_RTZ)
 
diff --git a/ocml/src/builtins.h b/ocml/src/builtins.h
index 337c1d0a..2d5f6565 100644
--- a/ocml/src/builtins.h
+++ b/ocml/src/builtins.h
@@ -222,3 +222,81 @@
 #define BUILTIN_CLAMP_F32(X,L,H) __llvm_amdgcn_fmed3_f32(X,L,H)
 #define BUILTIN_CLAMP_F16(X,L,H) __llvm_amdgcn_fmed3_f16(X,L,H)
 
+#define BUILTIN_ADD_RTE_F32 __llvm_add_rte_f32
+#define BUILTIN_ADD_RTE_F64 __llvm_add_rte_f64
+#define BUILTIN_ADD_RTE_F16 __llvm_add_rte_f16
+#define BUILTIN_ADD_RTN_F32 __llvm_add_rtn_f32
+#define BUILTIN_ADD_RTN_F64 __llvm_add_rtn_f64
+#define BUILTIN_ADD_RTN_F16 __llvm_add_rtn_f16
+#define BUILTIN_ADD_RTP_F32 __llvm_add_rtp_f32
+#define BUILTIN_ADD_RTP_F64 __llvm_add_rtp_f64
+#define BUILTIN_ADD_RTP_F16 __llvm_add_rtp_f16
+#define BUILTIN_ADD_RTZ_F32 __llvm_add_rtz_f32
+#define BUILTIN_ADD_RTZ_F64 __llvm_add_rtz_f64
+#define BUILTIN_ADD_RTZ_F16 __llvm_add_rtz_f16
+
+#define BUILTIN_SUB_RTE_F32 __llvm_sub_rte_f32
+#define BUILTIN_SUB_RTE_F64 __llvm_sub_rte_f64
+#define BUILTIN_SUB_RTE_F16 __llvm_sub_rte_f16
+#define BUILTIN_SUB_RTN_F32 __llvm_sub_rtn_f32
+#define BUILTIN_SUB_RTN_F64 __llvm_sub_rtn_f64
+#define BUILTIN_SUB_RTN_F16 __llvm_sub_rtn_f16
+#define BUILTIN_SUB_RTP_F32 __llvm_sub_rtp_f32
+#define BUILTIN_SUB_RTP_F64 __llvm_sub_rtp_f64
+#define BUILTIN_SUB_RTP_F16 __llvm_sub_rtp_f16
+#define BUILTIN_SUB_RTZ_F32 __llvm_sub_rtz_f32
+#define BUILTIN_SUB_RTZ_F64 __llvm_sub_rtz_f64
+#define BUILTIN_SUB_RTZ_F16 __llvm_sub_rtz_f16
+
+#define BUILTIN_MUL_RTE_F32 __llvm_mul_rte_f32
+#define BUILTIN_MUL_RTE_F64 __llvm_mul_rte_f64
+#define BUILTIN_MUL_RTE_F16 __llvm_mul_rte_f16
+#define BUILTIN_MUL_RTN_F32 __llvm_mul_rtn_f32
+#define BUILTIN_MUL_RTN_F64 __llvm_mul_rtn_f64
+#define BUILTIN_MUL_RTN_F16 __llvm_mul_rtn_f16
+#define BUILTIN_MUL_RTP_F32 __llvm_mul_rtp_f32
+#define BUILTIN_MUL_RTP_F64 __llvm_mul_rtp_f64
+#define BUILTIN_MUL_RTP_F16 __llvm_mul_rtp_f16
+#define BUILTIN_MUL_RTZ_F32 __llvm_mul_rtz_f32
+#define BUILTIN_MUL_RTZ_F64 __llvm_mul_rtz_f64
+#define BUILTIN_MUL_RTZ_F16 __llvm_mul_rtz_f16
+
+#define BUILTIN_DIV_RTE_F32 __llvm_div_rte_f32
+#define BUILTIN_DIV_RTE_F64 __llvm_div_rte_f64
+#define BUILTIN_DIV_RTE_F16 __llvm_div_rte_f16
+#define BUILTIN_DIV_RTN_F32 __llvm_div_rtn_f32
+#define BUILTIN_DIV_RTN_F64 __llvm_div_rtn_f64
+#define BUILTIN_DIV_RTN_F16 __llvm_div_rtn_f16
+#define BUILTIN_DIV_RTP_F32 __llvm_div_rtp_f32
+#define BUILTIN_DIV_RTP_F64 __llvm_div_rtp_f64
+#define BUILTIN_DIV_RTP_F16 __llvm_div_rtp_f16
+#define BUILTIN_DIV_RTZ_F32 __llvm_div_rtz_f32
+#define BUILTIN_DIV_RTZ_F64 __llvm_div_rtz_f64
+#define BUILTIN_DIV_RTZ_F16 __llvm_div_rtz_f16
+
+#define BUILTIN_SQRT_RTE_F32 __llvm_sqrt_rte_f32
+#define BUILTIN_SQRT_RTE_F64 __llvm_sqrt_rte_f64
+#define BUILTIN_SQRT_RTE_F16 __llvm_sqrt_rte_f16
+#define BUILTIN_SQRT_RTN_F32 __llvm_sqrt_rtn_f32
+#define BUILTIN_SQRT_RTN_F64 __llvm_sqrt_rtn_f64
+#define BUILTIN_SQRT_RTN_F16 __llvm_sqrt_rtn_f16
+#define BUILTIN_SQRT_RTP_F32 __llvm_sqrt_rtp_f32
+#define BUILTIN_SQRT_RTP_F64 __llvm_sqrt_rtp_f64
+#define BUILTIN_SQRT_RTP_F16 __llvm_sqrt_rtp_f16
+#define BUILTIN_SQRT_RTZ_F32 __llvm_sqrt_rtz_f32
+#define BUILTIN_SQRT_RTZ_F64 __llvm_sqrt_rtz_f64
+#define BUILTIN_SQRT_RTZ_F16 __llvm_sqrt_rtz_f16
+
+#define BUILTIN_FMA_RTE_F32 __llvm_fma_rte_f32
+#define BUILTIN_FMA_RTE_F64 __llvm_fma_rte_f64
+#define BUILTIN_FMA_RTE_F16 __llvm_fma_rte_f16
+#define BUILTIN_FMA_RTN_F32 __llvm_fma_rtn_f32
+#define BUILTIN_FMA_RTN_F64 __llvm_fma_rtn_f64
+#define BUILTIN_FMA_RTN_F16 __llvm_fma_rtn_f16
+#define BUILTIN_FMA_RTP_F32 __llvm_fma_rtp_f32
+#define BUILTIN_FMA_RTP_F64 __llvm_fma_rtp_f64
+#define BUILTIN_FMA_RTP_F16 __llvm_fma_rtp_f16
+#define BUILTIN_FMA_RTZ_F32 __llvm_fma_rtz_f32
+#define BUILTIN_FMA_RTZ_F64 __llvm_fma_rtz_f64
+#define BUILTIN_FMA_RTZ_F16 __llvm_fma_rtz_f16
+
diff --git a/ocml/src/divD.cl b/ocml/src/divD.cl
index eed7cbb7..ad7af822 100644
--- a/ocml/src/divD.cl
+++ b/ocml/src/divD.cl
@@ -7,21 +7,15 @@
 
 #include "mathD.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR double \
-MATH_MANGLE(NAME)(double x, double y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR double \
+MATH_MANGLE(LN)(double x, double y) \
 { \
-    return BUILTIN_FULL_BINARY(fdiv, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F64(x, y); \
 }
 
-GEN(div_rte, ROUND_TO_NEAREST_EVEN)
-GEN(div_rtp, ROUND_TO_POSINF)
-GEN(div_rtn, ROUND_TO_NEGINF)
-GEN(div_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(div_rte,DIV_RTE)
+GEN(div_rtn,DIV_RTN)
+GEN(div_rtp,DIV_RTP)
+GEN(div_rtz,DIV_RTZ)
 
diff --git a/ocml/src/divF.cl b/ocml/src/divF.cl
index e0c7b961..ce9519ab 100644
--- a/ocml/src/divF.cl
+++ b/ocml/src/divF.cl
@@ -7,27 +7,15 @@
 
 #include "mathF.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR float \
-MATH_MANGLE(NAME)(float x, float y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR float \
+MATH_MANGLE(LN)(float x, float y) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_BINARY(fdivf, true, ROUND, x, y); \
-    } else { \
-        ret = BUILTIN_FULL_BINARY(fdivf, false, ROUND, x, y); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x, y); \
 }
 
-GEN(div_rte, ROUND_TO_NEAREST_EVEN)
-GEN(div_rtp, ROUND_TO_POSINF)
-GEN(div_rtn, ROUND_TO_NEGINF)
-GEN(div_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(div_rte,DIV_RTE)
+GEN(div_rtn,DIV_RTN)
+GEN(div_rtp,DIV_RTP)
+GEN(div_rtz,DIV_RTZ)
 
diff --git a/ocml/src/divH.cl b/ocml/src/divH.cl
index 927784b1..3a7d17d3 100644
--- a/ocml/src/divH.cl
+++ b/ocml/src/divH.cl
@@ -7,21 +7,15 @@
 
 #include "mathH.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR half \
-MATH_MANGLE(NAME)(half x, half y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR half \
+MATH_MANGLE(LN)(half x, half y) \
 { \
-    return BUILTIN_FULL_BINARY(fdivh, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F16(x, y); \
 }
 
-GEN(div_rte, ROUND_TO_NEAREST_EVEN)
-GEN(div_rtp, ROUND_TO_POSINF)
-GEN(div_rtn, ROUND_TO_NEGINF)
-GEN(div_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(div_rte,DIV_RTE)
+GEN(div_rtn,DIV_RTN)
+GEN(div_rtp,DIV_RTP)
+GEN(div_rtz,DIV_RTZ)
 
diff --git a/ocml/src/fmaD.cl b/ocml/src/fmaD.cl
index 61a47ea5..0a526fe8 100644
--- a/ocml/src/fmaD.cl
+++ b/ocml/src/fmaD.cl
@@ -13,21 +13,15 @@ MATH_MANGLE(fma)(double a, double b, double c)
     return BUILTIN_FMA_F64(a, b, c);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR double \
-MATH_MANGLE(NAME)(double a, double b, double c) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR double \
+MATH_MANGLE(LN)(double a, double b, double c) \
 { \
-    return BUILTIN_FULL_TERNARY(ffma, false, ROUND, a, b, c); \
+    return BUILTIN_##UN##_F64(a, b, c); \
 }
 
-GEN(fma_rte, ROUND_TO_NEAREST_EVEN)
-GEN(fma_rtp, ROUND_TO_POSINF)
-GEN(fma_rtn, ROUND_TO_NEGINF)
-GEN(fma_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(fma_rte,FMA_RTE)
+GEN(fma_rtn,FMA_RTN)
+GEN(fma_rtp,FMA_RTP)
+GEN(fma_rtz,FMA_RTZ)
 
diff --git a/ocml/src/fmaF.cl b/ocml/src/fmaF.cl
index bf45492c..052acae0 100644
--- a/ocml/src/fmaF.cl
+++ b/ocml/src/fmaF.cl
@@ -5,6 +5,7 @@
  * License. See LICENSE.TXT for details.
  *===------------------------------------------------------------------------*/
 
+#include "irif.h"
 #include "mathF.h"
 
 CONSTATTR float
@@ -13,27 +14,15 @@ MATH_MANGLE(fma)(float a, float b, float c)
     return BUILTIN_FMA_F32(a, b, c);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR float \
-MATH_MANGLE(NAME)(float a, float b, float c) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR float \
+MATH_MANGLE(LN)(float a, float b, float c) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_TERNARY(ffmaf, true, ROUND, a, b, c); \
-    } else { \
-        ret = BUILTIN_FULL_TERNARY(ffmaf, false, ROUND, a, b, c); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(a, b, c); \
 }
 
-GEN(fma_rte, ROUND_TO_NEAREST_EVEN)
-GEN(fma_rtp, ROUND_TO_POSINF)
-GEN(fma_rtn, ROUND_TO_NEGINF)
-GEN(fma_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(fma_rte,FMA_RTE)
+GEN(fma_rtn,FMA_RTN)
+GEN(fma_rtp,FMA_RTP)
+GEN(fma_rtz,FMA_RTZ)
 
diff --git a/ocml/src/fmaH.cl b/ocml/src/fmaH.cl
index 98320551..03bacf72 100644
--- a/ocml/src/fmaH.cl
+++ b/ocml/src/fmaH.cl
@@ -19,21 +19,15 @@ MATH_MANGLE(fma)(half a, half b, half c)
     return BUILTIN_FMA_F16(a, b, c);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR half \
-MATH_MANGLE(NAME)(half a, half b, half c) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR half \
+MATH_MANGLE(LN)(half a, half b, half c) \
 { \
-    return BUILTIN_FULL_TERNARY(ffmah, false, ROUND, a, b, c); \
+    return BUILTIN_##UN##_F16(a, b, c); \
 }
 
-GEN(fma_rte, ROUND_TO_NEAREST_EVEN)
-GEN(fma_rtp, ROUND_TO_POSINF)
-GEN(fma_rtn, ROUND_TO_NEGINF)
-GEN(fma_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(fma_rte,FMA_RTE)
+GEN(fma_rtn,FMA_RTN)
+GEN(fma_rtp,FMA_RTP)
+GEN(fma_rtz,FMA_RTZ)
 
diff --git a/ocml/src/mulD.cl b/ocml/src/mulD.cl
index 6d7e296f..05c8aae6 100644
--- a/ocml/src/mulD.cl
+++ b/ocml/src/mulD.cl
@@ -7,21 +7,15 @@
 
 #include "mathD.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR double \
-MATH_MANGLE(NAME)(double x, double y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR double \
+MATH_MANGLE(LN)(double x, double y) \
 { \
-    return BUILTIN_FULL_BINARY(fmul, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F64(x, y); \
 }
 
-GEN(mul_rte, ROUND_TO_NEAREST_EVEN)
-GEN(mul_rtp, ROUND_TO_POSINF)
-GEN(mul_rtn, ROUND_TO_NEGINF)
-GEN(mul_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(mul_rte,MUL_RTE)
+GEN(mul_rtn,MUL_RTN)
+GEN(mul_rtp,MUL_RTP)
+GEN(mul_rtz,MUL_RTZ)
 
diff --git a/ocml/src/mulF.cl b/ocml/src/mulF.cl
index ace8b656..4a4e4da0 100644
--- a/ocml/src/mulF.cl
+++ b/ocml/src/mulF.cl
@@ -7,27 +7,15 @@
 
 #include "mathF.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR float \
-MATH_MANGLE(NAME)(float x, float y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR float \
+MATH_MANGLE(LN)(float x, float y) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_BINARY(fmulf, true, ROUND, x, y); \
-    } else { \
-        ret = BUILTIN_FULL_BINARY(fmulf, false, ROUND, x, y); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x, y); \
 }
 
-GEN(mul_rte, ROUND_TO_NEAREST_EVEN)
-GEN(mul_rtp, ROUND_TO_POSINF)
-GEN(mul_rtn, ROUND_TO_NEGINF)
-GEN(mul_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(mul_rte,MUL_RTE)
+GEN(mul_rtn,MUL_RTN)
+GEN(mul_rtp,MUL_RTP)
+GEN(mul_rtz,MUL_RTZ)
 
diff --git a/ocml/src/mulH.cl b/ocml/src/mulH.cl
index 2cb52876..9d738867 100644
--- a/ocml/src/mulH.cl
+++ b/ocml/src/mulH.cl
@@ -7,21 +7,15 @@
 
 #include "mathH.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR half \
-MATH_MANGLE(NAME)(half x, half y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR half \
+MATH_MANGLE(LN)(half x, half y) \
 { \
-    return BUILTIN_FULL_BINARY(fmulh, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F16(x, y); \
 }
 
-GEN(mul_rte, ROUND_TO_NEAREST_EVEN)
-GEN(mul_rtp, ROUND_TO_POSINF)
-GEN(mul_rtn, ROUND_TO_NEGINF)
-GEN(mul_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(mul_rte,MUL_RTE)
+GEN(mul_rtn,MUL_RTN)
+GEN(mul_rtp,MUL_RTP)
+GEN(mul_rtz,MUL_RTZ)
 
diff --git a/ocml/src/sqrtD.cl b/ocml/src/sqrtD.cl
index e2a6fedf..6f484fab 100644
--- a/ocml/src/sqrtD.cl
+++ b/ocml/src/sqrtD.cl
@@ -13,21 +13,15 @@ MATH_MANGLE(sqrt)(double x)
     return MATH_SQRT(x);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR double \
-MATH_MANGLE(NAME)(double x) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR double \
+MATH_MANGLE(LN)(double x) \
 { \
-    return BUILTIN_FULL_UNARY(fsqrt, false, ROUND, x); \
+    return BUILTIN_##UN##_F64(x); \
 }
 
-GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sqrt_rtp, ROUND_TO_POSINF)
-GEN(sqrt_rtn, ROUND_TO_NEGINF)
-GEN(sqrt_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sqrt_rte,SQRT_RTE)
+GEN(sqrt_rtn,SQRT_RTN)
+GEN(sqrt_rtp,SQRT_RTP)
+GEN(sqrt_rtz,SQRT_RTZ)
 
diff --git a/ocml/src/sqrtF.cl b/ocml/src/sqrtF.cl
index 99b32c25..051e73b6 100644
--- a/ocml/src/sqrtF.cl
+++ b/ocml/src/sqrtF.cl
@@ -17,27 +17,15 @@ MATH_MANGLE(sqrt)(float x)
     }
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR float \
-MATH_MANGLE(NAME)(float x) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR float \
+MATH_MANGLE(LN)(float x) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_UNARY(fsqrtf, true, ROUND, x); \
-    } else { \
-        ret = BUILTIN_FULL_UNARY(fsqrtf, false, ROUND, x); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x); \
 }
 
-GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sqrt_rtp, ROUND_TO_POSINF)
-GEN(sqrt_rtn, ROUND_TO_NEGINF)
-GEN(sqrt_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sqrt_rte,SQRT_RTE)
+GEN(sqrt_rtn,SQRT_RTN)
+GEN(sqrt_rtp,SQRT_RTP)
+GEN(sqrt_rtz,SQRT_RTZ)
 
diff --git a/ocml/src/sqrtH.cl b/ocml/src/sqrtH.cl
index 321003f9..b4488e4e 100644
--- a/ocml/src/sqrtH.cl
+++ b/ocml/src/sqrtH.cl
@@ -15,21 +15,15 @@ MATH_MANGLE(sqrt)(half x)
     return BUILTIN_SQRT_F16(x);
 }
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR half \
-MATH_MANGLE(NAME)(half x) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR half \
+MATH_MANGLE(LN)(half x) \
 { \
-    return BUILTIN_FULL_UNARY(fsqrth, false, ROUND, x); \
+    return BUILTIN_##UN##_F16(x); \
 }
 
-GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sqrt_rtp, ROUND_TO_POSINF)
-GEN(sqrt_rtn, ROUND_TO_NEGINF)
-GEN(sqrt_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sqrt_rte,SQRT_RTE)
+GEN(sqrt_rtp,SQRT_RTN)
+GEN(sqrt_rtn,SQRT_RTP)
+GEN(sqrt_rtz,SQRT_RTZ)
 
diff --git a/ocml/src/subD.cl b/ocml/src/subD.cl
index a9f4d3f1..f6c9a92b 100644
--- a/ocml/src/subD.cl
+++ b/ocml/src/subD.cl
@@ -7,21 +7,15 @@
 
 #include "mathD.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR double \
-MATH_MANGLE(NAME)(double x, double y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR double \
+MATH_MANGLE(LN)(double x, double y) \
 { \
-    return BUILTIN_FULL_BINARY(fsub, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F64(x, y); \
 }
 
-GEN(sub_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sub_rtp, ROUND_TO_POSINF)
-GEN(sub_rtn, ROUND_TO_NEGINF)
-GEN(sub_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sub_rte,SUB_RTE)
+GEN(sub_rtn,SUB_RTN)
+GEN(sub_rtp,SUB_RTP)
+GEN(sub_rtz,SUB_RTZ)
 
diff --git a/ocml/src/subF.cl b/ocml/src/subF.cl
index a8a4f2dd..80d7d3c7 100644
--- a/ocml/src/subF.cl
+++ b/ocml/src/subF.cl
@@ -7,27 +7,15 @@
 
 #include "mathF.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR float \
-MATH_MANGLE(NAME)(float x, float y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR float \
+MATH_MANGLE(LN)(float x, float y) \
 { \
-    float ret; \
-    if (DAZ_OPT()) { \
-        ret = BUILTIN_FULL_BINARY(fsubf, true, ROUND, x, y); \
-    } else { \
-        ret = BUILTIN_FULL_BINARY(fsubf, false, ROUND, x, y); \
-    } \
-    return ret; \
+    return BUILTIN_##UN##_F32(x, y); \
 }
 
-GEN(sub_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sub_rtp, ROUND_TO_POSINF)
-GEN(sub_rtn, ROUND_TO_NEGINF)
-GEN(sub_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sub_rte,SUB_RTE)
+GEN(sub_rtn,SUB_RTN)
+GEN(sub_rtp,SUB_RTP)
+GEN(sub_rtz,SUB_RTZ)
 
diff --git a/ocml/src/subH.cl b/ocml/src/subH.cl
index 054c46d2..369792e3 100644
--- a/ocml/src/subH.cl
+++ b/ocml/src/subH.cl
@@ -7,21 +7,15 @@
 
 #include "mathH.h"
 
-#if defined ENABLE_ROUNDED
-#if defined HSAIL_BUILD
-
-#define GEN(NAME,ROUND)\
-CONSTATTR half \
-MATH_MANGLE(NAME)(half x, half y) \
+#define GEN(LN,UN) \
+CONSTATTR INLINEATTR half \
+MATH_MANGLE(LN)(half x, half y) \
 { \
-    return BUILTIN_FULL_BINARY(fsubh, false, ROUND, x, y); \
+    return BUILTIN_##UN##_F16(x, y); \
 }
 
-GEN(sub_rte, ROUND_TO_NEAREST_EVEN)
-GEN(sub_rtp, ROUND_TO_POSINF)
-GEN(sub_rtn, ROUND_TO_NEGINF)
-GEN(sub_rtz, ROUND_TO_ZERO)
-
-#endif // HSAIL_BUILD
-#endif // ENABLE_ROUNDED
+GEN(sub_rte,SUB_RTE)
+GEN(sub_rtn,SUB_RTN)
+GEN(sub_rtp,SUB_RTP)
+GEN(sub_rtz,SUB_RTZ)
 

From 063b0239cc9280c39c002bb4530da02ce8694f41 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Thu, 10 Aug 2017 10:55:34 -0700
Subject: [PATCH 11/25] Update fast fma test

Change-Id: I54793aa1f15da105cd721a27438f19490e0076e4
---
 ocml/src/opts.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocml/src/opts.h b/ocml/src/opts.h
index c6bb1146..3a07cbc2 100644
--- a/ocml/src/opts.h
+++ b/ocml/src/opts.h
@@ -7,7 +7,7 @@
 
 #include "oclc.h"
 
-#define HAVE_FAST_FMA32() (__oclc_ISA_version() == 701 || __oclc_ISA_version() == 801)
+#define HAVE_FAST_FMA32() (__oclc_ISA_version() == 701 || __oclc_ISA_version() == 801 || __oclc_ISA_version() >= 900)
 #define FINITE_ONLY_OPT() __oclc_finite_only_opt()
 #define UNSAFE_MATH_OPT() __oclc_unsafe_math_opt()
 #define DAZ_OPT() __oclc_daz_opt()

From 1bbab71850408abadffc217dcb71cf23f88effe4 Mon Sep 17 00:00:00 2001
From: "Wen-Heng (Jack) Chung" <jack.chung@amd.com>
Date: Fri, 11 Aug 2017 14:13:33 -0500
Subject: [PATCH 12/25] Fix CMake for HCC build

CMAKE_SOURCE_DIR has been changed after recent changes in
HCC. Use
CMAKE_CURRENT_SOURCE_DIR to properly locate the source codes of ROCDL.

Change-Id: If943fa094c9c0b4bd8d5b6dba9e22706759a83bf
---
 hc/CMakeLists.txt     | 2 +-
 irif/CMakeLists.txt   | 2 +-
 opencl/CMakeLists.txt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hc/CMakeLists.txt b/hc/CMakeLists.txt
index d8d0c0d3..e7440e4f 100644
--- a/hc/CMakeLists.txt
+++ b/hc/CMakeLists.txt
@@ -24,7 +24,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_CURRENT_SOURCE_DIR}/../utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB ll_srcs
diff --git a/irif/CMakeLists.txt b/irif/CMakeLists.txt
index 12b0397d..883240f6 100644
--- a/irif/CMakeLists.txt
+++ b/irif/CMakeLists.txt
@@ -20,7 +20,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_CURRENT_SOURCE_DIR}/../utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB srcs
diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt
index ed78ec85..5b707605 100644
--- a/opencl/CMakeLists.txt
+++ b/opencl/CMakeLists.txt
@@ -36,7 +36,7 @@ if (GENERIC_IS_ZERO)
   endforeach(f)
 
   # Perform transformation
-  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/utils"
+  execute_process(COMMAND "${CMAKE_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/../utils"
                   WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
   file(GLOB ll_srcs

From 1fbc67a919dc67d591db0846600403ea6316ab94 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Fri, 11 Aug 2017 13:53:13 -0700
Subject: [PATCH 13/25] Switch to 64-bit intrinsics

Change-Id: I6119d87f9ed44629823a7dd88c78da340893688a
---
 irif/inc/irif.h                |  4 ++--
 ockl/inc/ockl.h                |  5 +++++
 ockl/src/clz.cl                |  7 +++++++
 ockl/src/ctz.cl                |  7 +++++++
 ockl/src/popcount.cl           |  6 ++++++
 opencl/src/integer/clz.cl      | 17 ++++++++---------
 opencl/src/integer/ctz.cl      | 17 ++++++++---------
 opencl/src/integer/popcount.cl |  6 ++----
 8 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/irif/inc/irif.h b/irif/inc/irif.h
index 2205d466..20fb3308 100644
--- a/irif/inc/irif.h
+++ b/irif/inc/irif.h
@@ -121,10 +121,10 @@ extern bool __llvm_smul_with_overflow_i64(long, long, __private long*);
 extern bool __llvm_umul_with_overflow_i64(ulong, ulong, __private ulong*);
 
 extern __attribute__((const)) int __llvm_ctlz_i32(int);
-extern __attribute__((const)) int __llvm_ctlz_i64(long);
+extern __attribute__((const)) long __llvm_ctlz_i64(long);
 
 extern __attribute__((const)) int __llvm_cttz_i32(int);
-extern __attribute__((const)) int __llvm_cttz_i64(long);
+extern __attribute__((const)) long __llvm_cttz_i64(long);
 
 // Fence intrinsics
 extern void __llvm_fence_acq_wi(void);
diff --git a/ockl/inc/ockl.h b/ockl/inc/ockl.h
index 0cba8e0f..10e4c3a1 100644
--- a/ockl/inc/ockl.h
+++ b/ockl/inc/ockl.h
@@ -103,8 +103,13 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 DECL_CONST_OCKL_UNARY_U32(clz)
+DECL_CONST_OCKL_UNARY_U64(clz)
+
 DECL_CONST_OCKL_UNARY_U32(ctz)
+DECL_CONST_OCKL_UNARY_U64(ctz)
+
 DECL_CONST_OCKL_UNARY_U32(popcount)
+DECL_CONST_OCKL_UNARY_U64(popcount)
 
 DECL_CONST_OCKL_BINARY_I32(add_sat)
 DECL_CONST_OCKL_BINARY_U32(add_sat)
diff --git a/ockl/src/clz.cl b/ockl/src/clz.cl
index 593b4fcf..fdc219b5 100644
--- a/ockl/src/clz.cl
+++ b/ockl/src/clz.cl
@@ -15,3 +15,10 @@ OCKL_MANGLE_U32(clz)(uint i)
     return i ? r : 32u;
 }
 
+__attribute__((always_inline, const)) ulong
+OCKL_MANGLE_U64(clz)(ulong i)
+{
+    ulong r = (ulong)__llvm_ctlz_i64((long)i);
+    return i ? r : 64ul;
+}
+
diff --git a/ockl/src/ctz.cl b/ockl/src/ctz.cl
index 72de58cf..f927f2a5 100644
--- a/ockl/src/ctz.cl
+++ b/ockl/src/ctz.cl
@@ -15,3 +15,10 @@ OCKL_MANGLE_U32(ctz)(uint i)
     return i ? r : 32u;
 }
 
+__attribute__((always_inline, const)) ulong
+OCKL_MANGLE_U64(ctz)(ulong i)
+{
+    ulong r = (ulong)__llvm_cttz_i64((long)i);
+    return i ? r : 64ul;
+}
+
diff --git a/ockl/src/popcount.cl b/ockl/src/popcount.cl
index 77212f17..b6404022 100644
--- a/ockl/src/popcount.cl
+++ b/ockl/src/popcount.cl
@@ -14,3 +14,9 @@ OCKL_MANGLE_U32(popcount)(uint i)
     return (uint)__llvm_ctpop_i32((int)i);
 }
 
+__attribute__((always_inline, const)) ulong
+OCKL_MANGLE_U64(popcount)(ulong i)
+{
+    return (ulong)__llvm_ctpop_i64((long)i);
+}
+
diff --git a/opencl/src/integer/clz.cl b/opencl/src/integer/clz.cl
index f24a648a..7719641b 100644
--- a/opencl/src/integer/clz.cl
+++ b/opencl/src/integer/clz.cl
@@ -61,16 +61,15 @@ clz(uint x)
     return __ockl_clz_u32(x);
 }
 
-__attribute__((always_inline, const)) static ulong
-clz_u64(ulong x)
+UEXPATTR long
+clz(long x)
 {
-    uint xlo = (uint)x;
-    uint xhi = (uint)(x >> 32);
-    uint zlo = __ockl_clz_u32(xlo) + 32u;
-    uint zhi = __ockl_clz_u32(xhi);
-    return (ulong)(xhi == 0 ? zlo : zhi);
+    return (long)__ockl_clz_u64((ulong)x);
 }
 
-extern __attribute__((overloadable, always_inline, const, alias("clz_u64"))) ulong clz(ulong);
-extern __attribute__((overloadable, always_inline, const, alias("clz_u64")))  long clz(long);
+UEXPATTR ulong
+clz(ulong x)
+{
+    return __ockl_clz_u64(x);
+}
 
diff --git a/opencl/src/integer/ctz.cl b/opencl/src/integer/ctz.cl
index d75fc386..1577ab50 100644
--- a/opencl/src/integer/ctz.cl
+++ b/opencl/src/integer/ctz.cl
@@ -57,16 +57,15 @@ ctz(uint x)
     return __ockl_ctz_u32(x);
 }
 
-__attribute__((always_inline, const)) static ulong
-ctz_u64(ulong x)
+UEXPATTR long
+ctz(long x)
 {
-    uint xlo = (uint)x;
-    uint xhi = (uint)(x >> 32);
-    uint zlo = __ockl_ctz_u32(xlo);
-    uint zhi = __ockl_ctz_u32(xhi) + 32u;
-    return (ulong)(xlo == 0 ? zhi : zlo);
+    return (long)__ockl_ctz_u64((ulong)x);
 }
 
-extern __attribute__((overloadable, always_inline, const, alias("ctz_u64"))) ulong ctz(ulong);
-extern __attribute__((overloadable, always_inline, const, alias("ctz_u64")))  long ctz(long);
+UEXPATTR ulong
+ctz(ulong x)
+{
+    return __ockl_ctz_u64(x);
+}
 
diff --git a/opencl/src/integer/popcount.cl b/opencl/src/integer/popcount.cl
index f40f32b1..53c525ad 100644
--- a/opencl/src/integer/popcount.cl
+++ b/opencl/src/integer/popcount.cl
@@ -57,14 +57,12 @@ popcount(uint x)
 UEXPATTR long
 popcount(long x)
 {
-    uint2 y = as_uint2(x);
-    return (long)(__ockl_popcount_u32(y.lo) + __ockl_popcount_u32(y.hi));
+    return (long)__ockl_popcount_u64((ulong)x);
 }
 
 UEXPATTR ulong
 popcount(ulong x)
 {
-    uint2 y = as_uint2(x);
-    return (ulong)(__ockl_popcount_u32(y.lo) + __ockl_popcount_u32(y.hi));
+    return __ockl_popcount_u64(x);
 }
 

From a57dee34eb478e660646929b1d1261577493d458 Mon Sep 17 00:00:00 2001
From: "Wen-Heng (Jack) Chung" <jack.chung@amd.com>
Date: Fri, 11 Aug 2017 18:26:25 -0500
Subject: [PATCH 14/25] Revise HC workitem indexing function implementation

- Make sure all functions return 32-bit unsigned integers.
- Optimize implementation of amp_get_local_size().
- Optimize implementation of amp_get_num_groups().

Change-Id: Ib27caa0030071bca338ab924fa405ad288dbc0c2
---
 hc/src/hc_kernel.cl | 114 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 91 insertions(+), 23 deletions(-)

diff --git a/hc/src/hc_kernel.cl b/hc/src/hc_kernel.cl
index 50e8e07d..3bbd1c77 100644
--- a/hc/src/hc_kernel.cl
+++ b/hc/src/hc_kernel.cl
@@ -5,7 +5,7 @@
 #define ATTR __attribute__((always_inline, const))
 #define ATTR2 __attribute__((always_inline))
 
-ATTR long
+ATTR uint
 amp_get_global_id(int dim)
 {
   __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr();
@@ -37,70 +37,138 @@ amp_get_global_id(int dim)
   return (g*s + l);
 }
 
-ATTR long
+ATTR uint
 amp_get_global_size(int dim)
 {
-  return __ockl_get_global_size(dim);
+    __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr();
+
+    switch(dim) {
+    case 0:
+        return p->grid_size_x;
+    case 1:
+        return p->grid_size_y;
+    case 2:
+        return p->grid_size_z;
+    default:
+        return 1;
+    }
 }
 
-ATTR long
+ATTR uint
 amp_get_local_id(int dim)
 {
-  return __ockl_get_local_id(dim);
+    switch(dim) {
+    case 0:
+        return __llvm_amdgcn_workitem_id_x();
+    case 1:
+        return __llvm_amdgcn_workitem_id_y();
+    case 2:
+        return __llvm_amdgcn_workitem_id_z();
+    default:
+        return 0;
+    }
 }
 
-ATTR long
+ATTR uint
 amp_get_num_groups(int dim)
 {
-  return __ockl_get_num_groups(dim);
+    __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr();
+
+    uint n, d;
+    switch(dim) {
+    case 0:
+        n = p->grid_size_x;
+        d = p->workgroup_size_x;
+        break;
+    case 1:
+        n = p->grid_size_y;
+        d = p->workgroup_size_y;
+        break;
+    case 2:
+        n = p->grid_size_z;
+        d = p->workgroup_size_z;
+        break;
+    default:
+        n = 1;
+        d = 1;
+        break;
+    }
+
+    return n / d;
 }
 
-ATTR long
+ATTR uint
 amp_get_group_id(int dim)
 {
-  return __ockl_get_group_id(dim);
+    switch(dim) {
+    case 0:
+        return __llvm_amdgcn_workgroup_id_x();
+    case 1:
+        return __llvm_amdgcn_workgroup_id_y();
+    case 2:
+        return __llvm_amdgcn_workgroup_id_z();
+    default:
+        return 0;
+    }
 }
 
-ATTR long
+ATTR uint
 amp_get_local_size(int dim)
 {
-  return __ockl_get_local_size(dim);
+    __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr();
+    uint d;
+
+    switch(dim) {
+    case 0:
+        d = p->workgroup_size_x;
+        break;
+    case 1:
+        d = p->workgroup_size_y;
+        break;
+    case 2:
+        d = p->workgroup_size_z;
+        break;
+    default:
+        d = 1;
+        break;
+    }
+    return d;
 }
 
-ATTR long
+ATTR uint
 hc_get_grid_size(int dim)
 {
-  return __ockl_get_global_size(dim);
+    return amp_get_global_size(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_workitem_absolute_id(int dim)
 {
-  return amp_get_global_id(dim);
+    return amp_get_global_id(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_workitem_id(int dim)
 {
-  return __ockl_get_local_id(dim);
+    return amp_get_local_id(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_num_groups(int dim)
 {
-  return __ockl_get_num_groups(dim);
+    return amp_get_num_groups(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_group_id(int dim)
 {
-  return __ockl_get_group_id(dim);
+    return amp_get_group_id(dim);
 }
 
-ATTR long
+ATTR uint
 hc_get_group_size(int dim)
 {
-  return __ockl_get_local_size(dim);
+    return amp_get_local_size(dim);
 }
 
 ATTR2 void

From 6d7336be6f7fe9552d34de14984d7270a556b1f1 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Thu, 17 Aug 2017 15:18:53 -0700
Subject: [PATCH 15/25] Add atomic support

Change-Id: I8d5eb3b052441664933e52114cfa3b8e7762c9b7
---
 opencl/src/misc/asqf.cl   |   6 +
 opencl/src/misc/atom.cl   | 390 ++++++++++++++++++++++++++++++++++++++
 opencl/src/misc/printf.cl |   6 +
 3 files changed, 402 insertions(+)
 create mode 100644 opencl/src/misc/atom.cl

diff --git a/opencl/src/misc/asqf.cl b/opencl/src/misc/asqf.cl
index 84ff26ed..d6a05968 100644
--- a/opencl/src/misc/asqf.cl
+++ b/opencl/src/misc/asqf.cl
@@ -1,3 +1,9 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
 
 #include "ockl.h"
 
diff --git a/opencl/src/misc/atom.cl b/opencl/src/misc/atom.cl
new file mode 100644
index 00000000..d9f9ab38
--- /dev/null
+++ b/opencl/src/misc/atom.cl
@@ -0,0 +1,390 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+
+#define ATTR __attribute__((overloadable, always_inline))
+
+#define AC_int(X) X
+#define AC_uint(X) X
+#define AC_long(X) X
+#define AC_ulong(X) X
+#define AC_intptr_t(X) X
+#define AC_uintptr_t(X) X
+#define AC_size_t(X) X
+#define AC_ptrdiff_t(X) X
+#define AC_float(X) as_int(X)
+#define AC_double(X) as_long(X)
+
+#define RC_int(X) X
+#define RC_uint(X) X
+#define RC_long(X) X
+#define RC_ulong(X) X
+#define RC_intptr_t(X) X
+#define RC_uintptr_t(X) X
+#define RC_size_t(X) X
+#define RC_ptrdiff_t(X) X
+#define RC_float(X) as_float(X)
+#define RC_double(X) as_double(X)
+
+#define PC_int
+#define PC_uint
+#define PC_long
+#define PC_ulong
+#define PC_intptr_t
+#define PC_uintptr_t
+#define PC_size_t
+#define PC_ptrdiff_t
+#define PC_float (volatile atomic_int *)
+#define PC_double (volatile atomic_long *)
+
+#define EC_int
+#define EC_uint
+#define EC_long
+#define EC_ulong
+#define EC_intptr_t
+#define EC_uintptr_t
+#define EC_size_t
+#define EC_ptrdiff_t
+#define EC_float (int *)
+#define EC_double (long *)
+
+#define OCL12_MEMORY_ORDER memory_order_relaxed
+#define OCL12_MEMORY_SCOPE memory_scope_device
+
+#define F_inc __opencl_atomic_fetch_add
+#define F_dec __opencl_atomic_fetch_sub
+
+// extension and 1.2 functions
+#define GEN1(T,A,O) \
+ATTR T \
+atom_##O(volatile A T *p, T v) \
+{ \
+    return __opencl_atomic_fetch_##O((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define GEN2(T,A,O) \
+ATTR T \
+atomic_##O(volatile A T *p, T v) \
+{ \
+    return __opencl_atomic_fetch_##O((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define OPSA(F,T,A) \
+    F(T,A,add) \
+    F(T,A,sub) \
+    F(T,A,max) \
+    F(T,A,min) \
+    F(T,A,and) \
+    F(T,A,or) \
+    F(T,A,xor)
+
+#define OPS(F,T) \
+    OPSA(F,T,)
+
+#define ALL() \
+    OPS(GEN1,int) \
+    OPS(GEN2,int) \
+    OPS(GEN1,uint) \
+    OPS(GEN2,uint) \
+    OPS(GEN1,long) \
+    OPS(GEN1,ulong)
+
+ALL()
+
+// Handle inc and dec
+#undef GEN1
+#undef GEN2
+#undef OPSA
+
+#define OPSA(F,T,A) \
+    F(T,A,inc) \
+    F(T,A,dec)
+
+
+#define GEN1(T,A,O) \
+ATTR T \
+atom_##O(volatile A T *p) \
+{ \
+    return F_##O((volatile atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define GEN2(T,A,O) \
+ATTR T \
+atomic_##O(volatile A T *p) \
+{ \
+    return F_##O((volatile atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+ALL()
+
+// Handle xchg
+#undef GEN1
+#undef GEN2
+#undef OPSA
+#undef OPS
+
+#define GEN1(T,A) \
+ATTR T \
+atom_xchg(volatile A T *p, T v) \
+{ \
+    return __opencl_atomic_exchange((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define GEN2(T,A) \
+ATTR T \
+atomic_xchg(volatile A T *p, T v) \
+{ \
+    return __opencl_atomic_exchange((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+}
+
+#define OPS(F,T) \
+    F(T,)
+
+ALL()
+
+ATTR float
+atomic_xchg(volatile float *p, float v)
+{
+    return as_float(__opencl_atomic_exchange((volatile atomic_int *)p, as_int(v), OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE));
+}
+
+// Handle cmpxchg
+#undef GEN1
+#undef GEN2
+
+#define GEN1(T,A) \
+ATTR T \
+atom_cmpxchg(volatile A T *p, T e, T d) \
+{ \
+    __opencl_atomic_compare_exchange_strong((volatile atomic_##T *)p, &e, d,  OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+    return e; \
+}
+
+#define GEN2(T,A) \
+ATTR T \
+atomic_cmpxchg(volatile A T *p, T e, T d) \
+{ \
+    __opencl_atomic_compare_exchange_strong((volatile atomic_##T *)p, &e, d,  OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \
+    return e; \
+}
+
+ALL()
+#undef GEN1
+#undef GEN2
+#undef ALL
+
+// 2.0 functions
+
+#define GENI(T) \
+ATTR void \
+atomic_init(volatile atomic_##T *p, T v) \
+{ \
+    __opencl_atomic_init(p, v); \
+}
+
+#define GENS(T) \
+ATTR void \
+atomic_store(volatile atomic_##T *p, T v) \
+{ \
+    __opencl_atomic_store(p, v, memory_order_seq_cst, memory_scope_device); \
+} \
+ \
+ATTR void \
+atomic_store_explicit(volatile atomic_##T *p, T v, memory_order o) \
+{ \
+    __opencl_atomic_store(p, v, o, memory_scope_device); \
+} \
+ \
+ATTR void \
+atomic_store_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \
+{ \
+    __opencl_atomic_store(p, v, o, s); \
+}
+
+#define GENL(T) \
+ATTR T \
+atomic_load(volatile atomic_##T *p) \
+{ \
+    return __opencl_atomic_load(p, memory_order_seq_cst, memory_scope_device); \
+} \
+ \
+ATTR T \
+atomic_load_explicit(volatile atomic_##T *p, memory_order o) \
+{ \
+    return __opencl_atomic_load(p, o, memory_scope_device); \
+} \
+ \
+ATTR T \
+atomic_load_explicit(volatile atomic_##T *p, memory_order o, memory_scope s) \
+{ \
+    return __opencl_atomic_load(p, o, s); \
+}
+
+#define GENX(T) \
+ATTR T \
+atomic_exchange(volatile atomic_##T *p, T v) \
+{ \
+    return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \
+} \
+ \
+ATTR T \
+atomic_exchange_explicit(volatile atomic_##T *p, T v, memory_order o) \
+{ \
+    return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), o, memory_scope_device)); \
+} \
+ \
+ATTR T \
+atomic_exchange_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \
+{ \
+    return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), o, s)); \
+}
+
+#define GENCX(T,K) \
+ATTR bool \
+atomic_compare_exchange_##K(volatile atomic_##T *p, T *e, T d) \
+{ \
+    return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \
+} \
+ \
+ATTR bool \
+atomic_compare_exchange_##K##_explicit(volatile atomic_##T *p, T *e, T d, memory_order os, memory_order of) \
+{ \
+    return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), os, of, memory_scope_device); \
+} \
+ \
+ATTR bool \
+atomic_compare_exchange_##K##_explicit(volatile atomic_##T *p, T *e, T d, memory_order os, memory_order of, memory_scope s) \
+{ \
+    return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), os, of, s); \
+}
+
+#define GENFO(T,O) \
+ATTR T \
+atomic_fetch_##O(volatile atomic_##T *p, T v) \
+{ \
+    return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \
+} \
+ \
+ATTR T \
+atomic_fetch_##O##_explicit(volatile atomic_##T *p, T v, memory_order o) \
+{ \
+    return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), o, memory_scope_device)); \
+} \
+ \
+ATTR T \
+atomic_fetch_##O##_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \
+{ \
+    return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), o, s)); \
+}
+
+#define CX(T) \
+    GENCX(T,strong) \
+    GENCX(T,weak)
+
+#define FO(T) \
+    GENFO(T,add) \
+    GENFO(T,sub) \
+    GENFO(T,or) \
+    GENFO(T,xor) \
+    GENFO(T,and) \
+    GENFO(T,min) \
+    GENFO(T,max) \
+
+#define ALLI(F) \
+    F(int) \
+    F(uint) \
+    F(long) \
+    F(ulong)
+
+#define ALL(F) \
+    ALLI(F) \
+    F(float) \
+    F(double)
+
+ALL(GENI)
+ALL(GENL)
+ALL(GENS)
+ALL(GENX)
+ALL(CX)
+ALLI(FO)
+
+// These are needed for uintptr_t
+ATTR ulong
+atomic_fetch_add(volatile atomic_ulong *p, long v)
+{
+    return __opencl_atomic_fetch_add(p, (ulong)v, memory_order_seq_cst, memory_scope_device);
+}
+
+ATTR ulong
+atomic_fetch_add_explicit(volatile atomic_ulong *p, long v, memory_order o)
+{
+    return __opencl_atomic_fetch_add(p, (ulong)v, o, memory_scope_device);
+}
+
+ATTR ulong
+atomic_fetch_add_explicit(volatile atomic_ulong *p, long v, memory_order o, memory_scope s)
+{
+    return __opencl_atomic_fetch_add(p, (ulong)v, o, s);
+}
+
+ATTR ulong
+atomic_fetch_sub(volatile atomic_ulong *p, long v)
+{
+    return __opencl_atomic_fetch_sub(p, (ulong)v, memory_order_seq_cst, memory_scope_device);
+}
+
+ATTR ulong
+atomic_fetch_sub_explicit(volatile atomic_ulong *p, long v, memory_order o)
+{
+    return __opencl_atomic_fetch_sub(p, (ulong)v, o, memory_scope_device);
+}
+
+ATTR ulong
+atomic_fetch_sub_explicit(volatile atomic_ulong *p, long v, memory_order o, memory_scope s)
+{
+    return __opencl_atomic_fetch_sub(p, (ulong)v, o, s);
+}
+
+// flag functions
+ATTR bool
+atomic_flag_test_and_set(volatile atomic_flag *p)
+{
+    return __opencl_atomic_exchange((volatile atomic_int *)p, 1, memory_order_seq_cst, memory_scope_device);
+}
+
+ATTR bool
+atomic_flag_test_and_set_explicit(volatile atomic_flag *p, memory_order o)
+{
+    return __opencl_atomic_exchange((volatile atomic_int *)p, 1, o, memory_scope_device);
+}
+
+ATTR bool
+atomic_flag_test_and_set_explicit(volatile atomic_flag *p, memory_order o, memory_scope s)
+{
+    return __opencl_atomic_exchange((volatile atomic_int *)p, 1, o, s);
+}
+
+ATTR void
+atomic_flag_clear(volatile atomic_flag *p)
+{
+    __opencl_atomic_store((volatile atomic_int *)p, 0, memory_order_seq_cst, memory_scope_device);
+}
+
+ATTR void
+atomic_flag_clear_explicit(volatile atomic_flag *p, memory_order o)
+{
+    __opencl_atomic_store((volatile atomic_int *)p, 0, o, memory_scope_device);
+}
+
+ATTR void
+atomic_flag_clear_explicit(volatile atomic_flag *p, memory_order o, memory_scope s)
+{
+    __opencl_atomic_store((volatile atomic_int *)p, 0, o, s);
+}
+
diff --git a/opencl/src/misc/printf.cl b/opencl/src/misc/printf.cl
index 815f96ea..f80ebf1c 100644
--- a/opencl/src/misc/printf.cl
+++ b/opencl/src/misc/printf.cl
@@ -1,3 +1,9 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
 
 #include "irif.h"
 

From 805795d4f48f5e05916e44aa55813464daebee3c Mon Sep 17 00:00:00 2001
From: Ashwin Aji <ashwinma@gmail.com>
Date: Wed, 23 Aug 2017 15:21:19 -0500
Subject: [PATCH 16/25] For GFX9+ ring the full 64-bit doorbell directly

Change-Id: Ie8dc8748a40f01220f7d4d98fe5216d625635bbc
---
 ockl/src/hsaqs.cl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ockl/src/hsaqs.cl b/ockl/src/hsaqs.cl
index 426d2c6b..15ce2810 100644
--- a/ockl/src/hsaqs.cl
+++ b/ockl/src/hsaqs.cl
@@ -235,6 +235,9 @@ OCKL_MANGLE_T(hsa_signal,store)(hsa_signal_t sig, long value, __ockl_memory_orde
     if (s->kind == AMD_SIGNAL_KIND_USER) {
         AS((__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices);
         update_mbox(s);
+    } else if (__oclc_ISA_version() >= 900) {
+        // Hardware doorbell supports AQL semantics.
+        atomic_store_explicit((__global atomic_ulong *)s->hardware_doorbell_ptr, (ulong)value, memory_order_release, memory_scope_all_svm_devices);
     } else {
 
         {

From c58ed4680a2b9d137e4215809a5dd46c8a679a9f Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Thu, 24 Aug 2017 14:55:32 -0700
Subject: [PATCH 17/25] Update attributes

Change-Id: Ida1b7322defc5b4e6fb8eec354c7d6cfe26afd8f
---
 irif/inc/irif.h |  6 +++---
 irif/src/reg.ll | 11 +++++------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/irif/inc/irif.h b/irif/inc/irif.h
index 20fb3308..439b0ce3 100644
--- a/irif/inc/irif.h
+++ b/irif/inc/irif.h
@@ -349,9 +349,9 @@ extern void __llvm_amdcgn_buffer_wbinvl1_vol(void) __asm("llvm.amdgcn.buffer.wbi
 extern __attribute__((const)) uint __llvm_amdgcn_mbcnt_lo(uint, uint) __asm("llvm.amdgcn.mbcnt.lo");
 extern __attribute__((const)) uint __llvm_amdgcn_mbcnt_hi(uint, uint) __asm("llvm.amdgcn.mbcnt.hi");
 
-extern ulong __llvm_amdgcn_read_exec(void);
-extern uint __llvm_amdgcn_read_exec_lo(void);
-extern uint __llvm_amdgcn_read_exec_hi(void);
+extern __attribute__((convergent)) ulong __llvm_amdgcn_read_exec(void);
+extern __attribute__((convergent)) uint __llvm_amdgcn_read_exec_lo(void);
+extern __attribute__((convergent)) uint __llvm_amdgcn_read_exec_hi(void);
 
 extern uint __llvm_amdgcn_s_getreg(uint) __asm("llvm.amdgcn.s.getreg");
 
diff --git a/irif/src/reg.ll b/irif/src/reg.ll
index 2fa2ab65..43bf238c 100644
--- a/irif/src/reg.ll
+++ b/irif/src/reg.ll
@@ -12,23 +12,22 @@ declare i32 @llvm.read_register.i32(metadata) #0
 declare i64 @llvm.read_register.i64(metadata) #0
 
 define i64 @__llvm_amdgcn_read_exec() #1 {
-    %1 = call i64 @llvm.read_register.i64(metadata !0) #2
+    %1 = call i64 @llvm.read_register.i64(metadata !0) #0
     ret i64 %1
 }
 
 define i32 @__llvm_amdgcn_read_exec_lo() #1 {
-    %1 = call i32 @llvm.read_register.i32(metadata !1) #2
+    %1 = call i32 @llvm.read_register.i32(metadata !1) #0
     ret i32 %1
 }
 
 define i32 @__llvm_amdgcn_read_exec_hi() #1 {
-    %1 = call i32 @llvm.read_register.i32(metadata !2) #2
+    %1 = call i32 @llvm.read_register.i32(metadata !2) #0
     ret i32 %1
 }
 
-attributes #0 = { nounwind }
-attributes #1 = { alwaysinline nounwind }
-attributes #2 = { nounwind convergent }
+attributes #0 = { nounwind convergent }
+attributes #1 = { alwaysinline nounwind convergent }
 
 !0 = !{!"exec"}
 !1 = !{!"exec_lo"}

From 6a77f3f60e527ddc7ce8697246bfe87e4df8af5b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 29 Aug 2017 16:23:27 -0700
Subject: [PATCH 18/25] Revert "Add relaxed math attributes to all functions"

This reverts commit a6f6461392b4cde5edeebd0b9fcf02c005fc7f11.

This is incorrect. You cannot assume these attributes for the
library functions. IR optimzations can rely on these assumptions
and break the library IR. Codegen also can break. For example this
triggers fcanonicalize elimination optimizations if the function is
emitted as a call. Fixes various conformance failures when stress
testing calls.

Change-Id: I51c25cb4e7b178fce2e2656d473aeb7b20e40858
---
 utils/prepare-builtins/prepare-builtins.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/utils/prepare-builtins/prepare-builtins.cpp b/utils/prepare-builtins/prepare-builtins.cpp
index 5d5009bf..b1145363 100644
--- a/utils/prepare-builtins/prepare-builtins.cpp
+++ b/utils/prepare-builtins/prepare-builtins.cpp
@@ -14,7 +14,6 @@
 
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
@@ -108,18 +107,6 @@ int main(int argc, char **argv) {
     }
   }
 
-  // Set relaxed math attributes. This does not mean a library module was built
-  // with those relaxations, but marks it compatible with the relaxations which
-  // may be used for the kernel module. Setting them prevents removal of them
-  // for a caller function, thus retaining original caller attributes.
-  AttrBuilder B;
-  B.addAttribute("less-precise-fpmad", "true");
-  B.addAttribute("no-infs-fp-math", "true");
-  B.addAttribute("no-nans-fp-math", "true");
-  B.addAttribute("unsafe-fp-math", "true");
-  for (Function &F : M->functions()) {
-    F.addAttributes(AttributeList::FunctionIndex, B);
-  }
 
   if (OutputFilename.empty()) {
     errs() << "no output file\n";

From d70e34306f90146ebbbfe895f3207b174d7e84e4 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Wed, 30 Aug 2017 10:42:53 -0700
Subject: [PATCH 19/25] Avoid WQM instructions

Change-Id: I7454599e5e794be5ec62a03e8b82d12398839426
---
 irif/inc/irif.h   | 36 ++++++++++++++++++------------------
 ockl/src/image.cl | 40 ++++++++++++++++++++--------------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/irif/inc/irif.h b/irif/inc/irif.h
index 439b0ce3..476cfd6f 100644
--- a/irif/inc/irif.h
+++ b/irif/inc/irif.h
@@ -461,12 +461,12 @@ extern void __llvm_amdgcn_image_store_mip_f32_v4i32(float p, int4 c, uint8 t, ui
     __asm("llvm.amdgcn.image.store.mip.f32.v4i32.v8i32");
 
 // Image Sample: Only expose 8 word T# and a few of the other combinations
-extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f32.f32.v8i32");
-extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f32.v2f32.v8i32");
-extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f32.v4f32.v8i32");
+extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f32.f32.v8i32");
+extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f32.v2f32.v8i32");
+extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32");
 
 extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
     __asm("llvm.amdgcn.image.sample.l.v4f32.v2f32.v8i32");
@@ -480,12 +480,12 @@ extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_v4f32_v8f32(flo
 extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_v4f32_v16f32(float16 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
     __asm("llvm.amdgcn.image.sample.l.v4f32.v16f32.v8i32");
 
-extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f16.f32.v8i32");
-extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f16.v2f32.v8i32");
-extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.v4f16.v4f32.v8i32");
+extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f16.f32.v8i32");
+extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f16.v2f32.v8i32");
+extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.v4f16.v4f32.v8i32");
 
 extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
     __asm("llvm.amdgcn.image.sample.l.v4f16.v2f32.v8i32");
@@ -500,10 +500,10 @@ extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_d_v4f16_v16f32(flo
     __asm("llvm.amdgcn.image.sample.l.v4f16.v16f32.v8i32");
 
 // depth image sample
-extern __attribute__((pure)) float __llvm_amdgcn_image_sample_f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.f32.v2f32.v8i32");
-extern __attribute__((pure)) float __llvm_amdgcn_image_sample_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-    __asm("llvm.amdgcn.image.sample.f32.v4f32.v8i32");
+extern __attribute__((pure)) float __llvm_amdgcn_image_sample_lz_f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.f32.v2f32.v8i32");
+extern __attribute__((pure)) float __llvm_amdgcn_image_sample_lz_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+    __asm("llvm.amdgcn.image.sample.lz.f32.v4f32.v8i32");
 
 extern __attribute__((pure)) float __llvm_amdgcn_image_sample_l_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
     __asm("llvm.amdgcn.image.sample.l.f32.v4f32.v8i32");
@@ -514,8 +514,8 @@ extern __attribute__((pure)) float __llvm_amdgcn_image_sample_d_f32_v16f32(float
     __asm("llvm.amdgcn.image.sample.l.f32.v16f32.v8i32");
 
 // image fetch
-extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
-        __asm("llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32");
+extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da)
+        __asm("llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32");
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
 #endif // IRIF_H
diff --git a/ockl/src/image.cl b/ockl/src/image.cl
index b1752c4a..1d3ee450 100644
--- a/ockl/src/image.cl
+++ b/ockl/src/image.cl
@@ -497,7 +497,7 @@ RATTR float4
 OCKL_MANGLE_T(image_sample,1D)(TSHARP i, SSHARP s, float c)
 {
     ADJUST_X(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f32_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
@@ -505,14 +505,14 @@ OCKL_MANGLE_T(image_sample,1Da)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_X(c.x, i, s);
     c.y = __llvm_rint_f32(c.y);
-    return __llvm_amdgcn_image_sample_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_sample,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
@@ -520,7 +520,7 @@ OCKL_MANGLE_T(image_sample,2Da)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XY(c, i, s);
     c.z = __llvm_rint_f32(c.z);
-    return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
 }
 
 RATTR float
@@ -528,28 +528,28 @@ OCKL_MANGLE_T(image_sample,2Dad)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XY(c, i, s);
     c.z = __llvm_rint_f32(c.z);
-    return __llvm_amdgcn_image_sample_f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, true);
 }
 
 RATTR float
 OCKL_MANGLE_T(image_sample,2Dd)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_sample_f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_sample,3D)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XYZ(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_sample,CM)(TSHARP i, SSHARP s, float4 c)
 {
     CUBE_PREP(c);
-    return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
@@ -557,7 +557,7 @@ OCKL_MANGLE_T(image_sample,CMa)(TSHARP i, SSHARP s, float4 c)
 {
     CUBE_PREP(c);
     c.z = SAMPLE_ARRAY_FACE(c.w, c.z);
-    return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR float4
@@ -685,7 +685,7 @@ RATTR half4
 OCKL_MANGLE_T(image_sampleh,1D)(TSHARP i, SSHARP s, float c)
 {
     ADJUST_X(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f16_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
@@ -693,14 +693,14 @@ OCKL_MANGLE_T(image_sampleh,1Da)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_X(c.x, i, s);
     c.y = __llvm_rint_f32(c.y);
-    return __llvm_amdgcn_image_sample_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
 }
 
 RATTR half4
 OCKL_MANGLE_T(image_sampleh,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
@@ -708,21 +708,21 @@ OCKL_MANGLE_T(image_sampleh,2Da)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XY(c, i, s);
     c.z = __llvm_rint_f32(c.z);
-    return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true);
 }
 
 RATTR half4
 OCKL_MANGLE_T(image_sampleh,3D)(TSHARP i, SSHARP s, float4 c)
 {
     ADJUST_XYZ(c, i, s);
-    return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
 OCKL_MANGLE_T(image_sampleh,CM)(TSHARP i, SSHARP s, float4 c)
 {
     CUBE_PREP(c);
-    return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
@@ -730,7 +730,7 @@ OCKL_MANGLE_T(image_sampleh,CMa)(TSHARP i, SSHARP s, float4 c)
 {
     CUBE_PREP(c);
     c.z = SAMPLE_ARRAY_FACE(c.w, c.z);
-    return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
+    return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false);
 }
 
 RATTR half4
@@ -828,28 +828,28 @@ RATTR float4
 OCKL_MANGLE_T(image_gather4r,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false);
+    return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_gather4g,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x2, false, false, false, false, false);
+    return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x2, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_gather4b,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x4, false, false, false, false, false);
+    return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x4, false, false, false, false, false);
 }
 
 RATTR float4
 OCKL_MANGLE_T(image_gather4a,2D)(TSHARP i, SSHARP s, float2 c)
 {
     ADJUST_XY(c, i, s);
-    return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x8, false, false, false, false, false);
+    return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x8, false, false, false, false, false);
 }
 
 // We rely on the fact that the runtime allocates 12 words for the T# or V#

From e5f678f6dbabe9eaef66fca7d39c4c3dfd7b474c Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Fri, 1 Sep 2017 14:38:39 -0700
Subject: [PATCH 20/25] Update bit counting functions

Change-Id: Iedf2217e0c7e0c1987cb127276aff1ce48272354
---
 irif/inc/irif.h           | 14 +++++++++-----
 irif/src/cz.ll            | 30 +++++++++++++++++++++++++-----
 ockl/inc/ockl.h           |  4 ++++
 ockl/src/clz.cl           | 18 ++++++++++++++----
 ockl/src/ctz.cl           | 18 ++++++++++++++----
 opencl/src/integer/clz.cl | 16 ++++------------
 opencl/src/integer/ctz.cl | 12 ++++--------
 7 files changed, 74 insertions(+), 38 deletions(-)

diff --git a/irif/inc/irif.h b/irif/inc/irif.h
index 476cfd6f..9121bb39 100644
--- a/irif/inc/irif.h
+++ b/irif/inc/irif.h
@@ -120,11 +120,15 @@ extern bool __llvm_umul_with_overflow_i32(uint, uint, __private uint*);
 extern bool __llvm_smul_with_overflow_i64(long, long, __private long*);
 extern bool __llvm_umul_with_overflow_i64(ulong, ulong, __private ulong*);
 
-extern __attribute__((const)) int __llvm_ctlz_i32(int);
-extern __attribute__((const)) long __llvm_ctlz_i64(long);
-
-extern __attribute__((const)) int __llvm_cttz_i32(int);
-extern __attribute__((const)) long __llvm_cttz_i64(long);
+extern __attribute__((const)) uchar __llvm_ctlz_i8(uchar);
+extern __attribute__((const)) ushort __llvm_ctlz_i16(ushort);
+extern __attribute__((const)) uint __llvm_ctlz_i32(uint);
+extern __attribute__((const)) ulong __llvm_ctlz_i64(ulong);
+
+extern __attribute__((const)) uchar __llvm_cttz_i8(uchar);
+extern __attribute__((const)) ushort __llvm_cttz_i16(ushort);
+extern __attribute__((const)) uint __llvm_cttz_i32(uint);
+extern __attribute__((const)) ulong __llvm_cttz_i64(ulong);
 
 // Fence intrinsics
 extern void __llvm_fence_acq_wi(void);
diff --git a/irif/src/cz.ll b/irif/src/cz.ll
index af16d51c..bd9194c7 100644
--- a/irif/src/cz.ll
+++ b/irif/src/cz.ll
@@ -18,25 +18,45 @@ declare i16 @llvm.cttz.i16(i16, i1)
 declare i32 @llvm.cttz.i32(i32, i1)
 declare i64 @llvm.cttz.i64(i64, i1)
 
+define i8 @__llvm_ctlz_i8(i8) #0 {
+    %2 = call i8 @llvm.ctlz.i8(i8 %0, i1 0)
+    ret i8 %2
+}
+
+define i16 @__llvm_ctlz_i16(i16) #0 {
+    %2 = call i16 @llvm.ctlz.i16(i16 %0, i1 0)
+    ret i16 %2
+}
+
 define i32 @__llvm_ctlz_i32(i32) #0 {
-    %2 = call i32 @llvm.ctlz.i32(i32 %0, i1 1)
+    %2 = call i32 @llvm.ctlz.i32(i32 %0, i1 0)
     ret i32 %2
 }
 
 define i64 @__llvm_ctlz_i64(i64) #0 {
-    %2 = call i64 @llvm.ctlz.i64(i64 %0, i1 1)
+    %2 = call i64 @llvm.ctlz.i64(i64 %0, i1 0)
     ret i64 %2
 }
 
+define i8 @__llvm_cttz_i8(i8) #0 {
+    %2 = call i8 @llvm.cttz.i8(i8 %0, i1 0)
+    ret i8 %2
+}
+
+define i16 @__llvm_cttz_i16(i16) #0 {
+    %2 = call i16 @llvm.cttz.i16(i16 %0, i1 0)
+    ret i16 %2
+}
+
 define i32 @__llvm_cttz_i32(i32) #0 {
-    %2 = call i32 @llvm.cttz.i32(i32 %0, i1 1)
+    %2 = call i32 @llvm.cttz.i32(i32 %0, i1 0)
     ret i32 %2
 }
 
 define i64 @__llvm_cttz_i64(i64) #0 {
-    %2 = call i64 @llvm.cttz.i64(i64 %0, i1 1)
+    %2 = call i64 @llvm.cttz.i64(i64 %0, i1 0)
     ret i64 %2
 }
 
-attributes #0 = { alwaysinline argmemonly norecurse nounwind readnone }
+attributes #0 = { alwaysinline norecurse nounwind readnone }
 
diff --git a/ockl/inc/ockl.h b/ockl/inc/ockl.h
index 10e4c3a1..bceacdaa 100644
--- a/ockl/inc/ockl.h
+++ b/ockl/inc/ockl.h
@@ -102,9 +102,13 @@
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
+extern __attribute__((const)) uchar OCKL_MANGLE_T(clz,u8)(uchar);
+extern __attribute__((const)) ushort OCKL_MANGLE_T(clz,u16)(ushort);
 DECL_CONST_OCKL_UNARY_U32(clz)
 DECL_CONST_OCKL_UNARY_U64(clz)
 
+extern __attribute__((const)) uchar OCKL_MANGLE_T(ctz,u8)(uchar);
+extern __attribute__((const)) ushort OCKL_MANGLE_T(ctz,u16)(ushort);
 DECL_CONST_OCKL_UNARY_U32(ctz)
 DECL_CONST_OCKL_UNARY_U64(ctz)
 
diff --git a/ockl/src/clz.cl b/ockl/src/clz.cl
index fdc219b5..e93edbee 100644
--- a/ockl/src/clz.cl
+++ b/ockl/src/clz.cl
@@ -8,17 +8,27 @@
 #include "irif.h"
 #include "ockl.h"
 
+__attribute__((always_inline, const)) uchar
+OCKL_MANGLE_T(clz,u8)(uchar i)
+{
+    return __llvm_ctlz_i8(i);
+}
+
+__attribute__((always_inline, const)) ushort
+OCKL_MANGLE_T(clz,u16)(ushort i)
+{
+    return __llvm_ctlz_i16(i);
+}
+
 __attribute__((always_inline, const)) uint
 OCKL_MANGLE_U32(clz)(uint i)
 {
-    uint r = (uint)__llvm_ctlz_i32((int)i);
-    return i ? r : 32u;
+    return __llvm_ctlz_i32(i);
 }
 
 __attribute__((always_inline, const)) ulong
 OCKL_MANGLE_U64(clz)(ulong i)
 {
-    ulong r = (ulong)__llvm_ctlz_i64((long)i);
-    return i ? r : 64ul;
+   return __llvm_ctlz_i64(i);
 }
 
diff --git a/ockl/src/ctz.cl b/ockl/src/ctz.cl
index f927f2a5..a7ad76e9 100644
--- a/ockl/src/ctz.cl
+++ b/ockl/src/ctz.cl
@@ -8,17 +8,27 @@
 #include "irif.h"
 #include "ockl.h"
 
+__attribute__((always_inline, const)) uchar
+OCKL_MANGLE_T(ctz,u8)(uchar i)
+{
+    return __llvm_cttz_i8(i);
+}
+
+__attribute__((always_inline, const)) ushort
+OCKL_MANGLE_T(ctz,u16)(ushort i)
+{
+    return __llvm_cttz_i16(i);
+}
+
 __attribute__((always_inline, const)) uint
 OCKL_MANGLE_U32(ctz)(uint i)
 {
-    uint r = (uint)__llvm_cttz_i32((int)i);
-    return i ? r : 32u;
+    return __llvm_cttz_i32(i);
 }
 
 __attribute__((always_inline, const)) ulong
 OCKL_MANGLE_U64(ctz)(ulong i)
 {
-    ulong r = (ulong)__llvm_cttz_i64((long)i);
-    return i ? r : 64ul;
+    return __llvm_cttz_i64(i);
 }
 
diff --git a/opencl/src/integer/clz.cl b/opencl/src/integer/clz.cl
index 7719641b..c3f4b6af 100644
--- a/opencl/src/integer/clz.cl
+++ b/opencl/src/integer/clz.cl
@@ -20,33 +20,25 @@ UEXP(ulong,clz)
 UEXPATTR char
 clz(char x)
 {
-    uint y = (uint)(uchar)x;
-    uint z = __ockl_clz_u32(y);
-    return (char)(z - 24u);
+    return (char)__ockl_clz_u8((uchar)x);
 }
 
 UEXPATTR uchar
 clz(uchar x)
 {
-    uint y = (uint)x;
-    uint z = __ockl_clz_u32(y);
-    return (uchar)(z - 24u);
+    return __ockl_clz_u8(x);
 }
 
 UEXPATTR short
 clz(short x)
 {
-    uint y = (uint)(ushort)x;
-    uint z = __ockl_clz_u32(y);
-    return (short)(z - 16u);
+    return (short)__ockl_clz_u16((ushort)x);
 }
 
 UEXPATTR ushort
 clz(ushort x)
 {
-    uint y = (uint)x;
-    uint z = __ockl_clz_u32(y);
-    return (ushort)(z - 16u);
+    return __ockl_clz_u16(x);
 }
 
 UEXPATTR int
diff --git a/opencl/src/integer/ctz.cl b/opencl/src/integer/ctz.cl
index 1577ab50..b583bf52 100644
--- a/opencl/src/integer/ctz.cl
+++ b/opencl/src/integer/ctz.cl
@@ -20,29 +20,25 @@ UEXP(ulong,ctz)
 UEXPATTR char
 ctz(char x)
 {
-    uint y = (uint)(uchar)x;
-    return (char)min(__ockl_ctz_u32(y), 8u);
+    return (char)__ockl_ctz_u8((uchar)x);
 }
 
 UEXPATTR uchar
 ctz(uchar x)
 {
-    uint y = (uint)x;
-    return (uchar)min(__ockl_ctz_u32(y), 8u);
+    return __ockl_ctz_u8(x);
 }
 
 UEXPATTR short
 ctz(short x)
 {
-    uint y = (uint)(ushort)x;
-    return (short)min(__ockl_ctz_u32(y), 16u);
+    return (short)__ockl_ctz_u16((ushort)x);
 }
 
 UEXPATTR ushort
 ctz(ushort x)
 {
-    uint y = (uint)x;
-    return (ushort)min(__ockl_ctz_u32(y), 16u);
+    return __ockl_ctz_u16(x);
 }
 
 UEXPATTR int

From 79d2d27cfe99606187abe29903bf1965b3a0b144 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Wed, 6 Sep 2017 13:07:48 -0700
Subject: [PATCH 21/25] Remove workaround for atomics

Change-Id: Ia82031d162b23a70689506258306846d950e93e8
---
 ockl/inc/ockl_hsa.h |  10 ++---
 ockl/src/hsaqs.cl   | 102 +++-----------------------------------------
 2 files changed, 10 insertions(+), 102 deletions(-)

diff --git a/ockl/inc/ockl_hsa.h b/ockl/inc/ockl_hsa.h
index 111116b7..1a53d9e1 100644
--- a/ockl/inc/ockl_hsa.h
+++ b/ockl/inc/ockl_hsa.h
@@ -12,11 +12,11 @@
 #include "device_amd_hsa.h"
 
 typedef enum __ockl_memory_order_e {
-  __ockl_memory_order_relaxed,
-  __ockl_memory_order_acquire,
-  __ockl_memory_order_release,
-  __ockl_memory_order_acq_rel,
-  __ockl_memory_order_seq_cst,
+  __ockl_memory_order_relaxed = __ATOMIC_RELAXED,
+  __ockl_memory_order_acquire = __ATOMIC_ACQUIRE,
+  __ockl_memory_order_release = __ATOMIC_RELEASE,
+  __ockl_memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  __ockl_memory_order_seq_cst = __ATOMIC_SEQ_CST,
 } __ockl_memory_order;
 
 extern ulong OCKL_MANGLE_T(hsa_queue,load_write_index)(const __global hsa_queue_t *queue, __ockl_memory_order mem_order);
diff --git a/ockl/src/hsaqs.cl b/ockl/src/hsaqs.cl
index 15ce2810..a39fc495 100644
--- a/ockl/src/hsaqs.cl
+++ b/ockl/src/hsaqs.cl
@@ -15,103 +15,11 @@
 
 #define ATTR __attribute__((always_inline))
 
-// TODO Remove this workaround when the compiler is ready
-
-#define AL(T,P,O,S) ({ \
-    T __l; \
-    switch (O) { \
-    case __ockl_memory_order_acquire: \
-        __l = atomic_load_explicit(P, memory_order_acquire, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        __l = atomic_load_explicit(P, memory_order_seq_cst, S); \
-        break; \
-    default: \
-        __l = atomic_load_explicit(P, memory_order_relaxed, S); \
-        break; \
-    } \
-    __l; \
-})
-
-#define AS(P,V,O,S) ({ \
-    switch (O) { \
-    case __ockl_memory_order_release: \
-        atomic_store_explicit(P, V, memory_order_release, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        atomic_store_explicit(P, V, memory_order_seq_cst, S); \
-        break; \
-    default: \
-        atomic_store_explicit(P, V, memory_order_relaxed, S); \
-        break; \
-    } \
-})
-
-#define AF(T,K,P,V,O,S) ({ \
-    T __f; \
-    switch (O) { \
-    case __ockl_memory_order_acquire: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_acquire, S); \
-        break; \
-    case __ockl_memory_order_release: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_release, S); \
-        break; \
-    case __ockl_memory_order_acq_rel: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_acq_rel, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_seq_cst, S); \
-        break; \
-    default: \
-        __f = atomic_fetch_##K##_explicit(P, V, memory_order_relaxed, S); \
-        break; \
-    } \
-    __f; \
-})
-
-#define AX(T,P,V,O,S) ({ \
-    T __e; \
-    switch (O) { \
-    case __ockl_memory_order_acquire: \
-        __e = atomic_exchange_explicit(P, V, memory_order_acquire, S); \
-        break; \
-    case __ockl_memory_order_release: \
-        __e = atomic_exchange_explicit(P, V, memory_order_release, S); \
-        break; \
-    case __ockl_memory_order_acq_rel: \
-        __e = atomic_exchange_explicit(P, V, memory_order_acq_rel, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        __e = atomic_exchange_explicit(P, V, memory_order_seq_cst, S); \
-        break; \
-    default: \
-        __e = atomic_exchange_explicit(P, V, memory_order_relaxed, S); \
-        break; \
-    } \
-    __e; \
-})
-
-#define AC(P,E,V,O,R,S) ({ \
-    bool __c; \
-    switch (O) { \
-    case __ockl_memory_order_acquire: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_acquire, R, S); \
-        break; \
-    case __ockl_memory_order_release: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_release, R, S); \
-        break; \
-    case __ockl_memory_order_acq_rel: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_acq_rel, R, S); \
-        break; \
-    case __ockl_memory_order_seq_cst: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_seq_cst, R, S); \
-        break; \
-    default: \
-        __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_relaxed, R, S); \
-        break; \
-    } \
-    __c; \
-})
+#define AL(T,P,O,S) __opencl_atomic_load(P,O,S)
+#define AS(P,V,O,S) __opencl_atomic_store(P,V,O,S)
+#define AF(T,K,P,V,O,S) __opencl_atomic_fetch_##K(P,V,O,S)
+#define AX(T,P,V,O,S) __opencl_atomic_exchange(P,V,O,S)
+#define AC(P,E,V,O,R,S) __opencl_atomic_compare_exchange_strong(P,E,V,O,R,S)
 
 //
 // HSA queue ops

From abc46c9f2bb59e0a333ef7a71fbc21a8c18a4490 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Thu, 7 Sep 2017 10:07:00 -0700
Subject: [PATCH 22/25] Change address space to enable enqueue

Change-Id: I99ab1f49e0369946af2620c272950e4ec306ac79
---
 ockl/inc/hsa.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ockl/inc/hsa.h b/ockl/inc/hsa.h
index 0252b009..85365882 100644
--- a/ockl/inc/hsa.h
+++ b/ockl/inc/hsa.h
@@ -1502,7 +1502,7 @@ typedef struct hsa_queue_s {
 
 #ifdef HSA_LARGE_MODEL
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *base_address;
 #elif defined HSA_LITTLE_ENDIAN
@@ -1511,7 +1511,7 @@ typedef struct hsa_queue_s {
    * packets. Must be aligned to the size of an AQL packet.
    */
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *base_address;
   /**
@@ -1521,7 +1521,7 @@ typedef struct hsa_queue_s {
 #else
   uint32_t reserved0;
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *base_address;
 #endif
@@ -2129,7 +2129,7 @@ typedef struct hsa_kernel_dispatch_packet_s {
 
 #ifdef HSA_LARGE_MODEL
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *kernarg_address;
 #elif defined HSA_LITTLE_ENDIAN
@@ -2141,7 +2141,7 @@ typedef struct hsa_kernel_dispatch_packet_s {
    * completed execution.
    */
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *kernarg_address;
   /**
@@ -2151,7 +2151,7 @@ typedef struct hsa_kernel_dispatch_packet_s {
 #else
   uint32_t reserved1;
 #ifdef DEVICE_COMPILER
-  __constant
+  __global
 #endif
   void *kernarg_address;
 #endif

From 94b5167ce3880c1654f9d24bd5bbe2b45f507a1e Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Fri, 15 Sep 2017 05:13:35 -0700
Subject: [PATCH 23/25] Pipe functions

Change-Id: I47368f5e3d7b1083d0e7ba8a9b355cd7f5433f19
---
 opencl/CMakeLists.txt        |   1 +
 opencl/src/pipes/commitp.cl  |  93 +++++++++++++++
 opencl/src/pipes/getp.cl     |  45 +++++++
 opencl/src/pipes/memcpyia.cl |  55 +++++++++
 opencl/src/pipes/pipes.h     | 109 +++++++++++++++++
 opencl/src/pipes/readp.cl    |  75 ++++++++++++
 opencl/src/pipes/reservep.cl | 219 +++++++++++++++++++++++++++++++++++
 opencl/src/pipes/validp.cl   |  14 +++
 opencl/src/pipes/wresvnp.cl  | 148 +++++++++++++++++++++++
 opencl/src/pipes/writep.cl   |  65 +++++++++++
 10 files changed, 824 insertions(+)
 create mode 100644 opencl/src/pipes/commitp.cl
 create mode 100644 opencl/src/pipes/getp.cl
 create mode 100644 opencl/src/pipes/memcpyia.cl
 create mode 100644 opencl/src/pipes/pipes.h
 create mode 100644 opencl/src/pipes/readp.cl
 create mode 100644 opencl/src/pipes/reservep.cl
 create mode 100644 opencl/src/pipes/validp.cl
 create mode 100644 opencl/src/pipes/wresvnp.cl
 create mode 100644 opencl/src/pipes/writep.cl

diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt
index 5b707605..8da642aa 100644
--- a/opencl/CMakeLists.txt
+++ b/opencl/CMakeLists.txt
@@ -14,6 +14,7 @@ file(GLOB cl_sources
   ${CMAKE_CURRENT_SOURCE_DIR}/src/math/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/media/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/misc/*.cl
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/pipes/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/relational/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/subgroup/*.cl
   ${CMAKE_CURRENT_SOURCE_DIR}/src/vldst/*.cl
diff --git a/opencl/src/pipes/commitp.cl b/opencl/src/pipes/commitp.cl
new file mode 100644
index 00000000..51528cb8
--- /dev/null
+++ b/opencl/src/pipes/commitp.cl
@@ -0,0 +1,93 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(COMMIT_READ_PIPE_SIZE)
+
+ATTR void
+__commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+#define COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(COMMIT_WRITE_PIPE_SIZE)
+
+ATTR void
+__commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+// Work group functions
+
+#define WORK_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__work_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(WORK_GROUP_COMMIT_READ_PIPE_SIZE)
+
+ATTR void
+__work_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+#define WORK_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__work_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(WORK_GROUP_COMMIT_WRITE_PIPE_SIZE)
+
+ATTR void
+__work_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+// sub group functions
+
+#define SUB_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__sub_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(SUB_GROUP_COMMIT_READ_PIPE_SIZE)
+
+ATTR void
+__sub_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
+#define SUB_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR void \
+__sub_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+// DO_PIPE_SIZE(SUB_GROUP_COMMIT_WRITE_PIPE_SIZE)
+
+ATTR void
+__sub_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align)
+{
+}
+
diff --git a/opencl/src/pipes/getp.cl b/opencl/src/pipes/getp.cl
new file mode 100644
index 00000000..d5531996
--- /dev/null
+++ b/opencl/src/pipes/getp.cl
@@ -0,0 +1,45 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+#define ATTR __attribute__((always_inline, pure))
+
+#define GET_PIPE_NUM_PACKETS_SIZE(SIZE, STYPE) \
+ATTR uint \
+__get_pipe_num_packets_##SIZE(__global struct pipeimp* p) \
+{ \
+    size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    return (uint)(wi - ri); \
+}
+
+// DO_PIPE_SIZE(GET_PIPE_NUM_PACKETS_SIZE)
+
+ATTR uint
+__get_pipe_num_packets(__global struct pipeimp* p, uint size, uint align)
+{
+    size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    return (uint)(wi - ri);
+}
+
+#define GET_PIPE_MAX_PACKETS_SIZE(SIZE, STYPE) \
+ATTR uint \
+__get_pipe_max_packets_##SIZE(__global struct pipeimp* p) \
+{ \
+    return (uint)p->end_idx; \
+}
+
+// DO_PIPE_SIZE(GET_PIPE_MAX_PACKETS_SIZE)
+
+ATTR uint
+__get_pipe_max_packets(__global struct pipeimp* p, uint size, uint align)
+{
+    return (uint)p->end_idx;
+}
+
diff --git a/opencl/src/pipes/memcpyia.cl b/opencl/src/pipes/memcpyia.cl
new file mode 100644
index 00000000..f536d044
--- /dev/null
+++ b/opencl/src/pipes/memcpyia.cl
@@ -0,0 +1,55 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+__attribute__((always_inline, weak)) void 
+__memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align)
+{
+    if (align == 2) {
+	short *d2 = (short *)d;
+	short *s2 = (short *)s;
+	short *e2 = s2 + size/2;
+
+	while (s2 < e2)
+	    *d2++ = *s2++;
+    } else if (align == 4) {
+	int *d4 = (int *)d;
+	int *s4 = (int *)s;
+	int *e4 = s4 + size/4;
+
+	while (s4 < e4)
+	    *d4++ = *s4++;
+    } else if (align == 8) {
+	long *d8 = (long *)d;
+	long *s8 = (long *)s;
+	long *e8 = s8 + size/8;
+
+	while (s8 < e8)
+	    *d8++ = *s8++;
+    } else if (align == 16) {
+	long2 *d16 = (long2 *)d;
+	long2 *s16 = (long2 *)s;
+	long2 *e16 = s16 + size/16;
+
+	while (s16 < e16)
+	    *d16++ = *s16++;
+    } else if (align == 32 || align == 64 || align == 128) {
+	long4 *d32 = (long4 *)d;
+	long4 *s32 = (long4 *)s;
+	long4 *e32 = s32 + size/32;
+
+	while (s32 < e32)
+	    *d32++ = *s32++;
+    } else {
+	char *d1 = (char *)d;
+	char *s1 = (char *)s;
+	char *e1 = s1 + size;
+
+	while (s1 < e1)
+	    *d1++ = *s1++;
+    }
+}
+
diff --git a/opencl/src/pipes/pipes.h b/opencl/src/pipes/pipes.h
new file mode 100644
index 00000000..16ab22fd
--- /dev/null
+++ b/opencl/src/pipes/pipes.h
@@ -0,0 +1,109 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "irif.h"
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+ 
+extern size_t __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n);
+
+#define DO_PIPE_SIZE(F) \
+F(1,uchar) \
+F(2,ushort) \
+F(4,uint) \
+F(8,ulong) \
+F(16,ulong2) \
+F(32,ulong4) \
+F(64,ulong8) \
+F(128,ulong16)
+
+struct pipeimp {
+    atomic_size_t read_idx;
+    atomic_size_t write_idx;
+    size_t end_idx;
+    uchar pad[128 - 3*sizeof(size_t)];
+    uchar packets[1];
+};
+
+extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t);
+
+static __attribute__((always_inline)) size_t
+reserve(volatile __global atomic_size_t *pi, size_t lim, size_t n)
+{
+    size_t i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device);
+
+    for (;;) {
+        if (i + n > lim)
+            return ~(size_t)0;
+
+        if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device))
+            break;
+    }
+
+    return i;
+}
+
+static inline size_t
+wave_reserve_1(volatile __global atomic_size_t *pi, size_t lim)
+{
+    size_t n = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) +
+                        __llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi()));
+    uint l = __llvm_amdgcn_mbcnt_hi(__llvm_amdgcn_read_exec_hi(),
+               __llvm_amdgcn_mbcnt_lo(__llvm_amdgcn_read_exec_lo(), 0u));
+    size_t i = 0;
+
+    if (l == 0) {
+        i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device);
+
+        for (;;) {
+            if (i + n > lim) {
+                i = ~(size_t)0;
+                break;
+            }
+
+            if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device))
+                break;
+        }
+    }
+
+    __llvm_amdgcn_wave_barrier();
+
+    // Broadcast the result; the ctz tells us which lane has active lane id 0
+    uint k = (uint)__llvm_cttz_i64(__llvm_amdgcn_read_exec());
+    i = ((size_t)__llvm_amdgcn_readlane((uint)(i >> 32), k) << 32) |
+        (size_t)__llvm_amdgcn_readlane((uint)i, k);
+
+    __llvm_amdgcn_wave_barrier();
+
+    if (i != ~(size_t)0)
+        i += l;
+    else {
+        // The entire group didn't fit, have to handle one by one
+        i = reserve(pi, lim, (size_t)1);
+    }
+
+    return i;
+}
+
+static inline size_t
+wrap(size_t i, size_t n)
+{
+    // Assume end_i < 2^32
+    size_t ret;
+    if (as_uint2(i).y == 0U) {
+        uint j = (uint)i;
+        uint m = (uint)n;
+        if (j < m)
+            ret = i;
+        else
+            ret = (ulong)(j % m);
+    } else
+        ret = i % n;
+    return ret;
+}
+
diff --git a/opencl/src/pipes/readp.cl b/opencl/src/pipes/readp.cl
new file mode 100644
index 00000000..1808ad3a
--- /dev/null
+++ b/opencl/src/pipes/readp.cl
@@ -0,0 +1,75 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR int \
+__read_pipe_2_##SIZE(__global struct pipeimp* p, STYPE* ptr) \
+{ \
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ri = wave_reserve_1(&p->read_idx, wi); \
+    if (ri == ~(size_t)0) \
+        return -1; \
+ \
+    size_t pi = wrap(ri, p->end_idx); \
+    *ptr = ((__global STYPE *)p->packets)[pi]; \
+ \
+    if (ri == wi-1) { \
+        __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \
+        __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+    }\
+\
+    return 0; \
+}
+
+DO_PIPE_SIZE(READ_PIPE_SIZE)
+
+ATTR int
+__read_pipe_2(__global struct pipeimp* p, void* ptr, uint size, uint align)
+{
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    size_t ri = wave_reserve_1(&p->read_idx, wi);
+    if (ri == ~(size_t)0)
+        return -1;
+
+    size_t pi = wrap(ri, p->end_idx);
+    __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align);
+
+    if (ri == wi-1) {
+        __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
+        __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+    }
+
+    return 0;
+}
+
+#define READ_PIPE_RESERVED_SIZE(SIZE, STYPE) \
+ATTR int \
+__read_pipe_4_##SIZE(__global struct pipeimp* p, size_t rid, uint i, STYPE* ptr)  \
+{ \
+    rid += i; \
+    size_t pi = wrap(rid, p->end_idx); \
+    *ptr = ((__global STYPE *)p->packets)[pi]; \
+ \
+    return 0; \
+}
+
+DO_PIPE_SIZE(READ_PIPE_RESERVED_SIZE)
+
+ATTR int
+__read_pipe_4(__global struct pipeimp* p, size_t rid, uint i, void *ptr, uint size, uint align)
+{
+    rid += i;
+    size_t pi = wrap(rid, p->end_idx);
+    __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align);
+
+    return 0;
+}
+
diff --git a/opencl/src/pipes/reservep.cl b/opencl/src/pipes/reservep.cl
new file mode 100644
index 00000000..18e073be
--- /dev/null
+++ b/opencl/src/pipes/reservep.cl
@@ -0,0 +1,219 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#include "pipes.h"
+#include "../workgroup/wg.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets); \
+ \
+    if (rid + num_packets == wi) { \
+        __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \
+        __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+    } \
+ \
+    return rid; \
+}
+
+// DO_PIPE_SIZE(RESERVE_READ_PIPE_SIZE)
+
+ATTR size_t
+__reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets);
+
+    if (rid + num_packets == wi) {
+        __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
+        __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+    }
+
+    return rid;
+}
+
+#define RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ei = p->end_idx; \
+    return __amd_wresvn(&p->write_idx, ri + ei, num_packets); \
+}
+
+// DO_PIPE_SIZE(RESERVE_WRITE_PIPE_SIZE)
+
+ATTR size_t
+__reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t ei = p->end_idx;
+    return __amd_wresvn(&p->write_idx, ri + ei, num_packets);
+}
+
+// Work group functions
+
+#define WORK_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__work_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    __local size_t *t = (__local size_t *)__get_scratch_lds(); \
+ \
+    if ((int)get_local_linear_id() == 0) { \
+        size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+        size_t rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+        if (rid + num_packets == wi) { \
+            __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \
+            __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+        } \
+ \
+        *t = rid; \
+    } \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return *t; \
+}
+
+// DO_PIPE_SIZE(WORK_GROUP_RESERVE_READ_PIPE_SIZE)
+
+ATTR size_t
+__work_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    __local size_t *t = (__local size_t *)__get_scratch_lds();
+
+    if ((int)get_local_linear_id() == 0) {
+        size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+        size_t rid = reserve(&p->read_idx, wi, num_packets);
+
+        if (rid + num_packets == wi) {
+            __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
+            __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+        }
+
+        *t = rid;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    return *t;
+}
+
+#define WORK_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__work_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    __local size_t *t = (__local size_t *)__get_scratch_lds(); \
+ \
+    if ((int)get_local_linear_id() == 0) { \
+        size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+        size_t ei = p->end_idx; \
+        *t = reserve(&p->write_idx, ri + ei, num_packets); \
+    } \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return *t; \
+}
+
+// DO_PIPE_SIZE(WORK_GROUP_RESERVE_WRITE_PIPE_SIZE)
+
+ATTR size_t
+__work_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    __local size_t *t = (__local size_t *)__get_scratch_lds();
+
+    if ((int)get_local_linear_id() == 0) {
+        size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device);
+        size_t ei = p->end_idx;
+        *t = reserve(&p->write_idx, ri + ei, num_packets);
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    return *t;
+}
+
+// sub group functions
+
+#define SUB_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__sub_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t rid = ~(size_t)0; \
+ \
+    if (get_sub_group_local_id() == 0) { \
+        size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+        rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+        if (rid + num_packets == wi) { \
+            __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \
+            __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+        } \
+    } \
+ \
+    return sub_group_broadcast(rid, 0); \
+}
+
+// DO_PIPE_SIZE(SUB_GROUP_RESERVE_READ_PIPE_SIZE)
+
+ATTR size_t
+__sub_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+    size_t rid = ~(size_t)0;
+
+    if (get_sub_group_local_id() == 0) {
+        size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device);
+        rid = reserve(&p->read_idx, wi, num_packets);
+
+        if (rid + num_packets == wi) {
+            __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device);
+            __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+        }
+    }
+
+    return sub_group_broadcast(rid, 0);
+}
+
+#define SUB_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR size_t \
+__sub_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t rid = ~(size_t)0; \
+ \
+    if (get_sub_group_local_id() == 0) { \
+        size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+        size_t ei = p->end_idx; \
+        rid = reserve(&p->write_idx, ri + ei, num_packets); \
+    } \
+ \
+    return sub_group_broadcast(rid, 0); \
+}
+
+// DO_PIPE_SIZE(SUB_GROUP_RESERVE_WRITE_PIPE_SIZE)
+
+ATTR size_t
+__sub_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align)
+{
+     size_t rid = ~(size_t)0;
+
+    if (get_sub_group_local_id() == 0) {
+        size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device);
+        size_t ei = p->end_idx;
+        rid = reserve(&p->write_idx, ri + ei, num_packets);
+    }
+
+    return sub_group_broadcast(rid, 0);
+}
+
diff --git a/opencl/src/pipes/validp.cl b/opencl/src/pipes/validp.cl
new file mode 100644
index 00000000..5397dfce
--- /dev/null
+++ b/opencl/src/pipes/validp.cl
@@ -0,0 +1,14 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+
+__attribute__((overloadable, always_inline)) bool
+is_valid_reserve_id(reserve_id_t rid)
+{
+    return as_ulong(rid) != ~(size_t)0;
+}
+
diff --git a/opencl/src/pipes/wresvnp.cl b/opencl/src/pipes/wresvnp.cl
new file mode 100644
index 00000000..2b4f2fa4
--- /dev/null
+++ b/opencl/src/pipes/wresvnp.cl
@@ -0,0 +1,148 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+size_t
+__amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
+{
+    uint alc = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) +
+                        __llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi()));
+    uint l = __llvm_amdgcn_mbcnt_hi(-1, __llvm_amdgcn_mbcnt_lo(-1, 0u));
+    size_t rid;
+
+    if (__llvm_amdgcn_read_exec() == (1UL << alc) - 1UL) {
+        // Handle fully active subgroup
+        uint sum = sub_group_scan_inclusive_add((uint)n);
+        size_t idx = 0;
+        if (l == alc-1) {
+            idx = reserve(pidx, lim, (size_t)sum);
+        }
+        idx = sub_group_broadcast(idx, alc-1);
+        rid = idx + (size_t)(sum - (uint)n);
+        rid = idx != ~(size_t)0 ? rid : idx;
+    } else {
+        // Inclusive add scan with not all lanes active
+        const ulong nomsb = 0x7fffffffffffffffUL;
+
+        // Step 1
+        ulong smask = __llvm_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL);
+        int slid = 63 - (int)clz(smask);
+        uint t = __llvm_amdgcn_ds_bpermute(slid << 2, n);
+        uint sum = n + (slid < 0 ? 0 : t);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 2
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 3
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 4
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 5
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+        slid = 63 - (int)clz(smask);
+        smask ^= (0x1UL << slid) & nomsb;
+
+        // Step 6
+        slid = 63 - (int)clz(smask);
+        t = __llvm_amdgcn_ds_bpermute(slid << 2, sum);
+        sum += slid < 0 ? 0 : t;
+        __llvm_amdgcn_wave_barrier();
+
+        size_t idx = 0;
+        if (l == 63 - (int)clz(__llvm_amdgcn_read_exec())) {
+            idx = reserve(pidx, lim, (size_t)sum);
+        }
+        __llvm_amdgcn_wave_barrier();
+
+        // Broadcast
+        uint k = 63u - (uint)clz(__llvm_amdgcn_read_exec());
+        idx = ((size_t)__llvm_amdgcn_readlane((uint)(idx >> 32), k) << 32) |
+              (size_t)__llvm_amdgcn_readlane((uint)idx, k);
+        __llvm_amdgcn_wave_barrier();
+
+        rid = idx + (size_t)(sum - (uint)n);
+        rid = idx != ~(size_t)0 ? rid : idx;
+    }
+
+    if (rid == ~(size_t)0) {
+        // Try again one at a time
+        rid = reserve(pidx, lim, n);
+    }
+
+    return rid;
+}
+
diff --git a/opencl/src/pipes/writep.cl b/opencl/src/pipes/writep.cl
new file mode 100644
index 00000000..e07026cd
--- /dev/null
+++ b/opencl/src/pipes/writep.cl
@@ -0,0 +1,65 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "pipes.h"
+
+#define ATTR __attribute__((always_inline))
+
+#define WRITE_PIPE_SIZE(SIZE, STYPE) \
+ATTR int \
+__write_pipe_2_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \
+{ \
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ei = p->end_idx; \
+    size_t wi = wave_reserve_1(&p->write_idx, ri+ei); \
+    if (wi == ~(size_t)0) \
+        return -1; \
+ \
+    size_t pi = wrap(wi, ei); \
+    ((__global STYPE *)p->packets)[pi] = *ptr; \
+    return 0; \
+}
+
+DO_PIPE_SIZE(WRITE_PIPE_SIZE)
+
+ATTR int
+__write_pipe_2(__global struct pipeimp* p, const void* ptr, uint size, uint align)
+{
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t ei = p->end_idx;
+    size_t wi = wave_reserve_1(&p->write_idx, ri+ei);
+    if (wi == ~(size_t)0)
+        return -1;
+
+    size_t pi = wrap(wi, ei);
+    __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align);
+
+    return 0;
+}
+
+#define WRITE_PIPE_RESERVED_SIZE(SIZE, STYPE) \
+ATTR int \
+__write_pipe_4_##SIZE(__global struct pipeimp* p, size_t rid, uint i, const STYPE* ptr)  \
+{ \
+    rid += i; \
+    size_t pi = wrap(rid, p->end_idx); \
+    ((__global STYPE *)p->packets)[pi] = *ptr; \
+    return 0; \
+}
+
+DO_PIPE_SIZE(WRITE_PIPE_RESERVED_SIZE)
+
+ATTR int
+__write_pipe_4(__global struct pipeimp* p, size_t rid, uint i, const void *ptr, uint size, uint align)
+{
+    rid += i;
+    size_t pi = wrap(rid, p->end_idx);
+    __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align);
+
+    return 0;
+}
+

From a985d4abccdfcc749921b5c7d6dfaf48c3643dd1 Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Sat, 23 Sep 2017 03:21:31 -0400
Subject: [PATCH 24/25] Rename tool_output_file to ToolOutputFile, NFC

See r314050 for more details

Change-Id: I06a1e62d537abc6a40dd8bb9453059238416904b
---
 utils/prepare-builtins/prepare-builtins.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/prepare-builtins/prepare-builtins.cpp b/utils/prepare-builtins/prepare-builtins.cpp
index b1145363..ce3596fe 100644
--- a/utils/prepare-builtins/prepare-builtins.cpp
+++ b/utils/prepare-builtins/prepare-builtins.cpp
@@ -114,8 +114,8 @@ int main(int argc, char **argv) {
   }
 
   std::error_code EC;
-  std::unique_ptr<tool_output_file> Out
-  (new tool_output_file(OutputFilename, EC, sys::fs::F_None));
+  std::unique_ptr<ToolOutputFile> Out
+  (new ToolOutputFile(OutputFilename, EC, sys::fs::F_None));
   if (EC) {
     errs() << EC.message() << '\n';
     exit(1);

From c6b7afea69ac51768dfd0c4ff180039c4bc08ee7 Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Wed, 27 Sep 2017 13:04:58 -0700
Subject: [PATCH 25/25] Initial documentation for OCKL

Change-Id: I3f61348dd6eb87479ed438c52da8c8a512044cc0
---
 doc/OCKL.md | 412 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 412 insertions(+)
 create mode 100644 doc/OCKL.md

diff --git a/doc/OCKL.md b/doc/OCKL.md
new file mode 100644
index 00000000..169f511e
--- /dev/null
+++ b/doc/OCKL.md
@@ -0,0 +1,412 @@
+# OCKL User Guide
+
+* [Introduction](#introduction)
+  * [What Is OCKL](#what-is-ockl)
+* [Using OCKL](#using-ocml)
+  * [Standard Usage](#standard-usage)
+  * [Controls](#controls)
+* [Versioning](#versioning)
+* [Naming convention](#naming-convention)
+* [Supported functions](#supported-functions)
+
+
+## Introduction
+### What Is OCKL
+
+OCKL is an LLVM-IR bitcode library designed to provide access to certain hardware
+and compiler capabilities needed by language runtimes.  It should rarely be necessary
+to call any of these functions directly from application code.  Consider this library
+a "detail" layer.
+
+## Using OCKL
+### Standard Usage
+
+OCKL is expected to be used in a standard LLVM compilation flow as follows:
+  * Compile source modules to LLVM-IR bitcode (clang)
+  * Link together program bitcode with library bitcode including OCKL and OCLC.
+  * Run generic optimizations (opt)
+  * Code generation (llc)
+
+### Controls
+
+OCKL supports a number of controls that are provided by linking in specifically named inline
+functions.  These functions are inlined at optimization time and result in specific paths
+taken with no control flow overhead.  These functions all have the form (in C)
+
+    __attribute__((always_inline, const)) int
+    __oclc_control(void)
+    { return 1; } // or 0 to disable
+
+The currently supported control are
+  * `finite_only_opt` - floating point Inf and NaN are never expected to be consumed or produced
+  * `unsafe_math_opt` - lower accuracy results may be produced with higher performance
+  * `daz_opt` - subnormal values consumed and produced may be flushed to zero
+  * `correctly_rounded_sqrt32` - float square root must be correctly rounded
+  * `ISA_version` - an integer representation of the ISA version of the target device
+
+### Versioning
+
+OCKL usually ships as a single LLVM-IR bitcode file named
+
+    ocml-{LLVM rev}-{OCKL rev}.bc
+
+where `{LLVM rev}` is the version of LLVM used to create the file, of the
+form X.Y, e.g. 3.8, and `{OCKL rev}` is the OCKL library version of the form X.Y, currently 0.9.
+
+### Naming convention
+
+OCKL functions follow a simple naming convention:
+
+    __ockl_{function}_{type suffix}
+
+where {type suffix} generally indicates the type of the arguments and/or returned result using a type letter,
+e.g. "u" for unsigned integer, and a bit width, e.g. 32.
+
+### Supported functions
+
+The following table lists the available functions along with a brief description of each:
+
+| **function** | **Brief Description** |
+| :--- | :--- |
+| `uchar __ockl_clz_u8(uchar);` | Count leading zeroes |
+| `ushort __ockl_clz_u16(ushort);` | |
+| `uint __ockl_clz_u32(uint);` | |
+| `ulong __ockl_clz_u64(ulong);` | |
+| - | |
+| `uchar __ockl_ctz_u8(uchar);` | Count trailing zeroes |
+| `ushort __ockl_ctz_u16(ushort);` | |
+| `uint __ockl_ctz_u32(uint);` | |
+| `ulong __ockl_ctz_u64(ulong);` | |
+| - | |
+| `uint __ockl_popcount_u32(uint);` | Count nonzero bits |
+| `ulong __ockl_popcount_u64(ulong);` | |
+| - | |
+| `int __ockl_add_sat_i32(int,int);` | Add with saturation |
+| `uint __ockl_add_sat_u32(uint,uint);` | |
+| `long __ockl_add_sat_i64(long,long);` | |
+| `ulong __ockl_add_sat_u64(ulong,ulong);` | |
+| - | |
+| `int __ockl_sub_sat_i32(int,int);` | Subtract with saturation |
+| `uint __ockl_sub_sat_u32(uint,uint);` | |
+| `long __ockl_sub_sat_i64(long,long);` | |
+| `ulong __ockl_sub_sat_u64(ulong,ulong);` | |
+| - | |
+| `int __ockl_mul_hi_i32(int,int);` | High part of multiplication |
+| `uint __ockl_mul_hi_u32(uint,uint);` | |
+| `long __ockl_mul_hi_i64(long,long);` | |
+| `ulong __ockl_mul_hi_u64(ulong,ulong);` | |
+| - | |
+| `int __ockl_mul24_i32(int,int);` | Multiply assuming operands fit in 24 bits |
+| `uint __ockl_mul24_u32(uint,uint);` | |
+| - | |
+| `uint __ockl_activelane_u32(void);` | Index of currently lane counting only active lanes in wavefront |
+| - | |
+| `half __ockl_wfred_add_f16(half x);` | ADD reduction across wavefront |
+| `float __ockl_wfred_add_f32(float x);` | |
+| `double __ockl_wfred_add_f64(double x);` | |
+| `int __ockl_wfred_add_i32(int x);` | |
+| `long __ockl_wfred_add_i64(long x);` | |
+| `uint __ockl_wfred_add_u32(uint x);` | |
+| `ulong __ockl_wfred_add_u64(ulong x);` | AND reduction across wavefront |
+| `int __ockl_wfred_and_i32(int x);` | |
+| `long __ockl_wfred_and_i64(long x);` | |
+| `uint __ockl_wfred_and_u32(uint x);` | |
+| `ulong __ockl_wfred_and_u64(ulong x);` | |
+| `half __ockl_wfred_max_f16(half x);` | MAX reduction across wavefront |
+| `float __ockl_wfred_max_f32(float x);` | |
+| `double __ockl_wfred_max_f64(double x);` | |
+| `int __ockl_wfred_max_i32(int x);` | |
+| `long __ockl_wfred_max_i64(long x);` | |
+| `uint __ockl_wfred_max_u32(uint x);` | |
+| `ulong __ockl_wfred_max_u64(ulong x);` | |
+| `half __ockl_wfred_min_f16(half x);` | MIN reduction across wavefront |
+| `float __ockl_wfred_min_f32(float x);` | |
+| `double __ockl_wfred_min_f64(double x);` | |
+| `int __ockl_wfred_min_i32(int x);` | |
+| `long __ockl_wfred_min_i64(long x);` | |
+| `uint __ockl_wfred_min_u32(uint x);` | |
+| `ulong __ockl_wfred_min_u64(ulong x);` | |
+| `int __ockl_wfred_or_i32(int x);` | OR reduction across wavefront |
+| `long __ockl_wfred_or_i64(long x);` | |
+| `uint __ockl_wfred_or_u32(uint x);` | |
+| `ulong __ockl_wfred_or_u64(ulong x);` | |
+| `int __ockl_wfred_xor_i32(int x);` | XOR reduction across wavefront |
+| `long __ockl_wfred_xor_i64(long x);` | |
+| `uint __ockl_wfred_xor_u32(uint x);` | |
+| `ulong __ockl_wfred_xor_u64(ulong x);` | |
+| `half __ockl_wfscan_add_f16(half x, bool inclusive);` | ADD scan across wavefront |
+| `float __ockl_wfscan_add_f32(float x, bool inclusive);` | |
+| `double __ockl_wfscan_add_f64(double x, bool inclusive);` | |
+| `int __ockl_wfscan_add_i32(int x, bool inclusive);` | |
+| `long __ockl_wfscan_add_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_add_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_add_u64(ulong x, bool inclusive);` | |
+| `int __ockl_wfscan_and_i32(int x, bool inclusive);` | AND scan across wavefront |
+| `long __ockl_wfscan_and_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_and_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_and_u64(ulong x, bool inclusive);` | |
+| `half __ockl_wfscan_max_f16(half x, bool inclusive);` | MAX scan across wavefront |
+| `float __ockl_wfscan_max_f32(float x, bool inclusive);` | |
+| `double __ockl_wfscan_max_f64(double x, bool inclusive);` | |
+| `int __ockl_wfscan_max_i32(int x, bool inclusive);` | |
+| `long __ockl_wfscan_max_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_max_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_max_u64(ulong x, bool inclusive);` | |
+| `half __ockl_wfscan_min_f16(half x, bool inclusive);` | MIN scan across wavefront |
+| `float __ockl_wfscan_min_f32(float x, bool inclusive);` | |
+| `double __ockl_wfscan_min_f64(double x, bool inclusive);` | |
+| `int __ockl_wfscan_min_i32(int x, bool inclusive);` | |
+| `long __ockl_wfscan_min_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_min_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_min_u64(ulong x, bool inclusive);` | |
+| `int __ockl_wfscan_or_i32(int x, bool inclusive);` | OR scan across wavefront |
+| `long __ockl_wfscan_or_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_or_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_or_u64(ulong x, bool inclusive);` | |
+| `int __ockl_wfscan_xor_i32(int x, bool inclusive);` | XOR scan across wavefront |
+| `long __ockl_wfscan_xor_i64(long x, bool inclusive);` | |
+| `uint __ockl_wfscan_xor_u32(uint x, bool inclusive);` | |
+| `ulong __ockl_wfscan_xor_u64(ulong x, bool inclusive);` | |
+| `uint __ockl_wfbcast_u32(uint x, uint i);` | Broadcast to wavefront |
+| `ulong __ockl_wfbcast_u64(ulong x, uint i);` | |
+| - | |
+| `bool __ockl_wfany_i32(int e);` | Detect any nonzero across wavefront |
+| `bool __ockl_wfall_i32(int e);` | Detect all nozero across wavefront |
+| `bool __ockl_wfsame_i32(int e);` | Detect same across wavefront  |
+| - | |
+| `uint __ockl_bfm_u32(uint,uint);` | Bit field mask |
+| `int __ockl_bfe_i32(int, uint, uint);` | Bit field extract |
+| `uint __ockl_bfe_u32(uint,uint,uint);` | |
+| `uint __ockl_bitalign_u32(uint,uint,uint);` | Align on bit boundary |
+| `uint __ockl_bytealign_u32(uint,uint,uint);` | Align on byte boundary |
+| `uint __ockl_lerp_u32(uint,uint,uint);` | Add each byte with prescribed carry |
+| `float __ockl_max3_f32(float,float,float);` | Max of 3 |
+| `half __ockl_max3_f16(half,half,half);` | |
+| `int __ockl_max3_i32(int,int,int);` | |
+| `uint __ockl_max3_u32(uint,uint,uint);` | |
+| `float __ockl_median3_f32(float,float,float);` | Median of 3 |
+| `half __ockl_median3_f16(half,half,half);` | |
+| `int __ockl_median3_i32(int,int,int);` | |
+| `uint __ockl_median3_u32(uint,uint,uint);` | |
+| `float __ockl_min3_f32(float,float,float);` | Min of 3 |
+| `half __ockl_min3_f16(half,half,half);` | |
+| `int __ockl_min3_i32(int,int,int);` | |
+| `uint __ockl_min3_u32(uint,uint,uint);` | |
+| `ulong __ockl_mqsad_u64(ulong, uint, ulong);` | Masked rolling SAD |
+| `uint __ockl_pack_u32(float4);` | Pack vector to bytes |
+| `ulong __ockl_qsad_u64(ulong, uint, ulong);` | Rolling SAD |
+| `uint __ockl_msad_u32(uint,uint,uint);` | Masked SAD |
+| `uint __ockl_sad_u32(uint,uint,uint);` | SAD |
+| `uint __ockl_sadd_u32(uint,uint,uint);` | 32-bit SAD |
+| `uint __ockl_sadhi_u32(uint,uint,uint);` | SAD accululating to high half |
+| `uint __ockl_sadw_u32(uint,uint,uint);` | 16-bit SAD |
+| `float __ockl_unpack0_f32(uint);` | Extract byte and convert to float |
+| `float __ockl_unpack1_f32(uint);` | |
+| `float __ockl_unpack2_f32(uint);` | |
+| `float __ockl_unpack3_f32(uint);` | |
+| - | |
+| `float4 __ockl_image_load_1D(TSHARP i, int c);` | Load from 1D image |
+| `float4 __ockl_image_load_1Da(TSHARP i, int2 c);` | Load from 1D image array |
+| `float4 __ockl_image_load_1Db(TSHARP i, int c);` | Load from 1D buffered image |
+| `float4 __ockl_image_load_2D(TSHARP i, int2 c);` | Load from 2D image |
+| `float4 __ockl_image_load_2Da(TSHARP i, int4 c);` | Load from 2D image array |
+| `float __ockl_image_load_2Dad(TSHARP i, int4 c);` | Load from 2D depth image array |
+| `float __ockl_image_load_2Dd(TSHARP i, int2 c);` | Load from 2D depth image |
+| `float4 __ockl_image_load_3D(TSHARP i, int4 c);` | Load from 3D image |
+| `float4 __ockl_image_load_CM(TSHARP i, int2 c, int f);` | Load from cubemap |
+| `float4 __ockl_image_load_CMa(TSHARP i, int4 c, int f);` | Load from cubemap array |
+| - | |
+| `float4 __ockl_image_load_mip_1D(TSHARP i, int c, int l);` | Load from mipmapped image |
+| `float4 __ockl_image_load_mip_1Da(TSHARP i, int2 c, int l);` | |
+| `float4 __ockl_image_load_mip_2D(TSHARP i, int2 c, int l);` | |
+| `float4 __ockl_image_load_mip_2Da(TSHARP i, int4 c, int l);` | |
+| `float __ockl_image_load_mip_2Dad(TSHARP i, int4 c, int l);` | |
+| `float __ockl_image_load_mip_2Dd(TSHARP i, int2 c, int l);` | |
+| `float4 __ockl_image_load_mip_3D(TSHARP i, int4 c, int l);` | |
+| `float4 __ockl_image_load_mip_CM(TSHARP i, int2 c, int f, int l);` | |
+| `float4 __ockl_image_load_mip_CMa(TSHARP i, int4 c, int f, int l);` | |
+| - | |
+| `half4 __ockl_image_loadh_1D(TSHARP i, int c);` | Load from image returning half precision |
+| `half4 __ockl_image_loadh_1Da(TSHARP i, int2 c);` | |
+| `half4 __ockl_image_loadh_1Db(TSHARP i, int c);` | |
+| `half4 __ockl_image_loadh_2D(TSHARP i, int2 c);` | |
+| `half4 __ockl_image_loadh_2Da(TSHARP i, int4 c);` | |
+| `half4 __ockl_image_loadh_3D(TSHARP i, int4 c);` | |
+| `half4 __ockl_image_loadh_CM(TSHARP i, int2 c, int f);` | |
+| `half4 __ockl_image_loadh_CMa(TSHARP i, int4 c, int f);` | |
+| `half4 __ockl_image_loadh_mip_1D(TSHARP i, int c, int l);` | |
+| `half4 __ockl_image_loadh_mip_1Da(TSHARP i, int2 c, int l);` | |
+| `half4 __ockl_image_loadh_mip_2D(TSHARP i, int2 c, int l);` | |
+| `half4 __ockl_image_loadh_mip_2Da(TSHARP i, int4 c, int l);` | |
+| `half4 __ockl_image_loadh_mip_3D(TSHARP i, int4 c, int l);` | |
+| `half4 __ockl_image_loadh_mip_CM(TSHARP i, int2 c, int f, int l);` | |
+| `half4 __ockl_image_loadh_mip_CMa(TSHARP i, int4 c, int f, int l);` | |
+| - | |
+| `void __ockl_image_store_1D(TSHARP i, int c, float4 p);` | Store to image |
+| `void __ockl_image_store_1Da(TSHARP i, int2 c, float4 p);` | |
+| `void __ockl_image_store_1Db(TSHARP i, int c, float4 p);` | |
+| `void __ockl_image_store_2D(TSHARP i, int2 c, float4 p);` | |
+| `void __ockl_image_store_2Da(TSHARP i, int4 c, float4 p);` | |
+| `void __ockl_image_store_2Dad(TSHARP i, int4 c, float p);` | |
+| `void __ockl_image_store_2Dd(TSHARP i, int2 c, float p);` | |
+| `void __ockl_image_store_3D(TSHARP i, int4 c, float4 p);` | |
+| `void __ockl_image_store_CM(TSHARP i, int2 c, int f, float4 p);` | |
+| `void __ockl_image_store_CMa(TSHARP i, int4 c, int f, float4 p);` | |
+| `void __ockl_image_store_lod_1D(TSHARP i, int c, int l, float4 p);` | Store to level of mipmapped image |
+| - | |
+| `void __ockl_image_store_lod_1Da(TSHARP i, int2 c, int l, float4 p);` | |
+| `void __ockl_image_store_lod_2D(TSHARP i, int2 c, int l, float4 p);` | |
+| `void __ockl_image_store_lod_2Da(TSHARP i, int4 c, int l, float4 p);` | |
+| `void __ockl_image_store_lod_2Dad(TSHARP i, int4 c, int l, float p);` | |
+| `void __ockl_image_store_lod_2Dd(TSHARP i, int2 c, int l, float p);` | |
+| `void __ockl_image_store_lod_3D(TSHARP i, int4 c, int l, float4 p);` | |
+| `void __ockl_image_store_lod_CM(TSHARP i, int2 c, int f, int l, float4 p);` | |
+| `void __ockl_image_store_lod_CMa(TSHARP i, int4 c, int f, int l, float4 p);` | |
+| - | |
+| `void __ockl_image_storeh_1D(TSHARP i, int c, half4 p);` | Store half precision pixel to image|
+| `void __ockl_image_storeh_1Da(TSHARP i, int2 c, half4 p);` | |
+| `void __ockl_image_storeh_1Db(TSHARP i, int c, half4 p);` | |
+| `void __ockl_image_storeh_2D(TSHARP i, int2 c, half4 p);` | |
+| `void __ockl_image_storeh_2Da(TSHARP i, int4 c, half4 p);` | |
+| `void __ockl_image_storeh_3D(TSHARP i, int4 c, half4 p);` | |
+| `void __ockl_image_storeh_CM(TSHARP i, int2 c, int f, half4 p);` | |
+| `void __ockl_image_storeh_CMa(TSHARP i, int4 c, int f, half4 p);` | |
+| - | |
+| `void __ockl_image_storeh_lod_1D(TSHARP i, int c, int l, half4 p);` | Store half precision pixel to level of mipmapped image |
+| `void __ockl_image_storeh_lod_1Da(TSHARP i, int2 c, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_2D(TSHARP i, int2 c, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_2Da(TSHARP i, int4 c, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_3D(TSHARP i, int4 c, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_CM(TSHARP i, int2 c, int f, int l, half4 p);` | |
+| `void __ockl_image_storeh_lod_CMa(TSHARP i, int4 c, int f, int l, half4 p);` | |
+| - | |
+| `float4 __ockl_image_sample_1D(TSHARP i, SSHARP s, float c);` | Sample image |
+| `float4 __ockl_image_sample_1Da(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_sample_2D(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_sample_2Da(TSHARP i, SSHARP s, float4 c);` | |
+| `float __ockl_image_sample_2Dad(TSHARP i, SSHARP s, float4 c);` | |
+| `float __ockl_image_sample_2Dd(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_sample_3D(TSHARP i, SSHARP s, float4 c);` | |
+| `float4 __ockl_image_sample_CM(TSHARP i, SSHARP s, float4 c);` | |
+| `float4 __ockl_image_sample_CMa(TSHARP i, SSHARP s, float4 c);` | |
+| - | |
+| `float4 __ockl_image_sample_grad_1D(TSHARP i, SSHARP s, float c, float dx, float dy);` | Sample mipmapped image using gradient |
+| `float4 __ockl_image_sample_grad_1Da(TSHARP i, SSHARP s, float2 c, float dx, float dy);` | |
+| `float4 __ockl_image_sample_grad_2D(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | |
+| `float4 __ockl_image_sample_grad_2Da(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | |
+| `float __ockl_image_sample_grad_2Dad(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | |
+| `float __ockl_image_sample_grad_2Dd(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | |
+| `float4 __ockl_image_sample_grad_3D(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy);` | |
+| - | |
+| `float4 __ockl_image_sample_lod_1D(TSHARP i, SSHARP s, float c, float l);` | Sample mipmapped image using LOD |
+| `float4 __ockl_image_sample_lod_1Da(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `float4 __ockl_image_sample_lod_2D(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `float4 __ockl_image_sample_lod_2Da(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `float __ockl_image_sample_lod_2Dad(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `float __ockl_image_sample_lod_2Dd(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `float4 __ockl_image_sample_lod_3D(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `float4 __ockl_image_sample_lod_CM(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `float4 __ockl_image_sample_lod_CMa(TSHARP i, SSHARP s, float4 c, float l);` | |
+| - | |
+| `half4 __ockl_image_sampleh_1D(TSHARP i, SSHARP s, float c);` | Sample image returning half precision |
+| `half4 __ockl_image_sampleh_1Da(TSHARP i, SSHARP s, float2 c);` | |
+| `half4 __ockl_image_sampleh_2D(TSHARP i, SSHARP s, float2 c);` | |
+| `half4 __ockl_image_sampleh_2Da(TSHARP i, SSHARP s, float4 c);` | |
+| `half4 __ockl_image_sampleh_3D(TSHARP i, SSHARP s, float4 c);` | |
+| `half4 __ockl_image_sampleh_CM(TSHARP i, SSHARP s, float4 c);` | |
+| `half4 __ockl_image_sampleh_CMa(TSHARP i, SSHARP s, float4 c);` | |
+| - | |
+| `half4 __ockl_image_sampleh_grad_1D(TSHARP i, SSHARP s, float c, float dx, float dy);` | Sample mipmapped image using gradient returning half precision |
+| `half4 __ockl_image_sampleh_grad_1Da(TSHARP i, SSHARP s, float2 c, float dx, float dy);` | |
+| `half4 __ockl_image_sampleh_grad_2D(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | |
+| `half4 __ockl_image_sampleh_grad_2Da(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | |
+| `half4 __ockl_image_sampleh_grad_3D(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy);` | |
+| - | |
+| `half4 __ockl_image_sampleh_lod_1D(TSHARP i, SSHARP s, float c, float l);` | Sample mipmapped image using LOD returning half precision |
+| `half4 __ockl_image_sampleh_lod_1Da(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_2D(TSHARP i, SSHARP s, float2 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_2Da(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_3D(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_CM(TSHARP i, SSHARP s, float4 c, float l);` | |
+| `half4 __ockl_image_sampleh_lod_CMa(TSHARP i, SSHARP s, float4 c, float l);` | |
+| - | |
+| `float4 __ockl_image_gather4r_2D(TSHARP i, SSHARP s, float2 c);` | Gather 2x2 channel from image |
+| `float4 __ockl_image_gather4g_2D(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_gather4b_2D(TSHARP i, SSHARP s, float2 c);` | |
+| `float4 __ockl_image_gather4a_2D(TSHARP i, SSHARP s, float2 c);` | |
+| - | |
+| `int __ockl_image_array_size_1Da(TSHARP i);` | Get image array size |
+| `int __ockl_image_array_size_2Da(TSHARP i);` | |
+| `int __ockl_image_array_size_2Dad(TSHARP i);` | |
+| `int __ockl_image_array_size_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_channel_data_type_1D(TSHARP i);` | Get image channel data type |
+| `int __ockl_image_channel_data_type_1Da(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_1Db(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_2D(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_2Da(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_2Dad(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_2Dd(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_3D(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_CM(TSHARP i);` | |
+| `int __ockl_image_channel_data_type_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_channel_order_1D(TSHARP i);` | Get image channel order |
+| `int __ockl_image_channel_order_1Da(TSHARP i);` | |
+| `int __ockl_image_channel_order_1Db(TSHARP i);` | |
+| `int __ockl_image_channel_order_2D(TSHARP i);` | |
+| `int __ockl_image_channel_order_2Da(TSHARP i);` | |
+| `int __ockl_image_channel_order_2Dad(TSHARP i);` | |
+| `int __ockl_image_channel_order_2Dd(TSHARP i);` | |
+| `int __ockl_image_channel_order_3D(TSHARP i);` | |
+| `int __ockl_image_channel_order_CM(TSHARP i);` | |
+| `int __ockl_image_channel_order_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_depth_3D(TSHARP i);` | Get 3D image depth |
+| - | |
+| `int __ockl_image_height_2D(TSHARP i);` | Get image height |
+| `int __ockl_image_height_2Da(TSHARP i);` | |
+| `int __ockl_image_height_2Dad(TSHARP i);` | |
+| `int __ockl_image_height_2Dd(TSHARP i);` | |
+| `int __ockl_image_height_3D(TSHARP i);` | |
+| `int __ockl_image_height_CM(TSHARP i);` | |
+| `int __ockl_image_height_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_num_mip_levels_1D(TSHARP i);` | Get number of levels in mipmapped image |
+| `int __ockl_image_num_mip_levels_1Da(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_2D(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_2Da(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_2Dad(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_2Dd(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_3D(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_CM(TSHARP i);` | |
+| `int __ockl_image_num_mip_levels_CMa(TSHARP i);` | |
+| - | |
+| `int __ockl_image_width_1D(TSHARP i);` | Get image width |
+| `int __ockl_image_width_1Da(TSHARP i);` | |
+| `int __ockl_image_width_1Db(TSHARP i);` | |
+| `int __ockl_image_width_2D(TSHARP i);` | |
+| `int __ockl_image_width_2Da(TSHARP i);` | |
+| `int __ockl_image_width_2Dad(TSHARP i);` | |
+| `int __ockl_image_width_2Dd(TSHARP i);` | |
+| `int __ockl_image_width_3D(TSHARP i);` | |
+| `int __ockl_image_width_CM(TSHARP i);` | |
+| `int __ockl_image_width_CMa(TSHARP i);` | |
+| - | |
+| `size_t __ockl_get_global_offset(uint);` | Get grid global offset (OpenCL) of dimension |
+| `size_t __ockl_get_global_id(uint);` | Get workitem global ID of dimension |
+| `size_t __ockl_get_local_id(uint);` | Get workitem local ID of dimension |
+| `size_t __ockl_get_group_id(uint);` | Get ID of group workitem resides in of dimension |
+| `size_t __ockl_get_global_size(uint);` | Get global size of dimension |
+| `size_t __ockl_get_local_size(uint);` | Get local size of dimension |
+| `size_t __ockl_get_num_groups(uint);` | Get number of groups in dimension |
+| `uint __ockl_get_work_dim(void);` | Get grid number of dimensions |
+| `size_t __ockl_get_enqueued_local_size(uint);` | Get enqueued local size of dimension |
+| `size_t __ockl_get_global_linear_id(void);` | Get global linear ID of workitem|
+| `size_t __ockl_get_local_linear_id(void);` | Get local linear ID of workitem |
+| - | |
+| `bool __ockl_is_local_addr(const void *);` | Test if generic address is local |
+| `bool __ockl_is_private_addr(const void *);` | Test if generic address is private |
+| `__global void * __ockl_to_global(void *);` | Convert generic address to global address |
+| `__local void * __ockl_to_local(void *);` | Convert generic address to local address |
+| `__private void * __ockl_to_private(void *);` | Convert generic address to private address |