diff --git a/CMakeLists.txt b/CMakeLists.txt index f5b85f7d..1008f1a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,9 @@ if (GENERIC_IS_ZERO) set(AMDGPU_TARGET_TRIPLE "amdgcn--amdhsa-amdgizcl") # HCC will execute utils/change-addr-space.sh # and apply utils/add_amdgiz.sed on all .ll files in subdirectory hc/, irif/, opencl/ + if (CUDA_TRIPLE) + set(AMDGPU_TARGET_TRIPLE "amdgcn--cuda") + endif (CUDA_TRIPLE) endif (GENERIC_IS_ZERO) @@ -52,6 +55,9 @@ add_subdirectory(oclc) add_subdirectory(ocml) add_subdirectory(ockl) add_subdirectory(opencl) +if (CUDA_TRIPLE) + add_subdirectory(cuda2gcn) +endif (CUDA_TRIPLE) if(BUILD_HC_LIB) add_subdirectory(hc) diff --git a/cuda2gcn/CMakeLists.txt b/cuda2gcn/CMakeLists.txt new file mode 100644 index 00000000..c2ed32fe --- /dev/null +++ b/cuda2gcn/CMakeLists.txt @@ -0,0 +1,17 @@ +##===-------------------------------------------------------------------------- +## ROCm Device Libraries +## +## This file is distributed under the University of Illinois Open Source +## License. See LICENSE.TXT for details. +##===-------------------------------------------------------------------------- + +file(GLOB cl_sources + ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cl +) + +file(GLOB sources ${cl_sources}) + +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ocml/inc) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../ockl/inc) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../irif/inc) +opencl_bc_lib(cuda2gcn ${sources}) diff --git a/cuda2gcn/src/bitsbytes.cl b/cuda2gcn/src/bitsbytes.cl new file mode 100644 index 00000000..2df61c5a --- /dev/null +++ b/cuda2gcn/src/bitsbytes.cl @@ -0,0 +1,46 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "ockl.h" +#include "irif.h" + +#define ATTR __attribute__((always_inline, const)) + +//-------- T __nv_brev +ATTR int __nv_brev(int x) { return __llvm_bitreverse_i32(x); } + +//-------- T __nv_brevll +ATTR long __nv_brevll(long x) { return __llvm_bitreverse_i64(x); } + +//-------- T __nv_clz +ATTR int __nv_clz(int x) +{ + return (int)__ockl_clz_u32((uint)x); +} + +//-------- T __nv_clzll +ATTR int __nv_clzll(long x) +{ + uint xlo = (uint)x; + uint xhi = (uint)(x >> 32); + uint zlo = __ockl_clz_u32(xlo) + 32u; + uint zhi = __ockl_clz_u32(xhi); + return (int)(xhi == 0 ? zlo : zhi); +} + +//-------- T __nv_ffs +ATTR int __nv_ffs(int x) { return (32 - __nv_clz(x&(-x))); } + +//-------- T __nv_ffsll +ATTR int __nv_ffsll(long x) { return (int)(64 - __nv_clzll(x&(-x))); } + +//-------- T __nv_popc +ATTR int __nv_popc(int x) { return __llvm_ctpop_i32(x); } + +//-------- T __nv_popcll +ATTR int __nv_popcll(long x) { return (int)__llvm_ctpop_i64(x); } + diff --git a/cuda2gcn/src/convert.cl b/cuda2gcn/src/convert.cl new file mode 100644 index 00000000..43113915 --- /dev/null +++ b/cuda2gcn/src/convert.cl @@ -0,0 +1,150 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#define ATTR __attribute__((always_inline, const)) + +#define CONVERTM(A,B,m,n) ATTR B __nv_##A##2##B##_##m(A x) \ + { return convert_##B##_##n(x); } + +#define CONVERT(A,B) \ + CONVERTM(A, B, rd, rtn) \ + CONVERTM(A, B, rn, rte) \ + CONVERTM(A, B, ru, rtp) \ + CONVERTM(A, B, rz, rtz) + +//-------- T __nv_double2float_rd +//-------- T __nv_double2float_rn +//-------- T __nv_double2float_ru +//-------- T __nv_double2float_rz +CONVERT(double, float) + +//-------- T __nv_double2int_rd +//-------- T __nv_double2int_rn +//-------- T __nv_double2int_ru +//-------- T __nv_double2int_rz +CONVERT(double, int) + +//-------- T __nv_float2int_rd +//-------- T __nv_float2int_rn +//-------- T __nv_float2int_ru +//-------- T __nv_float2int_rz +CONVERT(float, int) + +//-------- T __nv_int2float_rd +//-------- T __nv_int2float_rn +//-------- T __nv_int2float_ru +//-------- T __nv_int2float_rz +CONVERT(int, float) + +//-------- T __nv_double2uint_rd +//-------- T __nv_double2uint_rn +//-------- T __nv_double2uint_ru +//-------- T __nv_double2uint_rz +CONVERT(double, uint) + +//-------- T __nv_float2uint_rd +//-------- T __nv_float2uint_rn +//-------- T __nv_float2uint_ru +//-------- T __nv_float2uint_rz +CONVERT(float, uint) + +//-------- T __nv_uint2double_rd +//-------- T __nv_uint2double_rn +//-------- T __nv_uint2double_ru +//-------- T __nv_uint2double_rz +CONVERT(uint, double) + +//-------- T __nv_uint2float_rd +//-------- T __nv_uint2float_rn +//-------- T __nv_uint2float_ru +//-------- T __nv_uint2float_rz +CONVERT(uint, float) + +#define CONVERT2LLM(A,B,m,n) ATTR long __nv_##A##2ll_##m(A x) \ + { return convert_long_##n(x); } + +#define CONVERT2LL(A) \ + CONVERT2LLM(A, long, rd, rtn) \ + CONVERT2LLM(A, long, rn, rte) \ + CONVERT2LLM(A, long, ru, rtp) \ + CONVERT2LLM(A, long, rz, rtz) + +//-------- T __nv_double2ll_rd +//-------- T __nv_double2ll_rn +//-------- T __nv_double2ll_ru +//-------- T __nv_double2ll_rz +CONVERT2LL(double) + +//-------- T __nv_float2ll_rd +//-------- T __nv_float2ll_rn +//-------- T __nv_float2ll_ru +//-------- T __nv_float2ll_rz +CONVERT2LL(float) + +#define CONVERT2ULLM(A,B,m,n) ATTR ulong __nv_##A##2ull_##m(A x) \ + { return convert_ulong_##n(x); } + +#define CONVERT2ULL(A) \ + CONVERT2ULLM(A, ulong, rd, rtn) \ + CONVERT2ULLM(A, ulong, rn, rte) \ + CONVERT2ULLM(A, ulong, ru, rtp) \ + CONVERT2ULLM(A, ulong, rz, rtz) + +//-------- T __nv_double2ull_rd +//-------- T __nv_double2ull_rn +//-------- T __nv_double2ull_ru +//-------- T __nv_double2ull_rz +CONVERT2ULL(double) + +//-------- T __nv_float2ull_rd +//-------- T __nv_float2ull_rn +//-------- T __nv_float2ull_ru +//-------- T __nv_float2ull_rz +CONVERT2ULL(float) + +#define CONVERT4LLM(A,B,m,n) ATTR B __nv_ll2##B##_##m(long x) \ + { return convert_##B##_##n(x); } + +#define CONVERT4LL(B) \ + CONVERT4LLM(long, B, rd, rtn) \ + CONVERT4LLM(long, B, rn, rte) \ + CONVERT4LLM(long, B, ru, rtp) \ + CONVERT4LLM(long, B, rz, rtz) + +//-------- T __nv_ll2double_rd +//-------- T __nv_ll2double_rn +//-------- T __nv_ll2double_ru +//-------- T __nv_ll2double_rz +CONVERT4LL(double) + +//-------- T __nv_ll2float_rd +//-------- T __nv_ll2float_rn +//-------- T __nv_ll2float_ru +//-------- T __nv_ll2float_rz +CONVERT4LL(float) + +#define CONVERT4ULLM(A,B,m,n) ATTR B __nv_ull2##B##_##m(ulong x) \ + { return convert_##B##_##n(x); } + +#define CONVERT4ULL(B) \ + CONVERT4ULLM(ulong, B, rd, rtn) \ + CONVERT4ULLM(ulong, B, rn, rte) \ + CONVERT4ULLM(ulong, B, ru, rtp) \ + CONVERT4ULLM(ulong, B, rz, rtz) + +//-------- T __nv_ull2double_rd +//-------- T __nv_ull2double_rn +//-------- T __nv_ull2double_ru +//-------- T __nv_ull2double_rz +CONVERT4ULL(double) + +//-------- T __nv_ull2float_rd +//-------- T __nv_ull2float_rn +//-------- T __nv_ull2float_ru +//-------- T __nv_ull2float_rz +CONVERT4ULL(float) + diff --git a/cuda2gcn/src/float.cl b/cuda2gcn/src/float.cl new file mode 100644 index 00000000..58c8a00b --- /dev/null +++ b/cuda2gcn/src/float.cl @@ -0,0 +1,33 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#define ATTR __attribute__((always_inline, const)) + +//-------- T __nv_finitef +ATTR int __nv_finitef(float x) { return isfinite(x); } + +//-------- T __nv_isfinited +ATTR int __nv_isfinited(double x) { return isfinite(x); } + +//-------- T __nv_isinfd +ATTR int __nv_isinfd(double x) { return isinf(x); } + +//-------- T __nv_isinff +ATTR int __nv_isinff(float x) { return isinf(x); } + +//-------- T __nv_isnand +ATTR int __nv_isnand(double x) { return isnan(x); } + +//-------- T __nv_isnanf +ATTR int __nv_isnanf(float x) { return isnan(x); } + +//-------- T __nv_nan +ATTR double __nv_nan(char *tagp) { return __builtin_nan(tagp); } + +//-------- T __nv_nanf +ATTR float __nv_nanf(char *tagp) { return __builtin_nan(tagp); } + diff --git a/cuda2gcn/src/generic.cl b/cuda2gcn/src/generic.cl new file mode 100644 index 00000000..c2a232c9 --- /dev/null +++ b/cuda2gcn/src/generic.cl @@ -0,0 +1,54 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#define ATTR __attribute__((always_inline, const)) + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) + +//-------- T __nv_abs +ATTR int __nv_abs(int x) { return abs(x); } + +//-------- T __nv_llabs +ATTR long __nv_llabs(long x) { return abs(x); } + +//-------- T __nv_max +ATTR int __nv_max(int a, int b) { return MAX(a,b); } + +//-------- T __nv_llmax +ATTR long __nv_llmax(long a, long b) { return MAX(a,b); } + +//-------- T __nv_ullmax +ATTR ulong __nv_ullmax(ulong a, ulong b) { return MAX(a,b); } + +//-------- T __nv_umax +ATTR uint __nv_umax(uint a, uint b) { return MAX(a,b); } + +//-------- T __nv_min +ATTR int __nv_min(int a, int b) { return MIN(a,b); } + +//-------- T __nv_llmin +ATTR long __nv_llmin(long a, long b) { return MIN(a,b); } + +//-------- T __nv_ullmin +ATTR ulong __nv_ullmin(ulong a, ulong b) { return MIN(a,b); } + +//-------- T __nv_umin +ATTR uint __nv_umin(uint a, uint b) { return MIN(a,b); } + +//-------- T __nv_sad +ATTR uint __nv_sad(int x, int y, uint z) +{ + return (z+abs(x-y)); +} + +//-------- T __nv_usad +ATTR uint __nv_usad(uint x, uint y, uint z) +{ + return (z+abs(x-y)); +} + diff --git a/cuda2gcn/src/half.cl b/cuda2gcn/src/half.cl new file mode 100644 index 00000000..02a26529 --- /dev/null +++ b/cuda2gcn/src/half.cl @@ -0,0 +1,23 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +#define ATTR __attribute__((always_inline, const)) + +//-------- T __nv_float2half_rn +half __nv_float2half_rn(float x) +{ + return (half)x; +} + +//-------- T __nv_half2float +float __nv_half2float(half x) +{ + return (float)x; +} + diff --git a/cuda2gcn/src/integer.cl b/cuda2gcn/src/integer.cl new file mode 100644 index 00000000..58b8bf5a --- /dev/null +++ b/cuda2gcn/src/integer.cl @@ -0,0 +1,29 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "ockl.h" + +#define ATTR __attribute__((always_inline, const)) + +//-------- T __nv_mul24 +ATTR int __nv_mul24(int x, int y) { return __ockl_mul24_i32(x, y); } + +//-------- T __nv_umul24 +ATTR uint __nv_umul24(uint x, uint y) { return __ockl_mul24_u32(x, y); } + +//-------- T __nv_mul64hi +ATTR long __nv_mul64hi(long x, long y) { return __ockl_mul_hi_i64(x,y); } + +//-------- T __nv_mulhi +ATTR int __nv_mulhi(int x, int y) { return __ockl_mul_hi_i32(x,y); } + +//-------- T __nv_umul64hi +ATTR ulong __nv_umul64hi(ulong x, ulong y) { return __ockl_mul_hi_u64(x,y); } + +//-------- T __nv_umulhi +ATTR uint __nv_umulhi(uint x, uint y) { return __ockl_mul_hi_u32(x,y); } + diff --git a/cuda2gcn/src/math.cl b/cuda2gcn/src/math.cl new file mode 100644 index 00000000..2c4eaf55 --- /dev/null +++ b/cuda2gcn/src/math.cl @@ -0,0 +1,354 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "ocml.h" + +#define ATTR __attribute__((always_inline)) + +#define FUNC1D(root) \ + ATTR double __nv_##root(double x) { return __ocml_##root##_f64(x); } +#define FUNC1F(root) \ + ATTR float __nv_##root##f(float x) { return __ocml_##root##_f32(x); } +#define FUNC1(root) FUNC1D(root) FUNC1F(root) + +#define FUNC2D(root) \ + ATTR double __nv_##root(double x, double y) { return __ocml_##root##_f64(x, y); } +#define FUNC2F(root) \ + ATTR float __nv_##root##f(float x, float y) { return __ocml_##root##_f32(x, y); } +#define FUNC2(root) FUNC2D(root) FUNC2F(root) + +#define FUNC3D(root) \ + ATTR double __nv_##root(double x, double y, double z) { return __ocml_##root##_f64(x, y, z); } +#define FUNC3F(root) \ + ATTR float __nv_##root##f(float x, float y, float z) { return __ocml_##root##_f32(x, y, z); } +#define FUNC3(root) FUNC3D(root) FUNC3F(root) + +//-------- T __nv_acos +//-------- T __nv_acosf +FUNC1(acos) + +//-------- T __nv_acosh +//-------- T __nv_acoshf +FUNC1(acosh) + +//-------- T __nv_asin +//-------- T __nv_asinf +FUNC1(asin) + +//-------- T __nv_asinh +//-------- T __nv_asinhf +FUNC1(asinh) + +//-------- T __nv_atan +//-------- T __nv_atanf +FUNC1(atan) + +//-------- T __nv_atan2 +//-------- T __nv_atan2f +FUNC2(atan2) + +//-------- T __nv_atanh +//-------- T __nv_atanhf +FUNC1(atanh) + +//-------- T __nv_cbrt +//-------- T __nv_cbrtf +FUNC1(cbrt) + +//-------- T __nv_ceil +//-------- T __nv_ceilf +FUNC1(ceil) + +//-------- T __nv_copysign +//-------- T __nv_copysignf +FUNC2(copysign) + +//-------- T __nv_cos +//-------- T __nv_cosf +FUNC1(cos) + +//-------- T __nv_cosh +//-------- T __nv_coshf +FUNC1(cosh) + +//-------- T __nv_cospi +//-------- T __nv_cospif +FUNC1(cospi) + +//-------- T __nv_erf +//-------- T __nv_erff +FUNC1(erf) + +//-------- T __nv_erfc +//-------- T __nv_erfcf +FUNC1(erfc) + +//-------- T __nv_erfcinv +//-------- T __nv_erfcinvf +FUNC1(erfcinv) + +//-------- T __nv_erfcx +//-------- T __nv_erfcxf +FUNC1(erfcx) + +//-------- T __nv_erfinv +//-------- T __nv_erfinvf +FUNC1(erfinv) + +//-------- T __nv_exp +//-------- T __nv_expf +FUNC1(exp) + +//-------- T __nv_exp10 +//-------- T __nv_exp10f +FUNC1(exp10) + +//-------- T __nv_exp2 +//-------- T __nv_exp2f +FUNC1(exp2) + +//-------- T __nv_expm1 +//-------- T __nv_expm1f +FUNC1(expm1) + +//-------- T __nv_fabs +//-------- T __nv_fabsf +FUNC1(fabs) + +//-------- T __nv_fdim +//-------- T __nv_fdimf +FUNC2(fdim) + +//-------- T __nv_floor +//-------- T __nv_floorf +FUNC1(floor) + +//-------- T __nv_fma +//-------- T __nv_fmaf +FUNC3(fma) + +//-------- T __nv_fmax +//-------- T __nv_fmaxf +FUNC2(fmax) + +//-------- T __nv_fmin +//-------- T __nv_fminf +FUNC2(fmin) + +//-------- T __nv_fmod +//-------- T __nv_fmodf +FUNC2(fmod) + +//-------- T __nv_hypot +//-------- T __nv_hypotf +FUNC2(hypot) + +//-------- T __nv_j0 +//-------- T __nv_j0f +FUNC1(j0) + +//-------- T __nv_j1 +//-------- T __nv_j1f +FUNC1(j1) + +//-------- T __nv_lgamma +//-------- T __nv_lgammaf +FUNC1(lgamma) + +//-------- T __nv_log +//-------- T __nv_logf +FUNC1(log) + +//-------- T __nv_log10 +//-------- T __nv_log10f +FUNC1(log10) + +//-------- T __nv_log1p +//-------- T __nv_log1pf +FUNC1(log1p) + +//-------- T __nv_log2 +//-------- T __nv_log2f +FUNC1(log2) + +//-------- T __nv_logb +//-------- T __nv_logbf +FUNC1(logb) + +//-------- T __nv_pow +//-------- T __nv_powf +FUNC2(pow) + +//-------- T __nv_rcbrt +//-------- T __nv_rcbrtf +FUNC1(rcbrt) + +//-------- T __nv_remainder +//-------- T __nv_remainderf +FUNC2(remainder) + +//-------- T __nv_rhypot +//-------- T __nv_rhypotf +FUNC2(rhypot) + +//-------- T __nv_nearbyint +//-------- T __nv_nearbyintf +FUNC1(nearbyint) + +//-------- T __nv_nextafter +//-------- T __nv_nextafterf +FUNC2(nextafter) + +//-------- T __nv_rint +//-------- T __nv_rintf +FUNC1(rint) + +//-------- T __nv_round +//-------- T __nv_roundf +FUNC1(round) + +//-------- T __nv_rsqrt +//-------- T __nv_rsqrtf +FUNC1(rsqrt) + +//-------- T __nv_scalbn +//-------- T __nv_scalbnf +FUNC2(scalbn) + +//-------- T __nv_sin +//-------- T __nv_sinf +FUNC1(sin) + +//-------- T __nv_sinh +//-------- T __nv_sinhf +FUNC1(sinh) + +//-------- T __nv_sinpi +//-------- T __nv_sinpif +FUNC1(sinpi) + +//-------- T __nv_sqrt +//-------- T __nv_sqrtf +FUNC1(sqrt) + +//-------- T __nv_tan +//-------- T __nv_tanf +FUNC1(tan) + +//-------- T __nv_tanh +//-------- T __nv_tanhf +FUNC1(tanh) + +//-------- T __nv_tgamma +//-------- T __nv_tgammaf +FUNC1(tgamma) + +//-------- T __nv_trunc +//-------- T __nv_truncf +FUNC1(trunc) + +//-------- T __nv_y0 +//-------- T __nv_y0f +FUNC1(y0) + +//-------- T __nv_y1 +//-------- T __nv_y1f +FUNC1(y1) + +//-------- T __nv_cyl_bessel_i0 +ATTR double __nv_cyl_bessel_i0(double x) { return __ocml_i0_f64(x); } + +//-------- T __nv_cyl_bessel_i0f +ATTR float __nv_cyl_bessel_i0f(float x) { return __ocml_i0_f32(x); } + +//-------- T __nv_cyl_bessel_i1 +ATTR double __nv_cyl_bessel_i1(double x) { return __ocml_i1_f64(x); } + +//-------- T __nv_cyl_bessel_i1f +ATTR float __nv_cyl_bessel_i1f(float x) { return __ocml_i1_f32(x); } + +//-------- T __nv_frexp +ATTR double __nv_frexp(double x, __private int *ptr) { return __ocml_frexp_f64(x, ptr); } + +//-------- T __nv_frexpf +ATTR float __nv_frexpf(float x, __private int *ptr) { return __ocml_frexp_f32(x, ptr); } + +//-------- T __nv_ilogb +ATTR int __nv_ilogb(double x) { return __ocml_ilogb_f64(x); } + +//-------- T __nv_ilogbf +ATTR int __nv_ilogbf(float x) { return __ocml_ilogb_f32(x); } + +//-------- T __nv_ldexp +ATTR double __nv_ldexp(double x, int i) { return __ocml_ldexp_f64(x, i); } + +//-------- T __nv_ldexpf +ATTR float __nv_ldexpf(float x, int i) { return __ocml_ldexp_f32(x, i); } + +//-------- T __nv_modf +ATTR double __nv_modf(double x, __private double *ptr) { return __ocml_modf_f64(x, ptr); } + +//-------- T __nv_modff +ATTR float __nv_modff(float x, __private float *ptr) { return __ocml_modf_f32(x, ptr); } + +//-------- T __nv_norm3d +ATTR double __nv_norm3d(double x, double y, double z) { return __ocml_len3_f64(x,y,z); } + +//-------- T __nv_norm3df +ATTR float __nv_norm3df(float x, float y, float z) { return __ocml_len3_f32(x,y,z); } + +//-------- T __nv_norm4d +ATTR double __nv_norm4d(double a, double b, double c, double d) { return __ocml_len4_f64(a,b,c,d); } + +//-------- T __nv_norm4df +ATTR float __nv_norm4df(float a, float b, float c, float d) { return __ocml_len4_f32(a,b,c,d); } + +//-------- T __nv_normcdf +ATTR double __nv_normcdf(double x) { return __ocml_ncdf_f64(x); } + +//-------- T __nv_normcdff +ATTR float __nv_normcdff(float x) { return __ocml_ncdf_f32(x); } + +//-------- T __nv_normcdfinv +ATTR double __nv_normcdfinv(double x) { return __ocml_ncdfinv_f64(x); } + +//-------- T __nv_normcdfinvf +ATTR float __nv_normcdfinvf(float x) { return __ocml_ncdfinv_f32(x); } + +//-------- T __nv_powi +ATTR double __nv_powi(double x, int n) { return __ocml_pown_f64(x, n); } + +//-------- T __nv_powi +ATTR float __nv_powif(float x, int n) { return __ocml_pown_f32(x, n); } + +//-------- T __nv_remquo +ATTR double __nv_remquo(double x, double y, __private int *ptr) { return __ocml_remquo_f64(x, y, ptr); } + +//-------- T __nv_remquof +ATTR float __nv_remquof(float x, float y, __private int *ptr) { return __ocml_remquo_f32(x, y, ptr); } + +//-------- T __nv_saturatef +ATTR float __nv_saturatef(float x) { return __ocml_min_f32(__ocml_max_f32(x, 0.0f), 1.0f); } + +//-------- T __nv_signbitd +ATTR int __nv_signbitd(double x) { return __ocml_signbit_f64(x); } + +//-------- T __nv_signbitf +ATTR int __nv_signbitf(float x) { return __ocml_signbit_f32(x); } + +//-------- T __nv_sincos +ATTR void __nv_sincos(double x, __private double * sptr, __private double *cptr) { (*sptr)=__ocml_sincos_f64(x, cptr); } + +//-------- T __nv_sincosf +ATTR void __nv_sincosf(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincos_f32(x, cptr); } + +//-------- T __nv_sincospi +ATTR void __nv_sincospi(double x, __private double * sptr, __private double *cptr) { (*sptr)=__ocml_sincospi_f64(x, cptr); } + +//-------- T __nv_sincospif +ATTR void __nv_sincosfpif(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincospi_f32(x, cptr); } + diff --git a/cuda2gcn/src/precision.cl b/cuda2gcn/src/precision.cl new file mode 100644 index 00000000..21a13d6e --- /dev/null +++ b/cuda2gcn/src/precision.cl @@ -0,0 +1,56 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "ocml.h" + +#define ATTR __attribute__((always_inline)) + +#define FUNC1F(root) \ + ATTR float __nv_fast_##root##f(float x) { return __ocml_##root##_f32(x); } +#define FUNC1(root) FUNC1F(root) + +#define FUNC2F(root) \ + ATTR float __nv_fast_##root##f(float x, float y) { return __ocml_##root##_f32(x, y); } +#define FUNC2(root) FUNC2F(root) + +#define FUNC3F(root) \ + ATTR float __nv_fast_##root##f(float x, float y, float z) { return __ocml_##root##_f32(x, y, z); } +#define FUNC3(root) FUNC3F(root) + +//-------- T __nv_fast_cosf +FUNC1(cos) + +//-------- T __nv_fast_exp10f +FUNC1(exp10) + +//-------- T __nv_fast_expf +FUNC1(exp) + +//-------- T __nv_fast_log10f +FUNC1(log10) + +//-------- T __nv_fast_log2f +FUNC1(log2) + +//-------- T __nv_fast_logf +FUNC1(log) + +//-------- T __nv_fast_powf +FUNC2(pow) + +//-------- T __nv_fast_sinf +FUNC1(sin) + +//-------- T __nv_fast_tanf +FUNC1(tan) + +//-------- T __nv_fast_fdividef +ATTR float __nv_fast_fdividef(float x, float y) { return native_divide(x, y); } + +//-------- T __nv_fast_sincosf +ATTR void __nv_fast_sincosf(float x, __private float * sptr, __private float *cptr) { (*sptr)=__ocml_sincos_f32(x, cptr); } + diff --git a/cuda2gcn/src/reinterpret.cl b/cuda2gcn/src/reinterpret.cl new file mode 100644 index 00000000..0d55cded --- /dev/null +++ b/cuda2gcn/src/reinterpret.cl @@ -0,0 +1,63 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#define ATTR __attribute__((always_inline, const)) + +//-------- T __nv_double_as_longlong +ATTR long __nv_double_as_longlong(double x) +{ + return as_long(x); +} + +//-------- T __nv_float_as_int +ATTR int __nv_float_as_int(float x) +{ + return as_int(x); +} + +//-------- T __nv_float_as_uint +ATTR unsigned int __nv_float_as_uint(float x) +{ + return as_uint(x); +} + +//-------- T __nv_int_as_float +ATTR float __nv_int_as_float(int x) +{ + return as_float(x); +} + +//-------- T __nv_longlong_as_double +ATTR double __nv_longlong_as_double(long x) +{ + return as_double(x); +} + +//-------- T __nv_uint_as_float +ATTR float __nv_uint_as_float(unsigned int x) +{ + return as_float(x); +} + +//-------- T __nv_double2hiint +int __nv_double2hiint(double x) +{ + return (int) as_long(x) >> 32; +} + +//-------- T __nv_double2loint +int __nv_double2loint(double x) +{ + return (int) as_long(x); +} + +//-------- T __nv_hiloint2double +double __nv_hiloint2double(int x, int y) +{ + return as_double((long)x << 32 | y); +} + diff --git a/cuda2gcn/src/rounding.cl b/cuda2gcn/src/rounding.cl new file mode 100644 index 00000000..a377e39d --- /dev/null +++ b/cuda2gcn/src/rounding.cl @@ -0,0 +1,23 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "ocml.h" + +#define ATTR __attribute__((always_inline, const)) + +//-------- T __nv_llrint +ATTR long __nv_llrint(double x) { return (long)__ocml_rint_f64(x); } + +//-------- T __nv_llrintf +ATTR long __nv_llrintf(float x) { return (long)__ocml_rint_f32(x); } + +//-------- T __nv_llround +ATTR long __nv_llround(double x) { return (long)__ocml_round_f64(x); } + +//-------- T __nv_llroundf +ATTR long __nv_llroundf(float x) { return (long)__ocml_round_f32(x); } + diff --git a/doc/OCKL.md b/doc/OCKL.md new file mode 100644 index 00000000..169f511e --- /dev/null +++ b/doc/OCKL.md @@ -0,0 +1,412 @@ +# OCKL User Guide + +* [Introduction](#introduction) + * [What Is OCKL](#what-is-ockl) +* [Using OCKL](#using-ocml) + * [Standard Usage](#standard-usage) + * [Controls](#controls) +* [Versioning](#versioning) +* [Naming convention](#naming-convention) +* [Supported functions](#supported-functions) + + +## Introduction +### What Is OCKL + +OCKL is an LLVM-IR bitcode library designed to provide access to certain hardware +and compiler capabilities needed by language runtimes. It should rarely be necessary +to call any of these functions directly from application code. Consider this library +a "detail" layer. + +## Using OCKL +### Standard Usage + +OCKL is expected to be used in a standard LLVM compilation flow as follows: + * Compile source modules to LLVM-IR bitcode (clang) + * Link together program bitcode with library bitcode including OCKL and OCLC. + * Run generic optimizations (opt) + * Code generation (llc) + +### Controls + +OCKL supports a number of controls that are provided by linking in specifically named inline +functions. These functions are inlined at optimization time and result in specific paths +taken with no control flow overhead. These functions all have the form (in C) + + __attribute__((always_inline, const)) int + __oclc_control(void) + { return 1; } // or 0 to disable + +The currently supported control are + * `finite_only_opt` - floating point Inf and NaN are never expected to be consumed or produced + * `unsafe_math_opt` - lower accuracy results may be produced with higher performance + * `daz_opt` - subnormal values consumed and produced may be flushed to zero + * `correctly_rounded_sqrt32` - float square root must be correctly rounded + * `ISA_version` - an integer representation of the ISA version of the target device + +### Versioning + +OCKL usually ships as a single LLVM-IR bitcode file named + + ocml-{LLVM rev}-{OCKL rev}.bc + +where `{LLVM rev}` is the version of LLVM used to create the file, of the +form X.Y, e.g. 3.8, and `{OCKL rev}` is the OCKL library version of the form X.Y, currently 0.9. + +### Naming convention + +OCKL functions follow a simple naming convention: + + __ockl_{function}_{type suffix} + +where {type suffix} generally indicates the type of the arguments and/or returned result using a type letter, +e.g. "u" for unsigned integer, and a bit width, e.g. 32. + +### Supported functions + +The following table lists the available functions along with a brief description of each: + +| **function** | **Brief Description** | +| :--- | :--- | +| `uchar __ockl_clz_u8(uchar);` | Count leading zeroes | +| `ushort __ockl_clz_u16(ushort);` | | +| `uint __ockl_clz_u32(uint);` | | +| `ulong __ockl_clz_u64(ulong);` | | +| - | | +| `uchar __ockl_ctz_u8(uchar);` | Count trailing zeroes | +| `ushort __ockl_ctz_u16(ushort);` | | +| `uint __ockl_ctz_u32(uint);` | | +| `ulong __ockl_ctz_u64(ulong);` | | +| - | | +| `uint __ockl_popcount_u32(uint);` | Count nonzero bits | +| `ulong __ockl_popcount_u64(ulong);` | | +| - | | +| `int __ockl_add_sat_i32(int,int);` | Add with saturation | +| `uint __ockl_add_sat_u32(uint,uint);` | | +| `long __ockl_add_sat_i64(long,long);` | | +| `ulong __ockl_add_sat_u64(ulong,ulong);` | | +| - | | +| `int __ockl_sub_sat_i32(int,int);` | Subtract with saturation | +| `uint __ockl_sub_sat_u32(uint,uint);` | | +| `long __ockl_sub_sat_i64(long,long);` | | +| `ulong __ockl_sub_sat_u64(ulong,ulong);` | | +| - | | +| `int __ockl_mul_hi_i32(int,int);` | High part of multiplication | +| `uint __ockl_mul_hi_u32(uint,uint);` | | +| `long __ockl_mul_hi_i64(long,long);` | | +| `ulong __ockl_mul_hi_u64(ulong,ulong);` | | +| - | | +| `int __ockl_mul24_i32(int,int);` | Multiply assuming operands fit in 24 bits | +| `uint __ockl_mul24_u32(uint,uint);` | | +| - | | +| `uint __ockl_activelane_u32(void);` | Index of currently lane counting only active lanes in wavefront | +| - | | +| `half __ockl_wfred_add_f16(half x);` | ADD reduction across wavefront | +| `float __ockl_wfred_add_f32(float x);` | | +| `double __ockl_wfred_add_f64(double x);` | | +| `int __ockl_wfred_add_i32(int x);` | | +| `long __ockl_wfred_add_i64(long x);` | | +| `uint __ockl_wfred_add_u32(uint x);` | | +| `ulong __ockl_wfred_add_u64(ulong x);` | AND reduction across wavefront | +| `int __ockl_wfred_and_i32(int x);` | | +| `long __ockl_wfred_and_i64(long x);` | | +| `uint __ockl_wfred_and_u32(uint x);` | | +| `ulong __ockl_wfred_and_u64(ulong x);` | | +| `half __ockl_wfred_max_f16(half x);` | MAX reduction across wavefront | +| `float __ockl_wfred_max_f32(float x);` | | +| `double __ockl_wfred_max_f64(double x);` | | +| `int __ockl_wfred_max_i32(int x);` | | +| `long __ockl_wfred_max_i64(long x);` | | +| `uint __ockl_wfred_max_u32(uint x);` | | +| `ulong __ockl_wfred_max_u64(ulong x);` | | +| `half __ockl_wfred_min_f16(half x);` | MIN reduction across wavefront | +| `float __ockl_wfred_min_f32(float x);` | | +| `double __ockl_wfred_min_f64(double x);` | | +| `int __ockl_wfred_min_i32(int x);` | | +| `long __ockl_wfred_min_i64(long x);` | | +| `uint __ockl_wfred_min_u32(uint x);` | | +| `ulong __ockl_wfred_min_u64(ulong x);` | | +| `int __ockl_wfred_or_i32(int x);` | OR reduction across wavefront | +| `long __ockl_wfred_or_i64(long x);` | | +| `uint __ockl_wfred_or_u32(uint x);` | | +| `ulong __ockl_wfred_or_u64(ulong x);` | | +| `int __ockl_wfred_xor_i32(int x);` | XOR reduction across wavefront | +| `long __ockl_wfred_xor_i64(long x);` | | +| `uint __ockl_wfred_xor_u32(uint x);` | | +| `ulong __ockl_wfred_xor_u64(ulong x);` | | +| `half __ockl_wfscan_add_f16(half x, bool inclusive);` | ADD scan across wavefront | +| `float __ockl_wfscan_add_f32(float x, bool inclusive);` | | +| `double __ockl_wfscan_add_f64(double x, bool inclusive);` | | +| `int __ockl_wfscan_add_i32(int x, bool inclusive);` | | +| `long __ockl_wfscan_add_i64(long x, bool inclusive);` | | +| `uint __ockl_wfscan_add_u32(uint x, bool inclusive);` | | +| `ulong __ockl_wfscan_add_u64(ulong x, bool inclusive);` | | +| `int __ockl_wfscan_and_i32(int x, bool inclusive);` | AND scan across wavefront | +| `long __ockl_wfscan_and_i64(long x, bool inclusive);` | | +| `uint __ockl_wfscan_and_u32(uint x, bool inclusive);` | | +| `ulong __ockl_wfscan_and_u64(ulong x, bool inclusive);` | | +| `half __ockl_wfscan_max_f16(half x, bool inclusive);` | MAX scan across wavefront | +| `float __ockl_wfscan_max_f32(float x, bool inclusive);` | | +| `double __ockl_wfscan_max_f64(double x, bool inclusive);` | | +| `int __ockl_wfscan_max_i32(int x, bool inclusive);` | | +| `long __ockl_wfscan_max_i64(long x, bool inclusive);` | | +| `uint __ockl_wfscan_max_u32(uint x, bool inclusive);` | | +| `ulong __ockl_wfscan_max_u64(ulong x, bool inclusive);` | | +| `half __ockl_wfscan_min_f16(half x, bool inclusive);` | MIN scan across wavefront | +| `float __ockl_wfscan_min_f32(float x, bool inclusive);` | | +| `double __ockl_wfscan_min_f64(double x, bool inclusive);` | | +| `int __ockl_wfscan_min_i32(int x, bool inclusive);` | | +| `long __ockl_wfscan_min_i64(long x, bool inclusive);` | | +| `uint __ockl_wfscan_min_u32(uint x, bool inclusive);` | | +| `ulong __ockl_wfscan_min_u64(ulong x, bool inclusive);` | | +| `int __ockl_wfscan_or_i32(int x, bool inclusive);` | OR scan across wavefront | +| `long __ockl_wfscan_or_i64(long x, bool inclusive);` | | +| `uint __ockl_wfscan_or_u32(uint x, bool inclusive);` | | +| `ulong __ockl_wfscan_or_u64(ulong x, bool inclusive);` | | +| `int __ockl_wfscan_xor_i32(int x, bool inclusive);` | XOR scan across wavefront | +| `long __ockl_wfscan_xor_i64(long x, bool inclusive);` | | +| `uint __ockl_wfscan_xor_u32(uint x, bool inclusive);` | | +| `ulong __ockl_wfscan_xor_u64(ulong x, bool inclusive);` | | +| `uint __ockl_wfbcast_u32(uint x, uint i);` | Broadcast to wavefront | +| `ulong __ockl_wfbcast_u64(ulong x, uint i);` | | +| - | | +| `bool __ockl_wfany_i32(int e);` | Detect any nonzero across wavefront | +| `bool __ockl_wfall_i32(int e);` | Detect all nozero across wavefront | +| `bool __ockl_wfsame_i32(int e);` | Detect same across wavefront | +| - | | +| `uint __ockl_bfm_u32(uint,uint);` | Bit field mask | +| `int __ockl_bfe_i32(int, uint, uint);` | Bit field extract | +| `uint __ockl_bfe_u32(uint,uint,uint);` | | +| `uint __ockl_bitalign_u32(uint,uint,uint);` | Align on bit boundary | +| `uint __ockl_bytealign_u32(uint,uint,uint);` | Align on byte boundary | +| `uint __ockl_lerp_u32(uint,uint,uint);` | Add each byte with prescribed carry | +| `float __ockl_max3_f32(float,float,float);` | Max of 3 | +| `half __ockl_max3_f16(half,half,half);` | | +| `int __ockl_max3_i32(int,int,int);` | | +| `uint __ockl_max3_u32(uint,uint,uint);` | | +| `float __ockl_median3_f32(float,float,float);` | Median of 3 | +| `half __ockl_median3_f16(half,half,half);` | | +| `int __ockl_median3_i32(int,int,int);` | | +| `uint __ockl_median3_u32(uint,uint,uint);` | | +| `float __ockl_min3_f32(float,float,float);` | Min of 3 | +| `half __ockl_min3_f16(half,half,half);` | | +| `int __ockl_min3_i32(int,int,int);` | | +| `uint __ockl_min3_u32(uint,uint,uint);` | | +| `ulong __ockl_mqsad_u64(ulong, uint, ulong);` | Masked rolling SAD | +| `uint __ockl_pack_u32(float4);` | Pack vector to bytes | +| `ulong __ockl_qsad_u64(ulong, uint, ulong);` | Rolling SAD | +| `uint __ockl_msad_u32(uint,uint,uint);` | Masked SAD | +| `uint __ockl_sad_u32(uint,uint,uint);` | SAD | +| `uint __ockl_sadd_u32(uint,uint,uint);` | 32-bit SAD | +| `uint __ockl_sadhi_u32(uint,uint,uint);` | SAD accululating to high half | +| `uint __ockl_sadw_u32(uint,uint,uint);` | 16-bit SAD | +| `float __ockl_unpack0_f32(uint);` | Extract byte and convert to float | +| `float __ockl_unpack1_f32(uint);` | | +| `float __ockl_unpack2_f32(uint);` | | +| `float __ockl_unpack3_f32(uint);` | | +| - | | +| `float4 __ockl_image_load_1D(TSHARP i, int c);` | Load from 1D image | +| `float4 __ockl_image_load_1Da(TSHARP i, int2 c);` | Load from 1D image array | +| `float4 __ockl_image_load_1Db(TSHARP i, int c);` | Load from 1D buffered image | +| `float4 __ockl_image_load_2D(TSHARP i, int2 c);` | Load from 2D image | +| `float4 __ockl_image_load_2Da(TSHARP i, int4 c);` | Load from 2D image array | +| `float __ockl_image_load_2Dad(TSHARP i, int4 c);` | Load from 2D depth image array | +| `float __ockl_image_load_2Dd(TSHARP i, int2 c);` | Load from 2D depth image | +| `float4 __ockl_image_load_3D(TSHARP i, int4 c);` | Load from 3D image | +| `float4 __ockl_image_load_CM(TSHARP i, int2 c, int f);` | Load from cubemap | +| `float4 __ockl_image_load_CMa(TSHARP i, int4 c, int f);` | Load from cubemap array | +| - | | +| `float4 __ockl_image_load_mip_1D(TSHARP i, int c, int l);` | Load from mipmapped image | +| `float4 __ockl_image_load_mip_1Da(TSHARP i, int2 c, int l);` | | +| `float4 __ockl_image_load_mip_2D(TSHARP i, int2 c, int l);` | | +| `float4 __ockl_image_load_mip_2Da(TSHARP i, int4 c, int l);` | | +| `float __ockl_image_load_mip_2Dad(TSHARP i, int4 c, int l);` | | +| `float __ockl_image_load_mip_2Dd(TSHARP i, int2 c, int l);` | | +| `float4 __ockl_image_load_mip_3D(TSHARP i, int4 c, int l);` | | +| `float4 __ockl_image_load_mip_CM(TSHARP i, int2 c, int f, int l);` | | +| `float4 __ockl_image_load_mip_CMa(TSHARP i, int4 c, int f, int l);` | | +| - | | +| `half4 __ockl_image_loadh_1D(TSHARP i, int c);` | Load from image returning half precision | +| `half4 __ockl_image_loadh_1Da(TSHARP i, int2 c);` | | +| `half4 __ockl_image_loadh_1Db(TSHARP i, int c);` | | +| `half4 __ockl_image_loadh_2D(TSHARP i, int2 c);` | | +| `half4 __ockl_image_loadh_2Da(TSHARP i, int4 c);` | | +| `half4 __ockl_image_loadh_3D(TSHARP i, int4 c);` | | +| `half4 __ockl_image_loadh_CM(TSHARP i, int2 c, int f);` | | +| `half4 __ockl_image_loadh_CMa(TSHARP i, int4 c, int f);` | | +| `half4 __ockl_image_loadh_mip_1D(TSHARP i, int c, int l);` | | +| `half4 __ockl_image_loadh_mip_1Da(TSHARP i, int2 c, int l);` | | +| `half4 __ockl_image_loadh_mip_2D(TSHARP i, int2 c, int l);` | | +| `half4 __ockl_image_loadh_mip_2Da(TSHARP i, int4 c, int l);` | | +| `half4 __ockl_image_loadh_mip_3D(TSHARP i, int4 c, int l);` | | +| `half4 __ockl_image_loadh_mip_CM(TSHARP i, int2 c, int f, int l);` | | +| `half4 __ockl_image_loadh_mip_CMa(TSHARP i, int4 c, int f, int l);` | | +| - | | +| `void __ockl_image_store_1D(TSHARP i, int c, float4 p);` | Store to image | +| `void __ockl_image_store_1Da(TSHARP i, int2 c, float4 p);` | | +| `void __ockl_image_store_1Db(TSHARP i, int c, float4 p);` | | +| `void __ockl_image_store_2D(TSHARP i, int2 c, float4 p);` | | +| `void __ockl_image_store_2Da(TSHARP i, int4 c, float4 p);` | | +| `void __ockl_image_store_2Dad(TSHARP i, int4 c, float p);` | | +| `void __ockl_image_store_2Dd(TSHARP i, int2 c, float p);` | | +| `void __ockl_image_store_3D(TSHARP i, int4 c, float4 p);` | | +| `void __ockl_image_store_CM(TSHARP i, int2 c, int f, float4 p);` | | +| `void __ockl_image_store_CMa(TSHARP i, int4 c, int f, float4 p);` | | +| `void __ockl_image_store_lod_1D(TSHARP i, int c, int l, float4 p);` | Store to level of mipmapped image | +| - | | +| `void __ockl_image_store_lod_1Da(TSHARP i, int2 c, int l, float4 p);` | | +| `void __ockl_image_store_lod_2D(TSHARP i, int2 c, int l, float4 p);` | | +| `void __ockl_image_store_lod_2Da(TSHARP i, int4 c, int l, float4 p);` | | +| `void __ockl_image_store_lod_2Dad(TSHARP i, int4 c, int l, float p);` | | +| `void __ockl_image_store_lod_2Dd(TSHARP i, int2 c, int l, float p);` | | +| `void __ockl_image_store_lod_3D(TSHARP i, int4 c, int l, float4 p);` | | +| `void __ockl_image_store_lod_CM(TSHARP i, int2 c, int f, int l, float4 p);` | | +| `void __ockl_image_store_lod_CMa(TSHARP i, int4 c, int f, int l, float4 p);` | | +| - | | +| `void __ockl_image_storeh_1D(TSHARP i, int c, half4 p);` | Store half precision pixel to image| +| `void __ockl_image_storeh_1Da(TSHARP i, int2 c, half4 p);` | | +| `void __ockl_image_storeh_1Db(TSHARP i, int c, half4 p);` | | +| `void __ockl_image_storeh_2D(TSHARP i, int2 c, half4 p);` | | +| `void __ockl_image_storeh_2Da(TSHARP i, int4 c, half4 p);` | | +| `void __ockl_image_storeh_3D(TSHARP i, int4 c, half4 p);` | | +| `void __ockl_image_storeh_CM(TSHARP i, int2 c, int f, half4 p);` | | +| `void __ockl_image_storeh_CMa(TSHARP i, int4 c, int f, half4 p);` | | +| - | | +| `void __ockl_image_storeh_lod_1D(TSHARP i, int c, int l, half4 p);` | Store half precision pixel to level of mipmapped image | +| `void __ockl_image_storeh_lod_1Da(TSHARP i, int2 c, int l, half4 p);` | | +| `void __ockl_image_storeh_lod_2D(TSHARP i, int2 c, int l, half4 p);` | | +| `void __ockl_image_storeh_lod_2Da(TSHARP i, int4 c, int l, half4 p);` | | +| `void __ockl_image_storeh_lod_3D(TSHARP i, int4 c, int l, half4 p);` | | +| `void __ockl_image_storeh_lod_CM(TSHARP i, int2 c, int f, int l, half4 p);` | | +| `void __ockl_image_storeh_lod_CMa(TSHARP i, int4 c, int f, int l, half4 p);` | | +| - | | +| `float4 __ockl_image_sample_1D(TSHARP i, SSHARP s, float c);` | Sample image | +| `float4 __ockl_image_sample_1Da(TSHARP i, SSHARP s, float2 c);` | | +| `float4 __ockl_image_sample_2D(TSHARP i, SSHARP s, float2 c);` | | +| `float4 __ockl_image_sample_2Da(TSHARP i, SSHARP s, float4 c);` | | +| `float __ockl_image_sample_2Dad(TSHARP i, SSHARP s, float4 c);` | | +| `float __ockl_image_sample_2Dd(TSHARP i, SSHARP s, float2 c);` | | +| `float4 __ockl_image_sample_3D(TSHARP i, SSHARP s, float4 c);` | | +| `float4 __ockl_image_sample_CM(TSHARP i, SSHARP s, float4 c);` | | +| `float4 __ockl_image_sample_CMa(TSHARP i, SSHARP s, float4 c);` | | +| - | | +| `float4 __ockl_image_sample_grad_1D(TSHARP i, SSHARP s, float c, float dx, float dy);` | Sample mipmapped image using gradient | +| `float4 __ockl_image_sample_grad_1Da(TSHARP i, SSHARP s, float2 c, float dx, float dy);` | | +| `float4 __ockl_image_sample_grad_2D(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | | +| `float4 __ockl_image_sample_grad_2Da(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | | +| `float __ockl_image_sample_grad_2Dad(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | | +| `float __ockl_image_sample_grad_2Dd(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | | +| `float4 __ockl_image_sample_grad_3D(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy);` | | +| - | | +| `float4 __ockl_image_sample_lod_1D(TSHARP i, SSHARP s, float c, float l);` | Sample mipmapped image using LOD | +| `float4 __ockl_image_sample_lod_1Da(TSHARP i, SSHARP s, float2 c, float l);` | | +| `float4 __ockl_image_sample_lod_2D(TSHARP i, SSHARP s, float2 c, float l);` | | +| `float4 __ockl_image_sample_lod_2Da(TSHARP i, SSHARP s, float4 c, float l);` | | +| `float __ockl_image_sample_lod_2Dad(TSHARP i, SSHARP s, float4 c, float l);` | | +| `float __ockl_image_sample_lod_2Dd(TSHARP i, SSHARP s, float2 c, float l);` | | +| `float4 __ockl_image_sample_lod_3D(TSHARP i, SSHARP s, float4 c, float l);` | | +| `float4 __ockl_image_sample_lod_CM(TSHARP i, SSHARP s, float4 c, float l);` | | +| `float4 __ockl_image_sample_lod_CMa(TSHARP i, SSHARP s, float4 c, float l);` | | +| - | | +| `half4 __ockl_image_sampleh_1D(TSHARP i, SSHARP s, float c);` | Sample image returning half precision | +| `half4 __ockl_image_sampleh_1Da(TSHARP i, SSHARP s, float2 c);` | | +| `half4 __ockl_image_sampleh_2D(TSHARP i, SSHARP s, float2 c);` | | +| `half4 __ockl_image_sampleh_2Da(TSHARP i, SSHARP s, float4 c);` | | +| `half4 __ockl_image_sampleh_3D(TSHARP i, SSHARP s, float4 c);` | | +| `half4 __ockl_image_sampleh_CM(TSHARP i, SSHARP s, float4 c);` | | +| `half4 __ockl_image_sampleh_CMa(TSHARP i, SSHARP s, float4 c);` | | +| - | | +| `half4 __ockl_image_sampleh_grad_1D(TSHARP i, SSHARP s, float c, float dx, float dy);` | Sample mipmapped image using gradient returning half precision | +| `half4 __ockl_image_sampleh_grad_1Da(TSHARP i, SSHARP s, float2 c, float dx, float dy);` | | +| `half4 __ockl_image_sampleh_grad_2D(TSHARP i, SSHARP s, float2 c, float2 dx, float2 dy);` | | +| `half4 __ockl_image_sampleh_grad_2Da(TSHARP i, SSHARP s, float4 c, float2 dx, float2 dy);` | | +| `half4 __ockl_image_sampleh_grad_3D(TSHARP i, SSHARP s, float4 c, float4 dx, float4 dy);` | | +| - | | +| `half4 __ockl_image_sampleh_lod_1D(TSHARP i, SSHARP s, float c, float l);` | Sample mipmapped image using LOD returning half precision | +| `half4 __ockl_image_sampleh_lod_1Da(TSHARP i, SSHARP s, float2 c, float l);` | | +| `half4 __ockl_image_sampleh_lod_2D(TSHARP i, SSHARP s, float2 c, float l);` | | +| `half4 __ockl_image_sampleh_lod_2Da(TSHARP i, SSHARP s, float4 c, float l);` | | +| `half4 __ockl_image_sampleh_lod_3D(TSHARP i, SSHARP s, float4 c, float l);` | | +| `half4 __ockl_image_sampleh_lod_CM(TSHARP i, SSHARP s, float4 c, float l);` | | +| `half4 __ockl_image_sampleh_lod_CMa(TSHARP i, SSHARP s, float4 c, float l);` | | +| - | | +| `float4 __ockl_image_gather4r_2D(TSHARP i, SSHARP s, float2 c);` | Gather 2x2 channel from image | +| `float4 __ockl_image_gather4g_2D(TSHARP i, SSHARP s, float2 c);` | | +| `float4 __ockl_image_gather4b_2D(TSHARP i, SSHARP s, float2 c);` | | +| `float4 __ockl_image_gather4a_2D(TSHARP i, SSHARP s, float2 c);` | | +| - | | +| `int __ockl_image_array_size_1Da(TSHARP i);` | Get image array size | +| `int __ockl_image_array_size_2Da(TSHARP i);` | | +| `int __ockl_image_array_size_2Dad(TSHARP i);` | | +| `int __ockl_image_array_size_CMa(TSHARP i);` | | +| - | | +| `int __ockl_image_channel_data_type_1D(TSHARP i);` | Get image channel data type | +| `int __ockl_image_channel_data_type_1Da(TSHARP i);` | | +| `int __ockl_image_channel_data_type_1Db(TSHARP i);` | | +| `int __ockl_image_channel_data_type_2D(TSHARP i);` | | +| `int __ockl_image_channel_data_type_2Da(TSHARP i);` | | +| `int __ockl_image_channel_data_type_2Dad(TSHARP i);` | | +| `int __ockl_image_channel_data_type_2Dd(TSHARP i);` | | +| `int __ockl_image_channel_data_type_3D(TSHARP i);` | | +| `int __ockl_image_channel_data_type_CM(TSHARP i);` | | +| `int __ockl_image_channel_data_type_CMa(TSHARP i);` | | +| - | | +| `int __ockl_image_channel_order_1D(TSHARP i);` | Get image channel order | +| `int __ockl_image_channel_order_1Da(TSHARP i);` | | +| `int __ockl_image_channel_order_1Db(TSHARP i);` | | +| `int __ockl_image_channel_order_2D(TSHARP i);` | | +| `int __ockl_image_channel_order_2Da(TSHARP i);` | | +| `int __ockl_image_channel_order_2Dad(TSHARP i);` | | +| `int __ockl_image_channel_order_2Dd(TSHARP i);` | | +| `int __ockl_image_channel_order_3D(TSHARP i);` | | +| `int __ockl_image_channel_order_CM(TSHARP i);` | | +| `int __ockl_image_channel_order_CMa(TSHARP i);` | | +| - | | +| `int __ockl_image_depth_3D(TSHARP i);` | Get 3D image depth | +| - | | +| `int __ockl_image_height_2D(TSHARP i);` | Get image height | +| `int __ockl_image_height_2Da(TSHARP i);` | | +| `int __ockl_image_height_2Dad(TSHARP i);` | | +| `int __ockl_image_height_2Dd(TSHARP i);` | | +| `int __ockl_image_height_3D(TSHARP i);` | | +| `int __ockl_image_height_CM(TSHARP i);` | | +| `int __ockl_image_height_CMa(TSHARP i);` | | +| - | | +| `int __ockl_image_num_mip_levels_1D(TSHARP i);` | Get number of levels in mipmapped image | +| `int __ockl_image_num_mip_levels_1Da(TSHARP i);` | | +| `int __ockl_image_num_mip_levels_2D(TSHARP i);` | | +| `int __ockl_image_num_mip_levels_2Da(TSHARP i);` | | +| `int __ockl_image_num_mip_levels_2Dad(TSHARP i);` | | +| `int __ockl_image_num_mip_levels_2Dd(TSHARP i);` | | +| `int __ockl_image_num_mip_levels_3D(TSHARP i);` | | +| `int __ockl_image_num_mip_levels_CM(TSHARP i);` | | +| `int __ockl_image_num_mip_levels_CMa(TSHARP i);` | | +| - | | +| `int __ockl_image_width_1D(TSHARP i);` | Get image width | +| `int __ockl_image_width_1Da(TSHARP i);` | | +| `int __ockl_image_width_1Db(TSHARP i);` | | +| `int __ockl_image_width_2D(TSHARP i);` | | +| `int __ockl_image_width_2Da(TSHARP i);` | | +| `int __ockl_image_width_2Dad(TSHARP i);` | | +| `int __ockl_image_width_2Dd(TSHARP i);` | | +| `int __ockl_image_width_3D(TSHARP i);` | | +| `int __ockl_image_width_CM(TSHARP i);` | | +| `int __ockl_image_width_CMa(TSHARP i);` | | +| - | | +| `size_t __ockl_get_global_offset(uint);` | Get grid global offset (OpenCL) of dimension | +| `size_t __ockl_get_global_id(uint);` | Get workitem global ID of dimension | +| `size_t __ockl_get_local_id(uint);` | Get workitem local ID of dimension | +| `size_t __ockl_get_group_id(uint);` | Get ID of group workitem resides in of dimension | +| `size_t __ockl_get_global_size(uint);` | Get global size of dimension | +| `size_t __ockl_get_local_size(uint);` | Get local size of dimension | +| `size_t __ockl_get_num_groups(uint);` | Get number of groups in dimension | +| `uint __ockl_get_work_dim(void);` | Get grid number of dimensions | +| `size_t __ockl_get_enqueued_local_size(uint);` | Get enqueued local size of dimension | +| `size_t __ockl_get_global_linear_id(void);` | Get global linear ID of workitem| +| `size_t __ockl_get_local_linear_id(void);` | Get local linear ID of workitem | +| - | | +| `bool __ockl_is_local_addr(const void *);` | Test if generic address is local | +| `bool __ockl_is_private_addr(const void *);` | Test if generic address is private | +| `__global void * __ockl_to_global(void *);` | Convert generic address to global address | +| `__local void * __ockl_to_local(void *);` | Convert generic address to local address | +| `__private void * __ockl_to_private(void *);` | Convert generic address to private address | diff --git a/hc/CMakeLists.txt b/hc/CMakeLists.txt index 6c4eb4e7..e7440e4f 100644 --- a/hc/CMakeLists.txt +++ b/hc/CMakeLists.txt @@ -24,7 +24,7 @@ if (GENERIC_IS_ZERO) endforeach(f) # Perform transformation - execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils" + execute_process(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_CURRENT_SOURCE_DIR}/../utils" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) file(GLOB ll_srcs diff --git a/hc/src/hc_kernel.cl b/hc/src/hc_kernel.cl index 50e8e07d..3bbd1c77 100644 --- a/hc/src/hc_kernel.cl +++ b/hc/src/hc_kernel.cl @@ -5,7 +5,7 @@ #define ATTR __attribute__((always_inline, const)) #define ATTR2 __attribute__((always_inline)) -ATTR long +ATTR uint amp_get_global_id(int dim) { __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr(); @@ -37,70 +37,138 @@ amp_get_global_id(int dim) return (g*s + l); } -ATTR long +ATTR uint amp_get_global_size(int dim) { - return __ockl_get_global_size(dim); + __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr(); + + switch(dim) { + case 0: + return p->grid_size_x; + case 1: + return p->grid_size_y; + case 2: + return p->grid_size_z; + default: + return 1; + } } -ATTR long +ATTR uint amp_get_local_id(int dim) { - return __ockl_get_local_id(dim); + switch(dim) { + case 0: + return __llvm_amdgcn_workitem_id_x(); + case 1: + return __llvm_amdgcn_workitem_id_y(); + case 2: + return __llvm_amdgcn_workitem_id_z(); + default: + return 0; + } } -ATTR long +ATTR uint amp_get_num_groups(int dim) { - return __ockl_get_num_groups(dim); + __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr(); + + uint n, d; + switch(dim) { + case 0: + n = p->grid_size_x; + d = p->workgroup_size_x; + break; + case 1: + n = p->grid_size_y; + d = p->workgroup_size_y; + break; + case 2: + n = p->grid_size_z; + d = p->workgroup_size_z; + break; + default: + n = 1; + d = 1; + break; + } + + return n / d; } -ATTR long +ATTR uint amp_get_group_id(int dim) { - return __ockl_get_group_id(dim); + switch(dim) { + case 0: + return __llvm_amdgcn_workgroup_id_x(); + case 1: + return __llvm_amdgcn_workgroup_id_y(); + case 2: + return __llvm_amdgcn_workgroup_id_z(); + default: + return 0; + } } -ATTR long +ATTR uint amp_get_local_size(int dim) { - return __ockl_get_local_size(dim); + __constant hsa_kernel_dispatch_packet_t *p = __llvm_amdgcn_dispatch_ptr(); + uint d; + + switch(dim) { + case 0: + d = p->workgroup_size_x; + break; + case 1: + d = p->workgroup_size_y; + break; + case 2: + d = p->workgroup_size_z; + break; + default: + d = 1; + break; + } + return d; } -ATTR long +ATTR uint hc_get_grid_size(int dim) { - return __ockl_get_global_size(dim); + return amp_get_global_size(dim); } -ATTR long +ATTR uint hc_get_workitem_absolute_id(int dim) { - return amp_get_global_id(dim); + return amp_get_global_id(dim); } -ATTR long +ATTR uint hc_get_workitem_id(int dim) { - return __ockl_get_local_id(dim); + return amp_get_local_id(dim); } -ATTR long +ATTR uint hc_get_num_groups(int dim) { - return __ockl_get_num_groups(dim); + return amp_get_num_groups(dim); } -ATTR long +ATTR uint hc_get_group_id(int dim) { - return __ockl_get_group_id(dim); + return amp_get_group_id(dim); } -ATTR long +ATTR uint hc_get_group_size(int dim) { - return __ockl_get_local_size(dim); + return amp_get_local_size(dim); } ATTR2 void diff --git a/irif/CMakeLists.txt b/irif/CMakeLists.txt index 37e89dca..883240f6 100644 --- a/irif/CMakeLists.txt +++ b/irif/CMakeLists.txt @@ -20,7 +20,7 @@ if (GENERIC_IS_ZERO) endforeach(f) # Perform transformation - execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils" + execute_process(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_CURRENT_SOURCE_DIR}/../utils" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) file(GLOB srcs diff --git a/irif/inc/irif.h b/irif/inc/irif.h index ec91d9f1..9121bb39 100644 --- a/irif/inc/irif.h +++ b/irif/inc/irif.h @@ -120,11 +120,15 @@ extern bool __llvm_umul_with_overflow_i32(uint, uint, __private uint*); extern bool __llvm_smul_with_overflow_i64(long, long, __private long*); extern bool __llvm_umul_with_overflow_i64(ulong, ulong, __private ulong*); -extern __attribute__((const)) int __llvm_ctlz_i32(int); -extern __attribute__((const)) int __llvm_ctlz_i64(long); +extern __attribute__((const)) uchar __llvm_ctlz_i8(uchar); +extern __attribute__((const)) ushort __llvm_ctlz_i16(ushort); +extern __attribute__((const)) uint __llvm_ctlz_i32(uint); +extern __attribute__((const)) ulong __llvm_ctlz_i64(ulong); -extern __attribute__((const)) int __llvm_cttz_i32(int); -extern __attribute__((const)) int __llvm_cttz_i64(long); +extern __attribute__((const)) uchar __llvm_cttz_i8(uchar); +extern __attribute__((const)) ushort __llvm_cttz_i16(ushort); +extern __attribute__((const)) uint __llvm_cttz_i32(uint); +extern __attribute__((const)) ulong __llvm_cttz_i64(ulong); // Fence intrinsics extern void __llvm_fence_acq_wi(void); @@ -197,6 +201,80 @@ extern ulong __llvm_cmpxchg_a1_x_x_dev_i64(__global ulong *, ulong, ulong); extern uint __llvm_cmpxchg_a3_x_x_wg_i32(__local uint *, uint, uint); extern ulong __llvm_cmpxchg_a3_x_x_wg_i64(__local ulong *, ulong, ulong); +// Constrained floating point +extern __attribute__((const)) half __llvm_add_rte_f16(half, half); +extern __attribute__((const)) half __llvm_add_rtn_f16(half, half); +extern __attribute__((const)) half __llvm_add_rtp_f16(half, half); +extern __attribute__((const)) half __llvm_add_rtz_f16(half, half); +extern __attribute__((const)) float __llvm_add_rte_f32(float, float); +extern __attribute__((const)) float __llvm_add_rtn_f32(float, float); +extern __attribute__((const)) float __llvm_add_rtp_f32(float, float); +extern __attribute__((const)) float __llvm_add_rtz_f32(float, float); +extern __attribute__((const)) double __llvm_add_rte_f64(double, double); +extern __attribute__((const)) double __llvm_add_rtn_f64(double, double); +extern __attribute__((const)) double __llvm_add_rtp_f64(double, double); +extern __attribute__((const)) double __llvm_add_rtz_f64(double, double); +extern __attribute__((const)) half __llvm_sub_rte_f16(half, half); +extern __attribute__((const)) half __llvm_sub_rtn_f16(half, half); +extern __attribute__((const)) half __llvm_sub_rtp_f16(half, half); +extern __attribute__((const)) half __llvm_sub_rtz_f16(half, half); +extern __attribute__((const)) float __llvm_sub_rte_f32(float, float); +extern __attribute__((const)) float __llvm_sub_rtn_f32(float, float); +extern __attribute__((const)) float __llvm_sub_rtp_f32(float, float); +extern __attribute__((const)) float __llvm_sub_rtz_f32(float, float); +extern __attribute__((const)) double __llvm_sub_rte_f64(double, double); +extern __attribute__((const)) double __llvm_sub_rtn_f64(double, double); +extern __attribute__((const)) double __llvm_sub_rtp_f64(double, double); +extern __attribute__((const)) double __llvm_sub_rtz_f64(double, double); +extern __attribute__((const)) half __llvm_mul_rte_f16(half, half); +extern __attribute__((const)) half __llvm_mul_rtn_f16(half, half); +extern __attribute__((const)) half __llvm_mul_rtp_f16(half, half); +extern __attribute__((const)) half __llvm_mul_rtz_f16(half, half); +extern __attribute__((const)) float __llvm_mul_rte_f32(float, float); +extern __attribute__((const)) float __llvm_mul_rtn_f32(float, float); +extern __attribute__((const)) float __llvm_mul_rtp_f32(float, float); +extern __attribute__((const)) float __llvm_mul_rtz_f32(float, float); +extern __attribute__((const)) double __llvm_mul_rte_f64(double, double); +extern __attribute__((const)) double __llvm_mul_rtn_f64(double, double); +extern __attribute__((const)) double __llvm_mul_rtp_f64(double, double); +extern __attribute__((const)) double __llvm_mul_rtz_f64(double, double); +extern __attribute__((const)) half __llvm_div_rte_f16(half, half); +extern __attribute__((const)) half __llvm_div_rtn_f16(half, half); +extern __attribute__((const)) half __llvm_div_rtp_f16(half, half); +extern __attribute__((const)) half __llvm_div_rtz_f16(half, half); +extern __attribute__((const)) float __llvm_div_rte_f32(float, float); +extern __attribute__((const)) float __llvm_div_rtn_f32(float, float); +extern __attribute__((const)) float __llvm_div_rtp_f32(float, float); +extern __attribute__((const)) float __llvm_div_rtz_f32(float, float); +extern __attribute__((const)) double __llvm_div_rte_f64(double, double); +extern __attribute__((const)) double __llvm_div_rtn_f64(double, double); +extern __attribute__((const)) double __llvm_div_rtp_f64(double, double); +extern __attribute__((const)) double __llvm_div_rtz_f64(double, double); +extern __attribute__((const)) half __llvm_sqrt_rte_f16(half); +extern __attribute__((const)) half __llvm_sqrt_rtn_f16(half); +extern __attribute__((const)) half __llvm_sqrt_rtp_f16(half); +extern __attribute__((const)) half __llvm_sqrt_rtz_f16(half); +extern __attribute__((const)) float __llvm_sqrt_rte_f32(float); +extern __attribute__((const)) float __llvm_sqrt_rtn_f32(float); +extern __attribute__((const)) float __llvm_sqrt_rtp_f32(float); +extern __attribute__((const)) float __llvm_sqrt_rtz_f32(float); +extern __attribute__((const)) double __llvm_sqrt_rte_f64(double); +extern __attribute__((const)) double __llvm_sqrt_rtn_f64(double); +extern __attribute__((const)) double __llvm_sqrt_rtp_f64(double); +extern __attribute__((const)) double __llvm_sqrt_rtz_f64(double); +extern __attribute__((const)) half __llvm_fma_rte_f16(half, half, half); +extern __attribute__((const)) half __llvm_fma_rtn_f16(half, half, half); +extern __attribute__((const)) half __llvm_fma_rtp_f16(half, half, half); +extern __attribute__((const)) half __llvm_fma_rtz_f16(half, half, half); +extern __attribute__((const)) float __llvm_fma_rte_f32(float, float, float); +extern __attribute__((const)) float __llvm_fma_rtn_f32(float, float, float); +extern __attribute__((const)) float __llvm_fma_rtp_f32(float, float, float); +extern __attribute__((const)) float __llvm_fma_rtz_f32(float, float, float); +extern __attribute__((const)) double __llvm_fma_rte_f64(double, double, double); +extern __attribute__((const)) double __llvm_fma_rtn_f64(double, double, double); +extern __attribute__((const)) double __llvm_fma_rtp_f64(double, double, double); +extern __attribute__((const)) double __llvm_fma_rtz_f64(double, double, double); + // AMDGPU intrinsics extern __attribute__((const)) bool __llvm_amdgcn_class_f16(half, int) __asm("llvm.amdgcn.class.f16"); extern __attribute__((const)) bool __llvm_amdgcn_class_f32(float, int) __asm("llvm.amdgcn.class.f32"); @@ -275,9 +353,9 @@ extern void __llvm_amdcgn_buffer_wbinvl1_vol(void) __asm("llvm.amdgcn.buffer.wbi extern __attribute__((const)) uint __llvm_amdgcn_mbcnt_lo(uint, uint) __asm("llvm.amdgcn.mbcnt.lo"); extern __attribute__((const)) uint __llvm_amdgcn_mbcnt_hi(uint, uint) __asm("llvm.amdgcn.mbcnt.hi"); -extern ulong __llvm_amdgcn_read_exec(void); -extern uint __llvm_amdgcn_read_exec_lo(void); -extern uint __llvm_amdgcn_read_exec_hi(void); +extern __attribute__((convergent)) ulong __llvm_amdgcn_read_exec(void); +extern __attribute__((convergent)) uint __llvm_amdgcn_read_exec_lo(void); +extern __attribute__((convergent)) uint __llvm_amdgcn_read_exec_hi(void); extern uint __llvm_amdgcn_s_getreg(uint) __asm("llvm.amdgcn.s.getreg"); @@ -387,12 +465,12 @@ extern void __llvm_amdgcn_image_store_mip_f32_v4i32(float p, int4 c, uint8 t, ui __asm("llvm.amdgcn.image.store.mip.f32.v4i32.v8i32"); // Image Sample: Only expose 8 word T# and a few of the other combinations -extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.sample.v4f32.f32.v8i32"); -extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.sample.v4f32.v2f32.v8i32"); -extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_v4f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.sample.v4f32.v4f32.v8i32"); +extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.sample.lz.v4f32.f32.v8i32"); +extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.sample.lz.v4f32.v2f32.v8i32"); +extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_lz_v4f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32"); extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_l_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) __asm("llvm.amdgcn.image.sample.l.v4f32.v2f32.v8i32"); @@ -406,12 +484,12 @@ extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_v4f32_v8f32(flo extern __attribute__((pure)) float4 __llvm_amdgcn_image_sample_d_v4f32_v16f32(float16 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) __asm("llvm.amdgcn.image.sample.l.v4f32.v16f32.v8i32"); -extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.sample.v4f16.f32.v8i32"); -extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.sample.v4f16.v2f32.v8i32"); -extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_v4f16_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.sample.v4f16.v4f32.v8i32"); +extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_f32(float c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.sample.lz.v4f16.f32.v8i32"); +extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.sample.lz.v4f16.v2f32.v8i32"); +extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_lz_v4f16_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.sample.lz.v4f16.v4f32.v8i32"); extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_l_v4f16_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) __asm("llvm.amdgcn.image.sample.l.v4f16.v2f32.v8i32"); @@ -426,10 +504,10 @@ extern __attribute__((pure)) half4 __llvm_amdgcn_image_sample_d_v4f16_v16f32(flo __asm("llvm.amdgcn.image.sample.l.v4f16.v16f32.v8i32"); // depth image sample -extern __attribute__((pure)) float __llvm_amdgcn_image_sample_f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.sample.f32.v2f32.v8i32"); -extern __attribute__((pure)) float __llvm_amdgcn_image_sample_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.sample.f32.v4f32.v8i32"); +extern __attribute__((pure)) float __llvm_amdgcn_image_sample_lz_f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.sample.lz.f32.v2f32.v8i32"); +extern __attribute__((pure)) float __llvm_amdgcn_image_sample_lz_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.sample.lz.f32.v4f32.v8i32"); extern __attribute__((pure)) float __llvm_amdgcn_image_sample_l_f32_v4f32(float4 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) __asm("llvm.amdgcn.image.sample.l.f32.v4f32.v8i32"); @@ -440,8 +518,8 @@ extern __attribute__((pure)) float __llvm_amdgcn_image_sample_d_f32_v16f32(float __asm("llvm.amdgcn.image.sample.l.f32.v16f32.v8i32"); // image fetch -extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) - __asm("llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32"); +extern __attribute__((pure)) float4 __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(float2 c, uint8 t, uint4 s, uint dmask, bool unorm, bool glc, bool slc, bool lwe, bool da) + __asm("llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32"); #pragma OPENCL EXTENSION cl_khr_fp16 : disable #endif // IRIF_H diff --git a/irif/src/cz.ll b/irif/src/cz.ll index af16d51c..bd9194c7 100644 --- a/irif/src/cz.ll +++ b/irif/src/cz.ll @@ -18,25 +18,45 @@ declare i16 @llvm.cttz.i16(i16, i1) declare i32 @llvm.cttz.i32(i32, i1) declare i64 @llvm.cttz.i64(i64, i1) +define i8 @__llvm_ctlz_i8(i8) #0 { + %2 = call i8 @llvm.ctlz.i8(i8 %0, i1 0) + ret i8 %2 +} + +define i16 @__llvm_ctlz_i16(i16) #0 { + %2 = call i16 @llvm.ctlz.i16(i16 %0, i1 0) + ret i16 %2 +} + define i32 @__llvm_ctlz_i32(i32) #0 { - %2 = call i32 @llvm.ctlz.i32(i32 %0, i1 1) + %2 = call i32 @llvm.ctlz.i32(i32 %0, i1 0) ret i32 %2 } define i64 @__llvm_ctlz_i64(i64) #0 { - %2 = call i64 @llvm.ctlz.i64(i64 %0, i1 1) + %2 = call i64 @llvm.ctlz.i64(i64 %0, i1 0) ret i64 %2 } +define i8 @__llvm_cttz_i8(i8) #0 { + %2 = call i8 @llvm.cttz.i8(i8 %0, i1 0) + ret i8 %2 +} + +define i16 @__llvm_cttz_i16(i16) #0 { + %2 = call i16 @llvm.cttz.i16(i16 %0, i1 0) + ret i16 %2 +} + define i32 @__llvm_cttz_i32(i32) #0 { - %2 = call i32 @llvm.cttz.i32(i32 %0, i1 1) + %2 = call i32 @llvm.cttz.i32(i32 %0, i1 0) ret i32 %2 } define i64 @__llvm_cttz_i64(i64) #0 { - %2 = call i64 @llvm.cttz.i64(i64 %0, i1 1) + %2 = call i64 @llvm.cttz.i64(i64 %0, i1 0) ret i64 %2 } -attributes #0 = { alwaysinline argmemonly norecurse nounwind readnone } +attributes #0 = { alwaysinline norecurse nounwind readnone } diff --git a/irif/src/fence.ll b/irif/src/fence.ll index 14f04b03..0bcaaaa9 100644 --- a/irif/src/fence.ll +++ b/irif/src/fence.ll @@ -1,27 +1,23 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" target triple = "amdgcn--amdhsa" -;; -;; syncscope number mapping is in llvm/target/AMDGPU/AMDGPU.h class AMDGPUSynchronizationScope -;; - define void @__llvm_fence_acq_wi() local_unnamed_addr #0 { - fence syncscope(5) acquire + fence syncscope("singlethread") acquire ret void } define void @__llvm_fence_acq_sg() local_unnamed_addr #0 { - fence syncscope(4) acquire + fence syncscope("wavefront") acquire ret void } define void @__llvm_fence_acq_wg() local_unnamed_addr #0 { - fence syncscope(3) acquire + fence syncscope("workgroup") acquire ret void } define void @__llvm_fence_acq_dev() local_unnamed_addr #0 { - fence syncscope(2) acquire + fence syncscope("agent") acquire ret void } @@ -31,22 +27,22 @@ define void @__llvm_fence_acq_sys() local_unnamed_addr #0 { } define void @__llvm_fence_rel_wi() local_unnamed_addr #0 { - fence syncscope(5) release + fence syncscope("singlethread") release ret void } define void @__llvm_fence_rel_sg() local_unnamed_addr #0 { - fence syncscope(4) release + fence syncscope("wavefront") release ret void } define void @__llvm_fence_rel_wg() local_unnamed_addr #0 { - fence syncscope(3) release + fence syncscope("workgroup") release ret void } define void @__llvm_fence_rel_dev() local_unnamed_addr #0 { - fence syncscope(2) release + fence syncscope("agent") release ret void } @@ -56,22 +52,22 @@ define void @__llvm_fence_rel_sys() local_unnamed_addr #0 { } define void @__llvm_fence_ar_wi() local_unnamed_addr #0 { - fence syncscope(5) acq_rel + fence syncscope("singlethread") acq_rel ret void } define void @__llvm_fence_ar_sg() local_unnamed_addr #0 { - fence syncscope(4) acq_rel + fence syncscope("wavefront") acq_rel ret void } define void @__llvm_fence_ar_wg() local_unnamed_addr #0 { - fence syncscope(3) acq_rel + fence syncscope("workgroup") acq_rel ret void } define void @__llvm_fence_ar_dev() local_unnamed_addr #0 { - fence syncscope(2) acq_rel + fence syncscope("agent") acq_rel ret void } @@ -81,22 +77,22 @@ define void @__llvm_fence_ar_sys() local_unnamed_addr #0 { } define void @__llvm_fence_sc_wi() local_unnamed_addr #0 { - fence syncscope(5) seq_cst + fence syncscope("singlethread") seq_cst ret void } define void @__llvm_fence_sc_sg() local_unnamed_addr #0 { - fence syncscope(4) seq_cst + fence syncscope("wavefront") seq_cst ret void } define void @__llvm_fence_sc_wg() local_unnamed_addr #0 { - fence syncscope(3) seq_cst + fence syncscope("workgroup") seq_cst ret void } define void @__llvm_fence_sc_dev() local_unnamed_addr #0 { - fence syncscope(2) seq_cst + fence syncscope("agent") seq_cst ret void } diff --git a/irif/src/reg.ll b/irif/src/reg.ll index 2fa2ab65..43bf238c 100644 --- a/irif/src/reg.ll +++ b/irif/src/reg.ll @@ -12,23 +12,22 @@ declare i32 @llvm.read_register.i32(metadata) #0 declare i64 @llvm.read_register.i64(metadata) #0 define i64 @__llvm_amdgcn_read_exec() #1 { - %1 = call i64 @llvm.read_register.i64(metadata !0) #2 + %1 = call i64 @llvm.read_register.i64(metadata !0) #0 ret i64 %1 } define i32 @__llvm_amdgcn_read_exec_lo() #1 { - %1 = call i32 @llvm.read_register.i32(metadata !1) #2 + %1 = call i32 @llvm.read_register.i32(metadata !1) #0 ret i32 %1 } define i32 @__llvm_amdgcn_read_exec_hi() #1 { - %1 = call i32 @llvm.read_register.i32(metadata !2) #2 + %1 = call i32 @llvm.read_register.i32(metadata !2) #0 ret i32 %1 } -attributes #0 = { nounwind } -attributes #1 = { alwaysinline nounwind } -attributes #2 = { nounwind convergent } +attributes #0 = { nounwind convergent } +attributes #1 = { alwaysinline nounwind convergent } !0 = !{!"exec"} !1 = !{!"exec_lo"} diff --git a/irif/src/rounded.ll b/irif/src/rounded.ll new file mode 100644 index 00000000..80b0082b --- /dev/null +++ b/irif/src/rounded.ll @@ -0,0 +1,393 @@ +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" +target triple = "amdgcn-amd-amdhsa-opencl" + +;;;;; Add ;;;;; +define half @__llvm_add_rte_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_add_rtn_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_add_rtp_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_add_rtz_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fadd.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define float @__llvm_add_rte_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_add_rtn_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_add_rtp_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_add_rtz_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fadd.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define double @__llvm_add_rte_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_add_rtn_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_add_rtp_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_add_rtz_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fadd.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret double %3 +} + +;;;;; Sub ;;;;; +define half @__llvm_sub_rte_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_sub_rtn_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_sub_rtp_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_sub_rtz_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fsub.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define float @__llvm_sub_rte_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_sub_rtn_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_sub_rtp_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_sub_rtz_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fsub.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define double @__llvm_sub_rte_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_sub_rtn_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_sub_rtp_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_sub_rtz_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fsub.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret double %3 +} + +;;;;; Mul ;;;;; +define half @__llvm_mul_rte_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_mul_rtn_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_mul_rtp_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_mul_rtz_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fmul.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define float @__llvm_mul_rte_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_mul_rtn_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_mul_rtp_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_mul_rtz_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fmul.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define double @__llvm_mul_rte_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_mul_rtn_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_mul_rtp_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_mul_rtz_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fmul.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret double %3 +} + +;;;;; Div ;;;;; +define half @__llvm_div_rte_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_div_rtn_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_div_rtp_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define half @__llvm_div_rtz_f16(half, half) local_unnamed_addr #0 { + %3 = tail call half @llvm.experimental.constrained.fdiv.f16(half %0, half %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret half %3 +} + +define float @__llvm_div_rte_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_div_rtn_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_div_rtp_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define float @__llvm_div_rtz_f32(float, float) local_unnamed_addr #0 { + %3 = tail call float @llvm.experimental.constrained.fdiv.f32(float %0, float %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret float %3 +} + +define double @__llvm_div_rte_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_div_rtn_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_div_rtp_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret double %3 +} + +define double @__llvm_div_rtz_f64(double, double) local_unnamed_addr #0 { + %3 = tail call double @llvm.experimental.constrained.fdiv.f64(double %0, double %1, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret double %3 +} + +;;;;; Sqrt ;;;;; +define half @__llvm_sqrt_rte_f16(half) local_unnamed_addr #0 { + %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %2 +} + +define half @__llvm_sqrt_rtn_f16(half) local_unnamed_addr #0 { + %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret half %2 +} + +define half @__llvm_sqrt_rtp_f16(half) local_unnamed_addr #0 { + %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret half %2 +} + +define half @__llvm_sqrt_rtz_f16(half) local_unnamed_addr #0 { + %2 = tail call half @llvm.experimental.constrained.sqrt.f16(half %0, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret half %2 +} + +define float @__llvm_sqrt_rte_f32(float) local_unnamed_addr #0 { + %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret float %2 +} + +define float @__llvm_sqrt_rtn_f32(float) local_unnamed_addr #0 { + %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret float %2 +} + +define float @__llvm_sqrt_rtp_f32(float) local_unnamed_addr #0 { + %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret float %2 +} + +define float @__llvm_sqrt_rtz_f32(float) local_unnamed_addr #0 { + %2 = tail call float @llvm.experimental.constrained.sqrt.f32(float %0, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret float %2 +} + +define double @__llvm_sqrt_rte_f64(double) local_unnamed_addr #0 { + %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret double %2 +} + +define double @__llvm_sqrt_rtn_f64(double) local_unnamed_addr #0 { + %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret double %2 +} + +define double @__llvm_sqrt_rtp_f64(double) local_unnamed_addr #0 { + %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret double %2 +} + +define double @__llvm_sqrt_rtz_f64(double) local_unnamed_addr #0 { + %2 = tail call double @llvm.experimental.constrained.sqrt.f64(double %0, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret double %2 +} + +;;;;; Fma ;;;;; +define half @__llvm_fma_rte_f16(half, half, half) local_unnamed_addr #0 { + %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret half %4 +} + +define half @__llvm_fma_rtn_f16(half, half, half) local_unnamed_addr #0 { + %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret half %4 +} + +define half @__llvm_fma_rtp_f16(half, half, half) local_unnamed_addr #0 { + %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret half %4 +} + +define half @__llvm_fma_rtz_f16(half, half, half) local_unnamed_addr #0 { + %4 = tail call half @llvm.experimental.constrained.fma.f16(half %0, half %1, half %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret half %4 +} + +define float @__llvm_fma_rte_f32(float, float, float) local_unnamed_addr #0 { + %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret float %4 +} + +define float @__llvm_fma_rtn_f32(float, float, float) local_unnamed_addr #0 { + %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret float %4 +} + +define float @__llvm_fma_rtp_f32(float, float, float) local_unnamed_addr #0 { + %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret float %4 +} + +define float @__llvm_fma_rtz_f32(float, float, float) local_unnamed_addr #0 { + %4 = tail call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret float %4 +} + +define double @__llvm_fma_rte_f64(double, double, double) local_unnamed_addr #0 { + %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.tonearest", metadata !"fpexcept.strict") #1 + ret double %4 +} + +define double @__llvm_fma_rtn_f64(double, double, double) local_unnamed_addr #0 { + %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.downward", metadata !"fpexcept.strict") #1 + ret double %4 +} + +define double @__llvm_fma_rtp_f64(double, double, double) local_unnamed_addr #0 { + %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.upward", metadata !"fpexcept.strict") #1 + ret double %4 +} + +define double @__llvm_fma_rtz_f64(double, double, double) local_unnamed_addr #0 { + %4 = tail call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2, metadata !"round.towardzero", metadata !"fpexcept.strict") #1 + ret double %4 +} + +declare half @llvm.experimental.constrained.fdiv.f16(half, half, metadata, metadata) local_unnamed_addr #1 +declare half @llvm.experimental.constrained.fmul.f16(half, half, metadata, metadata) local_unnamed_addr #1 +declare half @llvm.experimental.constrained.fadd.f16(half, half, metadata, metadata) local_unnamed_addr #1 +declare half @llvm.experimental.constrained.fsub.f16(half, half, metadata, metadata) local_unnamed_addr #1 +declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata) local_unnamed_addr #1 +declare half @llvm.experimental.constrained.fma.f16(half, half, half, metadata, metadata) local_unnamed_addr #1 + +declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata) local_unnamed_addr #1 +declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata) local_unnamed_addr #1 +declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata) local_unnamed_addr #1 +declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata) local_unnamed_addr #1 +declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata) local_unnamed_addr #1 +declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) local_unnamed_addr #1 + +declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata) local_unnamed_addr #1 +declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata) local_unnamed_addr #1 +declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) local_unnamed_addr #1 +declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata) local_unnamed_addr #1 +declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) local_unnamed_addr #1 +declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) local_unnamed_addr #1 + +attributes #0 = { alwaysinline nounwind readnone } +attributes #1 = { nounwind readnone } + diff --git a/ockl/CMakeLists.txt b/ockl/CMakeLists.txt index cb9bb25a..7fa87ef4 100644 --- a/ockl/CMakeLists.txt +++ b/ockl/CMakeLists.txt @@ -15,4 +15,14 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/inc) opencl_bc_lib(ockl ${sources}) -install(FILES inc/ockl.h DESTINATION include COMPONENT OpenCL) +install(FILES + inc/amd_hsa_common.h + inc/amd_hsa_elf.h + inc/amd_hsa_kernel_code.h + inc/amd_hsa_queue.h + inc/amd_hsa_signal.h + inc/device_amd_hsa.h + inc/hsa.h + inc/ockl_hsa.h + inc/ockl.h + DESTINATION include COMPONENT OpenCL) diff --git a/ockl/inc/hsa.h b/ockl/inc/hsa.h index 0252b009..85365882 100644 --- a/ockl/inc/hsa.h +++ b/ockl/inc/hsa.h @@ -1502,7 +1502,7 @@ typedef struct hsa_queue_s { #ifdef HSA_LARGE_MODEL #ifdef DEVICE_COMPILER - __constant + __global #endif void *base_address; #elif defined HSA_LITTLE_ENDIAN @@ -1511,7 +1511,7 @@ typedef struct hsa_queue_s { * packets. Must be aligned to the size of an AQL packet. */ #ifdef DEVICE_COMPILER - __constant + __global #endif void *base_address; /** @@ -1521,7 +1521,7 @@ typedef struct hsa_queue_s { #else uint32_t reserved0; #ifdef DEVICE_COMPILER - __constant + __global #endif void *base_address; #endif @@ -2129,7 +2129,7 @@ typedef struct hsa_kernel_dispatch_packet_s { #ifdef HSA_LARGE_MODEL #ifdef DEVICE_COMPILER - __constant + __global #endif void *kernarg_address; #elif defined HSA_LITTLE_ENDIAN @@ -2141,7 +2141,7 @@ typedef struct hsa_kernel_dispatch_packet_s { * completed execution. */ #ifdef DEVICE_COMPILER - __constant + __global #endif void *kernarg_address; /** @@ -2151,7 +2151,7 @@ typedef struct hsa_kernel_dispatch_packet_s { #else uint32_t reserved1; #ifdef DEVICE_COMPILER - __constant + __global #endif void *kernarg_address; #endif diff --git a/ockl/inc/ockl.h b/ockl/inc/ockl.h index 0cba8e0f..bceacdaa 100644 --- a/ockl/inc/ockl.h +++ b/ockl/inc/ockl.h @@ -102,9 +102,18 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +extern __attribute__((const)) uchar OCKL_MANGLE_T(clz,u8)(uchar); +extern __attribute__((const)) ushort OCKL_MANGLE_T(clz,u16)(ushort); DECL_CONST_OCKL_UNARY_U32(clz) +DECL_CONST_OCKL_UNARY_U64(clz) + +extern __attribute__((const)) uchar OCKL_MANGLE_T(ctz,u8)(uchar); +extern __attribute__((const)) ushort OCKL_MANGLE_T(ctz,u16)(ushort); DECL_CONST_OCKL_UNARY_U32(ctz) +DECL_CONST_OCKL_UNARY_U64(ctz) + DECL_CONST_OCKL_UNARY_U32(popcount) +DECL_CONST_OCKL_UNARY_U64(popcount) DECL_CONST_OCKL_BINARY_I32(add_sat) DECL_CONST_OCKL_BINARY_U32(add_sat) diff --git a/ockl/inc/ockl_hsa.h b/ockl/inc/ockl_hsa.h index 111116b7..1a53d9e1 100644 --- a/ockl/inc/ockl_hsa.h +++ b/ockl/inc/ockl_hsa.h @@ -12,11 +12,11 @@ #include "device_amd_hsa.h" typedef enum __ockl_memory_order_e { - __ockl_memory_order_relaxed, - __ockl_memory_order_acquire, - __ockl_memory_order_release, - __ockl_memory_order_acq_rel, - __ockl_memory_order_seq_cst, + __ockl_memory_order_relaxed = __ATOMIC_RELAXED, + __ockl_memory_order_acquire = __ATOMIC_ACQUIRE, + __ockl_memory_order_release = __ATOMIC_RELEASE, + __ockl_memory_order_acq_rel = __ATOMIC_ACQ_REL, + __ockl_memory_order_seq_cst = __ATOMIC_SEQ_CST, } __ockl_memory_order; extern ulong OCKL_MANGLE_T(hsa_queue,load_write_index)(const __global hsa_queue_t *queue, __ockl_memory_order mem_order); diff --git a/ockl/src/clz.cl b/ockl/src/clz.cl index 593b4fcf..e93edbee 100644 --- a/ockl/src/clz.cl +++ b/ockl/src/clz.cl @@ -8,10 +8,27 @@ #include "irif.h" #include "ockl.h" +__attribute__((always_inline, const)) uchar +OCKL_MANGLE_T(clz,u8)(uchar i) +{ + return __llvm_ctlz_i8(i); +} + +__attribute__((always_inline, const)) ushort +OCKL_MANGLE_T(clz,u16)(ushort i) +{ + return __llvm_ctlz_i16(i); +} + __attribute__((always_inline, const)) uint OCKL_MANGLE_U32(clz)(uint i) { - uint r = (uint)__llvm_ctlz_i32((int)i); - return i ? r : 32u; + return __llvm_ctlz_i32(i); +} + +__attribute__((always_inline, const)) ulong +OCKL_MANGLE_U64(clz)(ulong i) +{ + return __llvm_ctlz_i64(i); } diff --git a/ockl/src/ctz.cl b/ockl/src/ctz.cl index 72de58cf..a7ad76e9 100644 --- a/ockl/src/ctz.cl +++ b/ockl/src/ctz.cl @@ -8,10 +8,27 @@ #include "irif.h" #include "ockl.h" +__attribute__((always_inline, const)) uchar +OCKL_MANGLE_T(ctz,u8)(uchar i) +{ + return __llvm_cttz_i8(i); +} + +__attribute__((always_inline, const)) ushort +OCKL_MANGLE_T(ctz,u16)(ushort i) +{ + return __llvm_cttz_i16(i); +} + __attribute__((always_inline, const)) uint OCKL_MANGLE_U32(ctz)(uint i) { - uint r = (uint)__llvm_cttz_i32((int)i); - return i ? r : 32u; + return __llvm_cttz_i32(i); +} + +__attribute__((always_inline, const)) ulong +OCKL_MANGLE_U64(ctz)(ulong i) +{ + return __llvm_cttz_i64(i); } diff --git a/ockl/src/hsaqs.cl b/ockl/src/hsaqs.cl index 426d2c6b..a39fc495 100644 --- a/ockl/src/hsaqs.cl +++ b/ockl/src/hsaqs.cl @@ -15,103 +15,11 @@ #define ATTR __attribute__((always_inline)) -// TODO Remove this workaround when the compiler is ready - -#define AL(T,P,O,S) ({ \ - T __l; \ - switch (O) { \ - case __ockl_memory_order_acquire: \ - __l = atomic_load_explicit(P, memory_order_acquire, S); \ - break; \ - case __ockl_memory_order_seq_cst: \ - __l = atomic_load_explicit(P, memory_order_seq_cst, S); \ - break; \ - default: \ - __l = atomic_load_explicit(P, memory_order_relaxed, S); \ - break; \ - } \ - __l; \ -}) - -#define AS(P,V,O,S) ({ \ - switch (O) { \ - case __ockl_memory_order_release: \ - atomic_store_explicit(P, V, memory_order_release, S); \ - break; \ - case __ockl_memory_order_seq_cst: \ - atomic_store_explicit(P, V, memory_order_seq_cst, S); \ - break; \ - default: \ - atomic_store_explicit(P, V, memory_order_relaxed, S); \ - break; \ - } \ -}) - -#define AF(T,K,P,V,O,S) ({ \ - T __f; \ - switch (O) { \ - case __ockl_memory_order_acquire: \ - __f = atomic_fetch_##K##_explicit(P, V, memory_order_acquire, S); \ - break; \ - case __ockl_memory_order_release: \ - __f = atomic_fetch_##K##_explicit(P, V, memory_order_release, S); \ - break; \ - case __ockl_memory_order_acq_rel: \ - __f = atomic_fetch_##K##_explicit(P, V, memory_order_acq_rel, S); \ - break; \ - case __ockl_memory_order_seq_cst: \ - __f = atomic_fetch_##K##_explicit(P, V, memory_order_seq_cst, S); \ - break; \ - default: \ - __f = atomic_fetch_##K##_explicit(P, V, memory_order_relaxed, S); \ - break; \ - } \ - __f; \ -}) - -#define AX(T,P,V,O,S) ({ \ - T __e; \ - switch (O) { \ - case __ockl_memory_order_acquire: \ - __e = atomic_exchange_explicit(P, V, memory_order_acquire, S); \ - break; \ - case __ockl_memory_order_release: \ - __e = atomic_exchange_explicit(P, V, memory_order_release, S); \ - break; \ - case __ockl_memory_order_acq_rel: \ - __e = atomic_exchange_explicit(P, V, memory_order_acq_rel, S); \ - break; \ - case __ockl_memory_order_seq_cst: \ - __e = atomic_exchange_explicit(P, V, memory_order_seq_cst, S); \ - break; \ - default: \ - __e = atomic_exchange_explicit(P, V, memory_order_relaxed, S); \ - break; \ - } \ - __e; \ -}) - -#define AC(P,E,V,O,R,S) ({ \ - bool __c; \ - switch (O) { \ - case __ockl_memory_order_acquire: \ - __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_acquire, R, S); \ - break; \ - case __ockl_memory_order_release: \ - __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_release, R, S); \ - break; \ - case __ockl_memory_order_acq_rel: \ - __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_acq_rel, R, S); \ - break; \ - case __ockl_memory_order_seq_cst: \ - __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_seq_cst, R, S); \ - break; \ - default: \ - __c = atomic_compare_exchange_strong_explicit(P, E, V, memory_order_relaxed, R, S); \ - break; \ - } \ - __c; \ -}) +#define AL(T,P,O,S) __opencl_atomic_load(P,O,S) +#define AS(P,V,O,S) __opencl_atomic_store(P,V,O,S) +#define AF(T,K,P,V,O,S) __opencl_atomic_fetch_##K(P,V,O,S) +#define AX(T,P,V,O,S) __opencl_atomic_exchange(P,V,O,S) +#define AC(P,E,V,O,R,S) __opencl_atomic_compare_exchange_strong(P,E,V,O,R,S) // // HSA queue ops @@ -235,6 +143,9 @@ OCKL_MANGLE_T(hsa_signal,store)(hsa_signal_t sig, long value, __ockl_memory_orde if (s->kind == AMD_SIGNAL_KIND_USER) { AS((__global atomic_long *)&s->value, value, mem_order, memory_scope_all_svm_devices); update_mbox(s); + } else if (__oclc_ISA_version() >= 900) { + // Hardware doorbell supports AQL semantics. + atomic_store_explicit((__global atomic_ulong *)s->hardware_doorbell_ptr, (ulong)value, memory_order_release, memory_scope_all_svm_devices); } else { { diff --git a/ockl/src/image.cl b/ockl/src/image.cl index b1752c4a..1d3ee450 100644 --- a/ockl/src/image.cl +++ b/ockl/src/image.cl @@ -497,7 +497,7 @@ RATTR float4 OCKL_MANGLE_T(image_sample,1D)(TSHARP i, SSHARP s, float c) { ADJUST_X(c, i, s); - return __llvm_amdgcn_image_sample_v4f32_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f32_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR float4 @@ -505,14 +505,14 @@ OCKL_MANGLE_T(image_sample,1Da)(TSHARP i, SSHARP s, float2 c) { ADJUST_X(c.x, i, s); c.y = __llvm_rint_f32(c.y); - return __llvm_amdgcn_image_sample_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true); + return __llvm_amdgcn_image_sample_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true); } RATTR float4 OCKL_MANGLE_T(image_sample,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); - return __llvm_amdgcn_image_sample_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR float4 @@ -520,7 +520,7 @@ OCKL_MANGLE_T(image_sample,2Da)(TSHARP i, SSHARP s, float4 c) { ADJUST_XY(c, i, s); c.z = __llvm_rint_f32(c.z); - return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true); + return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true); } RATTR float @@ -528,28 +528,28 @@ OCKL_MANGLE_T(image_sample,2Dad)(TSHARP i, SSHARP s, float4 c) { ADJUST_XY(c, i, s); c.z = __llvm_rint_f32(c.z); - return __llvm_amdgcn_image_sample_f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, true); + return __llvm_amdgcn_image_sample_lz_f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, true); } RATTR float OCKL_MANGLE_T(image_sample,2Dd)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); - return __llvm_amdgcn_image_sample_f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false); } RATTR float4 OCKL_MANGLE_T(image_sample,3D)(TSHARP i, SSHARP s, float4 c) { ADJUST_XYZ(c, i, s); - return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR float4 OCKL_MANGLE_T(image_sample,CM)(TSHARP i, SSHARP s, float4 c) { CUBE_PREP(c); - return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR float4 @@ -557,7 +557,7 @@ OCKL_MANGLE_T(image_sample,CMa)(TSHARP i, SSHARP s, float4 c) { CUBE_PREP(c); c.z = SAMPLE_ARRAY_FACE(c.w, c.z); - return __llvm_amdgcn_image_sample_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f32_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR float4 @@ -685,7 +685,7 @@ RATTR half4 OCKL_MANGLE_T(image_sampleh,1D)(TSHARP i, SSHARP s, float c) { ADJUST_X(c, i, s); - return __llvm_amdgcn_image_sample_v4f16_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f16_f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR half4 @@ -693,14 +693,14 @@ OCKL_MANGLE_T(image_sampleh,1Da)(TSHARP i, SSHARP s, float2 c) { ADJUST_X(c.x, i, s); c.y = __llvm_rint_f32(c.y); - return __llvm_amdgcn_image_sample_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true); + return __llvm_amdgcn_image_sample_lz_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true); } RATTR half4 OCKL_MANGLE_T(image_sampleh,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); - return __llvm_amdgcn_image_sample_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f16_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR half4 @@ -708,21 +708,21 @@ OCKL_MANGLE_T(image_sampleh,2Da)(TSHARP i, SSHARP s, float4 c) { ADJUST_XY(c, i, s); c.z = __llvm_rint_f32(c.z); - return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true); + return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, true); } RATTR half4 OCKL_MANGLE_T(image_sampleh,3D)(TSHARP i, SSHARP s, float4 c) { ADJUST_XYZ(c, i, s); - return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR half4 OCKL_MANGLE_T(image_sampleh,CM)(TSHARP i, SSHARP s, float4 c) { CUBE_PREP(c); - return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR half4 @@ -730,7 +730,7 @@ OCKL_MANGLE_T(image_sampleh,CMa)(TSHARP i, SSHARP s, float4 c) { CUBE_PREP(c); c.z = SAMPLE_ARRAY_FACE(c.w, c.z); - return __llvm_amdgcn_image_sample_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); + return __llvm_amdgcn_image_sample_lz_v4f16_v4f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0xf, false, false, false, false, false); } RATTR half4 @@ -828,28 +828,28 @@ RATTR float4 OCKL_MANGLE_T(image_gather4r,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); - return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false); + return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x1, false, false, false, false, false); } RATTR float4 OCKL_MANGLE_T(image_gather4g,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); - return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x2, false, false, false, false, false); + return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x2, false, false, false, false, false); } RATTR float4 OCKL_MANGLE_T(image_gather4b,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); - return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x4, false, false, false, false, false); + return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x4, false, false, false, false, false); } RATTR float4 OCKL_MANGLE_T(image_gather4a,2D)(TSHARP i, SSHARP s, float2 c) { ADJUST_XY(c, i, s); - return __llvm_amdgcn_image_gather4_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x8, false, false, false, false, false); + return __llvm_amdgcn_image_gather4_lz_v4f32_v2f32(c, LOAD_TSHARP(i), LOAD_SSHARP(s), 0x8, false, false, false, false, false); } // We rely on the fact that the runtime allocates 12 words for the T# or V# diff --git a/ockl/src/popcount.cl b/ockl/src/popcount.cl index 77212f17..b6404022 100644 --- a/ockl/src/popcount.cl +++ b/ockl/src/popcount.cl @@ -14,3 +14,9 @@ OCKL_MANGLE_U32(popcount)(uint i) return (uint)__llvm_ctpop_i32((int)i); } +__attribute__((always_inline, const)) ulong +OCKL_MANGLE_U64(popcount)(ulong i) +{ + return (ulong)__llvm_ctpop_i64((long)i); +} + diff --git a/ocml/src/acoshD.cl b/ocml/src/acoshD.cl index 619f35a5..064897ff 100644 --- a/ocml/src/acoshD.cl +++ b/ocml/src/acoshD.cl @@ -12,7 +12,7 @@ extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x); -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(acosh)(double x) { bool b = x >= 0x1.0p+512; diff --git a/ocml/src/acoshF.cl b/ocml/src/acoshF.cl index 179b413d..962e0e39 100644 --- a/ocml/src/acoshF.cl +++ b/ocml/src/acoshF.cl @@ -12,7 +12,7 @@ extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(acosh)(float x) { bool b = x >= 0x1.0p+64f; diff --git a/ocml/src/acoshH.cl b/ocml/src/acoshH.cl index 074a7166..a8dc827e 100644 --- a/ocml/src/acoshH.cl +++ b/ocml/src/acoshH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(acosh) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(acosh)(half hx) { half ret; diff --git a/ocml/src/addD.cl b/ocml/src/addD.cl index 9e85367d..7a96f339 100644 --- a/ocml/src/addD.cl +++ b/ocml/src/addD.cl @@ -7,21 +7,15 @@ #include "mathD.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR double \ -MATH_MANGLE(NAME)(double x, double y) \ +MATH_MANGLE(LN)(double x, double y) \ { \ - return BUILTIN_FULL_BINARY(fadd, false, ROUND, x, y); \ + return BUILTIN_##UN##_F64(x, y); \ } -GEN(add_rte, ROUND_TO_NEAREST_EVEN) -GEN(add_rtp, ROUND_TO_POSINF) -GEN(add_rtn, ROUND_TO_NEGINF) -GEN(add_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(add_rte,ADD_RTE) +GEN(add_rtn,ADD_RTN) +GEN(add_rtp,ADD_RTP) +GEN(add_rtz,ADD_RTZ) diff --git a/ocml/src/addF.cl b/ocml/src/addF.cl index 8e676725..95debe18 100644 --- a/ocml/src/addF.cl +++ b/ocml/src/addF.cl @@ -7,27 +7,15 @@ #include "mathF.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR float \ -MATH_MANGLE(NAME)(float x, float y) \ +MATH_MANGLE(LN)(float x, float y) \ { \ - float ret; \ - if (DAZ_OPT()) { \ - ret = BUILTIN_FULL_BINARY(faddf, true, ROUND, x, y); \ - } else { \ - ret = BUILTIN_FULL_BINARY(faddf, false, ROUND, x, y); \ - } \ - return ret; \ + return BUILTIN_##UN##_F32(x, y); \ } -GEN(add_rte, ROUND_TO_NEAREST_EVEN) -GEN(add_rtp, ROUND_TO_POSINF) -GEN(add_rtn, ROUND_TO_NEGINF) -GEN(add_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(add_rte,ADD_RTE) +GEN(add_rtn,ADD_RTN) +GEN(add_rtp,ADD_RTP) +GEN(add_rtz,ADD_RTZ) diff --git a/ocml/src/addH.cl b/ocml/src/addH.cl index b540fdfe..e77e7a0a 100644 --- a/ocml/src/addH.cl +++ b/ocml/src/addH.cl @@ -7,21 +7,15 @@ #include "mathH.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR half \ -MATH_MANGLE(NAME)(half x, half y) \ +MATH_MANGLE(LN)(half x, half y) \ { \ - return BUILTIN_FULL_BINARY(faddh, false, ROUND, x, y); \ + return BUILTIN_##UN##_F16(x, y); \ } -GEN(add_rte, ROUND_TO_NEAREST_EVEN) -GEN(add_rtp, ROUND_TO_POSINF) -GEN(add_rtn, ROUND_TO_NEGINF) -GEN(add_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(add_rte,ADD_RTE) +GEN(add_rtn,ADD_RTN) +GEN(add_rtp,ADD_RTP) +GEN(add_rtz,ADD_RTZ) diff --git a/ocml/src/asinhD.cl b/ocml/src/asinhD.cl index 75c3408a..09957fcc 100644 --- a/ocml/src/asinhD.cl +++ b/ocml/src/asinhD.cl @@ -13,7 +13,7 @@ extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x); -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(asinh)(double x) { double y = BUILTIN_ABS_F64(x); diff --git a/ocml/src/asinhF.cl b/ocml/src/asinhF.cl index 407d9545..f5eeaf04 100644 --- a/ocml/src/asinhF.cl +++ b/ocml/src/asinhF.cl @@ -12,7 +12,7 @@ extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(asinh)(float x) { float y = BUILTIN_ABS_F32(x); diff --git a/ocml/src/asinhH.cl b/ocml/src/asinhH.cl index 027aed99..ae994c76 100644 --- a/ocml/src/asinhH.cl +++ b/ocml/src/asinhH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(asinh) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(asinh)(half hx) { half ret; diff --git a/ocml/src/atanF.cl b/ocml/src/atanF.cl index ad3cdb03..08a7b1b1 100644 --- a/ocml/src/atanF.cl +++ b/ocml/src/atanF.cl @@ -9,7 +9,7 @@ extern CONSTATTR float MATH_PRIVATE(atanred)(float); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(atan)(float x) { float v = BUILTIN_ABS_F32(x); diff --git a/ocml/src/atanH.cl b/ocml/src/atanH.cl index 9fe95d8c..42ba6898 100644 --- a/ocml/src/atanH.cl +++ b/ocml/src/atanH.cl @@ -11,7 +11,7 @@ extern CONSTATTR half MATH_PRIVATE(atanred)(half); CONSTATTR UGEN(atan) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(atan)(half x) { half v = BUILTIN_ABS_F16(x); diff --git a/ocml/src/atanhD.cl b/ocml/src/atanhD.cl index c044f71a..62d9ddb1 100644 --- a/ocml/src/atanhD.cl +++ b/ocml/src/atanhD.cl @@ -12,7 +12,7 @@ extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x); -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(atanh)(double x) { double y = BUILTIN_ABS_F64(x); diff --git a/ocml/src/atanhF.cl b/ocml/src/atanhF.cl index 82a5c3ab..817ed41a 100644 --- a/ocml/src/atanhF.cl +++ b/ocml/src/atanhF.cl @@ -12,7 +12,7 @@ extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(atanh)(float x) { float y = BUILTIN_ABS_F32(x); diff --git a/ocml/src/atanhH.cl b/ocml/src/atanhH.cl index 46c30ff8..c86722cd 100644 --- a/ocml/src/atanhH.cl +++ b/ocml/src/atanhH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(atanh) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(atanh)(half hx) { half ret; diff --git a/ocml/src/atanpiF.cl b/ocml/src/atanpiF.cl index f9af9b72..1c46c155 100644 --- a/ocml/src/atanpiF.cl +++ b/ocml/src/atanpiF.cl @@ -9,7 +9,7 @@ extern CONSTATTR float MATH_PRIVATE(atanpired)(float); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(atanpi)(float x) { float v = BUILTIN_ABS_F32(x); diff --git a/ocml/src/atanpiH.cl b/ocml/src/atanpiH.cl index d85fe700..44cb201c 100644 --- a/ocml/src/atanpiH.cl +++ b/ocml/src/atanpiH.cl @@ -12,7 +12,7 @@ extern CONSTATTR half MATH_PRIVATE(atanpired)(half); CONSTATTR UGEN(atanpi) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(atanpi)(half x) { half v = BUILTIN_ABS_F16(x); diff --git a/ocml/src/atanpiredF.cl b/ocml/src/atanpiredF.cl index d982869a..63af0f76 100644 --- a/ocml/src/atanpiredF.cl +++ b/ocml/src/atanpiredF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_PRIVATE(atanpired)(float v) { float t = v * v; diff --git a/ocml/src/atanpiredH.cl b/ocml/src/atanpiredH.cl index 121d304f..3eabd599 100644 --- a/ocml/src/atanpiredH.cl +++ b/ocml/src/atanpiredH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR half +CONSTATTR half MATH_PRIVATE(atanpired)(half v) { half t = v * v; diff --git a/ocml/src/atanredF.cl b/ocml/src/atanredF.cl index 10b5c5c1..a0895928 100644 --- a/ocml/src/atanredF.cl +++ b/ocml/src/atanredF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_PRIVATE(atanred)(float v) { float t = v * v; diff --git a/ocml/src/atanredH.cl b/ocml/src/atanredH.cl index dd2d1ba9..d721edb3 100644 --- a/ocml/src/atanredH.cl +++ b/ocml/src/atanredH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR half +CONSTATTR half MATH_PRIVATE(atanred)(half v) { half t = v * v; diff --git a/ocml/src/ba0D.cl b/ocml/src/ba0D.cl index c21d308b..e87226bc 100644 --- a/ocml/src/ba0D.cl +++ b/ocml/src/ba0D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_PRIVATE(ba0)(double t) { return diff --git a/ocml/src/ba0F.cl b/ocml/src/ba0F.cl index fc18577f..309ad267 100644 --- a/ocml/src/ba0F.cl +++ b/ocml/src/ba0F.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_PRIVATE(ba0)(float t) { return diff --git a/ocml/src/ba1D.cl b/ocml/src/ba1D.cl index c735f595..d4453e00 100644 --- a/ocml/src/ba1D.cl +++ b/ocml/src/ba1D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_PRIVATE(ba1)(double t) { return diff --git a/ocml/src/ba1F.cl b/ocml/src/ba1F.cl index 2b974a39..5dd1ea96 100644 --- a/ocml/src/ba1F.cl +++ b/ocml/src/ba1F.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_PRIVATE(ba1)(float t) { return diff --git a/ocml/src/bp0D.cl b/ocml/src/bp0D.cl index 0e08cb4b..9014ae9e 100644 --- a/ocml/src/bp0D.cl +++ b/ocml/src/bp0D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_PRIVATE(bp0)(double t) { return diff --git a/ocml/src/bp0F.cl b/ocml/src/bp0F.cl index da6b9b4f..c0c27a1f 100644 --- a/ocml/src/bp0F.cl +++ b/ocml/src/bp0F.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_PRIVATE(bp0)(float t) { return diff --git a/ocml/src/bp1D.cl b/ocml/src/bp1D.cl index 55ace155..c9239c95 100644 --- a/ocml/src/bp1D.cl +++ b/ocml/src/bp1D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_PRIVATE(bp1)(double t) { return diff --git a/ocml/src/bp1F.cl b/ocml/src/bp1F.cl index 3267c7f4..18569cb6 100644 --- a/ocml/src/bp1F.cl +++ b/ocml/src/bp1F.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_PRIVATE(bp1)(float t) { return diff --git a/ocml/src/builtins.h b/ocml/src/builtins.h index 337c1d0a..2d5f6565 100644 --- a/ocml/src/builtins.h +++ b/ocml/src/builtins.h @@ -222,3 +222,81 @@ #define BUILTIN_CLAMP_F32(X,L,H) __llvm_amdgcn_fmed3_f32(X,L,H) #define BUILTIN_CLAMP_F16(X,L,H) __llvm_amdgcn_fmed3_f16(X,L,H) +#define BUILTIN_ADD_RTE_F32 __llvm_add_rte_f32 +#define BUILTIN_ADD_RTE_F64 __llvm_add_rte_f64 +#define BUILTIN_ADD_RTE_F16 __llvm_add_rte_f16 +#define BUILTIN_ADD_RTN_F32 __llvm_add_rtn_f32 +#define BUILTIN_ADD_RTN_F64 __llvm_add_rtn_f64 +#define BUILTIN_ADD_RTN_F16 __llvm_add_rtn_f16 +#define BUILTIN_ADD_RTP_F32 __llvm_add_rtp_f32 +#define BUILTIN_ADD_RTP_F64 __llvm_add_rtp_f64 +#define BUILTIN_ADD_RTP_F16 __llvm_add_rtp_f16 +#define BUILTIN_ADD_RTZ_F32 __llvm_add_rtz_f32 +#define BUILTIN_ADD_RTZ_F64 __llvm_add_rtz_f64 +#define BUILTIN_ADD_RTZ_F16 __llvm_add_rtz_f16 + +#define BUILTIN_SUB_RTE_F32 __llvm_sub_rte_f32 +#define BUILTIN_SUB_RTE_F64 __llvm_sub_rte_f64 +#define BUILTIN_SUB_RTE_F16 __llvm_sub_rte_f16 +#define BUILTIN_SUB_RTN_F32 __llvm_sub_rtn_f32 +#define BUILTIN_SUB_RTN_F64 __llvm_sub_rtn_f64 +#define BUILTIN_SUB_RTN_F16 __llvm_sub_rtn_f16 +#define BUILTIN_SUB_RTP_F32 __llvm_sub_rtp_f32 +#define BUILTIN_SUB_RTP_F64 __llvm_sub_rtp_f64 +#define BUILTIN_SUB_RTP_F16 __llvm_sub_rtp_f16 +#define BUILTIN_SUB_RTZ_F32 __llvm_sub_rtz_f32 +#define BUILTIN_SUB_RTZ_F64 __llvm_sub_rtz_f64 +#define BUILTIN_SUB_RTZ_F16 __llvm_sub_rtz_f16 + +#define BUILTIN_MUL_RTE_F32 __llvm_mul_rte_f32 +#define BUILTIN_MUL_RTE_F64 __llvm_mul_rte_f64 +#define BUILTIN_MUL_RTE_F16 __llvm_mul_rte_f16 +#define BUILTIN_MUL_RTN_F32 __llvm_mul_rtn_f32 +#define BUILTIN_MUL_RTN_F64 __llvm_mul_rtn_f64 +#define BUILTIN_MUL_RTN_F16 __llvm_mul_rtn_f16 +#define BUILTIN_MUL_RTP_F32 __llvm_mul_rtp_f32 +#define BUILTIN_MUL_RTP_F64 __llvm_mul_rtp_f64 +#define BUILTIN_MUL_RTP_F16 __llvm_mul_rtp_f16 +#define BUILTIN_MUL_RTZ_F32 __llvm_mul_rtz_f32 +#define BUILTIN_MUL_RTZ_F64 __llvm_mul_rtz_f64 +#define BUILTIN_MUL_RTZ_F16 __llvm_mul_rtz_f16 + +#define BUILTIN_DIV_RTE_F32 __llvm_div_rte_f32 +#define BUILTIN_DIV_RTE_F64 __llvm_div_rte_f64 +#define BUILTIN_DIV_RTE_F16 __llvm_div_rte_f16 +#define BUILTIN_DIV_RTN_F32 __llvm_div_rtn_f32 +#define BUILTIN_DIV_RTN_F64 __llvm_div_rtn_f64 +#define BUILTIN_DIV_RTN_F16 __llvm_div_rtn_f16 +#define BUILTIN_DIV_RTP_F32 __llvm_div_rtp_f32 +#define BUILTIN_DIV_RTP_F64 __llvm_div_rtp_f64 +#define BUILTIN_DIV_RTP_F16 __llvm_div_rtp_f16 +#define BUILTIN_DIV_RTZ_F32 __llvm_div_rtz_f32 +#define BUILTIN_DIV_RTZ_F64 __llvm_div_rtz_f64 +#define BUILTIN_DIV_RTZ_F16 __llvm_div_rtz_f16 + +#define BUILTIN_SQRT_RTE_F32 __llvm_sqrt_rte_f32 +#define BUILTIN_SQRT_RTE_F64 __llvm_sqrt_rte_f64 +#define BUILTIN_SQRT_RTE_F16 __llvm_sqrt_rte_f16 +#define BUILTIN_SQRT_RTN_F32 __llvm_sqrt_rtn_f32 +#define BUILTIN_SQRT_RTN_F64 __llvm_sqrt_rtn_f64 +#define BUILTIN_SQRT_RTN_F16 __llvm_sqrt_rtn_f16 +#define BUILTIN_SQRT_RTP_F32 __llvm_sqrt_rtp_f32 +#define BUILTIN_SQRT_RTP_F64 __llvm_sqrt_rtp_f64 +#define BUILTIN_SQRT_RTP_F16 __llvm_sqrt_rtp_f16 +#define BUILTIN_SQRT_RTZ_F32 __llvm_sqrt_rtz_f32 +#define BUILTIN_SQRT_RTZ_F64 __llvm_sqrt_rtz_f64 +#define BUILTIN_SQRT_RTZ_F16 __llvm_sqrt_rtz_f16 + +#define BUILTIN_FMA_RTE_F32 __llvm_fma_rte_f32 +#define BUILTIN_FMA_RTE_F64 __llvm_fma_rte_f64 +#define BUILTIN_FMA_RTE_F16 __llvm_fma_rte_f16 +#define BUILTIN_FMA_RTN_F32 __llvm_fma_rtn_f32 +#define BUILTIN_FMA_RTN_F64 __llvm_fma_rtn_f64 +#define BUILTIN_FMA_RTN_F16 __llvm_fma_rtn_f16 +#define BUILTIN_FMA_RTP_F32 __llvm_fma_rtp_f32 +#define BUILTIN_FMA_RTP_F64 __llvm_fma_rtp_f64 +#define BUILTIN_FMA_RTP_F16 __llvm_fma_rtp_f16 +#define BUILTIN_FMA_RTZ_F32 __llvm_fma_rtz_f32 +#define BUILTIN_FMA_RTZ_F64 __llvm_fma_rtz_f64 +#define BUILTIN_FMA_RTZ_F16 __llvm_fma_rtz_f16 + diff --git a/ocml/src/cbrtD.cl b/ocml/src/cbrtD.cl index 67cd2628..fd83a2fb 100644 --- a/ocml/src/cbrtD.cl +++ b/ocml/src/cbrtD.cl @@ -1,7 +1,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(cbrt)(double x) { double a = BUILTIN_ABS_F64(x); diff --git a/ocml/src/cbrtF.cl b/ocml/src/cbrtF.cl index cab2df26..5e436900 100644 --- a/ocml/src/cbrtF.cl +++ b/ocml/src/cbrtF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(cbrt)(float x) { if (DAZ_OPT()) { diff --git a/ocml/src/ceilD.cl b/ocml/src/ceilD.cl index dc2eb8dc..654226cc 100644 --- a/ocml/src/ceilD.cl +++ b/ocml/src/ceilD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(ceil)(double x) { return BUILTIN_CEIL_F64(x); diff --git a/ocml/src/ceilF.cl b/ocml/src/ceilF.cl index 2a563cdf..8b1600c8 100644 --- a/ocml/src/ceilF.cl +++ b/ocml/src/ceilF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(ceil)(float x) { return BUILTIN_CEIL_F32(x); diff --git a/ocml/src/ceilH.cl b/ocml/src/ceilH.cl index 2db7385c..5b9804cb 100644 --- a/ocml/src/ceilH.cl +++ b/ocml/src/ceilH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(ceil)(half2 x) { return BUILTIN_CEIL_2F16(x); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(ceil)(half x) { return BUILTIN_CEIL_F16(x); diff --git a/ocml/src/copysignD.cl b/ocml/src/copysignD.cl index 5c2eb066..b239b793 100644 --- a/ocml/src/copysignD.cl +++ b/ocml/src/copysignD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(copysign)(double x, double y) { return BUILTIN_COPYSIGN_F64(x, y); diff --git a/ocml/src/copysignF.cl b/ocml/src/copysignF.cl index 87bc68d7..f2fac4ab 100644 --- a/ocml/src/copysignF.cl +++ b/ocml/src/copysignF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(copysign)(float x, float y) { return BUILTIN_COPYSIGN_F32(x, y); diff --git a/ocml/src/copysignH.cl b/ocml/src/copysignH.cl index f89c061c..7897b1e3 100644 --- a/ocml/src/copysignH.cl +++ b/ocml/src/copysignH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(copysign)(half2 x, half2 y) { return BUILTIN_COPYSIGN_2F16(x, y); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(copysign)(half x, half y) { return BUILTIN_COPYSIGN_F16(x, y); diff --git a/ocml/src/cosD.cl b/ocml/src/cosD.cl index fcb55925..b76adff9 100644 --- a/ocml/src/cosD.cl +++ b/ocml/src/cosD.cl @@ -8,17 +8,15 @@ #include "mathD.h" #include "trigredD.h" -INLINEATTR double +double MATH_MANGLE(cos)(double x) { - double r, rr; - int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x)); + struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x)); + struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); + sc.s = -sc.s; - double cc; - double ss = -MATH_PRIVATE(sincosred2)(r, rr, &cc); - - int2 c = AS_INT2((regn & 1) != 0 ? ss : cc); - c.hi ^= regn > 1 ? (int)0x80000000 : 0; + int2 c = AS_INT2((r.i & 1) != 0 ? sc.s : sc.c); + c.hi ^= r.i > 1 ? (int)0x80000000 : 0; if (!FINITE_ONLY_OPT()) { c = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : c; diff --git a/ocml/src/cosF.cl b/ocml/src/cosF.cl index 63da099e..60c57195 100644 --- a/ocml/src/cosF.cl +++ b/ocml/src/cosF.cl @@ -8,28 +8,23 @@ #include "mathF.h" #include "trigredF.h" -INLINEATTR float +float MATH_MANGLE(cos)(float x) { int ix = AS_INT(x); int ax = ix & 0x7fffffff; -#if defined EXTRA_PRECISION - float r0, r1; - int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax)); + struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax)); - float cc; - float ss = -MATH_PRIVATE(sincosred2)(r0, r1, &cc); +#if defined EXTRA_PRECISION + struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); #else - float r; - int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax)); - - float cc; - float ss = -MATH_PRIVATE(sincosred)(r, &cc); + struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif + sc.s = -sc.s; - float c = (regn & 1) != 0 ? ss : cc; - c = AS_FLOAT(AS_INT(c) ^ (regn > 1 ? 0x80000000 : 0)); + float c = (r.i & 1) != 0 ? sc.s : sc.c; + c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0)); if (!FINITE_ONLY_OPT()) { c = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : c; diff --git a/ocml/src/cosH.cl b/ocml/src/cosH.cl index 00df3a62..e4edc273 100644 --- a/ocml/src/cosH.cl +++ b/ocml/src/cosH.cl @@ -10,17 +10,15 @@ UGEN(cos) -INLINEATTR half +half MATH_MANGLE(cos)(half x) { - half r; - short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x)); + struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x)); + struct scret sc = MATH_PRIVATE(sincosred)(r.hi); + sc.s = -sc.s; - half cc; - half ss = -MATH_PRIVATE(sincosred)(r, &cc); - - short c = AS_SHORT((i & 1) == 0 ? cc : ss); - c ^= i > 1 ? (short)0x8000 : (short)0; + short c = AS_SHORT((r.i & 1) == (short)0 ? sc.c : sc.s); + c ^= r.i > 1 ? (short)0x8000 : (short)0; if (!FINITE_ONLY_OPT()) { c = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : c; diff --git a/ocml/src/cosbD.cl b/ocml/src/cosbD.cl index 36b4f178..0838b618 100644 --- a/ocml/src/cosbD.cl +++ b/ocml/src/cosbD.cl @@ -24,31 +24,31 @@ L = __e; \ } while (0) -INLINEATTR double +double MATH_PRIVATE(cosb)(double x, int n, double p) { - double ph, pl, rh, rl, sh, sl; - int i = MATH_PRIVATE(trigred)(&rh, &rl, x); - bool b = rh < p; - i = (i - b - n) & 3; + struct redret r = MATH_PRIVATE(trigred)(x); + bool b = r.hi < p; + r.i = (r.i - b - n) & 3; // This is a properly signed extra precise pi/4 - ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0))); - pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0))); + double ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0))); + double pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0))); + double sh, sl; FDIF2(ph, p, ph, sl); pl += sl; FSUM2(ph, pl, ph, pl); - FSUM2(ph, rh, sh, sl); - sl += pl + rl; + FSUM2(ph, r.hi, sh, sl); + sl += pl + r.lo; FSUM2(sh, sl, sh, sl); - double cc; - double ss = -MATH_PRIVATE(sincosred2)(sh, sl, &cc); + struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl); + sc.s = -sc.s; - int2 c = AS_INT2((i & 1) != 0 ? ss : cc); - c.hi ^= i > 1 ? 0x80000000 : 0; + int2 c = AS_INT2((r.i & 1) != 0 ? sc.s : sc.c); + c.hi ^= r.i > 1 ? 0x80000000 : 0; return AS_DOUBLE(c); } diff --git a/ocml/src/cosbF.cl b/ocml/src/cosbF.cl index 10aab950..60e1f415 100644 --- a/ocml/src/cosbF.cl +++ b/ocml/src/cosbF.cl @@ -24,42 +24,37 @@ L = __e; \ } while (0) -INLINEATTR float +float MATH_PRIVATE(cosb)(float x, int n, float p) { + struct redret r = MATH_PRIVATE(trigred)(x); + bool b = r.hi < p; + r.i = (r.i - b - n) & 3; #if defined EXTRA_PRECISION - float ph, pl, rh, rl, sh, sl; - int i = MATH_PRIVATE(trigred)(&rh, &rl, x); - bool b = rh < p; - i = (i - b - n) & 3; + float ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); + float pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0)); - ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); - pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0)); + float sh, sl; FDIF2(ph, p, ph, sl); pl += sl; FSUM2(ph, pl, ph, pl); - FSUM2(ph, rh, sh, sl); - sl += pl + rl; + FSUM2(ph, r.hi, sh, sl); + sl += pl + r.lo; FSUM2(sh, sl, sh, sl); - float cc; - float ss = -MATH_PRIVATE(sincosred2)(sh, sl, &cc); + struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl); #else - float r; - int i = MATH_PRIVATE(trigred)(&r, x); - bool b = r < p; - i = (i - b - n) & 3; - r = r - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); + r.hi = r.hi - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); - float cc; - float ss = -MATH_PRIVATE(sincosred)(r, &cc); + struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif + sc.s = -sc.s; - float c = (i & 1) != 0 ? ss : cc; - c = AS_FLOAT(AS_INT(c) ^ (i > 1 ? 0x80000000 : 0)); + float c = (r.i & 1) != 0 ? sc.s : sc.c; + c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0)); return c; } diff --git a/ocml/src/coshD.cl b/ocml/src/coshD.cl index fe1a676d..da1c54a4 100644 --- a/ocml/src/coshD.cl +++ b/ocml/src/coshD.cl @@ -12,7 +12,7 @@ extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x); -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(cosh)(double x) { x = BUILTIN_ABS_F64(x); diff --git a/ocml/src/coshF.cl b/ocml/src/coshF.cl index 425bea9d..ef4c46da 100644 --- a/ocml/src/coshF.cl +++ b/ocml/src/coshF.cl @@ -12,7 +12,7 @@ extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(cosh)(float x) { x = BUILTIN_ABS_F64(x); diff --git a/ocml/src/coshH.cl b/ocml/src/coshH.cl index 232b8f67..3ddea219 100644 --- a/ocml/src/coshH.cl +++ b/ocml/src/coshH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(cosh) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(cosh)(half hx) { float x = (float)hx * 0x1.715476p+0f; diff --git a/ocml/src/cospiD.cl b/ocml/src/cospiD.cl index 57686b03..fab3bc4d 100644 --- a/ocml/src/cospiD.cl +++ b/ocml/src/cospiD.cl @@ -8,17 +8,15 @@ #include "mathD.h" #include "trigpiredD.h" -INLINEATTR double +double MATH_MANGLE(cospi)(double x) { - double t; - int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t); + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); + sc.s = -sc.s; - double cc; - double ss = -MATH_PRIVATE(sincospired)(t, &cc); - - int2 c = AS_INT2((i & 1) == 0 ? cc : ss); - c.hi ^= i > 1 ? (int)0x80000000 : 0; + int2 c = AS_INT2((r.i & 1) == 0 ? sc.c : sc.s); + c.hi ^= r.i > 1 ? (int)0x80000000 : 0; if (!FINITE_ONLY_OPT()) { c = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : c; diff --git a/ocml/src/cospiF.cl b/ocml/src/cospiF.cl index 1d9ed3ee..90d360d0 100644 --- a/ocml/src/cospiF.cl +++ b/ocml/src/cospiF.cl @@ -8,19 +8,16 @@ #include "mathF.h" #include "trigpiredF.h" -INLINEATTR float +CONSTATTR float MATH_MANGLE(cospi)(float x) { int ax = AS_INT(x) & 0x7fffffff; + struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); + sc.s = -sc.s; - float r; - int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &r); - - float cc; - float ss = -MATH_PRIVATE(sincospired)(r, &cc); - - float c = (i & 1) != 0 ? ss : cc; - c = AS_FLOAT(AS_INT(c) ^ (i > 1 ? 0x80000000 : 0)); + float c = (r.i & 1) != 0 ? sc.s : sc.c; + c = AS_FLOAT(AS_INT(c) ^ (r.i > 1 ? 0x80000000 : 0)); if (!FINITE_ONLY_OPT()) { c = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : c; diff --git a/ocml/src/cospiH.cl b/ocml/src/cospiH.cl index 830bc239..3f55c79e 100644 --- a/ocml/src/cospiH.cl +++ b/ocml/src/cospiH.cl @@ -10,17 +10,15 @@ UGEN(cospi) -INLINEATTR half +half MATH_MANGLE(cospi)(half x) { - half t; - int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t); + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); + sc.s = -sc.s; - half cc; - half ss = -MATH_PRIVATE(sincospired)(t, &cc); - - short c = AS_SHORT((i & (short)1) == (short)0 ? cc : ss); - c ^= i > (short)1 ? (short)0x8000 : (short)0; + short c = AS_SHORT((r.i & (short)1) == (short)0 ? sc.c : sc.s); + c ^= r.i > (short)1 ? (short)0x8000 : (short)0; if (!FINITE_ONLY_OPT()) { c = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : c; diff --git a/ocml/src/divD.cl b/ocml/src/divD.cl index 27ae4318..ad7af822 100644 --- a/ocml/src/divD.cl +++ b/ocml/src/divD.cl @@ -7,21 +7,15 @@ #include "mathD.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR double \ -MATH_MANGLE(NAME)(double x, double y) \ +MATH_MANGLE(LN)(double x, double y) \ { \ - return BUILTIN_FULL_BINARY(fdiv, false, ROUND, x, y); \ + return BUILTIN_##UN##_F64(x, y); \ } -GEN(div_rte, ROUND_TO_NEAREST_EVEN) -GEN(div_rtp, ROUND_TO_POSINF) -GEN(div_rtn, ROUND_TO_NEGINF) -GEN(div_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(div_rte,DIV_RTE) +GEN(div_rtn,DIV_RTN) +GEN(div_rtp,DIV_RTP) +GEN(div_rtz,DIV_RTZ) diff --git a/ocml/src/divF.cl b/ocml/src/divF.cl index 9dcfb511..ce9519ab 100644 --- a/ocml/src/divF.cl +++ b/ocml/src/divF.cl @@ -7,27 +7,15 @@ #include "mathF.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR float \ -MATH_MANGLE(NAME)(float x, float y) \ +MATH_MANGLE(LN)(float x, float y) \ { \ - float ret; \ - if (DAZ_OPT()) { \ - ret = BUILTIN_FULL_BINARY(fdivf, true, ROUND, x, y); \ - } else { \ - ret = BUILTIN_FULL_BINARY(fdivf, false, ROUND, x, y); \ - } \ - return ret; \ + return BUILTIN_##UN##_F32(x, y); \ } -GEN(div_rte, ROUND_TO_NEAREST_EVEN) -GEN(div_rtp, ROUND_TO_POSINF) -GEN(div_rtn, ROUND_TO_NEGINF) -GEN(div_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(div_rte,DIV_RTE) +GEN(div_rtn,DIV_RTN) +GEN(div_rtp,DIV_RTP) +GEN(div_rtz,DIV_RTZ) diff --git a/ocml/src/divH.cl b/ocml/src/divH.cl index 7ac66449..3a7d17d3 100644 --- a/ocml/src/divH.cl +++ b/ocml/src/divH.cl @@ -7,21 +7,15 @@ #include "mathH.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR half \ -MATH_MANGLE(NAME)(half x, half y) \ +MATH_MANGLE(LN)(half x, half y) \ { \ - return BUILTIN_FULL_BINARY(fdivh, false, ROUND, x, y); \ + return BUILTIN_##UN##_F16(x, y); \ } -GEN(div_rte, ROUND_TO_NEAREST_EVEN) -GEN(div_rtp, ROUND_TO_POSINF) -GEN(div_rtn, ROUND_TO_NEGINF) -GEN(div_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(div_rte,DIV_RTE) +GEN(div_rtn,DIV_RTN) +GEN(div_rtp,DIV_RTP) +GEN(div_rtz,DIV_RTZ) diff --git a/ocml/src/epexpepD.cl b/ocml/src/epexpepD.cl index 292a61b9..f6340e15 100644 --- a/ocml/src/epexpepD.cl +++ b/ocml/src/epexpepD.cl @@ -10,7 +10,7 @@ #define DOUBLE_SPECIALIZATION #include "ep.h" -INLINEATTR CONSTATTR double2 +CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x) { double dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0); diff --git a/ocml/src/epexpepF.cl b/ocml/src/epexpepF.cl index abeecc70..1ba48e10 100644 --- a/ocml/src/epexpepF.cl +++ b/ocml/src/epexpepF.cl @@ -10,7 +10,7 @@ #define FLOAT_SPECIALIZATION #include "ep.h" -INLINEATTR CONSTATTR float2 +CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x) { float fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f); diff --git a/ocml/src/eplnD.cl b/ocml/src/eplnD.cl index f16b4071..7540e5b9 100644 --- a/ocml/src/eplnD.cl +++ b/ocml/src/eplnD.cl @@ -10,7 +10,7 @@ #define DOUBLE_SPECIALIZATION #include "ep.h" -INLINEATTR CONSTATTR double2 +CONSTATTR double2 MATH_PRIVATE(epln)(double a) { double m = BUILTIN_FREXP_MANT_F64(a); diff --git a/ocml/src/eplnF.cl b/ocml/src/eplnF.cl index 9063d677..b7fef2be 100644 --- a/ocml/src/eplnF.cl +++ b/ocml/src/eplnF.cl @@ -10,7 +10,7 @@ #define FLOAT_SPECIALIZATION #include "ep.h" -INLINEATTR CONSTATTR float2 +CONSTATTR float2 MATH_PRIVATE(epln)(float a) { float m = BUILTIN_FREXP_MANT_F32(a); diff --git a/ocml/src/erfH.cl b/ocml/src/erfH.cl index 883509d8..47c3c353 100644 --- a/ocml/src/erfH.cl +++ b/ocml/src/erfH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(erf) -INLINEATTR PUREATTR half +PUREATTR half MATH_MANGLE(erf)(half x) { return (half)MATH_UPMANGLE(erf)((float)x); diff --git a/ocml/src/erfcH.cl b/ocml/src/erfcH.cl index 2adc0236..ec7c7b04 100644 --- a/ocml/src/erfcH.cl +++ b/ocml/src/erfcH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(erfc) -INLINEATTR PUREATTR half +PUREATTR half MATH_MANGLE(erfc)(half x) { return (half)MATH_UPMANGLE(erfc)((float)x); diff --git a/ocml/src/erfcinvH.cl b/ocml/src/erfcinvH.cl index 8050709b..6258a9b9 100644 --- a/ocml/src/erfcinvH.cl +++ b/ocml/src/erfcinvH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(erfcinv) -INLINEATTR PUREATTR half +PUREATTR half MATH_MANGLE(erfcinv)(half x) { return (half)MATH_UPMANGLE(erfcinv)((float)x); diff --git a/ocml/src/erfcxH.cl b/ocml/src/erfcxH.cl index eb064e47..9fa79b5f 100644 --- a/ocml/src/erfcxH.cl +++ b/ocml/src/erfcxH.cl @@ -3,7 +3,7 @@ PUREATTR UGEN(erfcx) -INLINEATTR PUREATTR half +PUREATTR half MATH_MANGLE(erfcx)(half x) { return (half)MATH_UPMANGLE(erfcx)((float)x); diff --git a/ocml/src/erfinvH.cl b/ocml/src/erfinvH.cl index 60238709..18317b51 100644 --- a/ocml/src/erfinvH.cl +++ b/ocml/src/erfinvH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(erfinv) -INLINEATTR PUREATTR half +PUREATTR half MATH_MANGLE(erfinv)(half x) { return (half)MATH_UPMANGLE(erfinv)((float)x); diff --git a/ocml/src/exp10H.cl b/ocml/src/exp10H.cl index d376414e..94a50ce2 100644 --- a/ocml/src/exp10H.cl +++ b/ocml/src/exp10H.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(exp10) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(exp10)(half x) { return (half)BUILTIN_EXP2_F32((float)x * 0x1.a934f0p+1f); diff --git a/ocml/src/exp2H.cl b/ocml/src/exp2H.cl index a8b72ff3..b6afa724 100644 --- a/ocml/src/exp2H.cl +++ b/ocml/src/exp2H.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(exp2) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(exp2)(half x) { return BUILTIN_EXP2_F16(x); diff --git a/ocml/src/expF_base.h b/ocml/src/expF_base.h index 9c42d5fe..08bde388 100644 --- a/ocml/src/expF_base.h +++ b/ocml/src/expF_base.h @@ -32,7 +32,7 @@ // // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) ) -PUREATTR INLINEATTR float +PUREATTR float #if defined COMPILING_EXP2 MATH_MANGLE(exp2)(float x) #elif defined COMPILING_EXP10 diff --git a/ocml/src/expH.cl b/ocml/src/expH.cl index 1ff4a024..caa3a4ac 100644 --- a/ocml/src/expH.cl +++ b/ocml/src/expH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(exp) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(exp)(half x) { return (half)BUILTIN_EXP2_F32((float)x * 0x1.715476p+0f); diff --git a/ocml/src/expepD.cl b/ocml/src/expepD.cl index f8d4fd95..859a023d 100644 --- a/ocml/src/expepD.cl +++ b/ocml/src/expepD.cl @@ -10,7 +10,7 @@ #define DOUBLE_SPECIALIZATION #include "ep.h" -INLINEATTR CONSTATTR double +CONSTATTR double MATH_PRIVATE(expep)(double2 x) { double dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0); diff --git a/ocml/src/expepF.cl b/ocml/src/expepF.cl index bf585b47..657267a2 100644 --- a/ocml/src/expepF.cl +++ b/ocml/src/expepF.cl @@ -10,7 +10,7 @@ #define FLOAT_SPECIALIZATION #include "ep.h" -INLINEATTR CONSTATTR float +CONSTATTR float MATH_PRIVATE(expep)(float2 x) { float fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f); diff --git a/ocml/src/expm1F.cl b/ocml/src/expm1F.cl index 583a7a11..31ac3b89 100644 --- a/ocml/src/expm1F.cl +++ b/ocml/src/expm1F.cl @@ -12,7 +12,7 @@ extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(expm1)(float x) { float2 e = sub(MATH_PRIVATE(epexpep)(con(x, 0.0f)), 1.0f); diff --git a/ocml/src/expm1H.cl b/ocml/src/expm1H.cl index c04c6c84..79498be4 100644 --- a/ocml/src/expm1H.cl +++ b/ocml/src/expm1H.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(expm1) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(expm1)(half x) { half ret; diff --git a/ocml/src/fabsD.cl b/ocml/src/fabsD.cl index 2c5332c6..9052cd01 100644 --- a/ocml/src/fabsD.cl +++ b/ocml/src/fabsD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(fabs)(double x) { return BUILTIN_ABS_F64(x); diff --git a/ocml/src/fabsF.cl b/ocml/src/fabsF.cl index 444e9075..957cb79f 100644 --- a/ocml/src/fabsF.cl +++ b/ocml/src/fabsF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(fabs)(float x) { return BUILTIN_ABS_F32(x); diff --git a/ocml/src/fabsH.cl b/ocml/src/fabsH.cl index 9cd7dbbd..1504bb6a 100644 --- a/ocml/src/fabsH.cl +++ b/ocml/src/fabsH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(fabs)(half2 x) { return BUILTIN_ABS_2F16(x); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(fabs)(half x) { return BUILTIN_ABS_F16(x); diff --git a/ocml/src/fdimD.cl b/ocml/src/fdimD.cl index 8214203e..cc7255c3 100644 --- a/ocml/src/fdimD.cl +++ b/ocml/src/fdimD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(fdim)(double x, double y) { long d = AS_LONG(x - y); diff --git a/ocml/src/fdimF.cl b/ocml/src/fdimF.cl index 9e418b24..968eb908 100644 --- a/ocml/src/fdimF.cl +++ b/ocml/src/fdimF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(fdim)(float x, float y) { if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/fdimH.cl b/ocml/src/fdimH.cl index 304c96ab..989f8213 100644 --- a/ocml/src/fdimH.cl +++ b/ocml/src/fdimH.cl @@ -9,7 +9,7 @@ CONSTATTR BGEN(fdim) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(fdim)(half x, half y) { if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/floorD.cl b/ocml/src/floorD.cl index 8fd637da..2fc2375d 100644 --- a/ocml/src/floorD.cl +++ b/ocml/src/floorD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(floor)(double x) { return BUILTIN_FLOOR_F64(x); diff --git a/ocml/src/floorF.cl b/ocml/src/floorF.cl index 3364960a..e8b6d3ef 100644 --- a/ocml/src/floorF.cl +++ b/ocml/src/floorF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(floor)(float x) { return BUILTIN_FLOOR_F32(x); diff --git a/ocml/src/floorH.cl b/ocml/src/floorH.cl index 16c84eee..f563e648 100644 --- a/ocml/src/floorH.cl +++ b/ocml/src/floorH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(floor)(half2 x) { return BUILTIN_FLOOR_2F16(x); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(floor)(half x) { return BUILTIN_FLOOR_F16(x); diff --git a/ocml/src/fmaD.cl b/ocml/src/fmaD.cl index 15d596dc..0a526fe8 100644 --- a/ocml/src/fmaD.cl +++ b/ocml/src/fmaD.cl @@ -13,21 +13,15 @@ MATH_MANGLE(fma)(double a, double b, double c) return BUILTIN_FMA_F64(a, b, c); } -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR double \ -MATH_MANGLE(NAME)(double a, double b, double c) \ +MATH_MANGLE(LN)(double a, double b, double c) \ { \ - return BUILTIN_FULL_TERNARY(ffma, false, ROUND, a, b, c); \ + return BUILTIN_##UN##_F64(a, b, c); \ } -GEN(fma_rte, ROUND_TO_NEAREST_EVEN) -GEN(fma_rtp, ROUND_TO_POSINF) -GEN(fma_rtn, ROUND_TO_NEGINF) -GEN(fma_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(fma_rte,FMA_RTE) +GEN(fma_rtn,FMA_RTN) +GEN(fma_rtp,FMA_RTP) +GEN(fma_rtz,FMA_RTZ) diff --git a/ocml/src/fmaF.cl b/ocml/src/fmaF.cl index 3974f317..052acae0 100644 --- a/ocml/src/fmaF.cl +++ b/ocml/src/fmaF.cl @@ -5,6 +5,7 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ +#include "irif.h" #include "mathF.h" CONSTATTR float @@ -13,27 +14,15 @@ MATH_MANGLE(fma)(float a, float b, float c) return BUILTIN_FMA_F32(a, b, c); } -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR float \ -MATH_MANGLE(NAME)(float a, float b, float c) \ +MATH_MANGLE(LN)(float a, float b, float c) \ { \ - float ret; \ - if (DAZ_OPT()) { \ - ret = BUILTIN_FULL_TERNARY(ffmaf, true, ROUND, a, b, c); \ - } else { \ - ret = BUILTIN_FULL_TERNARY(ffmaf, false, ROUND, a, b, c); \ - } \ - return ret; \ + return BUILTIN_##UN##_F32(a, b, c); \ } -GEN(fma_rte, ROUND_TO_NEAREST_EVEN) -GEN(fma_rtp, ROUND_TO_POSINF) -GEN(fma_rtn, ROUND_TO_NEGINF) -GEN(fma_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(fma_rte,FMA_RTE) +GEN(fma_rtn,FMA_RTN) +GEN(fma_rtp,FMA_RTP) +GEN(fma_rtz,FMA_RTZ) diff --git a/ocml/src/fmaH.cl b/ocml/src/fmaH.cl index c34f1781..03bacf72 100644 --- a/ocml/src/fmaH.cl +++ b/ocml/src/fmaH.cl @@ -19,21 +19,15 @@ MATH_MANGLE(fma)(half a, half b, half c) return BUILTIN_FMA_F16(a, b, c); } -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR half \ -MATH_MANGLE(NAME)(half a, half b, half c) \ +MATH_MANGLE(LN)(half a, half b, half c) \ { \ - return BUILTIN_FULL_TERNARY(ffmah, false, ROUND, a, b, c); \ + return BUILTIN_##UN##_F16(a, b, c); \ } -GEN(fma_rte, ROUND_TO_NEAREST_EVEN) -GEN(fma_rtp, ROUND_TO_POSINF) -GEN(fma_rtn, ROUND_TO_NEGINF) -GEN(fma_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(fma_rte,FMA_RTE) +GEN(fma_rtn,FMA_RTN) +GEN(fma_rtp,FMA_RTP) +GEN(fma_rtz,FMA_RTZ) diff --git a/ocml/src/fmaxD.cl b/ocml/src/fmaxD.cl index 97a80466..fa8fc448 100644 --- a/ocml/src/fmaxD.cl +++ b/ocml/src/fmaxD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(fmax)(double x, double y) { return BUILTIN_MAX_F64(BUILTIN_CANONICALIZE_F64(x), BUILTIN_CANONICALIZE_F64(y)); diff --git a/ocml/src/fmaxF.cl b/ocml/src/fmaxF.cl index d96a4c34..7fa39a8a 100644 --- a/ocml/src/fmaxF.cl +++ b/ocml/src/fmaxF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(fmax)(float x, float y) { float ret; diff --git a/ocml/src/fmaxH.cl b/ocml/src/fmaxH.cl index f6817006..552be89f 100644 --- a/ocml/src/fmaxH.cl +++ b/ocml/src/fmaxH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(fmax)(half2 x, half2 y) { return BUILTIN_MAX_2F16(BUILTIN_CANONICALIZE_2F16(x), BUILTIN_CANONICALIZE_2F16(y)); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(fmax)(half x, half y) { return BUILTIN_MAX_F16(BUILTIN_CANONICALIZE_F16(x), BUILTIN_CANONICALIZE_F16(y)); diff --git a/ocml/src/fminD.cl b/ocml/src/fminD.cl index 0ff01127..04fba1fb 100644 --- a/ocml/src/fminD.cl +++ b/ocml/src/fminD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(fmin)(double x, double y) { return BUILTIN_MIN_F64(BUILTIN_CANONICALIZE_F64(x), BUILTIN_CANONICALIZE_F64(y)); diff --git a/ocml/src/fminF.cl b/ocml/src/fminF.cl index ffd6f40a..e979e18e 100644 --- a/ocml/src/fminF.cl +++ b/ocml/src/fminF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(fmin)(float x, float y) { float ret; diff --git a/ocml/src/fminH.cl b/ocml/src/fminH.cl index 6da1fb55..76398429 100644 --- a/ocml/src/fminH.cl +++ b/ocml/src/fminH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(fmin)(half2 x, half2 y) { return BUILTIN_MIN_2F16(BUILTIN_CANONICALIZE_2F16(x), BUILTIN_CANONICALIZE_2F16(y)); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(fmin)(half x, half y) { return BUILTIN_MIN_F16(BUILTIN_CANONICALIZE_F16(x), BUILTIN_CANONICALIZE_F16(y)); diff --git a/ocml/src/fpclassifyD.cl b/ocml/src/fpclassifyD.cl index cfefa9d5..8db6b992 100644 --- a/ocml/src/fpclassifyD.cl +++ b/ocml/src/fpclassifyD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(fpclassify)(double x) { int ret = BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN; diff --git a/ocml/src/fpclassifyF.cl b/ocml/src/fpclassifyF.cl index 824c140e..50a84783 100644 --- a/ocml/src/fpclassifyF.cl +++ b/ocml/src/fpclassifyF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(fpclassify)(float x) { int ret = BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN; diff --git a/ocml/src/fpclassifyH.cl b/ocml/src/fpclassifyH.cl index 20d34897..a9c2d928 100644 --- a/ocml/src/fpclassifyH.cl +++ b/ocml/src/fpclassifyH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(fpclassify)(half x) { int ret = BUILTIN_CLASS_F16(x, CLASS_PINF|CLASS_NINF) ? FP_INFINITE : FP_NAN; diff --git a/ocml/src/fractD.cl b/ocml/src/fractD.cl index 720e3e23..e4b75aec 100644 --- a/ocml/src/fractD.cl +++ b/ocml/src/fractD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -INLINEATTR double +double MATH_MANGLE(fract)(double x, __private double *ip) { double i = BUILTIN_FLOOR_F64(x); diff --git a/ocml/src/fractF.cl b/ocml/src/fractF.cl index 9b03b797..b65b517c 100644 --- a/ocml/src/fractF.cl +++ b/ocml/src/fractF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -INLINEATTR float +float MATH_MANGLE(fract)(float x, __private float *ip) { float i = BUILTIN_FLOOR_F32(x); diff --git a/ocml/src/fractH.cl b/ocml/src/fractH.cl index ba127717..2cda3a5c 100644 --- a/ocml/src/fractH.cl +++ b/ocml/src/fractH.cl @@ -7,14 +7,14 @@ #include "mathH.h" -INLINEATTR half2 +half2 MATH_MANGLE2(fract)(half2 x, __private half2 *ip) { *ip = BUILTIN_FLOOR_2F16(x); return (half2)(BUILTIN_FRACTION_F16(x.lo), BUILTIN_FRACTION_F16(x.hi)); } -INLINEATTR half +half MATH_MANGLE(fract)(half x, __private half *ip) { *ip = BUILTIN_FLOOR_F16(x); diff --git a/ocml/src/frexpD.cl b/ocml/src/frexpD.cl index b3deeb64..4f9d252d 100644 --- a/ocml/src/frexpD.cl +++ b/ocml/src/frexpD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -INLINEATTR double +double MATH_MANGLE(frexp)(double x, __private int *ep) { int e = BUILTIN_FREXP_EXP_F64(x); diff --git a/ocml/src/frexpF.cl b/ocml/src/frexpF.cl index e29554ba..c5b0b84b 100644 --- a/ocml/src/frexpF.cl +++ b/ocml/src/frexpF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -INLINEATTR float +float MATH_MANGLE(frexp)(float x, __private int *ep) { int e = BUILTIN_FREXP_EXP_F32(x); diff --git a/ocml/src/frexpH.cl b/ocml/src/frexpH.cl index a5e43691..a4bc6e3c 100644 --- a/ocml/src/frexpH.cl +++ b/ocml/src/frexpH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -INLINEATTR half2 +half2 MATH_MANGLE2(frexp)(half2 x, __private int2 *ep) { int elo, ehi; @@ -18,7 +18,7 @@ MATH_MANGLE2(frexp)(half2 x, __private int2 *ep) return r; } -INLINEATTR half +half MATH_MANGLE(frexp)(half x, __private int *ep) { int e = (int)BUILTIN_FREXP_EXP_F16(x); diff --git a/ocml/src/hypotD.cl b/ocml/src/hypotD.cl index 405720f4..fd99614a 100644 --- a/ocml/src/hypotD.cl +++ b/ocml/src/hypotD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(hypot)(double x, double y) { double a = BUILTIN_ABS_F64(x); diff --git a/ocml/src/hypotF.cl b/ocml/src/hypotF.cl index adca99ea..2b697a32 100644 --- a/ocml/src/hypotF.cl +++ b/ocml/src/hypotF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(hypot)(float x, float y) { float a = BUILTIN_ABS_F32(x); diff --git a/ocml/src/hypotH.cl b/ocml/src/hypotH.cl index dc0dad36..66b7811a 100644 --- a/ocml/src/hypotH.cl +++ b/ocml/src/hypotH.cl @@ -9,7 +9,7 @@ CONSTATTR BGEN(hypot) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(hypot)(half x, half y) { float fx = (float)x; diff --git a/ocml/src/i0H.cl b/ocml/src/i0H.cl index f42f1482..913942f5 100644 --- a/ocml/src/i0H.cl +++ b/ocml/src/i0H.cl @@ -9,7 +9,7 @@ UGEN(i0) -INLINEATTR half +half MATH_MANGLE(i0)(half x) { return (half)MATH_UPMANGLE(i0)((float)x); diff --git a/ocml/src/i1H.cl b/ocml/src/i1H.cl index 09b74c6d..d778626b 100644 --- a/ocml/src/i1H.cl +++ b/ocml/src/i1H.cl @@ -9,7 +9,7 @@ UGEN(i1) -INLINEATTR half +half MATH_MANGLE(i1)(half x) { return (half)MATH_UPMANGLE(i1)((float)x); diff --git a/ocml/src/ilogbD.cl b/ocml/src/ilogbD.cl index 95ce66fc..0f0b9ace 100644 --- a/ocml/src/ilogbD.cl +++ b/ocml/src/ilogbD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(ilogb)(double x) { int r = BUILTIN_FREXP_EXP_F64(x) - 1; diff --git a/ocml/src/ilogbF.cl b/ocml/src/ilogbF.cl index e84537b8..1a7e1d1d 100644 --- a/ocml/src/ilogbF.cl +++ b/ocml/src/ilogbF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(ilogb)(float x) { int r = BUILTIN_FREXP_EXP_F32(x) - 1; diff --git a/ocml/src/ilogbH.cl b/ocml/src/ilogbH.cl index a5aeef18..d7a274e4 100644 --- a/ocml/src/ilogbH.cl +++ b/ocml/src/ilogbH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR int2 +CONSTATTR int2 MATH_MANGLE2(ilogb)(half2 x) { return (int2)(MATH_MANGLE(ilogb)(x.lo), MATH_MANGLE(ilogb)(x.hi)); } -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(ilogb)(half x) { int r = (int)BUILTIN_FREXP_EXP_F16(x) - 1; diff --git a/ocml/src/isfiniteD.cl b/ocml/src/isfiniteD.cl index 489a390f..bdca20d5 100644 --- a/ocml/src/isfiniteD.cl +++ b/ocml/src/isfiniteD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isfinite)(double x) { return BUILTIN_CLASS_F64(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR); diff --git a/ocml/src/isfiniteF.cl b/ocml/src/isfiniteF.cl index 11227450..421ab1a1 100644 --- a/ocml/src/isfiniteF.cl +++ b/ocml/src/isfiniteF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isfinite)(float x) { return BUILTIN_CLASS_F32(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR); diff --git a/ocml/src/isfiniteH.cl b/ocml/src/isfiniteH.cl index c2b62152..dce82701 100644 --- a/ocml/src/isfiniteH.cl +++ b/ocml/src/isfiniteH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR short2 +CONSTATTR short2 MATH_MANGLE2(isfinite)(half2 x) { return (short2) @@ -15,7 +15,7 @@ MATH_MANGLE2(isfinite)(half2 x) BUILTIN_CLASS_F16(x.hi, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR) ? (short)-1 : (short)0); } -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isfinite)(half x) { return BUILTIN_CLASS_F16(x, CLASS_NNOR|CLASS_NSUB|CLASS_NZER|CLASS_PZER|CLASS_PSUB|CLASS_PNOR); diff --git a/ocml/src/isinfD.cl b/ocml/src/isinfD.cl index 00822a9b..bf33343e 100644 --- a/ocml/src/isinfD.cl +++ b/ocml/src/isinfD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isinf)(double x) { return BUILTIN_CLASS_F64(x, CLASS_PINF|CLASS_NINF); diff --git a/ocml/src/isinfF.cl b/ocml/src/isinfF.cl index 4a0bda85..0a408cd1 100644 --- a/ocml/src/isinfF.cl +++ b/ocml/src/isinfF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isinf)(float x) { return BUILTIN_CLASS_F32(x, CLASS_PINF|CLASS_NINF); diff --git a/ocml/src/isinfH.cl b/ocml/src/isinfH.cl index db18b9b7..d2978f02 100644 --- a/ocml/src/isinfH.cl +++ b/ocml/src/isinfH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR short2 +CONSTATTR short2 MATH_MANGLE2(isinf)(half2 x) { return (short2) @@ -15,7 +15,7 @@ MATH_MANGLE2(isinf)(half2 x) BUILTIN_CLASS_F16(x.hi, CLASS_PINF|CLASS_NINF) ? (short)-1 : (short)0); } -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isinf)(half x) { return BUILTIN_CLASS_F16(x, CLASS_PINF|CLASS_NINF); diff --git a/ocml/src/isnanD.cl b/ocml/src/isnanD.cl index d1f1b03d..12400473 100644 --- a/ocml/src/isnanD.cl +++ b/ocml/src/isnanD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isnan)(double x) { return BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN); diff --git a/ocml/src/isnanF.cl b/ocml/src/isnanF.cl index 5e305755..47fc9910 100644 --- a/ocml/src/isnanF.cl +++ b/ocml/src/isnanF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isnan)(float x) { return BUILTIN_CLASS_F32(x, CLASS_SNAN|CLASS_QNAN); diff --git a/ocml/src/isnanH.cl b/ocml/src/isnanH.cl index 8eb1b8e2..d831c3e8 100644 --- a/ocml/src/isnanH.cl +++ b/ocml/src/isnanH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR short2 +CONSTATTR short2 MATH_MANGLE2(isnan)(half2 x) { return (short2) @@ -15,7 +15,7 @@ MATH_MANGLE2(isnan)(half2 x) BUILTIN_CLASS_F16(x.hi, CLASS_SNAN|CLASS_QNAN) ? (short)-1 : (short)0); } -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isnan)(half x) { return BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN); diff --git a/ocml/src/isnormalD.cl b/ocml/src/isnormalD.cl index 74907904..55799a17 100644 --- a/ocml/src/isnormalD.cl +++ b/ocml/src/isnormalD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isnormal)(double x) { return BUILTIN_CLASS_F64(x, CLASS_PNOR|CLASS_NNOR); diff --git a/ocml/src/isnormalF.cl b/ocml/src/isnormalF.cl index 2e717e4b..9c640286 100644 --- a/ocml/src/isnormalF.cl +++ b/ocml/src/isnormalF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isnormal)(float x) { return BUILTIN_CLASS_F32(x, CLASS_PNOR|CLASS_NNOR); diff --git a/ocml/src/isnormalH.cl b/ocml/src/isnormalH.cl index 1c0325a3..c33d9092 100644 --- a/ocml/src/isnormalH.cl +++ b/ocml/src/isnormalH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR short2 +CONSTATTR short2 MATH_MANGLE2(isnormal)(half2 x) { return (short2) @@ -15,7 +15,7 @@ MATH_MANGLE2(isnormal)(half2 x) BUILTIN_CLASS_F16(x.hi, CLASS_PNOR|CLASS_NNOR) ? (short)-1 : (short)0); } -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(isnormal)(half x) { return BUILTIN_CLASS_F16(x, CLASS_PNOR|CLASS_NNOR); diff --git a/ocml/src/j0H.cl b/ocml/src/j0H.cl index f61b3fca..83feff6f 100644 --- a/ocml/src/j0H.cl +++ b/ocml/src/j0H.cl @@ -9,7 +9,7 @@ UGEN(j0) -INLINEATTR half +half MATH_MANGLE(j0)(half x) { return (half)MATH_UPMANGLE(j0)((float)x); diff --git a/ocml/src/j1H.cl b/ocml/src/j1H.cl index 7cbaddf4..557038f2 100644 --- a/ocml/src/j1H.cl +++ b/ocml/src/j1H.cl @@ -9,7 +9,7 @@ UGEN(j1) -INLINEATTR half +half MATH_MANGLE(j1)(half x) { return (half)MATH_UPMANGLE(j1)((float)x); diff --git a/ocml/src/ldexpD.cl b/ocml/src/ldexpD.cl index 1cf0e093..7ba48285 100644 --- a/ocml/src/ldexpD.cl +++ b/ocml/src/ldexpD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(ldexp)(double x, int n) { return BUILTIN_FLDEXP_F64(x, n); diff --git a/ocml/src/ldexpF.cl b/ocml/src/ldexpF.cl index 435848aa..29a1da28 100644 --- a/ocml/src/ldexpF.cl +++ b/ocml/src/ldexpF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(ldexp)(float x, int n) { return BUILTIN_FLDEXP_F32(x, n); diff --git a/ocml/src/ldexpH.cl b/ocml/src/ldexpH.cl index 7f06e7ef..d4d57043 100644 --- a/ocml/src/ldexpH.cl +++ b/ocml/src/ldexpH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(ldexp)(half2 x, int2 n) { return (half2)(MATH_MANGLE(ldexp)(x.lo, n.lo), MATH_MANGLE(ldexp)(x.hi, n.hi)); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(ldexp)(half x, int n) { return BUILTIN_FLDEXP_F16(x, BUILTIN_CLAMP_S32(n, SHRT_MIN, SHRT_MAX)); diff --git a/ocml/src/len3D.cl b/ocml/src/len3D.cl index d0c6e811..fee8e9db 100644 --- a/ocml/src/len3D.cl +++ b/ocml/src/len3D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(len3)(double x, double y, double z) { double a = BUILTIN_ABS_F64(x); diff --git a/ocml/src/len3F.cl b/ocml/src/len3F.cl index f2ab9125..bb14ee5a 100644 --- a/ocml/src/len3F.cl +++ b/ocml/src/len3F.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(len3)(float x, float y, float z) { float a = BUILTIN_ABS_F32(x); diff --git a/ocml/src/len3H.cl b/ocml/src/len3H.cl index 32248780..bb6ef92c 100644 --- a/ocml/src/len3H.cl +++ b/ocml/src/len3H.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(len3)(half x, half y, half z) { float fx = (float)x; diff --git a/ocml/src/len4D.cl b/ocml/src/len4D.cl index 4fe8b898..b05f0cad 100644 --- a/ocml/src/len4D.cl +++ b/ocml/src/len4D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(len4)(double x, double y, double z, double w) { double a = BUILTIN_ABS_F64(x); diff --git a/ocml/src/len4F.cl b/ocml/src/len4F.cl index c80e4c0c..24231618 100644 --- a/ocml/src/len4F.cl +++ b/ocml/src/len4F.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(len4)(float x, float y, float z, float w) { float a = BUILTIN_ABS_F32(x); diff --git a/ocml/src/len4H.cl b/ocml/src/len4H.cl index 6fee1090..9b320c78 100644 --- a/ocml/src/len4H.cl +++ b/ocml/src/len4H.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(len4)(half x, half y, half z, half w) { float fx = (float)x; diff --git a/ocml/src/lgammaD.cl b/ocml/src/lgammaD.cl index 4a9849e9..69e50258 100644 --- a/ocml/src/lgammaD.cl +++ b/ocml/src/lgammaD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -INLINEATTR double +double MATH_MANGLE(lgamma)(double x) { int s; diff --git a/ocml/src/lgammaF.cl b/ocml/src/lgammaF.cl index 2f53d18c..4a113c1d 100644 --- a/ocml/src/lgammaF.cl +++ b/ocml/src/lgammaF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -INLINEATTR float +float MATH_MANGLE(lgamma)(float x) { int s; diff --git a/ocml/src/lgammaH.cl b/ocml/src/lgammaH.cl index 6472f9f6..81a0fcec 100644 --- a/ocml/src/lgammaH.cl +++ b/ocml/src/lgammaH.cl @@ -9,7 +9,7 @@ UGEN(lgamma) -INLINEATTR half +half MATH_MANGLE(lgamma)(half x) { int s; diff --git a/ocml/src/lgamma_rH.cl b/ocml/src/lgamma_rH.cl index 377721d9..b1f6d485 100644 --- a/ocml/src/lgamma_rH.cl +++ b/ocml/src/lgamma_rH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -INLINEATTR half2 +half2 MATH_MANGLE2(lgamma_r)(half2 x, __private int2 *signp) { int slo, shi; @@ -18,7 +18,7 @@ MATH_MANGLE2(lgamma_r)(half2 x, __private int2 *signp) return r; } -INLINEATTR half +half MATH_MANGLE(lgamma_r)(half x, __private int *signp) { return (half)MATH_UPMANGLE(lgamma_r)((float)x, signp); diff --git a/ocml/src/lnepD.cl b/ocml/src/lnepD.cl index 6bece3e7..dfe4484d 100644 --- a/ocml/src/lnepD.cl +++ b/ocml/src/lnepD.cl @@ -10,7 +10,7 @@ #define DOUBLE_SPECIALIZATION #include "ep.h" -INLINEATTR CONSTATTR double +CONSTATTR double MATH_PRIVATE(lnep)(double2 a) { int b = BUILTIN_FREXP_MANT_F64(a.hi) < (2.0/3.0); diff --git a/ocml/src/lnepF.cl b/ocml/src/lnepF.cl index 65675582..0c4502f2 100644 --- a/ocml/src/lnepF.cl +++ b/ocml/src/lnepF.cl @@ -10,7 +10,7 @@ #define FLOAT_SPECIALIZATION #include "ep.h" -INLINEATTR CONSTATTR float +CONSTATTR float MATH_PRIVATE(lnep)(float2 a) { int b = BUILTIN_FREXP_MANT_F32(a.hi) < (2.0f/3.0f); diff --git a/ocml/src/log10H.cl b/ocml/src/log10H.cl index 1fbf60ae..2a03ff02 100644 --- a/ocml/src/log10H.cl +++ b/ocml/src/log10H.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(log10) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(log10)(half x) { return (half)(BUILTIN_LOG2_F32((float)x) * 0x1.344136p-2f); diff --git a/ocml/src/log1pD.cl b/ocml/src/log1pD.cl index c5f5252f..240b4626 100644 --- a/ocml/src/log1pD.cl +++ b/ocml/src/log1pD.cl @@ -12,7 +12,7 @@ extern CONSTATTR double MATH_PRIVATE(lnep)(double2 x); #define DOUBLE_SPECIALIZATION #include "ep.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(log1p)(double x) { double z = MATH_PRIVATE(lnep)(add(1.0, x)); diff --git a/ocml/src/log1pF.cl b/ocml/src/log1pF.cl index b4584519..ce8a5a2b 100644 --- a/ocml/src/log1pF.cl +++ b/ocml/src/log1pF.cl @@ -12,7 +12,7 @@ extern CONSTATTR float MATH_PRIVATE(lnep)(float2 x); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(log1p)(float x) { float z = MATH_PRIVATE(lnep)(add(1.0, x)); diff --git a/ocml/src/log1pH.cl b/ocml/src/log1pH.cl index da274acf..51b5ff7c 100644 --- a/ocml/src/log1pH.cl +++ b/ocml/src/log1pH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(log1p) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(log1p)(half x) { half ret; diff --git a/ocml/src/log2H.cl b/ocml/src/log2H.cl index 3d38a9f6..4a46e968 100644 --- a/ocml/src/log2H.cl +++ b/ocml/src/log2H.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(log2) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(log2)(half x) { return BUILTIN_LOG2_F16(x); diff --git a/ocml/src/logF_base.h b/ocml/src/logF_base.h index cddad305..763623ab 100644 --- a/ocml/src/logF_base.h +++ b/ocml/src/logF_base.h @@ -7,7 +7,7 @@ #include "mathF.h" -INLINEATTR CONSTATTR float +CONSTATTR float #if defined COMPILING_LOG2 MATH_MANGLE(log2)(float x) #elif defined COMPILING_LOG10 diff --git a/ocml/src/logH.cl b/ocml/src/logH.cl index d62fecc4..08439ff5 100644 --- a/ocml/src/logH.cl +++ b/ocml/src/logH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(log) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(log)(half x) { return (half)(BUILTIN_LOG2_F32((float)x) * 0x1.62e430p-1f); diff --git a/ocml/src/logbD.cl b/ocml/src/logbD.cl index cbc52224..2b859853 100644 --- a/ocml/src/logbD.cl +++ b/ocml/src/logbD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(logb)(double x) { double ret = (double)(BUILTIN_FREXP_EXP_F64(x) - 1); diff --git a/ocml/src/logbF.cl b/ocml/src/logbF.cl index f7154d4f..0e6cb740 100644 --- a/ocml/src/logbF.cl +++ b/ocml/src/logbF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(logb)(float x) { float ret = (float)(BUILTIN_FREXP_EXP_F32(x) - 1); diff --git a/ocml/src/logbH.cl b/ocml/src/logbH.cl index 656d07b0..49af766e 100644 --- a/ocml/src/logbH.cl +++ b/ocml/src/logbH.cl @@ -9,7 +9,7 @@ CONSTATTR UGEN(logb) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(logb)(half x) { half ret = (half)(BUILTIN_FREXP_EXP_F16(x) - (short)1); diff --git a/ocml/src/madD.cl b/ocml/src/madD.cl index e5573141..293e3fce 100644 --- a/ocml/src/madD.cl +++ b/ocml/src/madD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(mad)(double a, double b, double c) { return MATH_MAD(a, b, c); diff --git a/ocml/src/madF.cl b/ocml/src/madF.cl index b1f67ec4..06546b44 100644 --- a/ocml/src/madF.cl +++ b/ocml/src/madF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(mad)(float a, float b, float c) { return MATH_MAD(a, b, c); diff --git a/ocml/src/madH.cl b/ocml/src/madH.cl index 707f99ac..4f3d393f 100644 --- a/ocml/src/madH.cl +++ b/ocml/src/madH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(mad)(half2 a, half2 b, half2 c) { return MATH_MAD2(a, b, c); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(mad)(half a, half b, half c) { return MATH_MAD(a, b, c); diff --git a/ocml/src/maxD.cl b/ocml/src/maxD.cl index 49b3dccb..7c6664b0 100644 --- a/ocml/src/maxD.cl +++ b/ocml/src/maxD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(max)(double x, double y) { return BUILTIN_CMAX_F64(x, y); diff --git a/ocml/src/maxF.cl b/ocml/src/maxF.cl index 6e3e17ba..4cd0bfa9 100644 --- a/ocml/src/maxF.cl +++ b/ocml/src/maxF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(max)(float x, float y) { return BUILTIN_CMAX_F32(x, y); diff --git a/ocml/src/maxH.cl b/ocml/src/maxH.cl index 31cad270..01479c8a 100644 --- a/ocml/src/maxH.cl +++ b/ocml/src/maxH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(max)(half2 x, half2 y) { return BUILTIN_CMAX_2F16(x, y); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(max)(half x, half y) { return BUILTIN_CMAX_F16(x, y); diff --git a/ocml/src/maxmagD.cl b/ocml/src/maxmagD.cl index 9f606da8..3db12aad 100644 --- a/ocml/src/maxmagD.cl +++ b/ocml/src/maxmagD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(maxmag)(double x, double y) { #if 0 diff --git a/ocml/src/maxmagF.cl b/ocml/src/maxmagF.cl index 4997bd06..941fbe4b 100644 --- a/ocml/src/maxmagF.cl +++ b/ocml/src/maxmagF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(maxmag)(float x, float y) { #if 0 diff --git a/ocml/src/maxmagH.cl b/ocml/src/maxmagH.cl index 74ab78aa..9453df4e 100644 --- a/ocml/src/maxmagH.cl +++ b/ocml/src/maxmagH.cl @@ -9,7 +9,7 @@ CONSTATTR BGEN(maxmag) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(maxmag)(half x, half y) { x = BUILTIN_CANONICALIZE_F16(x); diff --git a/ocml/src/minD.cl b/ocml/src/minD.cl index c2d0b120..151178c2 100644 --- a/ocml/src/minD.cl +++ b/ocml/src/minD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(min)(double x, double y) { return BUILTIN_CMIN_F64(x, y); diff --git a/ocml/src/minF.cl b/ocml/src/minF.cl index 9c5e741b..eb38af70 100644 --- a/ocml/src/minF.cl +++ b/ocml/src/minF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(min)(float x, float y) { return BUILTIN_CMIN_F32(x, y); diff --git a/ocml/src/minH.cl b/ocml/src/minH.cl index 2ed7fa68..2f2eb4d7 100644 --- a/ocml/src/minH.cl +++ b/ocml/src/minH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(min)(half2 x, half2 y) { return BUILTIN_CMIN_2F16(x, y); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(min)(half x, half y) { return BUILTIN_CMIN_F16(x, y); diff --git a/ocml/src/minmagD.cl b/ocml/src/minmagD.cl index 80e7e3f4..cb3dbf3d 100644 --- a/ocml/src/minmagD.cl +++ b/ocml/src/minmagD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(minmag)(double x, double y) { #if 0 diff --git a/ocml/src/minmagF.cl b/ocml/src/minmagF.cl index 41fabef4..8994aac0 100644 --- a/ocml/src/minmagF.cl +++ b/ocml/src/minmagF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(minmag)(float x, float y) { #if 0 diff --git a/ocml/src/minmagH.cl b/ocml/src/minmagH.cl index 8b3fd016..e2659945 100644 --- a/ocml/src/minmagH.cl +++ b/ocml/src/minmagH.cl @@ -9,7 +9,7 @@ CONSTATTR BGEN(minmag) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(minmag)(half x, half y) { x = BUILTIN_CANONICALIZE_F16(x); diff --git a/ocml/src/modfD.cl b/ocml/src/modfD.cl index 317abdc6..6ad02e35 100644 --- a/ocml/src/modfD.cl +++ b/ocml/src/modfD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -INLINEATTR double +double MATH_MANGLE(modf)(double x, __private double *iptr) { double tx = BUILTIN_TRUNC_F64(x); diff --git a/ocml/src/modfF.cl b/ocml/src/modfF.cl index 27b33289..7d9b2964 100644 --- a/ocml/src/modfF.cl +++ b/ocml/src/modfF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -INLINEATTR float +float MATH_MANGLE(modf)(float x, __private float *iptr) { float tx = BUILTIN_TRUNC_F32(x); diff --git a/ocml/src/modfH.cl b/ocml/src/modfH.cl index 7c40cba9..8c28ef86 100644 --- a/ocml/src/modfH.cl +++ b/ocml/src/modfH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -INLINEATTR half2 +half2 MATH_MANGLE2(modf)(half2 x, __private half2 *iptr) { half2 tx = BUILTIN_TRUNC_2F16(x); @@ -18,7 +18,7 @@ MATH_MANGLE2(modf)(half2 x, __private half2 *iptr) return BUILTIN_COPYSIGN_2F16(ret, x); } -INLINEATTR half +half MATH_MANGLE(modf)(half x, __private half *iptr) { half tx = BUILTIN_TRUNC_F16(x); diff --git a/ocml/src/mulD.cl b/ocml/src/mulD.cl index c567b07e..05c8aae6 100644 --- a/ocml/src/mulD.cl +++ b/ocml/src/mulD.cl @@ -7,21 +7,15 @@ #include "mathD.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR double \ -MATH_MANGLE(NAME)(double x, double y) \ +MATH_MANGLE(LN)(double x, double y) \ { \ - return BUILTIN_FULL_BINARY(fmul, false, ROUND, x, y); \ + return BUILTIN_##UN##_F64(x, y); \ } -GEN(mul_rte, ROUND_TO_NEAREST_EVEN) -GEN(mul_rtp, ROUND_TO_POSINF) -GEN(mul_rtn, ROUND_TO_NEGINF) -GEN(mul_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(mul_rte,MUL_RTE) +GEN(mul_rtn,MUL_RTN) +GEN(mul_rtp,MUL_RTP) +GEN(mul_rtz,MUL_RTZ) diff --git a/ocml/src/mulF.cl b/ocml/src/mulF.cl index 0a26fa26..4a4e4da0 100644 --- a/ocml/src/mulF.cl +++ b/ocml/src/mulF.cl @@ -7,27 +7,15 @@ #include "mathF.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR float \ -MATH_MANGLE(NAME)(float x, float y) \ +MATH_MANGLE(LN)(float x, float y) \ { \ - float ret; \ - if (DAZ_OPT()) { \ - ret = BUILTIN_FULL_BINARY(fmulf, true, ROUND, x, y); \ - } else { \ - ret = BUILTIN_FULL_BINARY(fmulf, false, ROUND, x, y); \ - } \ - return ret; \ + return BUILTIN_##UN##_F32(x, y); \ } -GEN(mul_rte, ROUND_TO_NEAREST_EVEN) -GEN(mul_rtp, ROUND_TO_POSINF) -GEN(mul_rtn, ROUND_TO_NEGINF) -GEN(mul_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(mul_rte,MUL_RTE) +GEN(mul_rtn,MUL_RTN) +GEN(mul_rtp,MUL_RTP) +GEN(mul_rtz,MUL_RTZ) diff --git a/ocml/src/mulH.cl b/ocml/src/mulH.cl index 7fcf2141..9d738867 100644 --- a/ocml/src/mulH.cl +++ b/ocml/src/mulH.cl @@ -7,21 +7,15 @@ #include "mathH.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR half \ -MATH_MANGLE(NAME)(half x, half y) \ +MATH_MANGLE(LN)(half x, half y) \ { \ - return BUILTIN_FULL_BINARY(fmulh, false, ROUND, x, y); \ + return BUILTIN_##UN##_F16(x, y); \ } -GEN(mul_rte, ROUND_TO_NEAREST_EVEN) -GEN(mul_rtp, ROUND_TO_POSINF) -GEN(mul_rtn, ROUND_TO_NEGINF) -GEN(mul_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(mul_rte,MUL_RTE) +GEN(mul_rtn,MUL_RTN) +GEN(mul_rtp,MUL_RTP) +GEN(mul_rtz,MUL_RTZ) diff --git a/ocml/src/nanD.cl b/ocml/src/nanD.cl index 439c9654..762365bc 100644 --- a/ocml/src/nanD.cl +++ b/ocml/src/nanD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(nan)(ulong nancode) { return AS_DOUBLE((nancode & MANTBITS_DP64) | QNANBITPATT_DP64); diff --git a/ocml/src/nanF.cl b/ocml/src/nanF.cl index 7fcf09fd..aeb5e530 100644 --- a/ocml/src/nanF.cl +++ b/ocml/src/nanF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(nan)(uint nancode) { return AS_FLOAT(QNANBITPATT_SP32 | (nancode & 0xfffff)); diff --git a/ocml/src/nanH.cl b/ocml/src/nanH.cl index 086c5f6d..b53e48e8 100644 --- a/ocml/src/nanH.cl +++ b/ocml/src/nanH.cl @@ -7,14 +7,14 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(nan)(ushort2 nancode) { ushort2 h = (ushort2)QNANBITPATT_HP16 | (nancode & (ushort2)0x01ff); return AS_HALF2(h); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(nan)(ushort nancode) { ushort h = (ushort)QNANBITPATT_HP16 | (nancode & (ushort)0x01ff); diff --git a/ocml/src/ncdfH.cl b/ocml/src/ncdfH.cl index 1ac2bf9f..cb7bd711 100644 --- a/ocml/src/ncdfH.cl +++ b/ocml/src/ncdfH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(ncdf) -INLINEATTR PUREATTR half +PUREATTR half MATH_MANGLE(ncdf)(half x) { return (half)MATH_UPMANGLE(ncdf)((float)x); diff --git a/ocml/src/ncdfinvD.cl b/ocml/src/ncdfinvD.cl index 300f6048..f2e6cfd5 100644 --- a/ocml/src/ncdfinvD.cl +++ b/ocml/src/ncdfinvD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -INLINEATTR PUREATTR double +PUREATTR double MATH_MANGLE(ncdfinv)(double x) { return -0x1.6a09e667f3bcdp+0 * MATH_MANGLE(erfcinv)(x + x); diff --git a/ocml/src/ncdfinvF.cl b/ocml/src/ncdfinvF.cl index d04dddd0..9c31025d 100644 --- a/ocml/src/ncdfinvF.cl +++ b/ocml/src/ncdfinvF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -INLINEATTR PUREATTR float +PUREATTR float MATH_MANGLE(ncdfinv)(float x) { return -0x1.6a09e6p+0f * MATH_MANGLE(erfcinv)(x + x); diff --git a/ocml/src/ncdfinvH.cl b/ocml/src/ncdfinvH.cl index 3905a68d..8f4fceca 100644 --- a/ocml/src/ncdfinvH.cl +++ b/ocml/src/ncdfinvH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(ncdfinv) -INLINEATTR PUREATTR half +PUREATTR half MATH_MANGLE(ncdfinv)(half x) { return (half)MATH_UPMANGLE(ncdfinv)((float)x); diff --git a/ocml/src/nearbyintD.cl b/ocml/src/nearbyintD.cl index df2d005b..a222532f 100644 --- a/ocml/src/nearbyintD.cl +++ b/ocml/src/nearbyintD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(nearbyint)(double x) { return BUILTIN_RINT_F64(x); diff --git a/ocml/src/nearbyintF.cl b/ocml/src/nearbyintF.cl index 5ae97fff..44be2481 100644 --- a/ocml/src/nearbyintF.cl +++ b/ocml/src/nearbyintF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(nearbyint)(float x) { return BUILTIN_RINT_F32(x); diff --git a/ocml/src/nearbyintH.cl b/ocml/src/nearbyintH.cl index cf2e962c..92c0fa3b 100644 --- a/ocml/src/nearbyintH.cl +++ b/ocml/src/nearbyintH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(nearbyint)(half2 x) { return BUILTIN_RINT_2F16(x); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(nearbyint)(half x) { return BUILTIN_RINT_F16(x); diff --git a/ocml/src/nextafterD.cl b/ocml/src/nextafterD.cl index ee4031c6..aa1add9f 100644 --- a/ocml/src/nextafterD.cl +++ b/ocml/src/nextafterD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(nextafter)(double x, double y) { long ix = AS_LONG(x); diff --git a/ocml/src/nextafterF.cl b/ocml/src/nextafterF.cl index 4ef25bcd..0c4180c5 100644 --- a/ocml/src/nextafterF.cl +++ b/ocml/src/nextafterF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(nextafter)(float x, float y) { int ix = AS_INT(x); diff --git a/ocml/src/nextafterH.cl b/ocml/src/nextafterH.cl index d81028ac..517ce81a 100644 --- a/ocml/src/nextafterH.cl +++ b/ocml/src/nextafterH.cl @@ -9,7 +9,7 @@ CONSTATTR BGEN(nextafter) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(nextafter)(half x, half y) { short ix = AS_SHORT(x); diff --git a/ocml/src/opts.h b/ocml/src/opts.h index c6bb1146..3a07cbc2 100644 --- a/ocml/src/opts.h +++ b/ocml/src/opts.h @@ -7,7 +7,7 @@ #include "oclc.h" -#define HAVE_FAST_FMA32() (__oclc_ISA_version() == 701 || __oclc_ISA_version() == 801) +#define HAVE_FAST_FMA32() (__oclc_ISA_version() == 701 || __oclc_ISA_version() == 801 || __oclc_ISA_version() >= 900) #define FINITE_ONLY_OPT() __oclc_finite_only_opt() #define UNSAFE_MATH_OPT() __oclc_unsafe_math_opt() #define DAZ_OPT() __oclc_daz_opt() diff --git a/ocml/src/pownH.cl b/ocml/src/pownH.cl index c8c74d31..3604cae6 100644 --- a/ocml/src/pownH.cl +++ b/ocml/src/pownH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -PUREATTR INLINEATTR half2 +PUREATTR half2 MATH_MANGLE2(pown)(half2 x, int2 ny) { return (half2)(MATH_MANGLE(pown)(x.lo, ny.lo), MATH_MANGLE(pown)(x.hi, ny.hi)); diff --git a/ocml/src/rcbrtF.cl b/ocml/src/rcbrtF.cl index 1fd6c9c0..0e393e68 100644 --- a/ocml/src/rcbrtF.cl +++ b/ocml/src/rcbrtF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(rcbrt)(float x) { if (DAZ_OPT()) { diff --git a/ocml/src/remainderF_base.h b/ocml/src/remainderF_base.h index 4422f826..a79ec5d1 100644 --- a/ocml/src/remainderF_base.h +++ b/ocml/src/remainderF_base.h @@ -18,7 +18,7 @@ CLO = MATH_MAD(__ta, __tb, MATH_MAD(__ta, __hb, MATH_MAD(__ha, __tb, MATH_MAD(__ha, __hb, -CHI)))); \ } while (0) -CONSTATTR static inline float +CONSTATTR INLINEATTR static float fnma(float a, float b, float c) { float d; diff --git a/ocml/src/remquoH.cl b/ocml/src/remquoH.cl index 3893dded..18106093 100644 --- a/ocml/src/remquoH.cl +++ b/ocml/src/remquoH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -INLINEATTR half2 +half2 MATH_MANGLE2(remquo)(half2 x, half2 y, __private int2 *q7p) { int qlo, qhi; diff --git a/ocml/src/rhypotD.cl b/ocml/src/rhypotD.cl index 0524902b..4339b4f5 100644 --- a/ocml/src/rhypotD.cl +++ b/ocml/src/rhypotD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(rhypot)(double x, double y) { double a = BUILTIN_ABS_F64(x); diff --git a/ocml/src/rhypotF.cl b/ocml/src/rhypotF.cl index 56cc0d2f..cdf08f86 100644 --- a/ocml/src/rhypotF.cl +++ b/ocml/src/rhypotF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(rhypot)(float x, float y) { float a = BUILTIN_ABS_F32(x); diff --git a/ocml/src/rhypotH.cl b/ocml/src/rhypotH.cl index d1c571a0..97acf627 100644 --- a/ocml/src/rhypotH.cl +++ b/ocml/src/rhypotH.cl @@ -9,7 +9,7 @@ CONSTATTR BGEN(rhypot) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(rhypot)(half x, half y) { float fx = (float)x; diff --git a/ocml/src/rintD.cl b/ocml/src/rintD.cl index a43b5ec8..7c3bb107 100644 --- a/ocml/src/rintD.cl +++ b/ocml/src/rintD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(rint)(double x) { return BUILTIN_RINT_F64(x); diff --git a/ocml/src/rintF.cl b/ocml/src/rintF.cl index a95c223b..17254933 100644 --- a/ocml/src/rintF.cl +++ b/ocml/src/rintF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(rint)(float x) { return BUILTIN_RINT_F32(x); diff --git a/ocml/src/rintH.cl b/ocml/src/rintH.cl index fa789d5b..f2ffd3c1 100644 --- a/ocml/src/rintH.cl +++ b/ocml/src/rintH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(rint)(half2 x) { return BUILTIN_RINT_2F16(x); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(rint)(half x) { return BUILTIN_RINT_F16(x); diff --git a/ocml/src/rlen3D.cl b/ocml/src/rlen3D.cl index f9442e48..a1081a2c 100644 --- a/ocml/src/rlen3D.cl +++ b/ocml/src/rlen3D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(rlen3)(double x, double y, double z) { double a = BUILTIN_ABS_F64(x); diff --git a/ocml/src/rlen3F.cl b/ocml/src/rlen3F.cl index bf0cab90..03f2c40a 100644 --- a/ocml/src/rlen3F.cl +++ b/ocml/src/rlen3F.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(rlen3)(float x, float y, float z) { float a = BUILTIN_ABS_F32(x); diff --git a/ocml/src/rlen3H.cl b/ocml/src/rlen3H.cl index c12755f7..b147b44d 100644 --- a/ocml/src/rlen3H.cl +++ b/ocml/src/rlen3H.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(rlen3)(half x, half y, half z) { float fx = (float)x; diff --git a/ocml/src/rlen4D.cl b/ocml/src/rlen4D.cl index 9c4fe9bf..4d16f943 100644 --- a/ocml/src/rlen4D.cl +++ b/ocml/src/rlen4D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(rlen4)(double x, double y, double z, double w) { double a = BUILTIN_ABS_F64(x); diff --git a/ocml/src/rlen4F.cl b/ocml/src/rlen4F.cl index 733f62d1..e6d7603f 100644 --- a/ocml/src/rlen4F.cl +++ b/ocml/src/rlen4F.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(rlen4)(float x, float y, float z, float w) { float a = BUILTIN_ABS_F32(x); diff --git a/ocml/src/rlen4H.cl b/ocml/src/rlen4H.cl index 9fb13359..5abb05f5 100644 --- a/ocml/src/rlen4H.cl +++ b/ocml/src/rlen4H.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(rlen4)(half x, half y, half z, half w) { float fx = (float)x; diff --git a/ocml/src/rootnH.cl b/ocml/src/rootnH.cl index d17abfc8..5bd94272 100644 --- a/ocml/src/rootnH.cl +++ b/ocml/src/rootnH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -PUREATTR INLINEATTR half2 +PUREATTR half2 MATH_MANGLE2(rootn)(half2 x, int2 ny) { return (half2)(MATH_MANGLE(rootn)(x.lo, ny.lo), MATH_MANGLE(rootn)(x.hi, ny.hi)); diff --git a/ocml/src/roundD.cl b/ocml/src/roundD.cl index e8281f8d..0bc2aedb 100644 --- a/ocml/src/roundD.cl +++ b/ocml/src/roundD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(round)(double x) { double t = BUILTIN_TRUNC_F64(x); diff --git a/ocml/src/roundF.cl b/ocml/src/roundF.cl index bbaf3e6d..2b98a223 100644 --- a/ocml/src/roundF.cl +++ b/ocml/src/roundF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(round)(float x) { float t = BUILTIN_TRUNC_F32(x); diff --git a/ocml/src/roundH.cl b/ocml/src/roundH.cl index 045f5d95..d735a7fb 100644 --- a/ocml/src/roundH.cl +++ b/ocml/src/roundH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(round)(half2 x) { half2 t = BUILTIN_TRUNC_2F16(x); @@ -18,7 +18,7 @@ MATH_MANGLE2(round)(half2 x) return t + o; } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(round)(half x) { half t = BUILTIN_TRUNC_F16(x); diff --git a/ocml/src/rsqrtD.cl b/ocml/src/rsqrtD.cl index d67127d4..5fd5d156 100644 --- a/ocml/src/rsqrtD.cl +++ b/ocml/src/rsqrtD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(rsqrt)(double x) { double y0 = BUILTIN_RSQRT_F64(x); diff --git a/ocml/src/rsqrtF.cl b/ocml/src/rsqrtF.cl index dc7df5fb..8349387f 100644 --- a/ocml/src/rsqrtF.cl +++ b/ocml/src/rsqrtF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -PUREATTR INLINEATTR float +PUREATTR float MATH_MANGLE(rsqrt)(float x) { if (DAZ_OPT()) { diff --git a/ocml/src/rsqrtH.cl b/ocml/src/rsqrtH.cl index ec5f9bed..ab42880e 100644 --- a/ocml/src/rsqrtH.cl +++ b/ocml/src/rsqrtH.cl @@ -9,7 +9,7 @@ CONSTATTR UGEN(rsqrt) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(rsqrt)(half x) { return BUILTIN_RSQRT_F16(x); diff --git a/ocml/src/scalbD.cl b/ocml/src/scalbD.cl index 5bfce8a7..cfe4caf3 100644 --- a/ocml/src/scalbD.cl +++ b/ocml/src/scalbD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(scalb)(double x, double y) { double t = BUILTIN_MIN_F64(BUILTIN_MAX_F64(y, -0x1.0p+20), 0x1.0p+20); diff --git a/ocml/src/scalbF.cl b/ocml/src/scalbF.cl index f957fb7b..05d95969 100644 --- a/ocml/src/scalbF.cl +++ b/ocml/src/scalbF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(scalb)(float x, float y) { float t = BUILTIN_CLAMP_F32(y, -0x1.0p+20f, 0x1.0p+20f); diff --git a/ocml/src/scalbH.cl b/ocml/src/scalbH.cl index 2d55c644..53b8cc8e 100644 --- a/ocml/src/scalbH.cl +++ b/ocml/src/scalbH.cl @@ -9,7 +9,7 @@ CONSTATTR BGEN(scalb) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(scalb)(half x, half y) { half t = BUILTIN_MIN_F16(BUILTIN_MAX_F16(y, -0x1.0p+6h), 0x1.0p+6h); diff --git a/ocml/src/scalbnD.cl b/ocml/src/scalbnD.cl index 350c47f9..07ecd541 100644 --- a/ocml/src/scalbnD.cl +++ b/ocml/src/scalbnD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(scalbn)(double x, int n) { return MATH_MANGLE(ldexp)(x, n); diff --git a/ocml/src/scalbnF.cl b/ocml/src/scalbnF.cl index 49f4e700..b0adcc1a 100644 --- a/ocml/src/scalbnF.cl +++ b/ocml/src/scalbnF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(scalbn)(float x, int n) { return MATH_MANGLE(ldexp)(x, n); diff --git a/ocml/src/scalbnH.cl b/ocml/src/scalbnH.cl index 5656013c..f9be702e 100644 --- a/ocml/src/scalbnH.cl +++ b/ocml/src/scalbnH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(scalbn)(half2 x, int2 n) { return (half2)(MATH_MANGLE(ldexp)(x.lo, n.lo), MATH_MANGLE(ldexp)(x.hi, n.hi)); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(scalbn)(half x, int n) { return MATH_MANGLE(ldexp)(x, n); diff --git a/ocml/src/signbitD.cl b/ocml/src/signbitD.cl index 3c93ca5b..98681e5d 100644 --- a/ocml/src/signbitD.cl +++ b/ocml/src/signbitD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(signbit)(double x) { return AS_INT2(x).hi < 0; diff --git a/ocml/src/signbitF.cl b/ocml/src/signbitF.cl index 3ceec89c..e944a72b 100644 --- a/ocml/src/signbitF.cl +++ b/ocml/src/signbitF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(signbit)(float x) { return AS_INT(x) < 0; diff --git a/ocml/src/signbitH.cl b/ocml/src/signbitH.cl index e5fb9130..b5d99170 100644 --- a/ocml/src/signbitH.cl +++ b/ocml/src/signbitH.cl @@ -7,7 +7,7 @@ #include "mathH.h" -CONSTATTR INLINEATTR short2 +CONSTATTR short2 MATH_MANGLE2(signbit)(half2 x) { return (short2) @@ -15,7 +15,7 @@ MATH_MANGLE2(signbit)(half2 x) AS_SHORT(x.hi) < 0 ? (short)-1 : (short)0); } -CONSTATTR INLINEATTR int +CONSTATTR int MATH_MANGLE(signbit)(half x) { return AS_SHORT(x) < 0; diff --git a/ocml/src/sinD.cl b/ocml/src/sinD.cl index 7ec233bc..8f4464c2 100644 --- a/ocml/src/sinD.cl +++ b/ocml/src/sinD.cl @@ -8,17 +8,14 @@ #include "mathD.h" #include "trigredD.h" -INLINEATTR double +CONSTATTR double MATH_MANGLE(sin)(double x) { - double r, rr; - int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x)); + struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x)); + struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); - double cc; - double ss = MATH_PRIVATE(sincosred2)(r, rr, &cc); - - int2 s = AS_INT2((regn & 1) == 0 ? ss : cc); - s.hi ^= (regn > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000); + int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c); + s.hi ^= (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000); if (!FINITE_ONLY_OPT()) { s = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : s; diff --git a/ocml/src/sinF.cl b/ocml/src/sinF.cl index fe6a75d8..c9059771 100644 --- a/ocml/src/sinF.cl +++ b/ocml/src/sinF.cl @@ -8,28 +8,22 @@ #include "mathF.h" #include "trigredF.h" -INLINEATTR float +float MATH_MANGLE(sin)(float x) { int ix = AS_INT(x); int ax = ix & 0x7fffffff; -#if defined EXTRA_PRECISION - float r0, r1; - int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax)); + struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax)); - float cc; - float ss = MATH_PRIVATE(sincosred2)(r0, r1, &cc); +#if defined EXTRA_PRECISION + struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); #else - float r; - int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax)); - - float cc; - float ss = MATH_PRIVATE(sincosred)(r, &cc); + struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif - float s = (regn & 1) != 0 ? cc : ss; - s = AS_FLOAT(AS_INT(s) ^ (regn > 1 ? 0x80000000 : 0) ^ (ix ^ ax)); + float s = (r.i & 1) != 0 ? sc.c : sc.s; + s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0) ^ (ix ^ ax)); if (!FINITE_ONLY_OPT()) { s = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : s; diff --git a/ocml/src/sinH.cl b/ocml/src/sinH.cl index 1c92458d..7c018cc3 100644 --- a/ocml/src/sinH.cl +++ b/ocml/src/sinH.cl @@ -10,17 +10,14 @@ UGEN(sin) -INLINEATTR half +half MATH_MANGLE(sin)(half x) { - half r; - short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x)); + struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x)); + struct scret sc = MATH_PRIVATE(sincosred)(r.hi); - half cc; - half ss = MATH_PRIVATE(sincosred)(r, &cc); - - short s = AS_SHORT((i & (short)1) == (short)0 ? ss : cc); - s ^= (i > (short)1 ? (short)0x8000 : 0) ^ (AS_SHORT(x) & (short)0x8000); + short s = AS_SHORT((r.i & (short)1) == (short)0 ? sc.s : sc.c); + s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000); if (!FINITE_ONLY_OPT()) { s = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : s; diff --git a/ocml/src/sinbD.cl b/ocml/src/sinbD.cl index b300f031..c98a8fa2 100644 --- a/ocml/src/sinbD.cl +++ b/ocml/src/sinbD.cl @@ -24,31 +24,31 @@ L = __e; \ } while (0) -INLINEATTR double +double MATH_PRIVATE(sinb)(double x, int n, double p) { - double ph, pl, rh, rl, sh, sl; - int i = MATH_PRIVATE(trigred)(&rh, &rl, x); - bool b = rh < p; - i = (i - b - n) & 3; + struct redret r = MATH_PRIVATE(trigred)(x); + bool b = r.hi < p; + r.i = (r.i - b - n) & 3; // This is a properly signed extra precise pi/4 - ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0))); - pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0))); + double ph = AS_DOUBLE((uint2)(0x54442d18, 0xbfe921fb ^ (b ? 0x80000000 : 0))); + double pl = AS_DOUBLE((uint2)(0x33145c07, 0xbc81a626 ^ (b ? 0x80000000 : 0))); + + double sh, sl; FDIF2(ph, p, ph, sl); pl += sl; FSUM2(ph, pl, ph, pl); - FSUM2(ph, rh, sh, sl); - sl += pl + rl; + FSUM2(ph, r.hi, sh, sl); + sl += pl + r.lo; FSUM2(sh, sl, sh, sl); - double cc; - double ss = MATH_PRIVATE(sincosred2)(sh, sl, &cc); + struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl); - int2 s = AS_INT2((i & 1) == 0 ? ss : cc); - s.hi ^= i > 1 ? 0x80000000 : 0; + int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c); + s.hi ^= r.i > 1 ? 0x80000000 : 0; return AS_DOUBLE(s); } diff --git a/ocml/src/sinbF.cl b/ocml/src/sinbF.cl index 9e26d0b6..cdc139be 100644 --- a/ocml/src/sinbF.cl +++ b/ocml/src/sinbF.cl @@ -24,41 +24,36 @@ L = __e; \ } while (0) -INLINEATTR float +float MATH_PRIVATE(sinb)(float x, int n, float p) { + struct redret r = MATH_PRIVATE(trigred)(x); + bool b = r.hi < p; + r.i = (r.i - b - n) & 3; + #if defined EXTRA_PRECISION - float ph, pl, rh, rl, sh, sl; - int i = MATH_PRIVATE(trigred)(&rh, &rl, x); - bool b = rh < p; - i = (i - b - n) & 3; + float ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); + float pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0)); - ph = AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); - pl = AS_FLOAT(0x32bbbd2e ^ (b ? 0x80000000 : 0)); + float sh, sl; FDIF2(ph, p, ph, sl); pl += sl; FSUM2(ph, pl, ph, pl); - FSUM2(ph, rh, sh, sl); - sl += pl + rl; + FSUM2(ph, r.hi, sh, sl); + sl += pl + r.lo; FSUM2(sh, sl, sh, sl); - float cc; - float ss = MATH_PRIVATE(sincosred2)(sh, sl, &cc); + struct scret sc = MATH_PRIVATE(sincosred2)(sh, sl); #else - float r; - int i = MATH_PRIVATE(trigred)(&r, x); - bool b = r < p; - i = (i - b - n) & 3; - r = r - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); + r.hi = r.hi - p + AS_FLOAT(0xbf490fdb ^ (b ? 0x80000000 : 0)); - float cc; - float ss = MATH_PRIVATE(sincosred)(r, &cc); + struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif - float s = (i & 1) != 0 ? cc : ss; - s = AS_FLOAT(AS_INT(s) ^ (i > 1 ? 0x80000000 : 0)); + float s = (r.i & 1) != 0 ? sc.c : sc.s; + s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0)); return s; } diff --git a/ocml/src/sincosD.cl b/ocml/src/sincosD.cl index de851c34..891d083d 100644 --- a/ocml/src/sincosD.cl +++ b/ocml/src/sincosD.cl @@ -8,22 +8,19 @@ #include "mathD.h" #include "trigredD.h" -INLINEATTR double +double MATH_MANGLE(sincos)(double x, __private double * cp) { - double r, rr; - int regn = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x)); + struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x)); + struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); - double cc; - double ss = MATH_PRIVATE(sincosred2)(r, rr, &cc); + int flip = r.i > 1 ? (int)0x80000000 : 0; + bool odd = (r.i & 1) != 0; - int flip = regn > 1 ? (int)0x80000000 : 0; - bool odd = (regn & 1) != 0; - - int2 s = AS_INT2(odd ? cc : ss); + int2 s = AS_INT2(odd ? sc.c : sc.s); s.hi ^= flip ^ (AS_INT2(x).hi &(int)0x80000000); - ss = -ss; - int2 c = AS_INT2(odd ? ss : cc); + sc.s = -sc.s; + int2 c = AS_INT2(odd ? sc.s : sc.c); c.hi ^= flip; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/sincosF.cl b/ocml/src/sincosF.cl index 1baa857f..123b4595 100644 --- a/ocml/src/sincosF.cl +++ b/ocml/src/sincosF.cl @@ -8,32 +8,26 @@ #include "mathF.h" #include "trigredF.h" -INLINEATTR float +float MATH_MANGLE(sincos)(float x, __private float *cp) { int ix = AS_INT(x); int ax = ix & 0x7fffffff; -#if defined EXTRA_PRECISION - float r0, r1; - int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax)); + struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax)); - float cc; - float ss = MATH_PRIVATE(sincosred2)(r0, r1, &cc); +#if defined EXTRA_PRECISION + struct scret sc = MATH_PRIVATE(sincosred2)(r.hi, r.lo); #else - float r; - int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax)); - - float cc; - float ss = MATH_PRIVATE(sincosred)(r, &cc); + struct scret sc = MATH_PRIVATE(sincosred)(r.hi); #endif - int flip = regn > 1 ? 0x80000000 : 0; - bool odd = (regn & 1) != 0; - float s = odd ? cc : ss; + int flip = r.i > 1 ? 0x80000000 : 0; + bool odd = (r.i & 1) != 0; + float s = odd ? sc.c : sc.s; s = AS_FLOAT(AS_INT(s) ^ flip ^ (ax ^ ix)); - ss = -ss; - float c = odd ? ss : cc; + sc.s = -sc.s; + float c = odd ? sc.s : sc.c; c = AS_FLOAT(AS_INT(c) ^ flip); if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/sincosH.cl b/ocml/src/sincosH.cl index 43a35c6b..bdf62827 100644 --- a/ocml/src/sincosH.cl +++ b/ocml/src/sincosH.cl @@ -8,7 +8,7 @@ #include "mathH.h" #include "trigredH.h" -INLINEATTR half2 +half2 MATH_MANGLE2(sincos)(half2 x, __private half2 *cp) { half2 s; @@ -19,21 +19,18 @@ MATH_MANGLE2(sincos)(half2 x, __private half2 *cp) return s; } -INLINEATTR half +CONSTATTR half MATH_MANGLE(sincos)(half x, __private half *cp) { - half r; - short regn = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x)); + struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x)); + struct scret sc = MATH_PRIVATE(sincosred)(r.hi); - half cc; - half ss = MATH_PRIVATE(sincosred)(r, &cc); - - short flip = regn > (short)1 ? (short)0x8000 : (short)0; - bool odd = (regn & 1) != 0; - short s = AS_SHORT(odd ? cc : ss); + short flip = r.i > (short)1 ? (short)0x8000 : (short)0; + bool odd = (r.i & (short)1) != (short)0; + short s = AS_SHORT(odd ? sc.c : sc.s); s ^= flip ^ (AS_SHORT(x) & (short)0x8000); - ss = -ss; - short c = AS_SHORT(odd ? ss : cc); + sc.s = -sc.s; + short c = AS_SHORT(odd ? sc.s : sc.c); c ^= flip; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/sincospiD.cl b/ocml/src/sincospiD.cl index 1b92e61a..4ede0cc7 100644 --- a/ocml/src/sincospiD.cl +++ b/ocml/src/sincospiD.cl @@ -8,22 +8,19 @@ #include "mathD.h" #include "trigpiredD.h" -INLINEATTR double +double MATH_MANGLE(sincospi)(double x, __private double * cp) { - double t; - int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t); + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); - double cc; - double ss = MATH_PRIVATE(sincospired)(t, &cc); + int flip = r.i > 1 ? (int)0x80000000 : 0; + bool odd = (r.i & 1) != 0; - int flip = i > 1 ? (int)0x80000000 : 0; - bool odd = (i & 1) != 0; - - int2 s = AS_INT2(odd ? cc : ss); + int2 s = AS_INT2(odd ? sc.c : sc.s); s.hi ^= flip ^ (AS_INT2(x).hi & 0x80000000); - ss = -ss; - int2 c = AS_INT2(odd ? ss : cc); + sc.s = -sc.s; + int2 c = AS_INT2(odd ? sc.s : sc.c); c.hi ^= flip; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/sincospiF.cl b/ocml/src/sincospiF.cl index af3528ce..9585bb42 100644 --- a/ocml/src/sincospiF.cl +++ b/ocml/src/sincospiF.cl @@ -8,24 +8,21 @@ #include "mathF.h" #include "trigpiredF.h" -INLINEATTR float +float MATH_MANGLE(sincospi)(float x, __private float *cp) { int ix = AS_INT(x); int ax = ix & 0x7fffffff; - float t; - int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &t); + struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); - float cc; - float ss = MATH_PRIVATE(sincospired)(t, &cc); - - int flip = i > 1 ? 0x80000000 : 0; - bool odd = (i & 1) != 0; - float s = odd ? cc : ss; + int flip = r.i > 1 ? 0x80000000 : 0; + bool odd = (r.i & 1) != 0; + float s = odd ? sc.c : sc.s; s = AS_FLOAT(AS_INT(s) ^ flip ^ (ax ^ ix)); - ss = -ss; - float c = odd ? ss : cc; + sc.s = -sc.s; + float c = odd ? sc.s : sc.c; c = AS_FLOAT(AS_INT(c) ^ flip); if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/sincospiH.cl b/ocml/src/sincospiH.cl index cba66af7..78249533 100644 --- a/ocml/src/sincospiH.cl +++ b/ocml/src/sincospiH.cl @@ -8,7 +8,7 @@ #include "mathH.h" #include "trigpiredH.h" -INLINEATTR half2 +half2 MATH_MANGLE2(sincospi)(half2 x, __private half2 *cp) { half2 s; @@ -20,22 +20,18 @@ MATH_MANGLE2(sincospi)(half2 x, __private half2 *cp) return s; } -INLINEATTR half +half MATH_MANGLE(sincospi)(half x, __private half *cp) { - half t; - short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t); + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); - half cc; - half ss = MATH_PRIVATE(sincospired)(t, &cc); - - short flip = i > (short)1 ? (short)0x8000 : (short)0; - bool odd = (i & (short)1) != (short)0; - - short s = AS_SHORT(odd ? cc : ss); + short flip = r.i > (short)1 ? (short)0x8000 : (short)0; + bool odd = (r.i & (short)1) != (short)0; + short s = AS_SHORT(odd ? sc.c : sc.s); s ^= flip ^ (AS_SHORT(x) & (short)0x8000); - ss = -ss; - short c = AS_SHORT(odd ? ss : cc); + sc.s = -sc.s; + short c = AS_SHORT(odd ? sc.s : sc.c); c ^= flip; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/sincospiredD.cl b/ocml/src/sincospiredD.cl index 5200346a..aae84504 100644 --- a/ocml/src/sincospiredD.cl +++ b/ocml/src/sincospiredD.cl @@ -6,11 +6,11 @@ *===------------------------------------------------------------------------*/ #include "mathD.h" +#include "trigpiredD.h" -INLINEATTR double -MATH_PRIVATE(sincospired)(double x, __private double *cp) +CONSTATTR struct scret +MATH_PRIVATE(sincospired)(double x) { - double t = x * x; double sx = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, @@ -26,7 +26,9 @@ MATH_PRIVATE(sincospired)(double x, __private double *cp) -0x1.55d3c7e3c325bp+0), 0x1.03c1f081b5a67p+2), -0x1.3bd3cc9be45dep+2); cx = MATH_MAD(t, cx, 1.0); - *cp = cx; - return sx; + struct scret ret; + ret.c = cx; + ret.s = sx; + return ret; } diff --git a/ocml/src/sincospiredF.cl b/ocml/src/sincospiredF.cl index 786036a1..ac164a17 100644 --- a/ocml/src/sincospiredF.cl +++ b/ocml/src/sincospiredF.cl @@ -6,9 +6,10 @@ *===------------------------------------------------------------------------*/ #include "mathF.h" +#include "trigredF.h" -INLINEATTR float -MATH_PRIVATE(sincospired)(float x, __private float *cp) +CONSTATTR struct scret +MATH_PRIVATE(sincospired)(float x) { float t = x * x; @@ -23,7 +24,9 @@ MATH_PRIVATE(sincospired)(float x, __private float *cp) -0x1.3bd3ccp+2f); cx = MATH_MAD(t, cx, 1.0f); - *cp = cx; - return sx; + struct scret ret; + ret.c = cx; + ret.s = sx; + return ret; } diff --git a/ocml/src/sincospiredH.cl b/ocml/src/sincospiredH.cl index 28a0fa7a..33a13ab0 100644 --- a/ocml/src/sincospiredH.cl +++ b/ocml/src/sincospiredH.cl @@ -6,11 +6,11 @@ *===------------------------------------------------------------------------*/ #include "mathH.h" +#include "trigpiredH.h" -INLINEATTR half -MATH_PRIVATE(sincospired)(half x, __private half *cp) +CONSTATTR struct scret +MATH_PRIVATE(sincospired)(half x) { - half t = x * x; half sx = MATH_MAD(t, 0x1.b84p+0h, -0x1.46cp+2h); @@ -20,7 +20,9 @@ MATH_PRIVATE(sincospired)(half x, __private half *cp) half cx = MATH_MAD(t, 0x1.fbp+1h, -0x1.3bcp+2h); cx = MATH_MAD(t, cx, 1.0h); - *cp = cx; - return sx; + struct scret ret; + ret.c = cx; + ret.s = sx; + return ret; } diff --git a/ocml/src/sincosred2D.cl b/ocml/src/sincosred2D.cl index 800c1021..3d8c487d 100644 --- a/ocml/src/sincosred2D.cl +++ b/ocml/src/sincosred2D.cl @@ -6,9 +6,10 @@ *===------------------------------------------------------------------------*/ #include "mathD.h" +#include "trigredD.h" -INLINEATTR double -MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp) +CONSTATTR struct scret +MATH_PRIVATE(sincosred2)(double x, double y) { const double S0 = -0x1.5555555555555p-3; const double S1 = 0x1.1111111110bb3p-7; @@ -35,7 +36,9 @@ MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp) double sxy = MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, S5, S4), S3), S2), S1); sxy = x - MATH_MAD(-x3, S0, MATH_MAD(x2, MATH_MAD(-x3, sxy, 0.5*y), -y)); - *cp = cxy; - return sxy; + struct scret ret; + ret.c = cxy; + ret.s = sxy; + return ret; } diff --git a/ocml/src/sincosred2F.cl b/ocml/src/sincosred2F.cl index 36767e53..16cd8fde 100644 --- a/ocml/src/sincosred2F.cl +++ b/ocml/src/sincosred2F.cl @@ -6,9 +6,10 @@ *===------------------------------------------------------------------------*/ #include "mathF.h" +#include "trigredF.h" -INLINEATTR float -MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp) +CONSTATTR struct scret +MATH_PRIVATE(sincosred2)(float x, float y) { const float c0 = 0x1.555556p-5f; const float c1 = -0x1.6c16b2p-10f; @@ -32,7 +33,9 @@ MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp) float sxy = MATH_MAD(x2, MATH_MAD(x2, s3, s2), s1); sxy = x - MATH_MAD(-x3, s0, MATH_MAD(x2, MATH_MAD(-x3, sxy, 0.5f*y), -y)); - *cp = cxy; - return sxy; + struct scret ret; + ret.c = cxy; + ret.s = sxy; + return ret; } diff --git a/ocml/src/sincosredD.cl b/ocml/src/sincosredD.cl index ed64d24b..4418d623 100644 --- a/ocml/src/sincosredD.cl +++ b/ocml/src/sincosredD.cl @@ -6,9 +6,10 @@ *===------------------------------------------------------------------------*/ #include "mathD.h" +#include "trigredD.h" -INLINEATTR double -MATH_PRIVATE(sincosred)(double x, __private double *cp) +CONSTATTR struct scret +MATH_PRIVATE(sincosred)(double x) { const double S0 = -0x1.5555555555555p-3; const double S1 = 0x1.1111111110bb3p-7; @@ -33,7 +34,9 @@ MATH_PRIVATE(sincosred)(double x, __private double *cp) double cx = t + MATH_MAD(x2*x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, C5, C4), C3), C2), C1), C0), v); double sx = MATH_MAD(x2*x, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, MATH_MAD(x2, S5, S4), S3), S2), S1), S0), x); - *cp = cx; - return sx; + struct scret ret; + ret.c = cx; + ret.s = sx; + return ret; } diff --git a/ocml/src/sincosredF.cl b/ocml/src/sincosredF.cl index e4d2cfd5..54167c47 100644 --- a/ocml/src/sincosredF.cl +++ b/ocml/src/sincosredF.cl @@ -8,8 +8,8 @@ #include "mathF.h" #include "trigredF.h" -INLINEATTR float -MATH_PRIVATE(sincosred)(float x, __private float *cp) +CONSTATTR struct scret +MATH_PRIVATE(sincosred)(float x) { float t = x * x; @@ -17,7 +17,9 @@ MATH_PRIVATE(sincosred)(float x, __private float *cp) float c = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, 0x1.aea668p-16f, -0x1.6c9e76p-10f), 0x1.5557eep-5f), -0x1.000008p-1f), 1.0f); - *cp = c; - return s; + struct scret ret; + ret.c = c; + ret.s = s; + return ret; } diff --git a/ocml/src/sincosredH.cl b/ocml/src/sincosredH.cl index a3ffec57..0dd4b17d 100644 --- a/ocml/src/sincosredH.cl +++ b/ocml/src/sincosredH.cl @@ -8,14 +8,16 @@ #include "mathH.h" #include "trigredH.h" -INLINEATTR half -MATH_PRIVATE(sincosred)(half x, __private half *cp) +CONSTATTR struct scret +MATH_PRIVATE(sincosred)(half x) { half t = x * x; half s = MATH_MAD(x, t*MATH_MAD(t, 0x1.0bp-7h, -0x1.554p-3h), x); half c = MATH_MAD(t, MATH_MAD(t, 0x1.4b4p-5h, -0x1.ffcp-2h), 1.0h); - *cp = c; - return s; + struct scret ret; + ret.c = c; + ret.s = s; + return ret; } diff --git a/ocml/src/sinhD.cl b/ocml/src/sinhD.cl index 7d377385..0bab018b 100644 --- a/ocml/src/sinhD.cl +++ b/ocml/src/sinhD.cl @@ -12,7 +12,7 @@ extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x); -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(sinh)(double x) { double y = BUILTIN_ABS_F64(x); diff --git a/ocml/src/sinhF.cl b/ocml/src/sinhF.cl index 5718e06e..9ea55fc9 100644 --- a/ocml/src/sinhF.cl +++ b/ocml/src/sinhF.cl @@ -12,7 +12,7 @@ extern CONSTATTR float2 MATH_PRIVATE(epexpep)(float2 x); -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(sinh)(float x) { float y = BUILTIN_ABS_F32(x); diff --git a/ocml/src/sinhH.cl b/ocml/src/sinhH.cl index 92954661..c3ab5ed7 100644 --- a/ocml/src/sinhH.cl +++ b/ocml/src/sinhH.cl @@ -9,7 +9,7 @@ PUREATTR UGEN(sinh) -PUREATTR INLINEATTR half +PUREATTR half MATH_MANGLE(sinh)(half hx) { float x = (float)hx * 0x1.715476p+0f; diff --git a/ocml/src/sinpiD.cl b/ocml/src/sinpiD.cl index 5393c792..ab208901 100644 --- a/ocml/src/sinpiD.cl +++ b/ocml/src/sinpiD.cl @@ -8,17 +8,14 @@ #include "mathD.h" #include "trigpiredD.h" -INLINEATTR double +double MATH_MANGLE(sinpi)(double x) { - double t; - int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &t); + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); - double cc; - double ss = MATH_PRIVATE(sincospired)(t, &cc); - - int2 s = AS_INT2((i & 1) == 0 ? ss : cc); - s.hi ^= (i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000); + int2 s = AS_INT2((r.i & 1) == 0 ? sc.s : sc.c); + s.hi ^= (r.i > 1 ? 0x80000000 : 0) ^ (AS_INT2(x).hi & 0x80000000); if (!FINITE_ONLY_OPT()) { s = BUILTIN_CLASS_F64(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? AS_INT2(QNANBITPATT_DP64) : s; diff --git a/ocml/src/sinpiF.cl b/ocml/src/sinpiF.cl index 50fa9a44..2a50553a 100644 --- a/ocml/src/sinpiF.cl +++ b/ocml/src/sinpiF.cl @@ -8,20 +8,16 @@ #include "mathF.h" #include "trigpiredF.h" -INLINEATTR float +float MATH_MANGLE(sinpi)(float x) { int ix = AS_INT(x); int ax = ix & 0x7fffffff; + struct redret r = MATH_PRIVATE(trigpired)(AS_FLOAT(ax)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); - float r; - int i = MATH_PRIVATE(trigpired)(AS_FLOAT(ax), &r); - - float cc; - float ss = MATH_PRIVATE(sincospired)(r, &cc); - - float s = (i & 1) == 0 ? ss : cc; - s = AS_FLOAT(AS_INT(s) ^ (i > 1 ? 0x80000000 : 0) ^ (ix ^ ax)); + float s = (r.i & 1) == 0 ? sc.s : sc.c; + s = AS_FLOAT(AS_INT(s) ^ (r.i > 1 ? 0x80000000 : 0) ^ (ix ^ ax)); if (!FINITE_ONLY_OPT()) { s = ax >= PINFBITPATT_SP32 ? AS_FLOAT(QNANBITPATT_SP32) : s; diff --git a/ocml/src/sinpiH.cl b/ocml/src/sinpiH.cl index a429ec5b..c738c222 100644 --- a/ocml/src/sinpiH.cl +++ b/ocml/src/sinpiH.cl @@ -10,17 +10,14 @@ UGEN(sinpi) -INLINEATTR half +half MATH_MANGLE(sinpi)(half x) { - half t; - short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &t); + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x)); + struct scret sc = MATH_PRIVATE(sincospired)(r.hi); - half cc; - half ss = MATH_PRIVATE(sincospired)(t, &cc); - - short s = AS_SHORT((i & (short)1) == (short)0 ? ss : cc); - s ^= (i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000); + short s = AS_SHORT((r.i & (short)1) == (short)0 ? sc.s : sc.c); + s ^= (r.i > (short)1 ? (short)0x8000 : (short)0) ^ (AS_SHORT(x) & (short)0x8000); if (!FINITE_ONLY_OPT()) { s = BUILTIN_CLASS_F16(x, CLASS_SNAN|CLASS_QNAN|CLASS_NINF|CLASS_PINF) ? (short)QNANBITPATT_HP16 : s; diff --git a/ocml/src/sqrtD.cl b/ocml/src/sqrtD.cl index a68f7bd0..6f484fab 100644 --- a/ocml/src/sqrtD.cl +++ b/ocml/src/sqrtD.cl @@ -7,27 +7,21 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(sqrt)(double x) { return MATH_SQRT(x); } -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR double \ -MATH_MANGLE(NAME)(double x) \ +MATH_MANGLE(LN)(double x) \ { \ - return BUILTIN_FULL_UNARY(fsqrt, false, ROUND, x); \ + return BUILTIN_##UN##_F64(x); \ } -GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN) -GEN(sqrt_rtp, ROUND_TO_POSINF) -GEN(sqrt_rtn, ROUND_TO_NEGINF) -GEN(sqrt_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(sqrt_rte,SQRT_RTE) +GEN(sqrt_rtn,SQRT_RTN) +GEN(sqrt_rtp,SQRT_RTP) +GEN(sqrt_rtz,SQRT_RTZ) diff --git a/ocml/src/sqrtF.cl b/ocml/src/sqrtF.cl index dbf495c5..051e73b6 100644 --- a/ocml/src/sqrtF.cl +++ b/ocml/src/sqrtF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(sqrt)(float x) { if (CORRECTLY_ROUNDED_SQRT32()) { @@ -17,27 +17,15 @@ MATH_MANGLE(sqrt)(float x) } } -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR float \ -MATH_MANGLE(NAME)(float x) \ +MATH_MANGLE(LN)(float x) \ { \ - float ret; \ - if (DAZ_OPT()) { \ - ret = BUILTIN_FULL_UNARY(fsqrtf, true, ROUND, x); \ - } else { \ - ret = BUILTIN_FULL_UNARY(fsqrtf, false, ROUND, x); \ - } \ - return ret; \ + return BUILTIN_##UN##_F32(x); \ } -GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN) -GEN(sqrt_rtp, ROUND_TO_POSINF) -GEN(sqrt_rtn, ROUND_TO_NEGINF) -GEN(sqrt_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(sqrt_rte,SQRT_RTE) +GEN(sqrt_rtn,SQRT_RTN) +GEN(sqrt_rtp,SQRT_RTP) +GEN(sqrt_rtz,SQRT_RTZ) diff --git a/ocml/src/sqrtH.cl b/ocml/src/sqrtH.cl index 3c663887..b4488e4e 100644 --- a/ocml/src/sqrtH.cl +++ b/ocml/src/sqrtH.cl @@ -9,27 +9,21 @@ CONSTATTR UGEN(sqrt) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(sqrt)(half x) { return BUILTIN_SQRT_F16(x); } -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR half \ -MATH_MANGLE(NAME)(half x) \ +MATH_MANGLE(LN)(half x) \ { \ - return BUILTIN_FULL_UNARY(fsqrth, false, ROUND, x); \ + return BUILTIN_##UN##_F16(x); \ } -GEN(sqrt_rte, ROUND_TO_NEAREST_EVEN) -GEN(sqrt_rtp, ROUND_TO_POSINF) -GEN(sqrt_rtn, ROUND_TO_NEGINF) -GEN(sqrt_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(sqrt_rte,SQRT_RTE) +GEN(sqrt_rtp,SQRT_RTN) +GEN(sqrt_rtn,SQRT_RTP) +GEN(sqrt_rtz,SQRT_RTZ) diff --git a/ocml/src/subD.cl b/ocml/src/subD.cl index beda1a10..f6c9a92b 100644 --- a/ocml/src/subD.cl +++ b/ocml/src/subD.cl @@ -7,21 +7,15 @@ #include "mathD.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR double \ -MATH_MANGLE(NAME)(double x, double y) \ +MATH_MANGLE(LN)(double x, double y) \ { \ - return BUILTIN_FULL_BINARY(fsub, false, ROUND, x, y); \ + return BUILTIN_##UN##_F64(x, y); \ } -GEN(sub_rte, ROUND_TO_NEAREST_EVEN) -GEN(sub_rtp, ROUND_TO_POSINF) -GEN(sub_rtn, ROUND_TO_NEGINF) -GEN(sub_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(sub_rte,SUB_RTE) +GEN(sub_rtn,SUB_RTN) +GEN(sub_rtp,SUB_RTP) +GEN(sub_rtz,SUB_RTZ) diff --git a/ocml/src/subF.cl b/ocml/src/subF.cl index 30664d6c..80d7d3c7 100644 --- a/ocml/src/subF.cl +++ b/ocml/src/subF.cl @@ -7,27 +7,15 @@ #include "mathF.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR float \ -MATH_MANGLE(NAME)(float x, float y) \ +MATH_MANGLE(LN)(float x, float y) \ { \ - float ret; \ - if (DAZ_OPT()) { \ - ret = BUILTIN_FULL_BINARY(fsubf, true, ROUND, x, y); \ - } else { \ - ret = BUILTIN_FULL_BINARY(fsubf, false, ROUND, x, y); \ - } \ - return ret; \ + return BUILTIN_##UN##_F32(x, y); \ } -GEN(sub_rte, ROUND_TO_NEAREST_EVEN) -GEN(sub_rtp, ROUND_TO_POSINF) -GEN(sub_rtn, ROUND_TO_NEGINF) -GEN(sub_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(sub_rte,SUB_RTE) +GEN(sub_rtn,SUB_RTN) +GEN(sub_rtp,SUB_RTP) +GEN(sub_rtz,SUB_RTZ) diff --git a/ocml/src/subH.cl b/ocml/src/subH.cl index 6ca8e24b..369792e3 100644 --- a/ocml/src/subH.cl +++ b/ocml/src/subH.cl @@ -7,21 +7,15 @@ #include "mathH.h" -#if defined ENABLE_ROUNDED -#if defined HSAIL_BUILD - -#define GEN(NAME,ROUND)\ +#define GEN(LN,UN) \ CONSTATTR INLINEATTR half \ -MATH_MANGLE(NAME)(half x, half y) \ +MATH_MANGLE(LN)(half x, half y) \ { \ - return BUILTIN_FULL_BINARY(fsubh, false, ROUND, x, y); \ + return BUILTIN_##UN##_F16(x, y); \ } -GEN(sub_rte, ROUND_TO_NEAREST_EVEN) -GEN(sub_rtp, ROUND_TO_POSINF) -GEN(sub_rtn, ROUND_TO_NEGINF) -GEN(sub_rtz, ROUND_TO_ZERO) - -#endif // HSAIL_BUILD -#endif // ENABLE_ROUNDED +GEN(sub_rte,SUB_RTE) +GEN(sub_rtn,SUB_RTN) +GEN(sub_rtp,SUB_RTP) +GEN(sub_rtz,SUB_RTZ) diff --git a/ocml/src/tanD.cl b/ocml/src/tanD.cl index 442aa20d..0a3193d3 100644 --- a/ocml/src/tanD.cl +++ b/ocml/src/tanD.cl @@ -8,13 +8,12 @@ #include "mathD.h" #include "trigredD.h" -INLINEATTR double +CONSTATTR double MATH_MANGLE(tan)(double x) { - double r, rr; - int i = MATH_PRIVATE(trigred)(&r, &rr, BUILTIN_ABS_F64(x)); + struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F64(x)); - int2 t = AS_INT2(MATH_PRIVATE(tanred2)(r, rr, i & 1)); + int2 t = AS_INT2(MATH_PRIVATE(tanred2)(r.hi, r.lo, r.i & 1)); t.hi ^= AS_INT2(x).hi & (int)0x80000000; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/tanF.cl b/ocml/src/tanF.cl index 81698c4d..efe22a75 100644 --- a/ocml/src/tanF.cl +++ b/ocml/src/tanF.cl @@ -8,22 +8,18 @@ #include "mathF.h" #include "trigredF.h" -INLINEATTR float +float MATH_MANGLE(tan)(float x) { int ix = AS_INT(x); int ax = ix & 0x7fffffff; -#if defined EXTRA_PRECISION - float r0, r1; - int regn = MATH_PRIVATE(trigred)(&r0, &r1, AS_FLOAT(ax)); + struct redret r = MATH_PRIVATE(trigred)(AS_FLOAT(ax)); - float t = MATH_PRIVATE(tanred)(r0 + r1, regn & 1); +#if defined EXTRA_PRECISION + float t = MATH_PRIVATE(tanred)(r.hi + r.lo, r.i & 1); #else - float r; - int regn = MATH_PRIVATE(trigred)(&r, AS_FLOAT(ax)); - - float t = MATH_PRIVATE(tanred)(r, regn & 1); + float t = MATH_PRIVATE(tanred)(r.hi, r.i & 1); #endif t = AS_FLOAT(AS_INT(t) ^ (ix ^ ax)); diff --git a/ocml/src/tanH.cl b/ocml/src/tanH.cl index 201b2c79..36d91d3c 100644 --- a/ocml/src/tanH.cl +++ b/ocml/src/tanH.cl @@ -10,13 +10,11 @@ UGEN(tan) -INLINEATTR half +half MATH_MANGLE(tan)(half x) { - half r; - short i = MATH_PRIVATE(trigred)(&r, BUILTIN_ABS_F16(x)); - - short t = AS_SHORT(MATH_PRIVATE(tanred)(r, i & 1)); + struct redret r = MATH_PRIVATE(trigred)(BUILTIN_ABS_F16(x)); + short t = AS_SHORT(MATH_PRIVATE(tanred)(r.hi, r.i & (short)1)); t ^= AS_SHORT(x) & (short)0x8000; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/tanhD.cl b/ocml/src/tanhD.cl index 834e397e..e0c896d9 100644 --- a/ocml/src/tanhD.cl +++ b/ocml/src/tanhD.cl @@ -12,7 +12,7 @@ extern CONSTATTR double2 MATH_PRIVATE(epexpep)(double2 x); -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(tanh)(double x) { double y = BUILTIN_ABS_F64(x); diff --git a/ocml/src/tanpiD.cl b/ocml/src/tanpiD.cl index a55fff6f..90c746ef 100644 --- a/ocml/src/tanpiD.cl +++ b/ocml/src/tanpiD.cl @@ -8,14 +8,12 @@ #include "mathD.h" #include "trigpiredD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(tanpi)(double x) { - double r; - int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x), &r); - - int2 t = AS_INT2(MATH_PRIVATE(tanpired)(r, i & 1)); - t.hi ^= (((i == 1) | (i == 2)) & (r == 0.0)) ? 0x80000000 : 0; + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F64(x)); + int2 t = AS_INT2(MATH_PRIVATE(tanpired)(r.hi, r.i & 1)); + t.hi ^= (((r.i == 1) | (r.i == 2)) & (r.hi == 0.0)) ? 0x80000000 : 0; t.hi ^= AS_INT2(x).hi & (int)0x80000000; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/tanpiF.cl b/ocml/src/tanpiF.cl index fc188bc3..a13b9143 100644 --- a/ocml/src/tanpiF.cl +++ b/ocml/src/tanpiF.cl @@ -8,14 +8,12 @@ #include "mathF.h" #include "trigpiredF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(tanpi)(float x) { - float r; - int i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F32(x), &r); - - int t = AS_INT(MATH_PRIVATE(tanpired)(r, i & 1)); - t ^= (((i == 1) | (i == 2)) & (r == 0.0f)) ? (int)0x80000000 : 0; + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F32(x)); + int t = AS_INT(MATH_PRIVATE(tanpired)(r.hi, r.i & 1)); + t ^= (((r.i == 1) | (r.i == 2)) & (r.hi == 0.0f)) ? (int)0x80000000 : 0; t ^= AS_INT(x) & (int)0x80000000; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/tanpiH.cl b/ocml/src/tanpiH.cl index a36e97c0..b0571ba5 100644 --- a/ocml/src/tanpiH.cl +++ b/ocml/src/tanpiH.cl @@ -10,14 +10,12 @@ CONSTATTR UGEN(tanpi) -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(tanpi)(half x) { - half r; - short i = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x), &r); - - short t = AS_SHORT(MATH_PRIVATE(tanpired)(r, i & (short)1)); - t ^= (((i == (short)1) | (i == (short)2)) & (r == 0.0h)) ? (short)0x8000 : (short)0; + struct redret r = MATH_PRIVATE(trigpired)(BUILTIN_ABS_F16(x)); + short t = AS_SHORT(MATH_PRIVATE(tanpired)(r.hi, r.i & (short)1)); + t ^= (((r.i == (short)1) | (r.i == (short)2)) & (r.hi == 0.0h)) ? (short)0x8000 : (short)0; t ^= AS_SHORT(x) & (short)0x8000; if (!FINITE_ONLY_OPT()) { diff --git a/ocml/src/tanpiredD.cl b/ocml/src/tanpiredD.cl index 5d877733..ecedafd2 100644 --- a/ocml/src/tanpiredD.cl +++ b/ocml/src/tanpiredD.cl @@ -8,7 +8,7 @@ #include "mathD.h" #include "trigpiredD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_PRIVATE(tanpired)(double x, int i) { double s = x * x; diff --git a/ocml/src/tanpiredF.cl b/ocml/src/tanpiredF.cl index 25b2467b..96e63ad2 100644 --- a/ocml/src/tanpiredF.cl +++ b/ocml/src/tanpiredF.cl @@ -8,7 +8,7 @@ #include "mathF.h" #include "trigpiredF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_PRIVATE(tanpired)(float x, int i) { float s = x * x; diff --git a/ocml/src/tanpiredH.cl b/ocml/src/tanpiredH.cl index 221797f7..645f58a5 100644 --- a/ocml/src/tanpiredH.cl +++ b/ocml/src/tanpiredH.cl @@ -8,7 +8,7 @@ #include "mathH.h" #include "trigpiredH.h" -CONSTATTR INLINEATTR half +CONSTATTR half MATH_PRIVATE(tanpired)(half x, short i) { half s = x * x; diff --git a/ocml/src/tanred2D.cl b/ocml/src/tanred2D.cl index ae5d49c6..18dd4bf8 100644 --- a/ocml/src/tanred2D.cl +++ b/ocml/src/tanred2D.cl @@ -7,7 +7,7 @@ #include "mathD.h" -INLINEATTR CONSTATTR double +CONSTATTR double MATH_PRIVATE(tanred2)(double x, double xx, int sel) { const double piby4_lead = 0x1.921fb54442d18p-1; diff --git a/ocml/src/tanredF.cl b/ocml/src/tanredF.cl index 0bb6744c..b1a196cc 100644 --- a/ocml/src/tanredF.cl +++ b/ocml/src/tanredF.cl @@ -8,7 +8,7 @@ #include "mathF.h" #include "trigredF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_PRIVATE(tanred)(float x, int i) { float s = x * x; diff --git a/ocml/src/tanredH.cl b/ocml/src/tanredH.cl index bade03c2..b11844f2 100644 --- a/ocml/src/tanredH.cl +++ b/ocml/src/tanredH.cl @@ -8,7 +8,7 @@ #include "mathH.h" #include "trigredH.h" -CONSTATTR INLINEATTR half +CONSTATTR half MATH_PRIVATE(tanred)(half x, short i) { half s = x * x; diff --git a/ocml/src/tgammaH.cl b/ocml/src/tgammaH.cl index 07a72ef2..8ae01c2c 100644 --- a/ocml/src/tgammaH.cl +++ b/ocml/src/tgammaH.cl @@ -9,7 +9,7 @@ UGEN(tgamma) -INLINEATTR half +half MATH_MANGLE(tgamma)(half x) { return (half)MATH_UPMANGLE(tgamma)((float)x); diff --git a/ocml/src/trigpiredD.cl b/ocml/src/trigpiredD.cl index fddfef06..7bea3077 100644 --- a/ocml/src/trigpiredD.cl +++ b/ocml/src/trigpiredD.cl @@ -8,13 +8,16 @@ #include "mathD.h" #include "trigpiredD.h" -INLINEATTR int -MATH_PRIVATE(trigpired)(double x, __private double *r) +CONSTATTR struct redret +MATH_PRIVATE(trigpired)(double x) { double t = 2.0 * BUILTIN_FRACTION_F64(0.5 * x); x = x > 1.0 ? t : x; t = BUILTIN_RINT_F64(2.0 * x); - *r = MATH_MAD(t, -0.5, x); - return (int)t & 0x3; + + struct redret ret; + ret.hi = MATH_MAD(t, -0.5, x); + ret.i = (int)t & 0x3; + return ret; } diff --git a/ocml/src/trigpiredD.h b/ocml/src/trigpiredD.h index 1a464150..3d82c947 100644 --- a/ocml/src/trigpiredD.h +++ b/ocml/src/trigpiredD.h @@ -5,7 +5,17 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ -extern int MATH_PRIVATE(trigpired)(double x, __private double *r); -extern double MATH_PRIVATE(sincospired)(double x, __private double *cp); +struct redret { + double hi; + int i; +}; + +struct scret { + double c; + double s; +}; + +extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(double x); +extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(double x); extern CONSTATTR double MATH_PRIVATE(tanpired)(double x, int i); diff --git a/ocml/src/trigpiredF.cl b/ocml/src/trigpiredF.cl index ab2fa371..bcdc5727 100644 --- a/ocml/src/trigpiredF.cl +++ b/ocml/src/trigpiredF.cl @@ -8,13 +8,16 @@ #include "mathF.h" #include "trigpiredF.h" -INLINEATTR int -MATH_PRIVATE(trigpired)(float x, __private float *r) +CONSTATTR struct redret +MATH_PRIVATE(trigpired)(float x) { float t = 2.0f * BUILTIN_FRACTION_F32(0.5f * x); x = x > 1.0f ? t : x; t = BUILTIN_RINT_F32(2.0f * x); - *r = MATH_MAD(t, -0.5f, x); - return (int)t & 0x3; + + struct redret ret; + ret.hi = MATH_MAD(t, -0.5f, x); + ret.i = (int)t & 0x3; + return ret; } diff --git a/ocml/src/trigpiredF.h b/ocml/src/trigpiredF.h index 162544ec..f6727b5b 100644 --- a/ocml/src/trigpiredF.h +++ b/ocml/src/trigpiredF.h @@ -5,7 +5,17 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ -extern int MATH_PRIVATE(trigpired)(float x, __private float *r); -extern float MATH_PRIVATE(sincospired)(float x, __private float *cp); +struct redret { + float hi; + int i; +}; + +struct scret { + float s; + float c; +}; + +extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(float x); +extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(float x); extern CONSTATTR float MATH_PRIVATE(tanpired)(float x, int i); diff --git a/ocml/src/trigpiredH.cl b/ocml/src/trigpiredH.cl index b68d43e5..7615528f 100644 --- a/ocml/src/trigpiredH.cl +++ b/ocml/src/trigpiredH.cl @@ -8,13 +8,16 @@ #include "mathH.h" #include "trigpiredH.h" -INLINEATTR short -MATH_PRIVATE(trigpired)(half x, __private half *r) +CONSTATTR struct redret +MATH_PRIVATE(trigpired)(half x) { half t = 2.0h * BUILTIN_FRACTION_F16(0.5h * x); x = x > 1.0h ? t : x; t = BUILTIN_RINT_F16(2.0h * x); - *r = MATH_MAD(t, -0.5h, x); - return (short)t & (short)0x3; + + struct redret ret; + ret.hi = MATH_MAD(t, -0.5h, x); + ret.i = (short)t & (short)0x3; + return ret; } diff --git a/ocml/src/trigpiredH.h b/ocml/src/trigpiredH.h index 1294ebea..b2d240f5 100644 --- a/ocml/src/trigpiredH.h +++ b/ocml/src/trigpiredH.h @@ -5,7 +5,17 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ -extern short MATH_PRIVATE(trigpired)(half x, __private half *r); -extern half MATH_PRIVATE(sincospired)(half x, __private half *cp); +struct redret { + half hi; + short i; +}; + +struct scret { + half s; + half c; +}; + +extern CONSTATTR struct redret MATH_PRIVATE(trigpired)(half x); +extern CONSTATTR struct scret MATH_PRIVATE(sincospired)(half x); extern CONSTATTR half MATH_PRIVATE(tanpired)(half x, short i); diff --git a/ocml/src/trigredD.cl b/ocml/src/trigredD.cl index 60fc8b3f..c9700fd8 100644 --- a/ocml/src/trigredD.cl +++ b/ocml/src/trigredD.cl @@ -8,12 +8,12 @@ #include "mathD.h" #include "trigredD.h" -INLINEATTR int -MATH_PRIVATE(trigred)(__private double *r, __private double *rr, double x) +CONSTATTR struct redret +MATH_PRIVATE(trigred)(double x) { if (x < 0x1.0p+21) - return MATH_PRIVATE(trigredsmall)(r, rr, x); + return MATH_PRIVATE(trigredsmall)(x); else - return MATH_PRIVATE(trigredlarge)(r, rr, x); + return MATH_PRIVATE(trigredlarge)(x); } diff --git a/ocml/src/trigredD.h b/ocml/src/trigredD.h index 6dd96f67..26a9599d 100644 --- a/ocml/src/trigredD.h +++ b/ocml/src/trigredD.h @@ -5,12 +5,23 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ -extern int MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x); -extern int MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x); -extern int MATH_PRIVATE(trigred)(__private double *r, __private double *rr, double x); +struct redret { + double lo; + double hi; + int i; +}; -extern double MATH_PRIVATE(sincosred)(double x, __private double *cp); -extern double MATH_PRIVATE(sincosred2)(double x, double y, __private double *cp); +struct scret { + double s; + double c; +}; + +extern CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(double x); +extern CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(double x); +extern CONSTATTR struct redret MATH_PRIVATE(trigred)(double x); + +extern CONSTATTR struct scret MATH_PRIVATE(sincosred)(double x); +extern CONSTATTR struct scret MATH_PRIVATE(sincosred2)(double x, double y); extern CONSTATTR double MATH_PRIVATE(tanred2)(double x, double xx, int sel); diff --git a/ocml/src/trigredF.cl b/ocml/src/trigredF.cl index c73a0fb7..20cbd39b 100644 --- a/ocml/src/trigredF.cl +++ b/ocml/src/trigredF.cl @@ -8,24 +8,12 @@ #include "mathF.h" #include "trigredF.h" -INLINEATTR int -#if defined EXTRA_PRECISION -MATH_PRIVATE(trigred)(__private float *r, __private float *rr, float x) -#else -MATH_PRIVATE(trigred)(__private float *r, float x) -#endif +CONSTATTR struct redret +MATH_PRIVATE(trigred)(float x) { if (x < SMALL_BOUND) -#if defined EXTRA_PRECISION - return MATH_PRIVATE(trigredsmall)(r, rr, x); -#else - return MATH_PRIVATE(trigredsmall)(r, x); -#endif + return MATH_PRIVATE(trigredsmall)(x); else -#if defined EXTRA_PRECISION - return MATH_PRIVATE(trigredlarge)(r, rr, x); -#else - return MATH_PRIVATE(trigredlarge)(r, x); -#endif + return MATH_PRIVATE(trigredlarge)(x); } diff --git a/ocml/src/trigredF.h b/ocml/src/trigredF.h index d7a81a1b..e0e50c93 100644 --- a/ocml/src/trigredF.h +++ b/ocml/src/trigredF.h @@ -8,18 +8,33 @@ #define SMALL_BOUND 0x1.0p+17f #if defined EXTRA_PRECISION -extern int MATH_PRIVATE(trigredsmall)(__private float *r, __private float *rr, float x); -extern int MATH_PRIVATE(trigredlarge)(__private float *r, __private float *rr, float x); -extern int MATH_PRIVATE(trigred)(__private float *r, __private float *rr, float x); +struct redret { + float hi; + float lo; + int i; +}; #else -extern int MATH_PRIVATE(trigredsmall)(__private float *r, float x); -extern int MATH_PRIVATE(trigredlarge)(__private float *r, float x); -extern int MATH_PRIVATE(trigred)(__private float *r, float x); +struct redret { + float hi; + int i; +}; #endif -extern float MATH_PRIVATE(sincosred2)(float x, float y, __private float *cp); +struct scret { + float s; + float c; +}; -extern float MATH_PRIVATE(sincosred)(float x, __private float *cp); +extern CONSTATTR struct redret MATH_PRIVATE(trigredsmall)(float x); +extern CONSTATTR struct redret MATH_PRIVATE(trigredlarge)(float x); +extern CONSTATTR struct redret MATH_PRIVATE(trigred)(float x); + + +#if defined EXTRA_PRECISION +extern CONSTATTR struct scret MATH_PRIVATE(sincosred2)(float x, float y); +#else +extern CONSTATTR struct scret MATH_PRIVATE(sincosred)(float x); +#endif extern CONSTATTR float MATH_PRIVATE(tanred)(float x, int regn); diff --git a/ocml/src/trigredH.cl b/ocml/src/trigredH.cl index 5fcf39b1..ac75d51a 100644 --- a/ocml/src/trigredH.cl +++ b/ocml/src/trigredH.cl @@ -6,9 +6,10 @@ *===------------------------------------------------------------------------*/ #include "mathH.h" +#include "trigredH.h" -__attribute__((always_inline)) short -MATH_PRIVATE(trigred)(__private half *r, half hx) +CONSTATTR struct redret +MATH_PRIVATE(trigred)(half hx) { const float twobypi = 0x1.45f306p-1f; const float pb2_a = 0x1.92p+0f; @@ -18,8 +19,9 @@ MATH_PRIVATE(trigred)(__private half *r, half hx) float x = (float)hx; float fn = BUILTIN_RINT_F32(x * twobypi); - *r = (half)BUILTIN_MAD_F32(fn, -pb2_c, BUILTIN_MAD_F32(fn, -pb2_b, BUILTIN_MAD_F32(fn, -pb2_a, x))); - - return (int)fn & 0x3; + struct redret ret; + ret.hi = (half)BUILTIN_MAD_F32(fn, -pb2_c, BUILTIN_MAD_F32(fn, -pb2_b, BUILTIN_MAD_F32(fn, -pb2_a, x))); + ret.i = (int)fn & 0x3; + return ret; } diff --git a/ocml/src/trigredH.h b/ocml/src/trigredH.h index 97764561..2f02b42a 100644 --- a/ocml/src/trigredH.h +++ b/ocml/src/trigredH.h @@ -5,7 +5,17 @@ * License. See LICENSE.TXT for details. *===------------------------------------------------------------------------*/ -extern short MATH_PRIVATE(trigred)(__private half *r, half x); -extern half MATH_PRIVATE(sincosred)(half x, __private half *cp); -extern CONSTATTR half MATH_PRIVATE(tanred)(half x, short regn); +struct redret { + half hi; + short i; +}; + +struct scret { + half s; + half c; +}; + +extern CONSTATTR struct redret MATH_PRIVATE(trigred)(half x); +extern CONSTATTR struct scret MATH_PRIVATE(sincosred)(half x); +extern CONSTATTR half MATH_PRIVATE(tanred)(half x, short i); diff --git a/ocml/src/trigredlargeD.cl b/ocml/src/trigredlargeD.cl index caea8352..02804e0b 100644 --- a/ocml/src/trigredlargeD.cl +++ b/ocml/src/trigredlargeD.cl @@ -63,8 +63,8 @@ C3 += C2; \ } while (0) -int -MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x) +CONSTATTR struct redret +MATH_PRIVATE(trigredlarge)(double x) { // Scale x by relevant part of 2/pi double p2 = BUILTIN_TRIG_PREOP_F64(x, 0); @@ -106,9 +106,11 @@ MATH_PRIVATE(trigredlarge)(__private double *r, __private double *rr, double x) double rt = BUILTIN_FMA_F64(f1, pio2h, BUILTIN_FMA_F64(f2, pio2t, BUILTIN_FMA_F64(f2, pio2h, -rh))); FSUM2(rh, rt, rh, rt); - *r = rh; - *rr = rt; - return i & 0x3; + struct redret ret; + ret.hi = rh; + ret.lo = rt; + ret.i = i & 0x3; + return ret; } diff --git a/ocml/src/trigredlargeF.cl b/ocml/src/trigredlargeF.cl index dcb2057c..94ea8ae5 100644 --- a/ocml/src/trigredlargeF.cl +++ b/ocml/src/trigredlargeF.cl @@ -17,12 +17,8 @@ HI = BUILTIN_MULHI_U32(A, B); \ HI += LO < C -int -#if defined EXTRA_PRECISION -MATH_PRIVATE(trigredlarge)(__private float *r, __private float *rr, float x) -#else -MATH_PRIVATE(trigredlarge)(__private float *r, float x) -#endif +CONSTATTR struct redret +MATH_PRIVATE(trigredlarge)(float x) { int xe = (int)(AS_UINT(x) >> 23) - 127; uint xm = 0x00800000U | (AS_UINT(x) & 0x7fffffU); @@ -152,16 +148,18 @@ MATH_PRIVATE(trigredlarge)(__private float *r, float x) MATH_MAD(q0, pio2h, q1*pio2t); } + struct redret ret; #if defined EXTRA_PRECISION float t = rh + rt; rt = rt - (t - rh); - *r = t; - *rr = rt; + ret.hi = t; + ret.lo = rt; #else - *r = rh + rt; + ret.hi = rh + rt; #endif - return ((i >> 1) + (i & 1)) & 0x3; + ret.i = ((i >> 1) + (i & 1)) & 0x3; + return ret; } diff --git a/ocml/src/trigredsmallD.cl b/ocml/src/trigredsmallD.cl index 59f74c87..0cac73ef 100644 --- a/ocml/src/trigredsmallD.cl +++ b/ocml/src/trigredsmallD.cl @@ -8,8 +8,8 @@ #include "mathD.h" #include "trigredD.h" -INLINEATTR int -MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x) +CONSTATTR struct redret +MATH_PRIVATE(trigredsmall)(double x) { const double twobypi = 0x1.45f306dc9c883p-1; const double piby2_h = 0x1.921fb54442d18p+0; @@ -27,9 +27,10 @@ MATH_PRIVATE(trigredsmall)(__private double *r, __private double *rr, double x) double rh = yh + yt; double rt = yt - (rh - yh); - *r = rh; - *rr = rt; - - return (int)dn & 0x3; + struct redret ret; + ret.hi = rh; + ret.lo = rt; + ret.i = (int)dn & 0x3; + return ret; } diff --git a/ocml/src/trigredsmallF.cl b/ocml/src/trigredsmallF.cl index eaf2bc08..c93a2761 100644 --- a/ocml/src/trigredsmallF.cl +++ b/ocml/src/trigredsmallF.cl @@ -22,12 +22,8 @@ D = __t + (((C - __t) - __ph) - __pt); \ } while(0) -static inline int -#if defined EXTRA_PRECISION -mad_reduce(__private float *hi, __private float *lo, float x) -#else -mad_reduce(__private float *hi, float x) -#endif +static inline struct redret +mad_reduce(float x) { #if defined EXTRA_PRECISION #error Not implemented @@ -54,17 +50,16 @@ mad_reduce(__private float *hi, float x) float r; FNMA(fn, fnh, fnl, piby2_h, piby2_hh, piby2_hl, x, r); FNMA(fn, fnh, fnl, piby2_m, piby2_mh, piby2_ml, r, r); - *hi = MATH_MAD(-piby2_l, fn, r); - return (int)fn & 0x3; + + struct redret ret; + ret.hi = MATH_MAD(-piby2_l, fn, r); + ret.i = (int)fn & 0x3; + return ret; #endif } -static inline int -#if defined EXTRA_PRECISION -fma_reduce(__private float *hi, __private float *lo, float x) -#else -fma_reduce(__private float *hi, float x) -#endif +static inline struct redret +fma_reduce(float x) { const float twobypi = 0x1.45f306p-1f; const float piby2_h = 0x1.921fb4p+0f; @@ -72,6 +67,9 @@ fma_reduce(__private float *hi, float x) const float piby2_l = 0x1.846988p-48f; float fn = BUILTIN_RINT_F32(x * twobypi); + + struct redret ret; + #if defined EXTRA_PRECISION float xt = BUILTIN_FMA_F32(fn, -piby2_h, x); float yh = BUILTIN_FMA_F32(fn, -piby2_m, xt); @@ -82,34 +80,24 @@ fma_reduce(__private float *hi, float x) float yt = BUILTIN_FMA_F32(fn, -piby2_l, ((th - yh) + tt) - pt); float rh = yh + yt; float rt = yt - (rh - yh); - *hi = rh; - *lo = rt; + ret.hi = rh; + ret.lo = rt; #else float r = BUILTIN_FMA_F32(fn, -piby2_l, BUILTIN_FMA_F32(fn, -piby2_m, BUILTIN_FMA_F32(fn, -piby2_h, x))); - *hi = r; + ret.hi = r; #endif - return (int)fn & 0x3; + + ret.i =(int)fn & 0x3; + return ret; } -INLINEATTR int -#if defined EXTRA_PRECISION -MATH_PRIVATE(trigredsmall)(__private float *r, __private float *rr, float x) -#else -MATH_PRIVATE(trigredsmall)(__private float *r, float x) -#endif +CONSTATTR struct redret +MATH_PRIVATE(trigredsmall)(float x) { if (HAVE_FAST_FMA32()) { -#if defined EXTRA_PRECISION - return fma_reduce(r, rr, x); -#else - return fma_reduce(r, x); -#endif + return fma_reduce(x); } else { -#if defined EXTRA_PRECISION - return mad_reduce(r, rr, x); -#else - return mad_reduce(r, x); -#endif + return mad_reduce(x); } } diff --git a/ocml/src/truncD.cl b/ocml/src/truncD.cl index 91810829..b1ae0417 100644 --- a/ocml/src/truncD.cl +++ b/ocml/src/truncD.cl @@ -7,7 +7,7 @@ #include "mathD.h" -CONSTATTR INLINEATTR double +CONSTATTR double MATH_MANGLE(trunc)(double x) { return BUILTIN_TRUNC_F64(x); diff --git a/ocml/src/truncF.cl b/ocml/src/truncF.cl index 88ee87da..3d279363 100644 --- a/ocml/src/truncF.cl +++ b/ocml/src/truncF.cl @@ -7,7 +7,7 @@ #include "mathF.h" -CONSTATTR INLINEATTR float +CONSTATTR float MATH_MANGLE(trunc)(float x) { return BUILTIN_TRUNC_F32(x); diff --git a/ocml/src/truncH.cl b/ocml/src/truncH.cl index 77292247..6787af80 100644 --- a/ocml/src/truncH.cl +++ b/ocml/src/truncH.cl @@ -7,13 +7,13 @@ #include "mathH.h" -CONSTATTR INLINEATTR half2 +CONSTATTR half2 MATH_MANGLE2(trunc)(half2 x) { return BUILTIN_TRUNC_2F16(x); } -CONSTATTR INLINEATTR half +CONSTATTR half MATH_MANGLE(trunc)(half x) { return BUILTIN_TRUNC_F16(x); diff --git a/ocml/src/y0H.cl b/ocml/src/y0H.cl index b2a81454..c187f45a 100644 --- a/ocml/src/y0H.cl +++ b/ocml/src/y0H.cl @@ -9,7 +9,7 @@ UGEN(y0) -INLINEATTR half +half MATH_MANGLE(y0)(half x) { return (half)MATH_UPMANGLE(y0)((float)x); diff --git a/ocml/src/y1H.cl b/ocml/src/y1H.cl index 0c4197f0..a09ad9ef 100644 --- a/ocml/src/y1H.cl +++ b/ocml/src/y1H.cl @@ -9,7 +9,7 @@ UGEN(y1) -INLINEATTR half +half MATH_MANGLE(y1)(half x) { return (half)MATH_UPMANGLE(y1)((float)x); diff --git a/opencl/CMakeLists.txt b/opencl/CMakeLists.txt index 8ac5f76f..8da642aa 100644 --- a/opencl/CMakeLists.txt +++ b/opencl/CMakeLists.txt @@ -14,6 +14,7 @@ file(GLOB cl_sources ${CMAKE_CURRENT_SOURCE_DIR}/src/math/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/media/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/misc/*.cl + ${CMAKE_CURRENT_SOURCE_DIR}/src/pipes/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/relational/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/subgroup/*.cl ${CMAKE_CURRENT_SOURCE_DIR}/src/vldst/*.cl @@ -36,7 +37,7 @@ if (GENERIC_IS_ZERO) endforeach(f) # Perform transformation - execute_process(COMMAND "${CMAKE_SOURCE_DIR}/utils/change-addr-space.sh" "${CMAKE_SOURCE_DIR}/utils" + execute_process(COMMAND "${CMAKE_SOURCE_DIR}/../utils/change-addr-space.sh" "${AMDGPU_TARGET_TRIPLE}" "${CMAKE_SOURCE_DIR}/../utils" WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) file(GLOB ll_srcs diff --git a/opencl/src/integer/clz.cl b/opencl/src/integer/clz.cl index f24a648a..c3f4b6af 100644 --- a/opencl/src/integer/clz.cl +++ b/opencl/src/integer/clz.cl @@ -20,33 +20,25 @@ UEXP(ulong,clz) UEXPATTR char clz(char x) { - uint y = (uint)(uchar)x; - uint z = __ockl_clz_u32(y); - return (char)(z - 24u); + return (char)__ockl_clz_u8((uchar)x); } UEXPATTR uchar clz(uchar x) { - uint y = (uint)x; - uint z = __ockl_clz_u32(y); - return (uchar)(z - 24u); + return __ockl_clz_u8(x); } UEXPATTR short clz(short x) { - uint y = (uint)(ushort)x; - uint z = __ockl_clz_u32(y); - return (short)(z - 16u); + return (short)__ockl_clz_u16((ushort)x); } UEXPATTR ushort clz(ushort x) { - uint y = (uint)x; - uint z = __ockl_clz_u32(y); - return (ushort)(z - 16u); + return __ockl_clz_u16(x); } UEXPATTR int @@ -61,16 +53,15 @@ clz(uint x) return __ockl_clz_u32(x); } -__attribute__((always_inline, const)) static ulong -clz_u64(ulong x) +UEXPATTR long +clz(long x) { - uint xlo = (uint)x; - uint xhi = (uint)(x >> 32); - uint zlo = __ockl_clz_u32(xlo) + 32u; - uint zhi = __ockl_clz_u32(xhi); - return (ulong)(xhi == 0 ? zlo : zhi); + return (long)__ockl_clz_u64((ulong)x); } -extern __attribute__((overloadable, always_inline, const, alias("clz_u64"))) ulong clz(ulong); -extern __attribute__((overloadable, always_inline, const, alias("clz_u64"))) long clz(long); +UEXPATTR ulong +clz(ulong x) +{ + return __ockl_clz_u64(x); +} diff --git a/opencl/src/integer/ctz.cl b/opencl/src/integer/ctz.cl index d75fc386..b583bf52 100644 --- a/opencl/src/integer/ctz.cl +++ b/opencl/src/integer/ctz.cl @@ -20,29 +20,25 @@ UEXP(ulong,ctz) UEXPATTR char ctz(char x) { - uint y = (uint)(uchar)x; - return (char)min(__ockl_ctz_u32(y), 8u); + return (char)__ockl_ctz_u8((uchar)x); } UEXPATTR uchar ctz(uchar x) { - uint y = (uint)x; - return (uchar)min(__ockl_ctz_u32(y), 8u); + return __ockl_ctz_u8(x); } UEXPATTR short ctz(short x) { - uint y = (uint)(ushort)x; - return (short)min(__ockl_ctz_u32(y), 16u); + return (short)__ockl_ctz_u16((ushort)x); } UEXPATTR ushort ctz(ushort x) { - uint y = (uint)x; - return (ushort)min(__ockl_ctz_u32(y), 16u); + return __ockl_ctz_u16(x); } UEXPATTR int @@ -57,16 +53,15 @@ ctz(uint x) return __ockl_ctz_u32(x); } -__attribute__((always_inline, const)) static ulong -ctz_u64(ulong x) +UEXPATTR long +ctz(long x) { - uint xlo = (uint)x; - uint xhi = (uint)(x >> 32); - uint zlo = __ockl_ctz_u32(xlo); - uint zhi = __ockl_ctz_u32(xhi) + 32u; - return (ulong)(xlo == 0 ? zhi : zlo); + return (long)__ockl_ctz_u64((ulong)x); } -extern __attribute__((overloadable, always_inline, const, alias("ctz_u64"))) ulong ctz(ulong); -extern __attribute__((overloadable, always_inline, const, alias("ctz_u64"))) long ctz(long); +UEXPATTR ulong +ctz(ulong x) +{ + return __ockl_ctz_u64(x); +} diff --git a/opencl/src/integer/popcount.cl b/opencl/src/integer/popcount.cl index f40f32b1..53c525ad 100644 --- a/opencl/src/integer/popcount.cl +++ b/opencl/src/integer/popcount.cl @@ -57,14 +57,12 @@ popcount(uint x) UEXPATTR long popcount(long x) { - uint2 y = as_uint2(x); - return (long)(__ockl_popcount_u32(y.lo) + __ockl_popcount_u32(y.hi)); + return (long)__ockl_popcount_u64((ulong)x); } UEXPATTR ulong popcount(ulong x) { - uint2 y = as_uint2(x); - return (ulong)(__ockl_popcount_u32(y.lo) + __ockl_popcount_u32(y.hi)); + return __ockl_popcount_u64(x); } diff --git a/opencl/src/misc/asqf.cl b/opencl/src/misc/asqf.cl index 84ff26ed..d6a05968 100644 --- a/opencl/src/misc/asqf.cl +++ b/opencl/src/misc/asqf.cl @@ -1,3 +1,9 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ #include "ockl.h" diff --git a/opencl/src/misc/atom.cl b/opencl/src/misc/atom.cl new file mode 100644 index 00000000..d9f9ab38 --- /dev/null +++ b/opencl/src/misc/atom.cl @@ -0,0 +1,390 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable + +#define ATTR __attribute__((overloadable, always_inline)) + +#define AC_int(X) X +#define AC_uint(X) X +#define AC_long(X) X +#define AC_ulong(X) X +#define AC_intptr_t(X) X +#define AC_uintptr_t(X) X +#define AC_size_t(X) X +#define AC_ptrdiff_t(X) X +#define AC_float(X) as_int(X) +#define AC_double(X) as_long(X) + +#define RC_int(X) X +#define RC_uint(X) X +#define RC_long(X) X +#define RC_ulong(X) X +#define RC_intptr_t(X) X +#define RC_uintptr_t(X) X +#define RC_size_t(X) X +#define RC_ptrdiff_t(X) X +#define RC_float(X) as_float(X) +#define RC_double(X) as_double(X) + +#define PC_int +#define PC_uint +#define PC_long +#define PC_ulong +#define PC_intptr_t +#define PC_uintptr_t +#define PC_size_t +#define PC_ptrdiff_t +#define PC_float (volatile atomic_int *) +#define PC_double (volatile atomic_long *) + +#define EC_int +#define EC_uint +#define EC_long +#define EC_ulong +#define EC_intptr_t +#define EC_uintptr_t +#define EC_size_t +#define EC_ptrdiff_t +#define EC_float (int *) +#define EC_double (long *) + +#define OCL12_MEMORY_ORDER memory_order_relaxed +#define OCL12_MEMORY_SCOPE memory_scope_device + +#define F_inc __opencl_atomic_fetch_add +#define F_dec __opencl_atomic_fetch_sub + +// extension and 1.2 functions +#define GEN1(T,A,O) \ +ATTR T \ +atom_##O(volatile A T *p, T v) \ +{ \ + return __opencl_atomic_fetch_##O((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ +} + +#define GEN2(T,A,O) \ +ATTR T \ +atomic_##O(volatile A T *p, T v) \ +{ \ + return __opencl_atomic_fetch_##O((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ +} + +#define OPSA(F,T,A) \ + F(T,A,add) \ + F(T,A,sub) \ + F(T,A,max) \ + F(T,A,min) \ + F(T,A,and) \ + F(T,A,or) \ + F(T,A,xor) + +#define OPS(F,T) \ + OPSA(F,T,) + +#define ALL() \ + OPS(GEN1,int) \ + OPS(GEN2,int) \ + OPS(GEN1,uint) \ + OPS(GEN2,uint) \ + OPS(GEN1,long) \ + OPS(GEN1,ulong) + +ALL() + +// Handle inc and dec +#undef GEN1 +#undef GEN2 +#undef OPSA + +#define OPSA(F,T,A) \ + F(T,A,inc) \ + F(T,A,dec) + + +#define GEN1(T,A,O) \ +ATTR T \ +atom_##O(volatile A T *p) \ +{ \ + return F_##O((volatile atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ +} + +#define GEN2(T,A,O) \ +ATTR T \ +atomic_##O(volatile A T *p) \ +{ \ + return F_##O((volatile atomic_##T *)p, (T)1, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ +} + +ALL() + +// Handle xchg +#undef GEN1 +#undef GEN2 +#undef OPSA +#undef OPS + +#define GEN1(T,A) \ +ATTR T \ +atom_xchg(volatile A T *p, T v) \ +{ \ + return __opencl_atomic_exchange((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ +} + +#define GEN2(T,A) \ +ATTR T \ +atomic_xchg(volatile A T *p, T v) \ +{ \ + return __opencl_atomic_exchange((volatile atomic_##T *)p, v, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ +} + +#define OPS(F,T) \ + F(T,) + +ALL() + +ATTR float +atomic_xchg(volatile float *p, float v) +{ + return as_float(__opencl_atomic_exchange((volatile atomic_int *)p, as_int(v), OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE)); +} + +// Handle cmpxchg +#undef GEN1 +#undef GEN2 + +#define GEN1(T,A) \ +ATTR T \ +atom_cmpxchg(volatile A T *p, T e, T d) \ +{ \ + __opencl_atomic_compare_exchange_strong((volatile atomic_##T *)p, &e, d, OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + return e; \ +} + +#define GEN2(T,A) \ +ATTR T \ +atomic_cmpxchg(volatile A T *p, T e, T d) \ +{ \ + __opencl_atomic_compare_exchange_strong((volatile atomic_##T *)p, &e, d, OCL12_MEMORY_ORDER, OCL12_MEMORY_ORDER, OCL12_MEMORY_SCOPE); \ + return e; \ +} + +ALL() +#undef GEN1 +#undef GEN2 +#undef ALL + +// 2.0 functions + +#define GENI(T) \ +ATTR void \ +atomic_init(volatile atomic_##T *p, T v) \ +{ \ + __opencl_atomic_init(p, v); \ +} + +#define GENS(T) \ +ATTR void \ +atomic_store(volatile atomic_##T *p, T v) \ +{ \ + __opencl_atomic_store(p, v, memory_order_seq_cst, memory_scope_device); \ +} \ + \ +ATTR void \ +atomic_store_explicit(volatile atomic_##T *p, T v, memory_order o) \ +{ \ + __opencl_atomic_store(p, v, o, memory_scope_device); \ +} \ + \ +ATTR void \ +atomic_store_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \ +{ \ + __opencl_atomic_store(p, v, o, s); \ +} + +#define GENL(T) \ +ATTR T \ +atomic_load(volatile atomic_##T *p) \ +{ \ + return __opencl_atomic_load(p, memory_order_seq_cst, memory_scope_device); \ +} \ + \ +ATTR T \ +atomic_load_explicit(volatile atomic_##T *p, memory_order o) \ +{ \ + return __opencl_atomic_load(p, o, memory_scope_device); \ +} \ + \ +ATTR T \ +atomic_load_explicit(volatile atomic_##T *p, memory_order o, memory_scope s) \ +{ \ + return __opencl_atomic_load(p, o, s); \ +} + +#define GENX(T) \ +ATTR T \ +atomic_exchange(volatile atomic_##T *p, T v) \ +{ \ + return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \ +} \ + \ +ATTR T \ +atomic_exchange_explicit(volatile atomic_##T *p, T v, memory_order o) \ +{ \ + return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), o, memory_scope_device)); \ +} \ + \ +ATTR T \ +atomic_exchange_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \ +{ \ + return RC_##T(__opencl_atomic_exchange(PC_##T p, AC_##T(v), o, s)); \ +} + +#define GENCX(T,K) \ +ATTR bool \ +atomic_compare_exchange_##K(volatile atomic_##T *p, T *e, T d) \ +{ \ + return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), memory_order_seq_cst, memory_order_seq_cst, memory_scope_device); \ +} \ + \ +ATTR bool \ +atomic_compare_exchange_##K##_explicit(volatile atomic_##T *p, T *e, T d, memory_order os, memory_order of) \ +{ \ + return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), os, of, memory_scope_device); \ +} \ + \ +ATTR bool \ +atomic_compare_exchange_##K##_explicit(volatile atomic_##T *p, T *e, T d, memory_order os, memory_order of, memory_scope s) \ +{ \ + return __opencl_atomic_compare_exchange_##K(PC_##T p, EC_##T e, AC_##T(d), os, of, s); \ +} + +#define GENFO(T,O) \ +ATTR T \ +atomic_fetch_##O(volatile atomic_##T *p, T v) \ +{ \ + return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), memory_order_seq_cst, memory_scope_device)); \ +} \ + \ +ATTR T \ +atomic_fetch_##O##_explicit(volatile atomic_##T *p, T v, memory_order o) \ +{ \ + return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), o, memory_scope_device)); \ +} \ + \ +ATTR T \ +atomic_fetch_##O##_explicit(volatile atomic_##T *p, T v, memory_order o, memory_scope s) \ +{ \ + return RC_##T(__opencl_atomic_fetch_##O(PC_##T p, AC_##T(v), o, s)); \ +} + +#define CX(T) \ + GENCX(T,strong) \ + GENCX(T,weak) + +#define FO(T) \ + GENFO(T,add) \ + GENFO(T,sub) \ + GENFO(T,or) \ + GENFO(T,xor) \ + GENFO(T,and) \ + GENFO(T,min) \ + GENFO(T,max) \ + +#define ALLI(F) \ + F(int) \ + F(uint) \ + F(long) \ + F(ulong) + +#define ALL(F) \ + ALLI(F) \ + F(float) \ + F(double) + +ALL(GENI) +ALL(GENL) +ALL(GENS) +ALL(GENX) +ALL(CX) +ALLI(FO) + +// These are needed for uintptr_t +ATTR ulong +atomic_fetch_add(volatile atomic_ulong *p, long v) +{ + return __opencl_atomic_fetch_add(p, (ulong)v, memory_order_seq_cst, memory_scope_device); +} + +ATTR ulong +atomic_fetch_add_explicit(volatile atomic_ulong *p, long v, memory_order o) +{ + return __opencl_atomic_fetch_add(p, (ulong)v, o, memory_scope_device); +} + +ATTR ulong +atomic_fetch_add_explicit(volatile atomic_ulong *p, long v, memory_order o, memory_scope s) +{ + return __opencl_atomic_fetch_add(p, (ulong)v, o, s); +} + +ATTR ulong +atomic_fetch_sub(volatile atomic_ulong *p, long v) +{ + return __opencl_atomic_fetch_sub(p, (ulong)v, memory_order_seq_cst, memory_scope_device); +} + +ATTR ulong +atomic_fetch_sub_explicit(volatile atomic_ulong *p, long v, memory_order o) +{ + return __opencl_atomic_fetch_sub(p, (ulong)v, o, memory_scope_device); +} + +ATTR ulong +atomic_fetch_sub_explicit(volatile atomic_ulong *p, long v, memory_order o, memory_scope s) +{ + return __opencl_atomic_fetch_sub(p, (ulong)v, o, s); +} + +// flag functions +ATTR bool +atomic_flag_test_and_set(volatile atomic_flag *p) +{ + return __opencl_atomic_exchange((volatile atomic_int *)p, 1, memory_order_seq_cst, memory_scope_device); +} + +ATTR bool +atomic_flag_test_and_set_explicit(volatile atomic_flag *p, memory_order o) +{ + return __opencl_atomic_exchange((volatile atomic_int *)p, 1, o, memory_scope_device); +} + +ATTR bool +atomic_flag_test_and_set_explicit(volatile atomic_flag *p, memory_order o, memory_scope s) +{ + return __opencl_atomic_exchange((volatile atomic_int *)p, 1, o, s); +} + +ATTR void +atomic_flag_clear(volatile atomic_flag *p) +{ + __opencl_atomic_store((volatile atomic_int *)p, 0, memory_order_seq_cst, memory_scope_device); +} + +ATTR void +atomic_flag_clear_explicit(volatile atomic_flag *p, memory_order o) +{ + __opencl_atomic_store((volatile atomic_int *)p, 0, o, memory_scope_device); +} + +ATTR void +atomic_flag_clear_explicit(volatile atomic_flag *p, memory_order o, memory_scope s) +{ + __opencl_atomic_store((volatile atomic_int *)p, 0, o, s); +} + diff --git a/opencl/src/misc/printf.cl b/opencl/src/misc/printf.cl index 815f96ea..f80ebf1c 100644 --- a/opencl/src/misc/printf.cl +++ b/opencl/src/misc/printf.cl @@ -1,3 +1,9 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ #include "irif.h" diff --git a/opencl/src/pipes/commitp.cl b/opencl/src/pipes/commitp.cl new file mode 100644 index 00000000..51528cb8 --- /dev/null +++ b/opencl/src/pipes/commitp.cl @@ -0,0 +1,93 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +#define ATTR __attribute__((always_inline)) + +#define COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(COMMIT_READ_PIPE_SIZE) + +ATTR void +__commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +#define COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(COMMIT_WRITE_PIPE_SIZE) + +ATTR void +__commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +// Work group functions + +#define WORK_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__work_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(WORK_GROUP_COMMIT_READ_PIPE_SIZE) + +ATTR void +__work_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +#define WORK_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__work_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(WORK_GROUP_COMMIT_WRITE_PIPE_SIZE) + +ATTR void +__work_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +// sub group functions + +#define SUB_GROUP_COMMIT_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__sub_group_commit_read_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(SUB_GROUP_COMMIT_READ_PIPE_SIZE) + +ATTR void +__sub_group_commit_read_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + +#define SUB_GROUP_COMMIT_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR void \ +__sub_group_commit_write_pipe_##SIZE(__global struct pipeimp* p, size_t rid) \ +{ \ +} + +// DO_PIPE_SIZE(SUB_GROUP_COMMIT_WRITE_PIPE_SIZE) + +ATTR void +__sub_group_commit_write_pipe(__global struct pipeimp* p, size_t rid, uint size, uint align) +{ +} + diff --git a/opencl/src/pipes/getp.cl b/opencl/src/pipes/getp.cl new file mode 100644 index 00000000..d5531996 --- /dev/null +++ b/opencl/src/pipes/getp.cl @@ -0,0 +1,45 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +#define ATTR __attribute__((always_inline, pure)) + +#define GET_PIPE_NUM_PACKETS_SIZE(SIZE, STYPE) \ +ATTR uint \ +__get_pipe_num_packets_##SIZE(__global struct pipeimp* p) \ +{ \ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + return (uint)(wi - ri); \ +} + +// DO_PIPE_SIZE(GET_PIPE_NUM_PACKETS_SIZE) + +ATTR uint +__get_pipe_num_packets(__global struct pipeimp* p, uint size, uint align) +{ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + return (uint)(wi - ri); +} + +#define GET_PIPE_MAX_PACKETS_SIZE(SIZE, STYPE) \ +ATTR uint \ +__get_pipe_max_packets_##SIZE(__global struct pipeimp* p) \ +{ \ + return (uint)p->end_idx; \ +} + +// DO_PIPE_SIZE(GET_PIPE_MAX_PACKETS_SIZE) + +ATTR uint +__get_pipe_max_packets(__global struct pipeimp* p, uint size, uint align) +{ + return (uint)p->end_idx; +} + diff --git a/opencl/src/pipes/memcpyia.cl b/opencl/src/pipes/memcpyia.cl new file mode 100644 index 00000000..f536d044 --- /dev/null +++ b/opencl/src/pipes/memcpyia.cl @@ -0,0 +1,55 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +__attribute__((always_inline, weak)) void +__memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align) +{ + if (align == 2) { + short *d2 = (short *)d; + short *s2 = (short *)s; + short *e2 = s2 + size/2; + + while (s2 < e2) + *d2++ = *s2++; + } else if (align == 4) { + int *d4 = (int *)d; + int *s4 = (int *)s; + int *e4 = s4 + size/4; + + while (s4 < e4) + *d4++ = *s4++; + } else if (align == 8) { + long *d8 = (long *)d; + long *s8 = (long *)s; + long *e8 = s8 + size/8; + + while (s8 < e8) + *d8++ = *s8++; + } else if (align == 16) { + long2 *d16 = (long2 *)d; + long2 *s16 = (long2 *)s; + long2 *e16 = s16 + size/16; + + while (s16 < e16) + *d16++ = *s16++; + } else if (align == 32 || align == 64 || align == 128) { + long4 *d32 = (long4 *)d; + long4 *s32 = (long4 *)s; + long4 *e32 = s32 + size/32; + + while (s32 < e32) + *d32++ = *s32++; + } else { + char *d1 = (char *)d; + char *s1 = (char *)s; + char *e1 = s1 + size; + + while (s1 < e1) + *d1++ = *s1++; + } +} + diff --git a/opencl/src/pipes/pipes.h b/opencl/src/pipes/pipes.h new file mode 100644 index 00000000..16ab22fd --- /dev/null +++ b/opencl/src/pipes/pipes.h @@ -0,0 +1,109 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "irif.h" + +#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable +#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable + +extern size_t __amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n); + +#define DO_PIPE_SIZE(F) \ +F(1,uchar) \ +F(2,ushort) \ +F(4,uint) \ +F(8,ulong) \ +F(16,ulong2) \ +F(32,ulong4) \ +F(64,ulong8) \ +F(128,ulong16) + +struct pipeimp { + atomic_size_t read_idx; + atomic_size_t write_idx; + size_t end_idx; + uchar pad[128 - 3*sizeof(size_t)]; + uchar packets[1]; +}; + +extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t); + +static __attribute__((always_inline)) size_t +reserve(volatile __global atomic_size_t *pi, size_t lim, size_t n) +{ + size_t i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device); + + for (;;) { + if (i + n > lim) + return ~(size_t)0; + + if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) + break; + } + + return i; +} + +static inline size_t +wave_reserve_1(volatile __global atomic_size_t *pi, size_t lim) +{ + size_t n = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) + + __llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi())); + uint l = __llvm_amdgcn_mbcnt_hi(__llvm_amdgcn_read_exec_hi(), + __llvm_amdgcn_mbcnt_lo(__llvm_amdgcn_read_exec_lo(), 0u)); + size_t i = 0; + + if (l == 0) { + i = __opencl_atomic_load(pi, memory_order_relaxed, memory_scope_device); + + for (;;) { + if (i + n > lim) { + i = ~(size_t)0; + break; + } + + if (__opencl_atomic_compare_exchange_strong(pi, &i, i + n, memory_order_relaxed, memory_order_relaxed, memory_scope_device)) + break; + } + } + + __llvm_amdgcn_wave_barrier(); + + // Broadcast the result; the ctz tells us which lane has active lane id 0 + uint k = (uint)__llvm_cttz_i64(__llvm_amdgcn_read_exec()); + i = ((size_t)__llvm_amdgcn_readlane((uint)(i >> 32), k) << 32) | + (size_t)__llvm_amdgcn_readlane((uint)i, k); + + __llvm_amdgcn_wave_barrier(); + + if (i != ~(size_t)0) + i += l; + else { + // The entire group didn't fit, have to handle one by one + i = reserve(pi, lim, (size_t)1); + } + + return i; +} + +static inline size_t +wrap(size_t i, size_t n) +{ + // Assume end_i < 2^32 + size_t ret; + if (as_uint2(i).y == 0U) { + uint j = (uint)i; + uint m = (uint)n; + if (j < m) + ret = i; + else + ret = (ulong)(j % m); + } else + ret = i % n; + return ret; +} + diff --git a/opencl/src/pipes/readp.cl b/opencl/src/pipes/readp.cl new file mode 100644 index 00000000..1808ad3a --- /dev/null +++ b/opencl/src/pipes/readp.cl @@ -0,0 +1,75 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +#define ATTR __attribute__((always_inline)) + +#define READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR int \ +__read_pipe_2_##SIZE(__global struct pipeimp* p, STYPE* ptr) \ +{ \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t ri = wave_reserve_1(&p->read_idx, wi); \ + if (ri == ~(size_t)0) \ + return -1; \ + \ + size_t pi = wrap(ri, p->end_idx); \ + *ptr = ((__global STYPE *)p->packets)[pi]; \ + \ + if (ri == wi-1) { \ + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + }\ +\ + return 0; \ +} + +DO_PIPE_SIZE(READ_PIPE_SIZE) + +ATTR int +__read_pipe_2(__global struct pipeimp* p, void* ptr, uint size, uint align) +{ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t ri = wave_reserve_1(&p->read_idx, wi); + if (ri == ~(size_t)0) + return -1; + + size_t pi = wrap(ri, p->end_idx); + __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align); + + if (ri == wi-1) { + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + return 0; +} + +#define READ_PIPE_RESERVED_SIZE(SIZE, STYPE) \ +ATTR int \ +__read_pipe_4_##SIZE(__global struct pipeimp* p, size_t rid, uint i, STYPE* ptr) \ +{ \ + rid += i; \ + size_t pi = wrap(rid, p->end_idx); \ + *ptr = ((__global STYPE *)p->packets)[pi]; \ + \ + return 0; \ +} + +DO_PIPE_SIZE(READ_PIPE_RESERVED_SIZE) + +ATTR int +__read_pipe_4(__global struct pipeimp* p, size_t rid, uint i, void *ptr, uint size, uint align) +{ + rid += i; + size_t pi = wrap(rid, p->end_idx); + __memcpy_internal_aligned(ptr, p->packets + pi*size, size, align); + + return 0; +} + diff --git a/opencl/src/pipes/reservep.cl b/opencl/src/pipes/reservep.cl new file mode 100644 index 00000000..18e073be --- /dev/null +++ b/opencl/src/pipes/reservep.cl @@ -0,0 +1,219 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#pragma OPENCL EXTENSION cl_khr_subgroups : enable +#include "pipes.h" +#include "../workgroup/wg.h" + +#define ATTR __attribute__((always_inline)) + +#define RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + \ + return rid; \ +} + +// DO_PIPE_SIZE(RESERVE_READ_PIPE_SIZE) + +ATTR size_t +__reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t rid = __amd_wresvn(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + return rid; +} + +#define RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + return __amd_wresvn(&p->write_idx, ri + ei, num_packets); \ +} + +// DO_PIPE_SIZE(RESERVE_WRITE_PIPE_SIZE) + +ATTR size_t +__reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + return __amd_wresvn(&p->write_idx, ri + ei, num_packets); +} + +// Work group functions + +#define WORK_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__work_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + __local size_t *t = (__local size_t *)__get_scratch_lds(); \ + \ + if ((int)get_local_linear_id() == 0) { \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + size_t rid = reserve(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + \ + *t = rid; \ + } \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + \ + return *t; \ +} + +// DO_PIPE_SIZE(WORK_GROUP_RESERVE_READ_PIPE_SIZE) + +ATTR size_t +__work_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + __local size_t *t = (__local size_t *)__get_scratch_lds(); + + if ((int)get_local_linear_id() == 0) { + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + size_t rid = reserve(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + + *t = rid; + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + return *t; +} + +#define WORK_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__work_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + __local size_t *t = (__local size_t *)__get_scratch_lds(); \ + \ + if ((int)get_local_linear_id() == 0) { \ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + *t = reserve(&p->write_idx, ri + ei, num_packets); \ + } \ + \ + work_group_barrier(CLK_LOCAL_MEM_FENCE); \ + \ + return *t; \ +} + +// DO_PIPE_SIZE(WORK_GROUP_RESERVE_WRITE_PIPE_SIZE) + +ATTR size_t +__work_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + __local size_t *t = (__local size_t *)__get_scratch_lds(); + + if ((int)get_local_linear_id() == 0) { + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + *t = reserve(&p->write_idx, ri + ei, num_packets); + } + + work_group_barrier(CLK_LOCAL_MEM_FENCE); + + return *t; +} + +// sub group functions + +#define SUB_GROUP_RESERVE_READ_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__sub_group_reserve_read_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t rid = ~(size_t)0; \ + \ + if (get_sub_group_local_id() == 0) { \ + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); \ + rid = reserve(&p->read_idx, wi, num_packets); \ + \ + if (rid + num_packets == wi) { \ + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); \ + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \ + } \ + } \ + \ + return sub_group_broadcast(rid, 0); \ +} + +// DO_PIPE_SIZE(SUB_GROUP_RESERVE_READ_PIPE_SIZE) + +ATTR size_t +__sub_group_reserve_read_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + size_t rid = ~(size_t)0; + + if (get_sub_group_local_id() == 0) { + size_t wi = __opencl_atomic_load(&p->write_idx, memory_order_relaxed, memory_scope_device); + rid = reserve(&p->read_idx, wi, num_packets); + + if (rid + num_packets == wi) { + __opencl_atomic_store(&p->write_idx, 0, memory_order_relaxed, memory_scope_device); + __opencl_atomic_store(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); + } + } + + return sub_group_broadcast(rid, 0); +} + +#define SUB_GROUP_RESERVE_WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR size_t \ +__sub_group_reserve_write_pipe_##SIZE(__global struct pipeimp *p, uint num_packets) \ +{ \ + size_t rid = ~(size_t)0; \ + \ + if (get_sub_group_local_id() == 0) { \ + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + rid = reserve(&p->write_idx, ri + ei, num_packets); \ + } \ + \ + return sub_group_broadcast(rid, 0); \ +} + +// DO_PIPE_SIZE(SUB_GROUP_RESERVE_WRITE_PIPE_SIZE) + +ATTR size_t +__sub_group_reserve_write_pipe(__global struct pipeimp *p, uint num_packets, uint size, uint align) +{ + size_t rid = ~(size_t)0; + + if (get_sub_group_local_id() == 0) { + size_t ri = __opencl_atomic_load(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + rid = reserve(&p->write_idx, ri + ei, num_packets); + } + + return sub_group_broadcast(rid, 0); +} + diff --git a/opencl/src/pipes/validp.cl b/opencl/src/pipes/validp.cl new file mode 100644 index 00000000..5397dfce --- /dev/null +++ b/opencl/src/pipes/validp.cl @@ -0,0 +1,14 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + + +__attribute__((overloadable, always_inline)) bool +is_valid_reserve_id(reserve_id_t rid) +{ + return as_ulong(rid) != ~(size_t)0; +} + diff --git a/opencl/src/pipes/wresvnp.cl b/opencl/src/pipes/wresvnp.cl new file mode 100644 index 00000000..2b4f2fa4 --- /dev/null +++ b/opencl/src/pipes/wresvnp.cl @@ -0,0 +1,148 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +size_t +__amd_wresvn(volatile __global atomic_size_t *pidx, size_t lim, size_t n) +{ + uint alc = (size_t)(__llvm_ctpop_i32(__llvm_amdgcn_read_exec_lo()) + + __llvm_ctpop_i32(__llvm_amdgcn_read_exec_hi())); + uint l = __llvm_amdgcn_mbcnt_hi(-1, __llvm_amdgcn_mbcnt_lo(-1, 0u)); + size_t rid; + + if (__llvm_amdgcn_read_exec() == (1UL << alc) - 1UL) { + // Handle fully active subgroup + uint sum = sub_group_scan_inclusive_add((uint)n); + size_t idx = 0; + if (l == alc-1) { + idx = reserve(pidx, lim, (size_t)sum); + } + idx = sub_group_broadcast(idx, alc-1); + rid = idx + (size_t)(sum - (uint)n); + rid = idx != ~(size_t)0 ? rid : idx; + } else { + // Inclusive add scan with not all lanes active + const ulong nomsb = 0x7fffffffffffffffUL; + + // Step 1 + ulong smask = __llvm_amdgcn_read_exec() & ((0x1UL << l) - 0x1UL); + int slid = 63 - (int)clz(smask); + uint t = __llvm_amdgcn_ds_bpermute(slid << 2, n); + uint sum = n + (slid < 0 ? 0 : t); + smask ^= (0x1UL << slid) & nomsb; + + // Step 2 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + + // Step 3 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + + // Step 4 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + + // Step 5 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + slid = 63 - (int)clz(smask); + smask ^= (0x1UL << slid) & nomsb; + + // Step 6 + slid = 63 - (int)clz(smask); + t = __llvm_amdgcn_ds_bpermute(slid << 2, sum); + sum += slid < 0 ? 0 : t; + __llvm_amdgcn_wave_barrier(); + + size_t idx = 0; + if (l == 63 - (int)clz(__llvm_amdgcn_read_exec())) { + idx = reserve(pidx, lim, (size_t)sum); + } + __llvm_amdgcn_wave_barrier(); + + // Broadcast + uint k = 63u - (uint)clz(__llvm_amdgcn_read_exec()); + idx = ((size_t)__llvm_amdgcn_readlane((uint)(idx >> 32), k) << 32) | + (size_t)__llvm_amdgcn_readlane((uint)idx, k); + __llvm_amdgcn_wave_barrier(); + + rid = idx + (size_t)(sum - (uint)n); + rid = idx != ~(size_t)0 ? rid : idx; + } + + if (rid == ~(size_t)0) { + // Try again one at a time + rid = reserve(pidx, lim, n); + } + + return rid; +} + diff --git a/opencl/src/pipes/writep.cl b/opencl/src/pipes/writep.cl new file mode 100644 index 00000000..e07026cd --- /dev/null +++ b/opencl/src/pipes/writep.cl @@ -0,0 +1,65 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "pipes.h" + +#define ATTR __attribute__((always_inline)) + +#define WRITE_PIPE_SIZE(SIZE, STYPE) \ +ATTR int \ +__write_pipe_2_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \ +{ \ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \ + size_t ei = p->end_idx; \ + size_t wi = wave_reserve_1(&p->write_idx, ri+ei); \ + if (wi == ~(size_t)0) \ + return -1; \ + \ + size_t pi = wrap(wi, ei); \ + ((__global STYPE *)p->packets)[pi] = *ptr; \ + return 0; \ +} + +DO_PIPE_SIZE(WRITE_PIPE_SIZE) + +ATTR int +__write_pipe_2(__global struct pipeimp* p, const void* ptr, uint size, uint align) +{ + size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); + size_t ei = p->end_idx; + size_t wi = wave_reserve_1(&p->write_idx, ri+ei); + if (wi == ~(size_t)0) + return -1; + + size_t pi = wrap(wi, ei); + __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align); + + return 0; +} + +#define WRITE_PIPE_RESERVED_SIZE(SIZE, STYPE) \ +ATTR int \ +__write_pipe_4_##SIZE(__global struct pipeimp* p, size_t rid, uint i, const STYPE* ptr) \ +{ \ + rid += i; \ + size_t pi = wrap(rid, p->end_idx); \ + ((__global STYPE *)p->packets)[pi] = *ptr; \ + return 0; \ +} + +DO_PIPE_SIZE(WRITE_PIPE_RESERVED_SIZE) + +ATTR int +__write_pipe_4(__global struct pipeimp* p, size_t rid, uint i, const void *ptr, uint size, uint align) +{ + rid += i; + size_t pi = wrap(rid, p->end_idx); + __memcpy_internal_aligned(p->packets + pi*size, ptr, size, align); + + return 0; +} + diff --git a/utils/add_amdgiz.sed b/utils/add_amdgiz.sed index 995f6af9..ee495d3c 100755 --- a/utils/add_amdgiz.sed +++ b/utils/add_amdgiz.sed @@ -6,7 +6,8 @@ ####################### # amdgcn--amdhsa-amd -> amdgcn--amdhsa-amdgiz -/target triple/s/\"amdgcn--amdhsa\"/\"amdgcn--amdhsa-amdgiz\"/ +# This is now done directly by change-addr-space.sh +# /target triple/s/\"amdgcn--amdhsa\"/\"amdgcn--amdhsa-amdgiz\"/ ##################### # change data layout diff --git a/utils/change-addr-space.sh b/utils/change-addr-space.sh index 02d82af5..cfdea769 100755 --- a/utils/change-addr-space.sh +++ b/utils/change-addr-space.sh @@ -6,8 +6,14 @@ # utils/change-addr-space.sh src x : apply utils/remove_amdgiz.sed # adopt generic address space is address space 4 -if [ $# -lt 2 ]; then - find . -name "*.ll" | xargs sed -i -f "$1/add_amdgiz.sed" +tmpfile=/tmp/cas$$.sed +if [ $# -lt 3 ]; then + echo "/target triple/s/\\\"amdgcn--amdhsa\\\"/\\\"${1}\\\"/" >$tmpfile + cat $2/add_amdgiz.sed >>$tmpfile else - find . -name "*.ll" | xargs sed -i -f "$1/remove_amdgiz.sed" + echo "/target triple/s/\\\"${1}\\\"/\\\"amdgcn--amdhsa\\\"/" >$tmpfile + cat $2/remove_amdgiz.sed >>$tmpfile fi + +find . -name "*.ll" | xargs sed -i -f "$tmpfile" +rm $tmpfile diff --git a/utils/prepare-builtins/prepare-builtins.cpp b/utils/prepare-builtins/prepare-builtins.cpp index b1145363..ce3596fe 100644 --- a/utils/prepare-builtins/prepare-builtins.cpp +++ b/utils/prepare-builtins/prepare-builtins.cpp @@ -114,8 +114,8 @@ int main(int argc, char **argv) { } std::error_code EC; - std::unique_ptr Out - (new tool_output_file(OutputFilename, EC, sys::fs::F_None)); + std::unique_ptr Out + (new ToolOutputFile(OutputFilename, EC, sys::fs::F_None)); if (EC) { errs() << EC.message() << '\n'; exit(1); diff --git a/utils/remove_amdgiz.sed b/utils/remove_amdgiz.sed index d10630c6..7c76dd78 100755 --- a/utils/remove_amdgiz.sed +++ b/utils/remove_amdgiz.sed @@ -6,7 +6,8 @@ ####################### # amdgcn--amdhsa-amdgiz -> amdgcn--amdhsa -/target triple/s/\"amdgcn--amdhsa-amdgiz\"/\"amdgcn--amdhsa\"/ +# This is now done directly by change-addr-space.sh +#/target triple/s/\"amdgcn--amdhsa-amdgiz\"/\"amdgcn--amdhsa\"/ ##################### # change data layout