diff --git a/ocml/inc/ocml.h b/ocml/inc/ocml.h index 8a94c5c0..84debb12 100644 --- a/ocml/inc/ocml.h +++ b/ocml/inc/ocml.h @@ -670,6 +670,24 @@ DECL_CONST_OCML_UNARY_F16(native_log2) extern __attribute__((const)) float OCML_MANGLE_F32(cabs)(float2); extern __attribute__((const)) double OCML_MANGLE_F64(cabs)(double2); +extern __attribute__((const)) float2 OCML_MANGLE_F32(cacos)(float2); +extern __attribute__((const)) double2 OCML_MANGLE_F64(cacos)(double2); + +extern __attribute__((const)) float2 OCML_MANGLE_F32(cacosh)(float2); +extern __attribute__((const)) double2 OCML_MANGLE_F64(cacosh)(double2); + +extern __attribute__((const)) float2 OCML_MANGLE_F32(casin)(float2); +extern __attribute__((const)) double2 OCML_MANGLE_F64(casin)(double2); + +extern __attribute__((const)) float2 OCML_MANGLE_F32(casinh)(float2); +extern __attribute__((const)) double2 OCML_MANGLE_F64(casinh)(double2); + +extern __attribute__((const)) float2 OCML_MANGLE_F32(catan)(float2); +extern __attribute__((const)) double2 OCML_MANGLE_F64(catan)(double2); + +extern __attribute__((const)) float2 OCML_MANGLE_F32(catanh)(float2); +extern __attribute__((const)) double2 OCML_MANGLE_F64(catanh)(double2); + extern __attribute__((const)) float2 OCML_MANGLE_F32(cexp)(float2); extern __attribute__((const)) double2 OCML_MANGLE_F64(cexp)(double2); diff --git a/ocml/src/acoshD.cl b/ocml/src/acoshD.cl index e2424edb..3cacbf9f 100644 --- a/ocml/src/acoshD.cl +++ b/ocml/src/acoshD.cl @@ -21,8 +21,6 @@ MATH_MANGLE(acosh)(double x) double2 a = add(sx, root2(sub(sqr(sx), s*s))); double z = MATH_PRIVATE(lnep)(a, b ? 512 : 0); - z = x == 1.0 ? 0.0 : z; - if (!FINITE_ONLY_OPT()) { z = BUILTIN_CLASS_F64(x, CLASS_PINF) ? x : z; z = x < 1.0 ? AS_DOUBLE(QNANBITPATT_DP64) : z; diff --git a/ocml/src/acoshF.cl b/ocml/src/acoshF.cl index 5e5cc1f1..699330ca 100644 --- a/ocml/src/acoshF.cl +++ b/ocml/src/acoshF.cl @@ -21,8 +21,6 @@ MATH_MANGLE(acosh)(float x) float2 a = add(sx, root2(sub(sqr(sx), s*s))); float z = MATH_PRIVATE(lnep)(a, b ? 64 : 0); - z = x == 1.0f ? 0.0f : z; - if (!FINITE_ONLY_OPT()) { z = BUILTIN_CLASS_F32(x, CLASS_PINF) ? x : z; z = x < 1.0f ? AS_FLOAT(QNANBITPATT_SP32) : z; diff --git a/ocml/src/cacosD.cl b/ocml/src/cacosD.cl new file mode 100644 index 00000000..ac468011 --- /dev/null +++ b/ocml/src/cacosD.cl @@ -0,0 +1,17 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathD.h" + +CONSTATTR double2 +MATH_MANGLE(cacos)(double2 z) +{ + double2 a = MATH_MANGLE(cacosh)(z); + bool b = AS_INT2(z.y).hi < 0; + return (double2)(b ? -a.y : a.y, b ? a.x : -a.x); +} + diff --git a/ocml/src/cacosF.cl b/ocml/src/cacosF.cl new file mode 100644 index 00000000..e20b7d90 --- /dev/null +++ b/ocml/src/cacosF.cl @@ -0,0 +1,17 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathF.h" + +CONSTATTR float2 +MATH_MANGLE(cacos)(float2 z) +{ + float2 a = MATH_MANGLE(cacosh)(z); + bool b = AS_INT(z.y) < 0; + return (float2)(b ? -a.y : a.y, b ? a.x : -a.x); +} + diff --git a/ocml/src/cacoshD.cl b/ocml/src/cacoshD.cl new file mode 100644 index 00000000..cbb10cd6 --- /dev/null +++ b/ocml/src/cacoshD.cl @@ -0,0 +1,64 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathD.h" + +#define DOUBLE_SPECIALIZATION +#include "ep.h" + +extern CONSTATTR double4 MATH_PRIVATE(epcsqrtep)(double4 z); +extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); + +CONSTATTR double2 +MATH_MANGLE(cacosh)(double2 z) +{ + double x = BUILTIN_ABS_F64(z.x); + double y = BUILTIN_ABS_F64(z.y); + + double2 l2, t; + int e = 0; + bool b = true; + + if (x < 0x1.0p+54 && y < 0x1.0p+54) { + if (x >= 1.0 || y >= 0x1.0p-53 || y > (1.0 - x)*0x1.0p-26) { + double4 z2p1 = (double4)(add(mul(add(y,x), sub(y,x)), 1.0), mul(y,x)*2.0); + double4 rz2m1 = MATH_PRIVATE(epcsqrtep)(z2p1); + rz2m1 = (double4)(csgn(rz2m1.hi, (double2)z.x), csgn(rz2m1.lo, (double2)z.y)); + double4 s = (double4)(add(rz2m1.lo, z.x), add(rz2m1.hi, z.y)); + l2 = add(sqr(s.lo), sqr(s.hi)); + t = (double2)(s.s1, z.y == 0.0 ? z.y : s.s3); + } else { + b = false; + double r = MATH_FAST_SQRT(BUILTIN_FMA_F64(-x, x, 1.0)); + l2 = con(MATH_DIV(y, r), 0.0); + t = (double2)(z.x, BUILTIN_COPYSIGN_F64(r, z.y)); + } + } else { + e = BUILTIN_FREXP_EXP_F64(BUILTIN_MAX_F64(x,y)); + x = BUILTIN_FLDEXP_F64(x, -e); + y = BUILTIN_FLDEXP_F64(y, -e); + l2 = add(sqr(x), sqr(y)); + e = 2*e + 2; + t = z; + } + + double rr; + if (b) { + rr = 0.5 * MATH_PRIVATE(lnep)(l2, e); + } else { + rr = l2.hi; + } + + double ri = MATH_MANGLE(atan2)(t.y, t.x); + + if (!FINITE_ONLY_OPT()) { + rr = (BUILTIN_ISINF_F64(z.x) | BUILTIN_ISINF_F64(z.y)) ? AS_DOUBLE(PINFBITPATT_DP64) : rr; + } + + return (double2)(rr, ri); +} + diff --git a/ocml/src/cacoshF.cl b/ocml/src/cacoshF.cl new file mode 100644 index 00000000..14c151b9 --- /dev/null +++ b/ocml/src/cacoshF.cl @@ -0,0 +1,64 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathF.h" + +#define FLOAT_SPECIALIZATION +#include "ep.h" + +extern CONSTATTR float4 MATH_PRIVATE(epcsqrtep)(float4 z); +extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); + +CONSTATTR float2 +MATH_MANGLE(cacosh)(float2 z) +{ + float x = BUILTIN_ABS_F32(z.x); + float y = BUILTIN_ABS_F32(z.y); + + float2 l2, t; + int e = 0; + bool b = true; + + if (x < 0x1.0p+25f && y < 0x1.0p+25f) { + if (x >= 1.0f || y >= 0x1.0p-24f || y > (1.0f - x)*0x1.0p-12f) { + float4 z2p1 = (float4)(add(mul(add(y,x), sub(y,x)), 1.0f), mul(y,x)*2.0f); + float4 rz2m1 = MATH_PRIVATE(epcsqrtep)(z2p1); + rz2m1 = (float4)(csgn(rz2m1.hi, (float2)z.x), csgn(rz2m1.lo, (float2)z.y)); + float4 s = (float4)(add(rz2m1.lo, z.x), add(rz2m1.hi, z.y)); + l2 = add(sqr(s.lo), sqr(s.hi)); + t = (float2)(s.s1, z.y == 0.0f ? z.y : s.s3); + } else { + b = false; + float r = MATH_SQRT(BUILTIN_FMA_F32(-x, x, 1.0f)); + l2 = con(MATH_DIV(y, r), 0.0f); + t = (float2)(z.x, BUILTIN_COPYSIGN_F32(r, z.y)); + } + } else { + e = BUILTIN_FREXP_EXP_F32(AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(x), AS_UINT(y)))); + x = BUILTIN_FLDEXP_F32(x, -e); + y = BUILTIN_FLDEXP_F32(y, -e); + l2 = add(sqr(x), sqr(y)); + e = 2*e + 2; + t = z; + } + + float rr; + if (b) { + rr = 0.5f * MATH_PRIVATE(lnep)(l2, e); + } else { + rr = l2.hi; + } + + float ri = MATH_MANGLE(atan2)(t.y, t.x); + + if (!FINITE_ONLY_OPT()) { + rr = (BUILTIN_ISINF_F32(z.x) | BUILTIN_ISINF_F32(z.y)) ? AS_FLOAT(PINFBITPATT_SP32) : rr; + } + + return (float2)(rr, ri); +} + diff --git a/ocml/src/casinD.cl b/ocml/src/casinD.cl new file mode 100644 index 00000000..d0bafe12 --- /dev/null +++ b/ocml/src/casinD.cl @@ -0,0 +1,16 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathD.h" + +CONSTATTR double2 +MATH_MANGLE(casin)(double2 z) +{ + double2 a = MATH_MANGLE(casinh)((double2)(-z.y, z.x)); + return (double2)(a.y, -a.x); +} + diff --git a/ocml/src/casinF.cl b/ocml/src/casinF.cl new file mode 100644 index 00000000..1189c599 --- /dev/null +++ b/ocml/src/casinF.cl @@ -0,0 +1,16 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathF.h" + +CONSTATTR float2 +MATH_MANGLE(casin)(float2 z) +{ + float2 a = MATH_MANGLE(casinh)((float2)(-z.y, z.x)); + return (float2)(a.y, -a.x); +} + diff --git a/ocml/src/casinhD.cl b/ocml/src/casinhD.cl new file mode 100644 index 00000000..6d6b096d --- /dev/null +++ b/ocml/src/casinhD.cl @@ -0,0 +1,65 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathD.h" + +#define DOUBLE_SPECIALIZATION +#include "ep.h" + +extern CONSTATTR double4 MATH_PRIVATE(epcsqrtep)(double4 z); +extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); + +CONSTATTR double2 +MATH_MANGLE(casinh)(double2 z) +{ + double x = BUILTIN_ABS_F64(z.x); + double y = BUILTIN_ABS_F64(z.y); + + double2 l2, t; + int e = 0; + bool b = true; + + if (x < 0x1.0p+54 && y < 0x1.0p+54) { + if (y >= 1.0 || x >= 0x1.0p-53 || x > (1.0 - y)*0x1.0p-26f) { + double4 z2p1 = (double4)(add(mul(add(x,y), sub(x,y)), 1.0), mul(y,x)*2.0); + double4 rz2p1 = MATH_PRIVATE(epcsqrtep)(z2p1); + double4 s = (double4)(add(rz2p1.lo, x), add(rz2p1.hi, y)); + l2 = add(sqr(s.lo), sqr(s.hi)); + t = (double2)(s.s1, s.s3); + } else { + b = false; + double r = MATH_SQRT(BUILTIN_FMA_F64(-y, y, 1.0)); + l2 = con(MATH_DIV(x, r), 0.0); + t = (double2)(r, y); + } + } else { + t = (double2)(x, y); + e = BUILTIN_FREXP_EXP_F64(BUILTIN_MAX_F64(x, y)); + x = BUILTIN_FLDEXP_F64(x, -e); + y = BUILTIN_FLDEXP_F64(y, -e); + l2 = add(sqr(x), sqr(y)); + e = 2*e + 2; + } + + double rr; + if (b) { + rr = 0.5 * MATH_PRIVATE(lnep)(l2, e); + } else { + rr = l2.hi; + } + + rr = BUILTIN_COPYSIGN_F64(rr, z.x); + double ri = BUILTIN_COPYSIGN_F64(MATH_MANGLE(atan2)(t.y, t.x), z.y); + + if (!FINITE_ONLY_OPT()) { + double i = BUILTIN_COPYSIGN_F64(AS_DOUBLE(PINFBITPATT_DP64), z.x); + rr = (BUILTIN_ISINF_F64(z.x) | BUILTIN_ISINF_F64(z.y)) ? i : rr; + } + + return (double2)(rr, ri); +} + diff --git a/ocml/src/casinhF.cl b/ocml/src/casinhF.cl new file mode 100644 index 00000000..64624329 --- /dev/null +++ b/ocml/src/casinhF.cl @@ -0,0 +1,65 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathF.h" + +#define FLOAT_SPECIALIZATION +#include "ep.h" + +extern CONSTATTR float4 MATH_PRIVATE(epcsqrtep)(float4 z); +extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); + +CONSTATTR float2 +MATH_MANGLE(casinh)(float2 z) +{ + float x = BUILTIN_ABS_F32(z.x); + float y = BUILTIN_ABS_F32(z.y); + + float2 l2, t; + int e = 0; + bool b = true; + + if (x < 0x1.0p+25f && y < 0x1.0p+25f) { + if (y >= 1.0f || x >= 0x1.0p-24f || x > (1.0f - y)*0x1.0p-12f) { + float4 z2p1 = (float4)(add(mul(add(x,y), sub(x,y)), 1.0f), mul(y,x)*2.0f); + float4 rz2p1 = MATH_PRIVATE(epcsqrtep)(z2p1); + float4 s = (float4)(add(rz2p1.lo, x), add(rz2p1.hi, y)); + l2 = add(sqr(s.lo), sqr(s.hi)); + t = (float2)(s.s1, s.s3); + } else { + b = false; + float r = MATH_SQRT(BUILTIN_FMA_F32(-y, y, 1.0f)); + l2 = con(MATH_DIV(x, r), 0.0f); + t = (float2)(r, y); + } + } else { + t = (float2)(x, y); + e = BUILTIN_FREXP_EXP_F32(AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(x), AS_UINT(y)))); + x = BUILTIN_FLDEXP_F32(x, -e); + y = BUILTIN_FLDEXP_F32(y, -e); + l2 = add(sqr(x), sqr(y)); + e = 2*e + 2; + } + + float rr; + if (b) { + rr = 0.5f * MATH_PRIVATE(lnep)(l2, e); + } else { + rr = l2.hi; + } + + rr = BUILTIN_COPYSIGN_F32(rr, z.x); + float ri = BUILTIN_COPYSIGN_F32(MATH_MANGLE(atan2)(t.y, t.x), z.y); + + if (!FINITE_ONLY_OPT()) { + float i = BUILTIN_COPYSIGN_F32(AS_FLOAT(PINFBITPATT_SP32), z.x); + rr = (BUILTIN_ISINF_F32(z.x) | BUILTIN_ISINF_F32(z.y)) ? i : rr; + } + + return (float2)(rr, ri); +} + diff --git a/ocml/src/catanD.cl b/ocml/src/catanD.cl new file mode 100644 index 00000000..0c3cf43c --- /dev/null +++ b/ocml/src/catanD.cl @@ -0,0 +1,16 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathD.h" + +CONSTATTR double2 +MATH_MANGLE(catan)(double2 z) +{ + double2 a = MATH_MANGLE(catanh)((double2)(-z.y, z.x)); + return (double2)(a.y, -a.x); +} + diff --git a/ocml/src/catanF.cl b/ocml/src/catanF.cl new file mode 100644 index 00000000..55715a59 --- /dev/null +++ b/ocml/src/catanF.cl @@ -0,0 +1,16 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathF.h" + +CONSTATTR float2 +MATH_MANGLE(catan)(float2 z) +{ + float2 a = MATH_MANGLE(catanh)((float2)(-z.y, z.x)); + return (float2)(a.y, -a.x); +} + diff --git a/ocml/src/catanhD.cl b/ocml/src/catanhD.cl new file mode 100644 index 00000000..1539e12b --- /dev/null +++ b/ocml/src/catanhD.cl @@ -0,0 +1,59 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathD.h" + +#define DOUBLE_SPECIALIZATION +#include "ep.h" + +extern CONSTATTR double MATH_PRIVATE(lnep)(double2 a, int ea); + +CONSTATTR double2 +MATH_MANGLE(catanh)(double2 z) +{ + double x = BUILTIN_ABS_F64(z.x); + double y = BUILTIN_ABS_F64(z.y); + double rr, ri; + + if (x < 0x1.0p+54 && y < 0x1.0p+54) { + double2 omx = sub(1.0, x); + double2 opx = add(1.0, x); + double2 y2 = sqr(y); + double2 b = sub(mul(omx, opx), y2); + ri = 0.5 * MATH_MANGLE(atan2)(2.0 * y, b.hi); + + double2 a; + double2 d = add(sqr(opx), y2); + if (x < 0x1.0p-3 * d.hi) { + a = fsub(1.0, div(4.0*x, d)); + } else { + a = div(add(sqr(omx), y2), d); + } + rr = -0.25 * MATH_PRIVATE(lnep)(a, 0); + } else { + int e = BUILTIN_FREXP_EXP_F64(BUILTIN_MAX_F64(x, y)); + x = BUILTIN_FLDEXP_F64(x, -e); + y = BUILTIN_FLDEXP_F64(y, -e); + rr = BUILTIN_FLDEXP_F64(MATH_DIV(x, MATH_MAD(x, x, y*y)), -e); + ri = 0x1.921fb54442d18p+0; + } + + if (!FINITE_ONLY_OPT()) { + rr = ((x == 1.0) & (y == 0.0)) ? AS_DOUBLE(PINFBITPATT_DP64) : rr; + rr = x == 0.0 ? 0.0 : rr; + rr = BUILTIN_ISINF_F64(x) ? 0.0 : rr; + rr = (BUILTIN_ISNAN_F64(x) & BUILTIN_ISINF_F64(y)) ? 0.0 : rr; + ri = (BUILTIN_ISNAN_F64(x) & BUILTIN_ISFINITE_F64(y)) ? AS_DOUBLE(QNANBITPATT_DP64) : ri; + ri = BUILTIN_ISNAN_F64(y) ? y : ri; + } + + rr = BUILTIN_COPYSIGN_F64(rr, z.x); + ri = BUILTIN_COPYSIGN_F64(ri, z.y); + + return (double2)(rr, ri); +} + diff --git a/ocml/src/catanhF.cl b/ocml/src/catanhF.cl new file mode 100644 index 00000000..e0267eed --- /dev/null +++ b/ocml/src/catanhF.cl @@ -0,0 +1,59 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathF.h" + +#define FLOAT_SPECIALIZATION +#include "ep.h" + +extern CONSTATTR float MATH_PRIVATE(lnep)(float2 a, int ea); + +CONSTATTR float2 +MATH_MANGLE(catanh)(float2 z) +{ + float x = BUILTIN_ABS_F32(z.x); + float y = BUILTIN_ABS_F32(z.y); + float rr, ri; + + if (x < 0x1.0p+25f && y < 0x1.0p+25f) { + float2 omx = sub(1.0f, x); + float2 opx = add(1.0f, x); + float2 y2 = sqr(y); + float2 b = sub(mul(omx, opx), y2); + ri = 0.5f * MATH_MANGLE(atan2)(2.0f * y, b.hi); + + float2 a; + float2 d = add(sqr(opx), y2); + if (x < 0x1.0p-3f * d.hi) { + a = fsub(1.0f, div(4.0f*x, d)); + } else { + a = div(add(sqr(omx), y2), d); + } + rr = -0.25f * MATH_PRIVATE(lnep)(a, 0); + } else { + int e = BUILTIN_FREXP_EXP_F32(AS_FLOAT(BUILTIN_MAX_U32(AS_UINT(x), AS_UINT(y)))); + x = BUILTIN_FLDEXP_F32(x, -e); + y = BUILTIN_FLDEXP_F32(y, -e); + rr = BUILTIN_FLDEXP_F32(MATH_DIV(x, MATH_MAD(x, x, y*y)), -e); + ri = 0x1.921fb6p+0f; + } + + if (!FINITE_ONLY_OPT()) { + rr = ((x == 1.0f) & (y == 0.0f)) ? AS_FLOAT(PINFBITPATT_SP32) : rr; + rr = x == 0.0f ? 0.0f : rr; + rr = BUILTIN_ISINF_F32(x) ? 0.0f : rr; + rr = (BUILTIN_ISNAN_F32(x) & BUILTIN_ISINF_F32(y)) ? 0.0f : rr; + ri = (BUILTIN_ISNAN_F32(x) & BUILTIN_ISFINITE_F32(y)) ? AS_FLOAT(QNANBITPATT_SP32) : ri; + ri = BUILTIN_ISNAN_F32(y) ? y : ri; + } + + rr = BUILTIN_COPYSIGN_F32(rr, z.x); + ri = BUILTIN_COPYSIGN_F32(ri, z.y); + + return (float2)(rr, ri); +} + diff --git a/ocml/src/ep.h b/ocml/src/ep.h index e3bc73bc..3313a129 100644 --- a/ocml/src/ep.h +++ b/ocml/src/ep.h @@ -12,7 +12,8 @@ #define ISINF(X) BUILTIN_ISINF_F32(X) #define USE_FMA HAVE_FAST_FMA32() #define HIGH(X) AS_FLOAT(AS_UINT(X) & 0xfffff000U) -#define COPYSIGN BUILTIN_COPYSIGN_F64 +#define SIGNBIT(X) (AS_INT(X) < 0) +#define SAMESIGN(X,Y) ((AS_INT(X)& 0x80000000) == (AS_INT(Y) & 0x80000000)) #endif #if defined DOUBLE_SPECIALIZATION @@ -26,7 +27,8 @@ #define ISINF(X) BUILTIN_ISINF_F64(X) #define USE_FMA true #define HIGH(X) AS_DOUBLE(AS_ULONG(X) & 0xfffffffff8000000UL) -#define COPYSIGN BUILTIN_COPYSIGN_F32 +#define SIGNBIT(X) (AS_INT2(X).hi < 0) +#define SAMESIGN(X,Y) ((AS_INT2(X).hi & 0x80000000) == (AS_INT2(Y).hi & 0x80000000)) #endif #if defined HALF_SPECIALIZATION @@ -40,25 +42,26 @@ #define ISINF(X) BUILTIN_ISINF_F16(X) #define USE_FMA true #define HIGH(X) AS_HALF(AS_USHORT(X) & (ushort)0xffc0U) -#define COPYSIGN BUILTIN_COPYSIGN_F16 +#define SIGNBIT(X) (AS_SHORT(X) < (short)0) +#define SAMESIGN(X,Y) ((AS_USHORT(X) & (ushort)0x8000) == (AS_USHORT(Y) & (ushort)0x8000)) #endif static ATTR T2 -con(T a, T b) +absv(T2 a) { - return (T2)(b, a); + return SIGNBIT(a.hi) ? -a : a; } static ATTR T2 -csgn(T2 a, T b) +csgn(T2 a, T2 b) { - return con(COPYSIGN(a.hi, b), COPYSIGN(a.lo, b)); + return SAMESIGN(a.hi, b.hi) ? a : -a; } static ATTR T2 -csgn(T2 a, T2 b) +con(T a, T b) { - return con(COPYSIGN(a.hi, b.hi), COPYSIGN(a.lo, b.lo)); + return (T2)(b, a); } static ATTR T2 @@ -430,7 +433,7 @@ root2(T a) T shi = SQRT(a); T2 e = fsub(a, sqr(shi)); T slo = DIV(e.hi, (T)2 * shi); - return fadd(shi, slo); + return fadd(shi, a == (T)0 ? (T)0 : slo); } static ATTR T2 @@ -439,7 +442,7 @@ root2(T2 a) T shi = SQRT(a.hi); T2 e = fsub(a, sqr(shi)); T slo = DIV(e.hi, (T)2 * shi); - return fadd(shi, slo); + return fadd(shi, a.hi == (T)0 ? (T)0 : slo); } #undef ATTR @@ -454,4 +457,6 @@ root2(T2 a) #undef USE_FMA #undef HIGH #undef COPYSIGN +#undef SIGNBIT +#undef SAMESIGN diff --git a/ocml/src/epcsqrtepD.cl b/ocml/src/epcsqrtepD.cl new file mode 100644 index 00000000..ce95a7f9 --- /dev/null +++ b/ocml/src/epcsqrtepD.cl @@ -0,0 +1,26 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathD.h" + +#define DOUBLE_SPECIALIZATION +#include "ep.h" + +CONSTATTR double4 +MATH_PRIVATE(epcsqrtep)(double4 z) +{ + double2 x = z.lo; + double2 y = z.hi; + double2 u = root2(fadd(root2(add(sqr(x), sqr(y))), absv(x)) * 0.5); + double2 v = absv(fdiv(y, u) * 0.5); + v = ((y.hi == 0.0) & (u.hi == 0.0)) ? y : v; + bool b = x.hi >= 0.0; + double2 s = b ? u : v; + double2 t = csgn(b ? v : u, y); + return (double4)(s, t); +} + diff --git a/ocml/src/epcsqrtepF.cl b/ocml/src/epcsqrtepF.cl new file mode 100644 index 00000000..d8dcbd35 --- /dev/null +++ b/ocml/src/epcsqrtepF.cl @@ -0,0 +1,26 @@ +/*===-------------------------------------------------------------------------- + * ROCm Device Libraries + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + *===------------------------------------------------------------------------*/ + +#include "mathF.h" + +#define FLOAT_SPECIALIZATION +#include "ep.h" + +CONSTATTR float4 +MATH_PRIVATE(epcsqrtep)(float4 z) +{ + float2 x = z.lo; + float2 y = z.hi; + float2 u = root2(fadd(root2(add(sqr(x), sqr(y))), absv(x)) * 0.5f); + float2 v = absv(fdiv(y, u) * 0.5f); + v = ((y.hi == 0.0f) & (u.hi == 0.0f)) ? y : v; + bool b = x.hi >= 0.0f; + float2 s = b ? u : v; + float2 t = csgn(b ? v : u, y); + return (float4)(s, t); +} + diff --git a/ocml/src/hypotD.cl b/ocml/src/hypotD.cl index 8c4f7db5..dffa6b70 100644 --- a/ocml/src/hypotD.cl +++ b/ocml/src/hypotD.cl @@ -17,7 +17,6 @@ MATH_MANGLE(hypot)(double x, double y) a = BUILTIN_FLDEXP_F64(a, -e); b = BUILTIN_FLDEXP_F64(b, -e); double ret = BUILTIN_FLDEXP_F64(MATH_FAST_SQRT(MATH_MAD(a, a, b*b)), e); - ret = t == 0.0 ? 0.0 : ret; if (!FINITE_ONLY_OPT()) { ret = BUILTIN_ISNAN_F64(x) | diff --git a/ocml/src/len3D.cl b/ocml/src/len3D.cl index 3a95b261..dbe747b8 100644 --- a/ocml/src/len3D.cl +++ b/ocml/src/len3D.cl @@ -29,7 +29,6 @@ MATH_MANGLE(len3)(double x, double y, double z) c = BUILTIN_FLDEXP_F64(c, -e); double ret = BUILTIN_FLDEXP_F64(MATH_FAST_SQRT(MATH_MAD(a, a, MATH_MAD(b, b, c*c))), e); - ret = a == 0.0 ? 0.0 : ret; if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISNAN_F64(x) | diff --git a/ocml/src/len4D.cl b/ocml/src/len4D.cl index 4047a729..71c559a5 100644 --- a/ocml/src/len4D.cl +++ b/ocml/src/len4D.cl @@ -37,7 +37,6 @@ MATH_MANGLE(len4)(double x, double y, double z, double w) d = BUILTIN_FLDEXP_F64(d, -e); double ret = BUILTIN_FLDEXP_F64(MATH_FAST_SQRT(MATH_MAD(a, a, MATH_MAD(b, b, MATH_MAD(c, c, d*d)))), e); - ret = a == 0.0 ? 0.0 : ret; if (!FINITE_ONLY_OPT()) { ret = (BUILTIN_ISNAN_F64(x) | BUILTIN_ISNAN_F64(y) | diff --git a/ocml/src/privD.h b/ocml/src/privD.h index ff6cad46..d73bfc10 100644 --- a/ocml/src/privD.h +++ b/ocml/src/privD.h @@ -57,6 +57,7 @@ double _fsqrt_s1 = BUILTIN_FMA_F64(_fsqrt_s0, _fsqrt_r0, _fsqrt_s0); \ double _fsqrt_d0 = BUILTIN_FMA_F64(-_fsqrt_s1, _fsqrt_s1, _fsqrt_x); \ double _fsqrt_ret = BUILTIN_FMA_F64(_fsqrt_d0, _fsqrt_h1, _fsqrt_s1); \ + _fsqrt_ret = _fsqrt_x == 0.0 ? _fsqrt_x : _fsqrt_ret; \ _fsqrt_ret; \ })