diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2015-05-12 17:18:46 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2015-05-12 17:18:46 +0000 |
commit | 6657f6da7f8508146c64b031a1478db056b15c94 (patch) | |
tree | fda3c0d89e9c130bfdf50ff94430dcef836b6aba | |
parent | 16b7abe342e5aa5c54e411ab3edefe493ccfc8a5 (diff) |
Implement cos for double types
This implementation was ported from the AMD builtin library
and has been tested with piglit, OpenCV, and the ocl conformance tests.
git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@237154 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | generic/lib/math/cos.cl | 24 | ||||
-rw-r--r-- | generic/lib/math/sincos_helpers.cl | 241 | ||||
-rw-r--r-- | generic/lib/math/sincos_helpers.h | 10 | ||||
-rw-r--r-- | generic/lib/math/tables.cl | 20 | ||||
-rw-r--r-- | generic/lib/math/tables.h | 1 |
5 files changed, 289 insertions, 7 deletions
diff --git a/generic/lib/math/cos.cl b/generic/lib/math/cos.cl index cbf7d59..157447f 100644 --- a/generic/lib/math/cos.cl +++ b/generic/lib/math/cos.cl @@ -52,14 +52,24 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cos, float); #pragma OPENCL EXTENSION cl_khr_fp64 : enable -#define __CLC_FUNCTION __clc_cos_intrinsic -#define __CLC_INTRINSIC "llvm.cos" -#include <clc/math/unary_intrin.inc> -#undef __CLC_FUNCTION -#undef __CLC_INTRINSIC - _CLC_OVERLOAD _CLC_DEF double cos(double x) { - return __clc_cos_intrinsic(x); + x = fabs(x); + + double r, rr; + int regn; + + if (x < 0x1.0p+47) + __clc_remainder_piby2_medium(x, &r, &rr, ®n); + else + __clc_remainder_piby2_large(x, &r, &rr, ®n); + + double2 sc = __clc_sincos_piby4(r, rr); + sc.lo = -sc.lo; + + int2 c = as_int2(regn & 1 ? sc.lo : sc.hi); + c.hi ^= (regn > 1) << 31; + + return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c); } _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double); diff --git a/generic/lib/math/sincos_helpers.cl b/generic/lib/math/sincos_helpers.cl index 8619b34..251b7f9 100644 --- a/generic/lib/math/sincos_helpers.cl +++ b/generic/lib/math/sincos_helpers.cl @@ -23,11 +23,15 @@ #include <clc/clc.h> #include "math.h" +#include "tables.h" #include "sincos_helpers.h" #define bitalign(hi, lo, shift) \ ((hi) << (32 - (shift))) | ((lo) >> (shift)); +#define bytealign(src0, src1, src2) \ + ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8))) + _CLC_DEF float __clc_sinf_piby4(float x, float y) { // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... @@ -302,3 +306,240 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) return __clc_argReductionLargeS(r, rr, x); } +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +// Reduction for medium sized arguments +_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) { + // How many pi/2 is x a multiple of? + const double two_by_pi = 0x1.45f306dc9c883p-1; + double dnpi2 = trunc(fma(x, two_by_pi, 0.5)); + + const double piby2_h = -7074237752028440.0 / 0x1.0p+52; + const double piby2_m = -2483878800010755.0 / 0x1.0p+105; + const double piby2_t = -3956492004828932.0 / 0x1.0p+158; + + // Compute product of npi2 with 159 bits of 2/pi + double p_hh = piby2_h * dnpi2; + double p_ht = fma(piby2_h, dnpi2, -p_hh); + double p_mh = piby2_m * dnpi2; + double p_mt = fma(piby2_m, dnpi2, -p_mh); + double p_th = piby2_t * dnpi2; + double p_tt = fma(piby2_t, dnpi2, -p_th); + + // Reduce to 159 bits + double ph = p_hh; + double pm = p_ht + p_mh; + double t = p_mh - (pm - p_ht); + double pt = p_th + t + p_mt + p_tt; + t = ph + pm; pm = pm - (t - ph); ph = t; + t = pm + pt; pt = pt - (t - pm); pm = t; + + // Subtract from x + t = x + ph; + double qh = t + pm; + double qt = pm - (qh - t) + pt; + + *r = qh; + *rr = qt; + *regn = (int)(long)dnpi2 & 0x3; +} + +// Given positive argument x, reduce it to the range [-pi/4,pi/4] using +// extra precision, and return the result in r, rr. +// Return value "regn" tells how many lots of pi/2 were subtracted +// from x to put it in the range [-pi/4,pi/4], mod 4. + +_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) { + + long ux = as_long(x); + int e = (int)(ux >> 52) - 1023; + int i = max(23, (e >> 3) + 17); + int j = 150 - i; + int j16 = j & ~0xf; + double fract_temp; + + // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary + uint4 q0 = USE_TABLE(pibits_tbl, j16); + uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16)); + uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32)); + + int k = (j >> 2) & 0x3; + int4 c = (int4)k == (int4)(0, 1, 2, 3); + + uint u0, u1, u2, u3, u4, u5, u6; + + u0 = c.s1 ? q0.s1 : q0.s0; + u0 = c.s2 ? q0.s2 : u0; + u0 = c.s3 ? q0.s3 : u0; + + u1 = c.s1 ? q0.s2 : q0.s1; + u1 = c.s2 ? q0.s3 : u1; + u1 = c.s3 ? q1.s0 : u1; + + u2 = c.s1 ? q0.s3 : q0.s2; + u2 = c.s2 ? q1.s0 : u2; + u2 = c.s3 ? q1.s1 : u2; + + u3 = c.s1 ? q1.s0 : q0.s3; + u3 = c.s2 ? q1.s1 : u3; + u3 = c.s3 ? q1.s2 : u3; + + u4 = c.s1 ? q1.s1 : q1.s0; + u4 = c.s2 ? q1.s2 : u4; + u4 = c.s3 ? q1.s3 : u4; + + u5 = c.s1 ? q1.s2 : q1.s1; + u5 = c.s2 ? q1.s3 : u5; + u5 = c.s3 ? q2.s0 : u5; + + u6 = c.s1 ? q1.s3 : q1.s2; + u6 = c.s2 ? q2.s0 : u6; + u6 = c.s3 ? q2.s1 : u6; + + uint v0 = bytealign(u1, u0, j); + uint v1 = bytealign(u2, u1, j); + uint v2 = bytealign(u3, u2, j); + uint v3 = bytealign(u4, u3, j); + uint v4 = bytealign(u5, u4, j); + uint v5 = bytealign(u6, u5, j); + + // Place those 192 bits in 4 48-bit doubles along with correct exponent + // If i > 1018 we would get subnormals so we scale p up and x down to get the same product + i = 2 + 8*i; + x *= i > 1018 ? 0x1.0p-136 : 1.0; + i -= i > 1018 ? 136 : 0; + + uint ua = (uint)(1023 + 52 - i) << 20; + double a = as_double((uint2)(0, ua)); + double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a; + ua += 0x03000000U; + a = as_double((uint2)(0, ua)); + double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a; + + // Exact multiply + double f0h = p0 * x; + double f0l = fma(p0, x, -f0h); + double f1h = p1 * x; + double f1l = fma(p1, x, -f1h); + double f2h = p2 * x; + double f2l = fma(p2, x, -f2h); + double f3h = p3 * x; + double f3l = fma(p3, x, -f3h); + + // Accumulate product into 4 doubles + double s, t; + + double f3 = f3h + f2h; + t = f2h - (f3 - f3h); + s = f3l + t; + t = t - (s - f3l); + + double f2 = s + f1h; + t = f1h - (f2 - s) + t; + s = f2l + t; + t = t - (s - f2l); + + double f1 = s + f0h; + t = f0h - (f1 - s) + t; + s = f1l + t; + + double f0 = s + f0l; + + // Strip off unwanted large integer bits + f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp); + f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0; + + // Compute least significant integer bits + t = f3 + f2; + double di = t - fract(t, &fract_temp); + i = (float)di; + + // Shift out remaining integer part + f3 -= di; + s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t; + s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t; + f1 += f0; + + // Subtract 1 if fraction is >= 0.5, and update regn + int g = f3 >= 0.5; + i += g; + f3 -= (float)g; + + // Shift up bits + s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1; + + // Multiply precise fraction by pi/2 to get radians + const double p2h = 7074237752028440.0 / 0x1.0p+52; + const double p2t = 4967757600021510.0 / 0x1.0p+106; + + double rhi = f3 * p2h; + double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi))); + + *r = rhi + rlo; + *rr = rlo - (*r - rhi); + *regn = i & 0x3; +} + + +_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) { + // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... + // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... + // = x * f(w) + // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... + // We use a minimax approximation of (f(w) - 1) / w + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we add a correction + // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) + // is an approximation to cos(x)*sin(xx) valid because + // xx is tiny relative to x. + + // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... + // = f(w) + // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... + // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) + // because this produces an expansion in even powers of x. + // If xx (the tail of x) is non-zero, we subtract a correction + // term g(x,xx) = x*xx to the result, where g(x,xx) + // is an approximation to sin(x)*sin(xx) valid because + // xx is tiny relative to x. + + const double sc1 = -0.166666666666666646259241729; + const double sc2 = 0.833333333333095043065222816e-2; + const double sc3 = -0.19841269836761125688538679e-3; + const double sc4 = 0.275573161037288022676895908448e-5; + const double sc5 = -0.25051132068021699772257377197e-7; + const double sc6 = 0.159181443044859136852668200e-9; + + const double cc1 = 0.41666666666666665390037e-1; + const double cc2 = -0.13888888888887398280412e-2; + const double cc3 = 0.248015872987670414957399e-4; + const double cc4 = -0.275573172723441909470836e-6; + const double cc5 = 0.208761463822329611076335e-8; + const double cc6 = -0.113826398067944859590880e-10; + + double x2 = x * x; + double x3 = x2 * x; + double r = 0.5 * x2; + double t = 1.0 - r; + + double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); + + double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1), + x2*x2, fma(x, xx, (1.0 - t) - r)); + + double2 ret; + ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx)); + ret.hi = cp; + + return ret; +} + +#endif diff --git a/generic/lib/math/sincos_helpers.h b/generic/lib/math/sincos_helpers.h index f936d66..2565d44 100644 --- a/generic/lib/math/sincos_helpers.h +++ b/generic/lib/math/sincos_helpers.h @@ -23,3 +23,13 @@ _CLC_DECL float __clc_sinf_piby4(float x, float y); _CLC_DECL float __clc_cosf_piby4(float x, float y); _CLC_DECL int __clc_argReductionS(float *r, float *rr, float x); + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn); +_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn); +_CLC_DECL double2 __clc_sincos_piby4(double x, double xx); + +#endif diff --git a/generic/lib/math/tables.cl b/generic/lib/math/tables.cl index f22a6af..090e64a 100644 --- a/generic/lib/math/tables.cl +++ b/generic/lib/math/tables.cl @@ -288,9 +288,29 @@ DECLARE_TABLE(float, LOG_INV_TBL, 129) = { 0x1.000000p+0f, }; + +DECLARE_TABLE(uchar, PIBITS_TBL, ) = { + 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175, + 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31, + 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38, + 44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188, + 188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247, + 46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249, + 125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8, + 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39, + 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21, + 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109, + 3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13, + 230, 139, 2, 0, 0, 0, 0, 0, 0, 0 +}; + TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl); TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl); +uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) { + return *(__constant uint4 *)(PIBITS_TBL + idx); +} + #ifdef cl_khr_fp64 DECLARE_TABLE(double2, LN_TBL, 65) = { diff --git a/generic/lib/math/tables.h b/generic/lib/math/tables.h index 1e82901..d09adf1 100644 --- a/generic/lib/math/tables.h +++ b/generic/lib/math/tables.h @@ -40,6 +40,7 @@ TABLE_FUNCTION_DECL(float2, loge_tbl); TABLE_FUNCTION_DECL(float, log_inv_tbl); +TABLE_FUNCTION_DECL(uint4, pibits_tbl); #ifdef cl_khr_fp64 |