summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2015-05-12 17:18:46 +0000
committerTom Stellard <thomas.stellard@amd.com>2015-05-12 17:18:46 +0000
commit6657f6da7f8508146c64b031a1478db056b15c94 (patch)
treefda3c0d89e9c130bfdf50ff94430dcef836b6aba
parent16b7abe342e5aa5c54e411ab3edefe493ccfc8a5 (diff)
Implement cos for double types
This implementation was ported from the AMD builtin library and has been tested with piglit, OpenCV, and the ocl conformance tests. git-svn-id: https://llvm.org/svn/llvm-project/libclc/trunk@237154 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--generic/lib/math/cos.cl24
-rw-r--r--generic/lib/math/sincos_helpers.cl241
-rw-r--r--generic/lib/math/sincos_helpers.h10
-rw-r--r--generic/lib/math/tables.cl20
-rw-r--r--generic/lib/math/tables.h1
5 files changed, 289 insertions, 7 deletions
diff --git a/generic/lib/math/cos.cl b/generic/lib/math/cos.cl
index cbf7d59..157447f 100644
--- a/generic/lib/math/cos.cl
+++ b/generic/lib/math/cos.cl
@@ -52,14 +52,24 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cos, float);
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-#define __CLC_FUNCTION __clc_cos_intrinsic
-#define __CLC_INTRINSIC "llvm.cos"
-#include <clc/math/unary_intrin.inc>
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
-
_CLC_OVERLOAD _CLC_DEF double cos(double x) {
- return __clc_cos_intrinsic(x);
+ x = fabs(x);
+
+ double r, rr;
+ int regn;
+
+ if (x < 0x1.0p+47)
+ __clc_remainder_piby2_medium(x, &r, &rr, &regn);
+ else
+ __clc_remainder_piby2_large(x, &r, &rr, &regn);
+
+ double2 sc = __clc_sincos_piby4(r, rr);
+ sc.lo = -sc.lo;
+
+ int2 c = as_int2(regn & 1 ? sc.lo : sc.hi);
+ c.hi ^= (regn > 1) << 31;
+
+ return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c);
}
_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double);
diff --git a/generic/lib/math/sincos_helpers.cl b/generic/lib/math/sincos_helpers.cl
index 8619b34..251b7f9 100644
--- a/generic/lib/math/sincos_helpers.cl
+++ b/generic/lib/math/sincos_helpers.cl
@@ -23,11 +23,15 @@
#include <clc/clc.h>
#include "math.h"
+#include "tables.h"
#include "sincos_helpers.h"
#define bitalign(hi, lo, shift) \
((hi) << (32 - (shift))) | ((lo) >> (shift));
+#define bytealign(src0, src1, src2) \
+ ((uint) (((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3)*8)))
+
_CLC_DEF float __clc_sinf_piby4(float x, float y) {
// Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
// = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
@@ -302,3 +306,240 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x)
return __clc_argReductionLargeS(r, rr, x);
}
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+// Reduction for medium sized arguments
+_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn) {
+ // How many pi/2 is x a multiple of?
+ const double two_by_pi = 0x1.45f306dc9c883p-1;
+ double dnpi2 = trunc(fma(x, two_by_pi, 0.5));
+
+ const double piby2_h = -7074237752028440.0 / 0x1.0p+52;
+ const double piby2_m = -2483878800010755.0 / 0x1.0p+105;
+ const double piby2_t = -3956492004828932.0 / 0x1.0p+158;
+
+ // Compute product of npi2 with 159 bits of 2/pi
+ double p_hh = piby2_h * dnpi2;
+ double p_ht = fma(piby2_h, dnpi2, -p_hh);
+ double p_mh = piby2_m * dnpi2;
+ double p_mt = fma(piby2_m, dnpi2, -p_mh);
+ double p_th = piby2_t * dnpi2;
+ double p_tt = fma(piby2_t, dnpi2, -p_th);
+
+ // Reduce to 159 bits
+ double ph = p_hh;
+ double pm = p_ht + p_mh;
+ double t = p_mh - (pm - p_ht);
+ double pt = p_th + t + p_mt + p_tt;
+ t = ph + pm; pm = pm - (t - ph); ph = t;
+ t = pm + pt; pt = pt - (t - pm); pm = t;
+
+ // Subtract from x
+ t = x + ph;
+ double qh = t + pm;
+ double qt = pm - (qh - t) + pt;
+
+ *r = qh;
+ *rr = qt;
+ *regn = (int)(long)dnpi2 & 0x3;
+}
+
+// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+// extra precision, and return the result in r, rr.
+// Return value "regn" tells how many lots of pi/2 were subtracted
+// from x to put it in the range [-pi/4,pi/4], mod 4.
+
+_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn) {
+
+ long ux = as_long(x);
+ int e = (int)(ux >> 52) - 1023;
+ int i = max(23, (e >> 3) + 17);
+ int j = 150 - i;
+ int j16 = j & ~0xf;
+ double fract_temp;
+
+ // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
+ uint4 q0 = USE_TABLE(pibits_tbl, j16);
+ uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16));
+ uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32));
+
+ int k = (j >> 2) & 0x3;
+ int4 c = (int4)k == (int4)(0, 1, 2, 3);
+
+ uint u0, u1, u2, u3, u4, u5, u6;
+
+ u0 = c.s1 ? q0.s1 : q0.s0;
+ u0 = c.s2 ? q0.s2 : u0;
+ u0 = c.s3 ? q0.s3 : u0;
+
+ u1 = c.s1 ? q0.s2 : q0.s1;
+ u1 = c.s2 ? q0.s3 : u1;
+ u1 = c.s3 ? q1.s0 : u1;
+
+ u2 = c.s1 ? q0.s3 : q0.s2;
+ u2 = c.s2 ? q1.s0 : u2;
+ u2 = c.s3 ? q1.s1 : u2;
+
+ u3 = c.s1 ? q1.s0 : q0.s3;
+ u3 = c.s2 ? q1.s1 : u3;
+ u3 = c.s3 ? q1.s2 : u3;
+
+ u4 = c.s1 ? q1.s1 : q1.s0;
+ u4 = c.s2 ? q1.s2 : u4;
+ u4 = c.s3 ? q1.s3 : u4;
+
+ u5 = c.s1 ? q1.s2 : q1.s1;
+ u5 = c.s2 ? q1.s3 : u5;
+ u5 = c.s3 ? q2.s0 : u5;
+
+ u6 = c.s1 ? q1.s3 : q1.s2;
+ u6 = c.s2 ? q2.s0 : u6;
+ u6 = c.s3 ? q2.s1 : u6;
+
+ uint v0 = bytealign(u1, u0, j);
+ uint v1 = bytealign(u2, u1, j);
+ uint v2 = bytealign(u3, u2, j);
+ uint v3 = bytealign(u4, u3, j);
+ uint v4 = bytealign(u5, u4, j);
+ uint v5 = bytealign(u6, u5, j);
+
+ // Place those 192 bits in 4 48-bit doubles along with correct exponent
+ // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
+ i = 2 + 8*i;
+ x *= i > 1018 ? 0x1.0p-136 : 1.0;
+ i -= i > 1018 ? 136 : 0;
+
+ uint ua = (uint)(1023 + 52 - i) << 20;
+ double a = as_double((uint2)(0, ua));
+ double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a;
+ ua += 0x03000000U;
+ a = as_double((uint2)(0, ua));
+ double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a;
+
+ // Exact multiply
+ double f0h = p0 * x;
+ double f0l = fma(p0, x, -f0h);
+ double f1h = p1 * x;
+ double f1l = fma(p1, x, -f1h);
+ double f2h = p2 * x;
+ double f2l = fma(p2, x, -f2h);
+ double f3h = p3 * x;
+ double f3l = fma(p3, x, -f3h);
+
+ // Accumulate product into 4 doubles
+ double s, t;
+
+ double f3 = f3h + f2h;
+ t = f2h - (f3 - f3h);
+ s = f3l + t;
+ t = t - (s - f3l);
+
+ double f2 = s + f1h;
+ t = f1h - (f2 - s) + t;
+ s = f2l + t;
+ t = t - (s - f2l);
+
+ double f1 = s + f0h;
+ t = f0h - (f1 - s) + t;
+ s = f1l + t;
+
+ double f0 = s + f0l;
+
+ // Strip off unwanted large integer bits
+ f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp);
+ f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0;
+
+ // Compute least significant integer bits
+ t = f3 + f2;
+ double di = t - fract(t, &fract_temp);
+ i = (float)di;
+
+ // Shift out remaining integer part
+ f3 -= di;
+ s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
+ s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
+ f1 += f0;
+
+ // Subtract 1 if fraction is >= 0.5, and update regn
+ int g = f3 >= 0.5;
+ i += g;
+ f3 -= (float)g;
+
+ // Shift up bits
+ s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
+
+ // Multiply precise fraction by pi/2 to get radians
+ const double p2h = 7074237752028440.0 / 0x1.0p+52;
+ const double p2t = 4967757600021510.0 / 0x1.0p+106;
+
+ double rhi = f3 * p2h;
+ double rlo = fma(f2, p2h, fma(f3, p2t, fma(f3, p2h, -rhi)));
+
+ *r = rhi + rlo;
+ *rr = rlo - (*r - rhi);
+ *regn = i & 0x3;
+}
+
+
+_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) {
+ // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+ // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+ // = x * f(w)
+ // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+ // We use a minimax approximation of (f(w) - 1) / w
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we add a correction
+ // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+ // is an approximation to cos(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+ // = f(w)
+ // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+ // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+ // because this produces an expansion in even powers of x.
+ // If xx (the tail of x) is non-zero, we subtract a correction
+ // term g(x,xx) = x*xx to the result, where g(x,xx)
+ // is an approximation to sin(x)*sin(xx) valid because
+ // xx is tiny relative to x.
+
+ const double sc1 = -0.166666666666666646259241729;
+ const double sc2 = 0.833333333333095043065222816e-2;
+ const double sc3 = -0.19841269836761125688538679e-3;
+ const double sc4 = 0.275573161037288022676895908448e-5;
+ const double sc5 = -0.25051132068021699772257377197e-7;
+ const double sc6 = 0.159181443044859136852668200e-9;
+
+ const double cc1 = 0.41666666666666665390037e-1;
+ const double cc2 = -0.13888888888887398280412e-2;
+ const double cc3 = 0.248015872987670414957399e-4;
+ const double cc4 = -0.275573172723441909470836e-6;
+ const double cc5 = 0.208761463822329611076335e-8;
+ const double cc6 = -0.113826398067944859590880e-10;
+
+ double x2 = x * x;
+ double x3 = x2 * x;
+ double r = 0.5 * x2;
+ double t = 1.0 - r;
+
+ double sp = fma(fma(fma(fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2);
+
+ double cp = t + fma(fma(fma(fma(fma(fma(cc6, x2, cc5), x2, cc4), x2, cc3), x2, cc2), x2, cc1),
+ x2*x2, fma(x, xx, (1.0 - t) - r));
+
+ double2 ret;
+ ret.lo = x - fma(-x3, sc1, fma(fma(-x3, sp, 0.5*xx), x2, -xx));
+ ret.hi = cp;
+
+ return ret;
+}
+
+#endif
diff --git a/generic/lib/math/sincos_helpers.h b/generic/lib/math/sincos_helpers.h
index f936d66..2565d44 100644
--- a/generic/lib/math/sincos_helpers.h
+++ b/generic/lib/math/sincos_helpers.h
@@ -23,3 +23,13 @@
_CLC_DECL float __clc_sinf_piby4(float x, float y);
_CLC_DECL float __clc_cosf_piby4(float x, float y);
_CLC_DECL int __clc_argReductionS(float *r, float *rr, float x);
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn);
+_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn);
+_CLC_DECL double2 __clc_sincos_piby4(double x, double xx);
+
+#endif
diff --git a/generic/lib/math/tables.cl b/generic/lib/math/tables.cl
index f22a6af..090e64a 100644
--- a/generic/lib/math/tables.cl
+++ b/generic/lib/math/tables.cl
@@ -288,9 +288,29 @@ DECLARE_TABLE(float, LOG_INV_TBL, 129) = {
0x1.000000p+0f,
};
+
+DECLARE_TABLE(uchar, PIBITS_TBL, ) = {
+ 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
+ 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
+ 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
+ 44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
+ 188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
+ 46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
+ 125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
+ 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
+ 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
+ 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
+ 3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
+ 230, 139, 2, 0, 0, 0, 0, 0, 0, 0
+};
+
TABLE_FUNCTION(float2, LOGE_TBL, loge_tbl);
TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl);
+uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) {
+ return *(__constant uint4 *)(PIBITS_TBL + idx);
+}
+
#ifdef cl_khr_fp64
DECLARE_TABLE(double2, LN_TBL, 65) = {
diff --git a/generic/lib/math/tables.h b/generic/lib/math/tables.h
index 1e82901..d09adf1 100644
--- a/generic/lib/math/tables.h
+++ b/generic/lib/math/tables.h
@@ -40,6 +40,7 @@
TABLE_FUNCTION_DECL(float2, loge_tbl);
TABLE_FUNCTION_DECL(float, log_inv_tbl);
+TABLE_FUNCTION_DECL(uint4, pibits_tbl);
#ifdef cl_khr_fp64