libocl: Improve precision of pow/powr.

pow: When splitting a float into two floats. normally, we use 0xfffff000 as the mask. This leaves 12bit effective mantissa in high bits. after some calculation, it seems lost some bit in high bits. so I change the mask to 0xffffe000, which only leave 11bit mantissa in the high bits. Then the precision can meet OpenCL Spec requirement. powr: powr() defined different edge case behavior in OpenCL Spec 7.5 Signed-off-by: Ruiling Song <ruiling.song@intel.com> Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
author: Ruiling Song <ruiling.song@intel.com> 2014-12-22 10:00:38 +0800
committer: Zhigang Gong <zhigang.gong@intel.com> 2015-01-09 14:12:58 +0800
commit: 00d547eb4a2a114b8ebf1966650c61f3995bbec4 (patch)
tree: 0d14be23de81ccbb79f6d9a640a2246fab864017 /backend/src/libocl
parent: ca7a44c93ae2dab020a78d76c4f8f9b616a2dfc5 (diff)
1 files changed, 67 insertions, 14 deletions
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index c0b20766..4d05c07b 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -1926,7 +1926,6 @@ OVERLOADABLE float __gen_ocl_internal_round(float x) {
   return y;
 }
 OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
 OVERLOADABLE float __gen_ocl_internal_rint(float x) {
   return __gen_ocl_rnde(x);
 }
@@ -3016,6 +3015,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
   }
    /* y==zero: x**0 = 1 */
   if(iy==0) return one;
+  /* pow(+1, y) returns 1 for any y, even a NAN */
   if(hx==0x3f800000) return one;
   /* +-NaN return x+y */
   if(ix > 0x7f800000 || iy > 0x7f800000)
@@ -3115,6 +3115,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
     GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
     t_l = ax - (t_h-bp[k]);
     s_l = v*((u-s_h*t_h)-s_h*t_l);
+
     /* compute log(ax) */
     s2 = s*s;
     r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
@@ -3122,7 +3123,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
     s2  = s_h*s_h;
     t_h = 3.0f+s2+r;
     GEN_OCL_GET_FLOAT_WORD(is,t_h);
-    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xfffff000);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
     t_l = r-((t_h-3.0f)-s2);
     /* u+v = s*(1+...) */
     u = s_h*t_h;
@@ -3130,7 +3131,7 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
     /* 2/(3log2)*(s+...) */
     p_h = u+v;
     GEN_OCL_GET_FLOAT_WORD(is,p_h);
-    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xfffff000);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
     p_l = v-(p_h-u);
     z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
     z_l = cp_l*p_h+p_l*cp+dp_l[k];
@@ -3138,13 +3139,13 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
     t = (float)n;
     t1 = (((z_h+z_l)+dp_h[k])+t);
     GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
     t2 = z_l-(((t1-t)-dp_h[k])-z_h);
   }
 
   /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
   GEN_OCL_GET_FLOAT_WORD(is,y);
-  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
   p_l = (y-y1)*t1+y*t2;
   p_h = y1*t1;
   z = p_l+p_h;
@@ -3320,6 +3321,54 @@ OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
 OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
 #undef BODY
 
+OVERLOADABLE float powr(float x, float y) {
+  unsigned int hx, sx, hy, sy;
+
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_pow(x,y);
+  else {
+    if (isnan(x) || isnan(y)) return NAN;
+    GEN_OCL_GET_FLOAT_WORD(hx,x);
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    sx = (hx & 0x80000000) >> 31;
+    sy = (hy & 0x80000000) >> 31;
+
+    if ((hx&0x7fffffff) < 0x00800000) {	   /* x < 2**-126  */
+      x = 0.0f;/* Gen does not support subnormal number now */
+      hx = hx &0x80000000;
+    }
+    if ((hy&0x7fffffff) < 0x00800000) {	  /* y < 2**-126  */
+      y = 0.0;/* Gen does not support subnormal number now */
+      hy = hy &0x80000000;
+    }
+
+    // (x < 0) ** y = NAN (y!=0)
+    if ((sx && (hx & 0x7fffffff))) return NAN;
+
+    // +/-0 ** +/-0 = NAN
+    if ( !(hx&0x7fffffff) && !(hy&0x7fffffff)) return NAN;
+
+    // +inf ** +/-0 = NAN
+    if ( ((hx & 0x7f800000) ==0x7f800000) && !(hy&0x7fffffff)) return NAN;
+
+    // others except nan/inf/0 ** 0 = 1.0
+    if (!(hy&0x7fffffff)) return 1.0f;
+
+    // +1 ** inf = NAN; +1 ** finite = 1;
+    if (hx == 0x3f800000) {
+      return isinf(y) ? NAN : 1.0f;
+    }
+
+    if ( !(hx & 0x7fffffff)) {
+        // +/-0 ** y<0 = +inf
+        // +/-0 ** y>0 = +0
+      return sy ? INFINITY : 0.0f;
+    }
+
+    return __gen_ocl_internal_pow(x,y);
+  }
+}
+
 OVERLOADABLE float pown(float x, int n) {
   if (x == 0.f && n == 0)
     return 1.f;
@@ -3329,15 +3378,19 @@ OVERLOADABLE float pown(float x, int n) {
 }
 
 OVERLOADABLE float pow(float x, float y) {
-  int n;
-  if (x == 0.f && y == 0.f)
-    return 1.f;
-  if (x >= 0.f)
-    return powr(x, y);
-  n = y;
-  if ((float)n == y)//is exact integer
-    return pown(x, n);
-  return NAN;
+  if (!__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_pow(x,y);
+  else {
+    int n;
+    if (x == 0.f && y == 0.f)
+      return 1.f;
+    if (x >= 0.f)
+      return powr(x, y);
+    n = y;
+    if ((float)n == y)//is exact integer
+      return pown(x, n);
+    return NAN;
+  }
 }
 
 OVERLOADABLE float rootn(float x, int n) {
author	Ruiling Song <ruiling.song@intel.com>	2014-12-22 10:00:38 +0800
committer	Zhigang Gong <zhigang.gong@intel.com>	2015-01-09 14:12:58 +0800
commit	00d547eb4a2a114b8ebf1966650c61f3995bbec4 (patch)
tree	0d14be23de81ccbb79f6d9a640a2246fab864017 /backend/src/libocl
parent	ca7a44c93ae2dab020a78d76c4f8f9b616a2dfc5 (diff)