diff options
author | rander <rander.wang@intel.com> | 2017-06-22 17:41:17 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2017-07-04 09:57:57 +0800 |
commit | 7e1e128818c96736a5953e9ff9a566e680bec98a (patch) | |
tree | 9f679eecd1078ece365b0d0ad4c44c39f5ba11d4 | |
parent | 6fde3e082897975bacde1134c4a8096420fa5ab9 (diff) |
backend: refine pow function
Now save 40% time than before
(1) group many branches which deal with corner case to one branch.
(2) using HW exp2 and log2 to replace some instructions
pass conformance tests and utest
Signed-off-by: rander.wang <rander.wang@intel.com>
Reviewed-by: Yang Rong <rong.r.yang@intel.com>
-rw-r--r-- | backend/src/libocl/tmpl/ocl_math_common.tmpl.cl | 294 |
1 files changed, 148 insertions, 146 deletions
diff --git a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl index b4764ee3..c0ab2514 100644 --- a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl @@ -2382,7 +2382,8 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) { float z,ax,z_h,z_l,p_h,p_l; float y1,t1,t2,r,s,sn,t,u,v,w; int i,j,k,yisint,n; - int hx,hy,ix,iy,is; + int hy,ix,iy,is; + unsigned int hx; float bp,dp_h,dp_l, zero = 0.0, one = 1.0, @@ -2412,17 +2413,17 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) { float retVal = 0.0f; bool bRet = false; - GEN_OCL_GET_FLOAT_WORD(hx,x); + hx = as_uint(x); GEN_OCL_GET_FLOAT_WORD(hy,y); ax = __gen_ocl_fabs(x); ix = as_int(ax); iy = as_int(fabs(y)); - if(iy < 0x00800000 || hx==0x3f800000) + if(iy < 0x00800000) { bRet = true; retVal = one; } - else if (ix > 0x7f800000 || iy > 0x7f800000) + else if (iy > 0x7f800000) { bRet = true; retVal = NAN; @@ -2433,120 +2434,152 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) { * yisint = 1 ... y is an odd int * yisint = 2 ... y is an even int */ - yisint = 0; - if(hx<0) { - k = (iy>>23)-0x7f; /* exponent */ - j = iy>>(23-k); - yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint; - yisint = (iy>=0x4b800000) ? 2:yisint; - } - - /* special value of x */ - if(ix==0x7f800000||ix==0||ix==0x3f800000){ - z = ax; /*x is +-0,+-inf,+-1*/ + sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */ - z = (hy < 0)? one/z:z; - z = ((hx<0) && (((ix-0x3f800000)|yisint)==0))? NAN:z; - z = ((hx<0) && (yisint==1))? -z:z; + if(hx >= 0x7f800000) + { + yisint = 0; + n = (hx>>31)-1; - retVal = (bRet)? retVal:z; - bRet = true; - } + if (!retVal && ix > 0x7f800000) + { + bRet = true; + retVal = NAN; + } - n = ((uint)hx>>31)-1; + if(hx >= 0x80000000) { + k = (iy>>23)-0x7f; /* exponent */ + j = iy>>(23-k); + yisint = (iy>=0x3f800000 && (j<<(23-k))==iy)? 2-(j&1):yisint; + yisint = (iy>=0x4b800000) ? 2:yisint; + } - /* (x<0)**(non-int) is NaN */ - if(!bRet && (n|yisint)==0) - { - bRet= true; - retVal = NAN; - } + /* special value of x */ + if(ix==0x7f800000||ix==0||ix==0x3f800000){ + z = ax; /*x is +-0,+-inf,+-1*/ + z = (hy < 0)? one/z:z; + z = (((ix-0x3f800000)|yisint)==0)? NAN:z; + z = (yisint==1)? -z:z; + retVal = (bRet)? retVal:z; + bRet = true; + } - sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */ - if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */ + /* (x<0)**(non-int) is NaN */ + if(!bRet && (n|yisint)==0) + { + bRet= true; + retVal = NAN; + } - /* |y| is huge */ - if(iy>0x4d000000) - { /* if |y| > 2**27 */ - /* over/underflow if x is not close to one */ - /* special value of y */ - float b1 = (hy>=0)? y: zero; - float b2 = (hy<0)?-y: zero; - b1 = (ix > 0x3f800000)? b1:b2; - retVal = (iy==0x7f800000 && !bRet)? b1:retVal; - bRet = (iy==0x7f800000 && !bRet)? true: bRet; + if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */ + } + /* special value of x */ + if((ix&0x7fffff) == 0) { + if(hx == 0x3f800000) + { + retVal = one; + bRet = true; + } - b1 = (hy>0)? sn*huge*huge:0; - retVal = (ix>0x3f800007 && !bRet)? b1:retVal; - bRet = (ix>0x3f800007 && !bRet)? true:bRet; + if(ix==0x7f800000||ix==0) { + z = ax; /*x is +0,+inf*/ + z = (hy < 0)? one/z:z; + retVal = (bRet)? retVal:z; + bRet = true; + } + } - /* now |1-x| is tiny <= 2**-20, suffice to compute - log(x) by x-x^2/2+x^3/3-x^4/4 */ - t = ax-1; /* t has 20 trailing zeros */ - w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f)); - u = ivln2_h*t; /* ivln2_h has 16 sig. bits */ - v = t*ivln2_l-w*ivln2; - t1 = u+v; - GEN_OCL_GET_FLOAT_WORD(is,t1); - GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000); - t2 = v-(t1-u); - } - else + /* |y| is not huge */ + if(iy <= 0x4d000000) { - float s2,s_h,s_l,t_h,t_l; - n = 0; - n += ((ix)>>23)-0x7f; - j = ix&0x007fffff; - /* determine interval */ - ix = j|0x3f800000; /* normalize ix */ - - n = (j >= 0x5db3d7)?n+1:n; - ix = (j >= 0x5db3d7)? ix - 0x00800000:ix; - k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1; + float s2,s_h,s_l,t_h,t_l; + n = ((ix)>>23)-0x7f; + j = ix&0x007fffff; + /* determine interval */ + ix = j|0x3f800000; /* normalize ix */ - GEN_OCL_SET_FLOAT_WORD(ax,ix); + if(x > 0x1.0p-6 && x < 0x1.2p7 && fabs(y) < 0x1.0p4) + { + GEN_OCL_SET_FLOAT_WORD(ax,ix); + t1 = n; + t2 = native_log2(ax); + } + else + { + n = (j >= 0x5db3d7)?n+1:n; + ix = (j >= 0x5db3d7)? ix - 0x00800000:ix; + k = (j<=0x1cc471 || j >= 0x5db3d7)? 0:1; + + GEN_OCL_SET_FLOAT_WORD(ax,ix); + + bp = k? 1.5:bp; + dp_h = k? 5.84960938e-01:dp_h; + dp_l = k? 1.56322085e-06:dp_l; + + /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */ + u = ax-bp; /* bp[0]=1.0, bp[1]=1.5 */ + v = one/(ax+bp); + s = u*v; + s_h = s; + GEN_OCL_GET_FLOAT_WORD(is,s_h); + GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000); + /* t_h=ax+bp[k] High */ + is = ((ix>>1)&0xfffff000)|0x20000000; + GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21)); + t_l = ax - (t_h-bp); + s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u))); + + /* compute log(ax) */ + s2 = s*s; + r = s2*s2*(mad(s2, L2, L1)); + r = mad(s_l, (s_h+s), r); + t_h = mad(s_h, s_h, 3.0f)+r; + GEN_OCL_GET_FLOAT_WORD(is,t_h); + GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000); + t_l = r-(mad(-s_h, s_h, (t_h-3.0f))); + /* u+v = s*(1+...) */ + u = s_h*t_h; + v = mad(s_l, t_h, t_l*s); + /* 2/(3log2)*(s+...) */ + p_h = mad(s_h, t_h, v); + GEN_OCL_GET_FLOAT_WORD(is,p_h); + GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000); + p_l = v-(mad(-s_h, t_h, p_h)); + z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l)); + /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */ + t = (float)n; + t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t); + GEN_OCL_GET_FLOAT_WORD(is,t1); + GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000); + t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h)); + } + } + else + { /* if |y| > 2**27 */ + /* over/underflow if x is not close to one */ + /* special value of y */ + float b1 = (hy>=0)? y: zero; + float b2 = (hy<0)?-y: zero; + b1 = (ix > 0x3f800000)? b1:b2; + retVal = (iy==0x7f800000 && !bRet)? b1:retVal; + bRet = (iy==0x7f800000 && !bRet)? true: bRet; - bp = k? 1.5:bp; - dp_h = k? 5.84960938e-01:dp_h; - dp_l = k? 1.56322085e-06:dp_l; - /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */ - u = ax-bp; /* bp[0]=1.0, bp[1]=1.5 */ - v = one/(ax+bp); - s = u*v; - s_h = s; - GEN_OCL_GET_FLOAT_WORD(is,s_h); - GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000); - /* t_h=ax+bp[k] High */ - is = ((ix>>1)&0xfffff000)|0x20000000; - GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21)); - t_l = ax - (t_h-bp); - s_l = v*(mad(-s_h, t_l, mad(-s_h, t_h, u))); + b1 = (hy>0)? sn*huge*huge:0; + retVal = (ix>0x3f800007 && !bRet)? b1:retVal; + bRet = (ix>0x3f800007 && !bRet)? true:bRet; - /* compute log(ax) */ - s2 = s*s; - r = s2*s2*(mad(s2, L2, L1)); - r = mad(s_l, (s_h+s), r); - t_h = mad(s_h, s_h, 3.0f)+r; - GEN_OCL_GET_FLOAT_WORD(is,t_h); - GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000); - t_l = r-(mad(-s_h, s_h, (t_h-3.0f))); - /* u+v = s*(1+...) */ - u = s_h*t_h; - v = mad(s_l, t_h, t_l*s); - /* 2/(3log2)*(s+...) */ - p_h = mad(s_h, t_h, v); - GEN_OCL_GET_FLOAT_WORD(is,p_h); - GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000); - p_l = v-(mad(-s_h, t_h, p_h)); - z_l = mad(cp_l, p_h, mad(p_l, cp, dp_l)); - /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */ - t = (float)n; - t1 = ((mad(cp_h, p_h, z_l)+dp_h)+t); - GEN_OCL_GET_FLOAT_WORD(is,t1); - GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000); - t2 = z_l-mad(-cp_h, p_h, ((t1-t)-dp_h)); + /* now |1-x| is tiny <= 2**-20, suffice to compute + log(x) by x-x^2/2+x^3/3-x^4/4 */ + t = ax-1; /* t has 20 trailing zeros */ + w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f)); + u = ivln2_h*t; /* ivln2_h has 16 sig. bits */ + v = t*ivln2_l-w*ivln2; + t1 = u+v; + GEN_OCL_GET_FLOAT_WORD(is,t1); + GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000); + t2 = v-(t1-u); } /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */ @@ -2554,52 +2587,21 @@ OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) { GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000); p_l = mad((y-y1), t1, y*t2); p_h = y1*t1; - z = mad(y1, t1, p_l); - GEN_OCL_GET_FLOAT_WORD(j,z); - if((j&0x7fffffff) >= 0x43000000 && !bRet) + z = mad(y1, t1, p_l); + if(fabs(z) >= 128.0f) { - retVal = ((j&0x7fffffff)>0x43160000)? 0.0:retVal; - retVal = (j > 0x43000000) ? sn*huge*huge:retVal; - retVal = (j == 0x43000000 && p_l+ovt>z-p_h)? sn*huge*huge:retVal; - retVal = (j == 0xc3160000 && p_l<=z-p_h)? 0.0:retVal; - bRet = (((j&0x7fffffff)>0x43160000) || (j == 0xc3160000 && p_l<=z-p_h) || retVal == sn*huge*huge)? true:false; - } - - /* - * compute 2**(p_h+p_l) - */ - i = j&0x7fffffff; - k = (i>>23)-0x7f; - n = 0; - if(i>0x3f000000) { /* if |z| > 0.5, set n = [z+0.5] */ - n = j+(0x00800000>>(k+1)); - k = ((n&0x7fffffff)>>23)-0x7f; /* new k for n */ - GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k)); - n = ((n&0x007fffff)|0x00800000)>>(23-k); - n = (j<0)?-n:n; - p_h = mad(y1, t1, -t); + if(!bRet) + { + retVal = (fabs(z) > 150.0f)? 0.0:retVal; + retVal = (z > 128.0f) ? sn*INFINITY:retVal; + retVal = (z == 128.0f && p_l+ovt>z-p_h)? sn*INFINITY:retVal; + retVal = (z == -150.0f && p_l<=z-p_h)? 0.0:retVal; + bRet = ((fabs(z) > 150.0f) || (z == -150.0f && p_l<=z-p_h) || retVal == sn*INFINITY)? true:false; + } } - t = p_l+p_h; - GEN_OCL_GET_FLOAT_WORD(is,t); - GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000); - - v = mad((p_l-(t-p_h)), lg2, t*lg2_l); - z = mad(t, lg2_h, v); - w = v-mad(-t, lg2_h, z); - t = z*z; - t1 = mad(-t, (mad(t, P2, P1)), z); - r = (z*t1)/(t1-two)-(mad(z, w, w)); - z = one-(r-z); - - GEN_OCL_GET_FLOAT_WORD(j,z); - j += (n<<23); - - if((j>>23)<=0) - z = 0;//__gen_ocl_scalbnf(z,n); /* subnormal output */ - else - GEN_OCL_SET_FLOAT_WORD(z,j); + z = exp2(p_l)*exp2(p_h); return bRet ? retVal:sn*z; } |