backend: refine the code structure of math

mov all the common math function to match_common.cl. it is easy to maitain Signed-off-by: rander.wang <rander.wang@intel.com> Tested-by: Yang Rong <rong.r.yang@intel.com>
author: rander.wang <rander.wang@intel.com> 2017-05-15 14:11:18 +0800
committer: Yang Rong <rong.r.yang@intel.com> 2017-05-17 18:09:25 +0800
commit: acd53cffa43da9f92b7e656f1314edc0b4044c0b (patch)
tree: ed63012fbb24a36e521d98ada4a2ce4ddc721d34
parent: 98eeab036fc30a036d92bd0e3b63d093aae002c8 (diff)
9 files changed, 4073 insertions, 7538 deletions
diff --git a/backend/src/libocl/script/ocl_math.def b/backend/src/libocl/script/ocl_math.def
index ea4ae8ae..a4e99c35 100644
--- a/backend/src/libocl/script/ocl_math.def
+++ b/backend/src/libocl/script/ocl_math.def
@@ -1,41 +1,4 @@
 ##math
-gentype acos (gentype)
-gentype acosh (gentype)
-gentype acospi (gentype x)
-gentype asin (gentype)
-gentype asinh (gentype)
-gentype asinpi (gentype x)
-gentype atan (gentype y_over_x)
-gentype atan2 (gentype y, gentype x)
-gentype atanh (gentype)
-gentype atanpi (gentype x)
-gentype atan2pi (gentype y, gentype x)
-gentype cbrt (gentype)
-gentype ceil (gentype)
-gentype copysign (gentype x, gentype y)
-gentype cos (gentype)
-gentype cosh (gentype)
-gentype cospi (gentype x)
-gentype erfc (gentype)
-gentype erf (gentype)
-gentype exp (gentype x)
-gentype exp2 (gentype)
-gentype exp10 (gentype)
-gentype expm1 (gentype x)
-gentype fabs (gentype)
-gentype fdim (gentype x, gentype y)
-gentype floor (gentype)
-# XXX we use madd for fma
-gentype fma (gentype a, gentype b, gentype c)
-gentype fmax (gentype x, gentype y)
-gentypef fmax (gentypef x, float y)
-gentypeh fmax (gentypeh x, half y)
-gentyped fmax (gentyped x, double y)
-gentype fmin (gentype x, gentype y)
-gentypef fmin (gentypef x, float y)
-gentypeh fmin (gentypeh x, half y)
-gentyped fmin (gentyped x, double y)
-gentype fmod (gentype x, gentype y)
 gentype fract (gentype x, __global gentype *iptr)
 gentype fract (gentype x, __local gentype *iptr)
 gentype fract (gentype x, __private gentype *iptr)
@@ -57,20 +20,6 @@ doublen frexp (doublen x, __private intn *exp)
 double frexp (double x, __global int *exp)
 double frexp (double x, __local int *exp)
 double frexp (double x, __private int *exp)
-gentype hypot (gentype x, gentype y)
-intn ilogb (floatn x)
-int ilogb (float x)
-shortn ilogb (halfn x)
-short ilogb (half x)
-int ilogb (double x)
-floatn ldexp (floatn x, intn k)
-floatn ldexp (floatn x, int k)
-float ldexp (float x, int k)
-halfn ldexp (halfn x, intn k)
-halfn ldexp (halfn x, int k)
-half ldexp (half x, int k)
-double ldexp (double x, int k)
-gentype lgamma (gentype x)
 floatn lgamma_r (floatn x, __global intn *signp)
 floatn lgamma_r (floatn x, __local intn *signp)
 floatn lgamma_r (floatn x, __private intn *signp)
@@ -89,31 +38,9 @@ doublen lgamma_r (doublen x, __private intn *signp)
 double lgamma_r (double x, __global int *signp)
 double lgamma_r (double x, __local int *signp)
 double lgamma_r (double x, __private int *signp)
-gentype log (gentype)
-gentype log2 (gentype)
-gentype log10 (gentype)
-gentype log1p (gentype x)
-gentype logb (gentype x)
-gentype mad (gentype a, gentype b, gentype c)
-gentype maxmag (gentype x, gentype y)
-gentype minmag (gentype x, gentype y)
 gentype modf (gentype x, __global gentype *iptr)
 gentype modf (gentype x, __local gentype *iptr)
 gentype modf (gentype x, __private gentype *iptr)
-floatn nan (uintn nancode)
-float nan (uint nancode)
-halfn nan (ushortn nancode)
-half nan (ushort nancode)
-double nan (ulong nancode)
-gentype nextafter (gentype x, gentype y)
-gentype pow (gentype x, gentype y)
-floatn pown (floatn x, intn y)
-float pown (float x, int y)
-halfn pown (halfn x, intn y)
-half pown (half x, int y)
-double pown (double x, int y)
-gentype powr (gentype x, gentype y)
-gentype remainder (gentype x, gentype y)
 floatn remquo (floatn x, floatn y, __global intn *quo)
 floatn remquo (floatn x, floatn y, __local intn *quo)
 floatn remquo (floatn x, floatn y, __private intn *quo)
@@ -132,55 +59,7 @@ doublen remquo (doublen x, doublen y, __private intn *quo)
 double remquo (double x, double y, __global int *quo)
 double remquo (double x, double y, __local int *quo)
 double remquo (double x, double y, __private int *quo)
-gentype rint (gentype)
-floatn rootn (floatn x, intn y)
-halfn rootn (halfn x, intn y)
-gentype round (gentype x)
-gentype rsqrt (gentype)
-gentype sin (gentype)
 gentype sincos (gentype x, __global gentype *cosval)
 gentype sincos (gentype x, __local gentype *cosval)
 gentype sincos (gentype x, __private gentype *cosval)
-gentype sinh (gentype)
-gentype sinpi (gentype x)
-gentype sqrt (gentype)
-gentype tan (gentype)
-gentype tanh (gentype)
-gentype tanpi (gentype x)
-gentype tgamma (gentype)
-gentype trunc (gentype)
 
-
-# XXX we already defined all native and non-native
-# functions to the same one.
-gentype native_cos (gentype x)
-gentype native_divide (gentype x, gentype y)
-gentype native_exp (gentype x)
-gentype native_exp2 (gentype x)
-gentype native_exp10 (gentype x)
-gentype native_log (gentype x)
-gentype native_log2 (gentype x)
-gentype native_log10 (gentype x)
-gentype native_powr (gentype x, gentype y)
-gentype native_recip (gentype x)
-gentype native_rsqrt (gentype x)
-gentype native_sin (gentype x)
-gentype native_sqrt (gentype x)
-gentype native_tan (gentype x)
-
-
-##half_native_math
-gentype half_cos (gentype x)
-gentype half_divide (gentype x, gentype y)
-gentype half_exp (gentype x)
-gentype half_exp2 (gentype x)
-gentype half_exp10 (gentype x)
-gentype half_log (gentype x)
-gentype half_log2 (gentype x)
-gentype half_log10 (gentype x)
-gentype half_powr (gentype x, gentype y)
-gentype half_recip (gentype x)
-gentype half_rsqrt (gentype x)
-gentype half_sin (gentype x)
-gentype half_sqrt (gentype x)
-gentype half_tan (gentype x)
diff --git a/backend/src/libocl/script/ocl_math_20.def b/backend/src/libocl/script/ocl_math_20.def
index b0e9d890..71558d16 100644
--- a/backend/src/libocl/script/ocl_math_20.def
+++ b/backend/src/libocl/script/ocl_math_20.def
@@ -1,39 +1,4 @@
 ##math
-gentype acos (gentype)
-gentype acosh (gentype)
-gentype acospi (gentype x)
-gentype asin (gentype)
-gentype asinh (gentype)
-gentype asinpi (gentype x)
-gentype atan (gentype y_over_x)
-gentype atan2 (gentype y, gentype x)
-gentype atanh (gentype)
-gentype atanpi (gentype x)
-gentype atan2pi (gentype y, gentype x)
-gentype cbrt (gentype)
-gentype ceil (gentype)
-gentype copysign (gentype x, gentype y)
-gentype cos (gentype)
-gentype cosh (gentype)
-gentype cospi (gentype x)
-gentype erfc (gentype)
-gentype erf (gentype)
-gentype exp (gentype x)
-gentype exp2 (gentype)
-gentype exp10 (gentype)
-gentype expm1 (gentype x)
-gentype fabs (gentype)
-gentype fdim (gentype x, gentype y)
-gentype floor (gentype)
-# XXX we use madd for fma
-gentype fma (gentype a, gentype b, gentype c)
-gentype fmax (gentype x, gentype y)
-gentypef fmax (gentypef x, float y)
-gentypeh fmax (gentypeh x, half y)
-gentype fmin (gentype x, gentype y)
-gentypef fmin (gentypef x, float y)
-gentypeh fmin (gentypeh x, half y)
-gentype fmod (gentype x, gentype y)
 gentype fract (gentype x, __generic gentype *iptr)
 floatn frexp (floatn x, __generic intn *exp)
 float frexp (float x, __generic int *exp)
@@ -41,98 +6,18 @@ halfn frexp (halfn x, __generic intn *exp)
 half frexp (half x, __generic int *exp)
 doublen frexp (doublen x, __generic intn *exp)
 double frexp (double x, __generic int *exp)
-gentype hypot (gentype x, gentype y)
-intn ilogb (floatn x)
-int ilogb (float x)
-shortn ilogb (halfn x)
-short ilogb (half x)
-floatn ldexp (floatn x, intn k)
-floatn ldexp (floatn x, int k)
-float ldexp (float x, int k)
-halfn ldexp (halfn x, intn k)
-halfn ldexp (halfn x, int k)
-half ldexp (half x, int k)
-gentype lgamma (gentype x)
 floatn lgamma_r (floatn x, __generic intn *signp)
 float lgamma_r (float x, __generic int *signp)
 halfn lgamma_r (halfn x, __generic intn *signp)
 half lgamma_r (half x, __generic int *signp)
 doublen lgamma_r (doublen x, __generic intn *signp)
 double lgamma_r (double x, __generic int *signp)
-gentype log (gentype)
-gentype log2 (gentype)
-gentype log10 (gentype)
-gentype log1p (gentype x)
-gentype logb (gentype x)
-gentype mad (gentype a, gentype b, gentype c)
-gentype maxmag (gentype x, gentype y)
-gentype minmag (gentype x, gentype y)
 gentype modf (gentype x, __generic gentype *iptr)
-floatn nan (uintn nancode)
-float nan (uint nancode)
-halfn nan (ushortn nancode)
-half nan (ushort nancode)
-gentype nextafter (gentype x, gentype y)
-gentype pow (gentype x, gentype y)
-floatn pown (floatn x, intn y)
-float pown (float x, int y)
-halfn pown (halfn x, intn y)
-half pown (half x, int y)
-gentype powr (gentype x, gentype y)
-gentype remainder (gentype x, gentype y)
 floatn remquo (floatn x, floatn y, __generic intn *quo)
 float remquo (float x, float y, __generic int *quo)
 halfn remquo (halfn x, halfn y, __generic intn *quo)
 half remquo (half x, half y, __generic int *quo)
 doublen remquo (doublen x, doublen y, __generic intn *quo)
 double remquo (double x, double y, __generic int *quo)
-gentype rint (gentype)
-floatn rootn (floatn x, intn y)
-halfn rootn (halfn x, intn y)
-gentype round (gentype x)
-gentype rsqrt (gentype)
-gentype sin (gentype)
 gentype sincos (gentype x, __generic gentype *cosval)
-gentype sinh (gentype)
-gentype sinpi (gentype x)
-gentype sqrt (gentype)
-gentype tan (gentype)
-gentype tanh (gentype)
-gentype tanpi (gentype x)
-gentype tgamma (gentype)
-gentype trunc (gentype)
 
-
-# XXX we already defined all native and non-native
-# functions to the same one.
-gentype native_cos (gentype x)
-gentype native_divide (gentype x, gentype y)
-gentype native_exp (gentype x)
-gentype native_exp2 (gentype x)
-gentype native_exp10 (gentype x)
-gentype native_log (gentype x)
-gentype native_log2 (gentype x)
-gentype native_log10 (gentype x)
-gentype native_powr (gentype x, gentype y)
-gentype native_recip (gentype x)
-gentype native_rsqrt (gentype x)
-gentype native_sin (gentype x)
-gentype native_sqrt (gentype x)
-gentype native_tan (gentype x)
-
-
-##half_native_math
-gentype half_cos (gentype x)
-gentype half_divide (gentype x, gentype y)
-gentype half_exp (gentype x)
-gentype half_exp2 (gentype x)
-gentype half_exp10 (gentype x)
-gentype half_log (gentype x)
-gentype half_log2 (gentype x)
-gentype half_log10 (gentype x)
-gentype half_powr (gentype x, gentype y)
-gentype half_recip (gentype x)
-gentype half_rsqrt (gentype x)
-gentype half_sin (gentype x)
-gentype half_sqrt (gentype x)
-gentype half_tan (gentype x)
diff --git a/backend/src/libocl/script/ocl_math_common.def b/backend/src/libocl/script/ocl_math_common.def
index b28f8af4..77829235 100644
--- a/backend/src/libocl/script/ocl_math_common.def
+++ b/backend/src/libocl/script/ocl_math_common.def
@@ -1,4 +1,127 @@
 ##math
+gentype acos (gentype)
+gentype acosh (gentype)
+gentype acospi (gentype x)
+gentype asin (gentype)
+gentype asinh (gentype)
+gentype asinpi (gentype x)
+gentype atan (gentype y_over_x)
+gentype atan2 (gentype y, gentype x)
+gentype atanh (gentype)
+gentype atanpi (gentype x)
+gentype atan2pi (gentype y, gentype x)
+gentype cbrt (gentype)
+gentype ceil (gentype)
+gentype copysign (gentype x, gentype y)
+gentype cos (gentype)
+gentype cosh (gentype)
+gentype cospi (gentype x)
+gentype erfc (gentype)
+gentype erf (gentype)
+gentype exp (gentype x)
+gentype exp2 (gentype)
+gentype exp10 (gentype)
+gentype expm1 (gentype x)
+gentype fabs (gentype)
+gentype fdim (gentype x, gentype y)
+gentype floor (gentype)
+# XXX we use madd for fma
+gentype fma (gentype a, gentype b, gentype c)
+gentype fmax (gentype x, gentype y)
+gentypef fmax (gentypef x, float y)
+gentypeh fmax (gentypeh x, half y)
+gentyped fmax (gentyped x, double y)
+gentype fmin (gentype x, gentype y)
+gentypef fmin (gentypef x, float y)
+gentypeh fmin (gentypeh x, half y)
+gentyped fmin (gentyped x, double y)
+gentype fmod (gentype x, gentype y)
+gentype hypot (gentype x, gentype y)
+intn ilogb (floatn x)
+int ilogb (float x)
+shortn ilogb (halfn x)
+short ilogb (half x)
+int ilogb (double x)
+floatn ldexp (floatn x, intn k)
+floatn ldexp (floatn x, int k)
+float ldexp (float x, int k)
+halfn ldexp (halfn x, intn k)
+halfn ldexp (halfn x, int k)
+half ldexp (half x, int k)
+double ldexp (double x, int k)
+gentype lgamma (gentype x)
+gentype log (gentype)
+gentype log2 (gentype)
+gentype log10 (gentype)
+gentype log1p (gentype x)
+gentype logb (gentype x)
+gentype mad (gentype a, gentype b, gentype c)
+gentype maxmag (gentype x, gentype y)
+gentype minmag (gentype x, gentype y)
+floatn nan (uintn nancode)
+float nan (uint nancode)
+halfn nan (ushortn nancode)
+half nan (ushort nancode)
+double nan (ulong nancode)
+gentype nextafter (gentype x, gentype y)
+gentype pow (gentype x, gentype y)
+floatn pown (floatn x, intn y)
+float pown (float x, int y)
+halfn pown (halfn x, intn y)
+half pown (half x, int y)
+double pown (double x, int y)
+gentype powr (gentype x, gentype y)
+gentype remainder (gentype x, gentype y)
+gentype rint (gentype)
+floatn rootn (floatn x, intn y)
+halfn rootn (halfn x, intn y)
+gentype round (gentype x)
+gentype rsqrt (gentype)
+gentype sin (gentype)
+gentype sinh (gentype)
+gentype sinpi (gentype x)
+gentype sqrt (gentype)
+gentype tan (gentype)
+gentype tanh (gentype)
+gentype tanpi (gentype x)
+gentype tgamma (gentype)
+gentype trunc (gentype)
+
+# XXX we already defined all native and non-native
+# functions to the same one.
+gentype native_cos (gentype x)
+gentype native_divide (gentype x, gentype y)
+gentype native_exp (gentype x)
+gentype native_exp2 (gentype x)
+gentype native_exp10 (gentype x)
+gentype native_log (gentype x)
+gentype native_log2 (gentype x)
+gentype native_log10 (gentype x)
+gentype native_powr (gentype x, gentype y)
+gentype native_recip (gentype x)
+gentype native_rsqrt (gentype x)
+gentype native_sin (gentype x)
+gentype native_sqrt (gentype x)
+gentype native_tan (gentype x)
+
+
+##half_native_math
+gentype half_cos (gentype x)
+gentype half_divide (gentype x, gentype y)
+gentype half_exp (gentype x)
+gentype half_exp2 (gentype x)
+gentype half_exp10 (gentype x)
+gentype half_log (gentype x)
+gentype half_log2 (gentype x)
+gentype half_log10 (gentype x)
+gentype half_powr (gentype x, gentype y)
+gentype half_recip (gentype x)
+gentype half_rsqrt (gentype x)
+gentype half_sin (gentype x)
+gentype half_sqrt (gentype x)
+gentype half_tan (gentype x)
+
+
 double acos (double)
 double acosh (double)
 double acospi (double x)
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
index 98e295a1..594b7125 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -21,7 +21,6 @@
 #include "ocl_common.h"
 #include "ocl_integer.h"
 #include "ocl_convert.h"
-#include "ocl_printf.h"
 
 extern constant int __ocl_math_fastpath_flag;
 
@@ -39,96 +38,6 @@ CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
 CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
 CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
 
-
-/* native functions */
-OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
-OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
-OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
-OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
-OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
-OVERLOADABLE float native_log(float x) {
-  return native_log2(x) * 0.6931472002f;
-}
-OVERLOADABLE float native_log10(float x) {
-  return native_log2(x) * 0.3010299956f;
-}
-OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
-OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
-OVERLOADABLE float native_tan(float x) {
-  return native_sin(x) / native_cos(x);
-}
-OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
-OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
-OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
-OVERLOADABLE float native_divide(float x, float y) { return x/y; }
-
-/* Fast path */
-OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x) {
-    return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x) {
-    return native_log(x + native_sqrt(x * x + 1));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x) {
-    return 0.5f * native_log((1 + x) / (1 - x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x) {
-    return __gen_ocl_pow(x, 0.3333333333f);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x) {
-    return native_cos(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
-    return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
-    return __gen_ocl_cos(x * M_PI_F);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
-    return native_exp(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x) {
-    return native_exp10(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x) {
-    return __gen_ocl_pow(M_E_F, x) - 1;
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y) {
-    return x-y*__gen_ocl_rndz(x/y);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y) {
-    return __gen_ocl_sqrt(x*x + y*y);
-}
-OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x) {
-    return __gen_ocl_rndd(native_log2(x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n) {
-    return __gen_ocl_pow(2, n) * x;
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x) {
-    return native_log(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x) {
-    return native_log2(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x) {
-    return native_log10(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x) {
-    return native_log(x + 1);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x) {
-    return __gen_ocl_rndd(native_log2(x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y) {
-    return x-y*__gen_ocl_rnde(x/y);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n) {
-    return __gen_ocl_pow(x, 1.f / n);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x) {
-    return native_sin(x);
-}
 OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __global float *cosval) {
     *cosval = native_cos(x);
     return native_sin(x);
@@ -141,320 +50,8 @@ OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __private float
     *cosval = native_cos(x);
     return native_sin(x);
 }
-OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x) {
-    return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x) {
-    return __gen_ocl_sin(x * M_PI_F);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x) {
-    return native_tan(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x) {
-    float y = native_exp(-2 * x);
-    return (1 - y) / (1 + y);
-}
 
-
-/* Internal implement, high accuracy. */
-OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
-OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
-  union { unsigned u; float f; } ux, uy;
-  ux.f = x;
-  uy.f = y;
-  ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
-  return ux.f;
-}
-
-OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  union { unsigned int i; float f; } u;
-  const float
-  ln2_hi = 6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo = 9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =  3.355443200e+07, /* 0x4c000000 */
-  Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
-  Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
-  Lg3 = 2.8571429849e-01, /* 3E924925 */
-  Lg4 = 2.2222198546e-01; /* 3E638E29 */
-
-  const float zero   =  0.0;
-  float fsq, f, s, z, R, w, t1, t2, partial;
-  int k, ix, i, j;
-
-  u.f = x;  ix = u.i;
-  k = 0;
-
-  k += (ix>>23) - 127;
-  ix &= 0x007fffff;
-  i = (ix + (0x95f64<<3)) & 0x800000;
-  u.i = ix | (i^0x3f800000); x = u.f;
-  k += (i>>23);
-  f = x - 1.0f;
-  fsq = f * f;
-
-  if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
-      R = fsq * (0.5f - 0.33333333333333333f * f);
-      return k * ln2_hi + k * ln2_lo + f - R;
-  }
-
-  s = f / (2.0f + f);
-  z = s * s;
-  i = ix - (0x6147a << 3);
-  w = z * z;
-  j = (0x6b851 << 3) - ix;
-  t1= w * mad(w, Lg4, Lg2);
-  t2= z * mad(w, Lg3, Lg1);
-  i |= j;
-  R = t2 + t1;
-  partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
-
-  return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
-}
-
-OVERLOADABLE float __gen_ocl_internal_log(float x)
-{
-  union { unsigned int i; float f; } u;
-  u.f = x;
-  int ix = u.i;
-
-  if (ix < 0 )
-	return NAN;  /* log(-#) = NaN */
-  if (ix >= 0x7f800000)
-    return NAN;
-
-  return __gen_ocl_internal_log_valid(x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_log10(float x)
-{
-  union { float f; unsigned i; } u;
-  const float
-  ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
-  log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
-  log10_2lo  =  7.9034151668e-07; /* 0x355427db */
-
-  float y, z;
-  int i, k, hx;
-
-  u.f = x; hx = u.i;
-
-  if (hx<0)
-    return NAN; /* log(-#) = NaN */
-  if (hx >= 0x7f800000)
-    return NAN;
-
-  k = (hx >> 23) - 127;
-  i  = ((unsigned)k & 0x80000000) >> 31;
-  hx = (hx&0x007fffff) | ((0x7f-i) << 23);
-  y  = (float)(k + i);
-  u.i = hx; x = u.f;
-
-  return  y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
-}
-
-
-OVERLOADABLE float __gen_ocl_internal_log2(float x)
-{
-  const float zero   =  0.0,
-  invln2 = 0x1.715476p+0f;
-  int ix;
-
-  union { float f; int i; } u;
-  u.f = x; ix = u.i;
-
-  if (ix < 0)
-	return NAN;    /** log(-#) = NaN */
-  if (ix >= 0x7f800000)
-	return NAN;
-
-  return invln2 * __gen_ocl_internal_log_valid(x);
-}
-
-
-float __gen_ocl_scalbnf (float x, int n){
-  /* copy from fdlibm */
-  float two25 = 3.355443200e+07,	/* 0x4c000000 */
-  twom25 = 2.9802322388e-08,	        /* 0x33000000 */
-  huge = 1.0e+30,
-  tiny = 1.0e-30;
-  int k,ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  k = (ix&0x7f800000)>>23; /* extract exponent */
-  if (k==0) {	/* 0 or subnormal x */
-    if ((ix&0x7fffffff)==0) return x; /* +-0 */
-    x *= two25;
-    GEN_OCL_GET_FLOAT_WORD(ix,x);
-    k = ((ix&0x7f800000)>>23) - 25;
-  }
-  if (k==0xff) return x+x;	/* NaN or Inf */
-  if (n< -50000)
-    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
-  if (n> 50000 || k+n > 0xfe)
-    return huge*__gen_ocl_internal_copysign(huge,x); /* overflow  */
-  /* Now k and n are bounded we know that k = k+n does not overflow. */
-  k = k+n;
-  if (k > 0) { /* normal result */
-    GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
-    return x;
-  }
-  if (k <= -25)
-    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
-  k += 25;				/* subnormal result */
-  GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
-  return x*twom25;
-}
-
-const __constant unsigned int two_over_pi[] = {
-0, 0, 0xA2F, 0x983, 0x6E4, 0xe44, 0x152, 0x9FC,
-0x275, 0x7D1, 0xF53, 0x4DD, 0xC0D, 0xB62,
-0x959, 0x93C, 0x439, 0x041, 0xFE5, 0x163,
-};
-
-// The main idea is from "Radian Reduction for Trigonometric Functions"
-// written by Mary H. Payne and Robert N. Hanek. Also another reference
-// is "A Continued-Fraction Analysis of Trigonometric Argument Reduction"
-// written by Roger Alan Smith, who gave the worst case in this paper.
-// for single float, worst x = 0x1.47d0fep34, and there are 29 bit
-// leading zeros in the fraction part of x*(2.0/pi). so we need at least
-// 29 (leading zero)+ 24 (fraction )+12 (integer) + guard bits. that is,
-// 65 + guard bits, as we calculate in 12*7 = 84bits, which means we have
-// about 19 guard bits. If we need further precision, we may need more
-// guard bits
-// Note we place two 0 in two_over_pi, which is used to handle input less
-// than 0x1.0p23
-
-int payne_hanek(float x, float *y) {
-  union { float f; unsigned u;} ieee;
-  ieee.f = x;
-  unsigned u = ieee.u;
-  int k = ((u & 0x7f800000) >> 23)-127;
-  int ma = (u & 0x7fffff) | 0x800000;
-  unsigned  high, low;
-  high = (ma & 0xfff000) >> 12;
-  low = ma & 0xfff;
-
-  // Two tune below macro, you need to fully understand the algorithm
-#define CALC_BLOCKS 7
-#define ZERO_BITS 2
-
-  unsigned result[CALC_BLOCKS];
-
-  // round down, note we need 2 bits integer precision
-  int index = (k-23-2) < 0 ? (k-23-2-11)/12 : (k-23-2)/12;
-
-  for (int i = 0; i < CALC_BLOCKS; i++) {
-    result[i] =  low * two_over_pi[index+i+ZERO_BITS] ;
-    result[i] +=  high * two_over_pi[index+i+1+ZERO_BITS];
-  }
-
-  for (int i = CALC_BLOCKS-1; i > 0; i--) {
-    int temp = result[i] >> 12;
-    result[i]  -= temp << 12;
-    result[i-1] += temp;
-  }
-#undef CALC_BLOCKS
-#undef ZERO_BITS
-
-  // get number of integer digits in result[0], note we only consider 12 valid bits
-  // and also it means the fraction digits in result[0] is (12-intDigit)
-
-  int intDigit = index*(-12) + (k-23);
-
-  // As the integer bits may be all included in result[0], and also maybe
-  // some bits in result[0], and some in result[1]. So we merge succesive bits,
-  // which makes easy coding.
-
-  unsigned b0 = (result[0] << 12) | result[1];
-  unsigned b1 = (result[2] << 12) | result[3];
-  unsigned b2 = (result[4] << 12) | result[5];
-  unsigned b3 = (result[6] << 12);
-
-  unsigned intPart = b0 >> (24-intDigit);
-
-  unsigned fract1 = ((b0 << intDigit) | (b1 >> (24-intDigit))) & 0xffffff;
-  unsigned fract2 = ((b1 << intDigit) | (b2 >> (24-intDigit))) & 0xffffff;
-  unsigned fract3 = ((b2 << intDigit) | (b3 >> (24-intDigit))) & 0xffffff;
-
-  // larger than 0.5? which mean larger than pi/4, we need
-  // transform from [0,pi/2] to [-pi/4, pi/4] through -(1.0-fract)
-  int largerPiBy4 = ((fract1 & 0x800000) != 0);
-  int sign = largerPiBy4 ? 1 : 0;
-  intPart = largerPiBy4 ? (intPart+1) : intPart;
-
-  fract1 = largerPiBy4 ? (fract1 ^ 0x00ffffff) : fract1;
-  fract2 = largerPiBy4 ? (fract2 ^ 0x00ffffff) : fract2;
-  fract3 = largerPiBy4 ? (fract3 ^ 0x00ffffff) : fract3;
-
-  int leadingZero = (fract1 == 0);
-
-  // +1 is for the hidden bit 1 in floating-point format
-  int exponent = leadingZero ? -(24+1) : -(0+1);
-
-  fract1 = leadingZero ? fract2 : fract1;
-  fract2 = leadingZero ? fract3 : fract2;
-
-  // fract1 may have leading zeros, add it
-  int shift = clz(fract1)-8;
-  exponent += -shift;
-
-  float pio2 = 0x1.921fb6p+0;
-  unsigned fdigit = ((fract1 << shift) | (fract2 >> (24-shift))) & 0xffffff;
-
-  // we know that denormal number will not appear here
-  ieee.u = (sign << 31) | ((exponent+127) << 23) | (fdigit & 0x7fffff);
-  *y = ieee.f * pio2;
-  return intPart;
-}
-
-int argumentReduceSmall(float x, float * remainder) {
-  union {
-    float f;
-    unsigned u;
-  } ieee;
-
-  float twoByPi = 2.0f/3.14159265f;
-  float piBy2_1h = (float) 0xc90/0x1.0p11,
-        piBy2_1l = (float) 0xfda/0x1.0p23,
-        piBy2_2h = (float) 0xa22/0x1.0p35,
-        piBy2_2l = (float) 0x168/0x1.0p47,
-        piBy2_3h = (float) 0xc23/0x1.0p59,
-        piBy2_3l = (float) 0x4c4/0x1.0p71;
-
-  float y = (float)(int)(twoByPi * x + 0.5f);
-  ieee.f = y;
-  ieee.u = ieee.u & 0xfffff000;
-
-  float yh = ieee.f;
-  float yl = y - yh;
-  float rem = x - yh*piBy2_1h - yh*piBy2_1l - yl*piBy2_1h - yl*piBy2_1l;
-  rem = rem - yh*piBy2_2h - yh*piBy2_2l + yl*piBy2_2h + yl*piBy2_2l;
-  rem = rem - yh*piBy2_3h - yh*piBy2_3l - yl*piBy2_3h - yl*piBy2_3l;
-
-  *remainder = rem;
-  return (int)y;
-}
-
-
-int __ieee754_rem_pio2f(float x, float *y) {
-  if (x < 4000.0f) {
-    return argumentReduceSmall(x, y);
-  } else {
-    return payne_hanek(x, y);
-  }
-}
-
-OVERLOADABLE float __kernel_sinf(float x)
+OVERLOADABLE float __kernel_sinf_12(float x)
 {
   /* copied from fdlibm */
   const float
@@ -470,7 +67,7 @@ OVERLOADABLE float __kernel_sinf(float x)
   return mad(v, r, x);
 }
 
-float __kernel_cosf(float x, float y)
+float __kernel_cosf_12(float x, float y)
 {
   /* copied from fdlibm */
   const float
@@ -495,402 +92,153 @@ float __kernel_cosf(float x, float y)
   }
 }
 
-OVERLOADABLE float sin(float x)
-{
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sin(x);
-
-  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
-  float y,z=0.0;
-  int n, ix;
-
-  float negative = x < 0.0f? -1.0f : 1.0f;
-  x = fabs(x);
-
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  ix &= 0x7fffffff;
-
-    /* sin(Inf or NaN) is NaN */
-  if (ix >= 0x7f800000) return x-x;
-
-  if(x <= pio4)
-	  return negative * __kernel_sinf(x);
-  /* argument reduction needed */
-  else {
-      n = __ieee754_rem_pio2f(x,&y);
-      float s = __kernel_sinf(y);
-      float c = __kernel_cosf(y,0.0f);
-      float ret = (n&1) ? negative*c : negative*s;
-      return (n&3)> 1? -1.0f*ret : ret;
-  }
-}
-
-OVERLOADABLE float cos(float x)
-{
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_cos(x);
-
-  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
-  float y,z=0.0;
-  int n, ix;
-  x = __gen_ocl_fabs(x);
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-
-  ix &= 0x7fffffff;
-
-    /* cos(Inf or NaN) is NaN */
-  if (ix >= 0x7f800000) return x-x;
-
-  if(x <= pio4)
-	  return __kernel_cosf(x, 0.f);
-  /* argument reduction needed */
-  else {
-      n = __ieee754_rem_pio2f(x,&y);
-      n &= 3;
-      float c = __kernel_cosf(y, 0.0f);
-      float s = __kernel_sinf(y);
-      float v = (n&1) ? s : c;
-      /* n&3   return
-          0    cos(y)
-          1   -sin(y)
-          2   -cos(y)
-          3    sin(y)
-      */
-      int mask = (n>>1) ^ n;
-      float sign = (mask&1) ? -1.0f : 1.0f;
-      return sign * v;
-  }
-}
-
-float __kernel_tanf(float x, float y, int iy)
-{
-  /* copied from fdlibm */
-        float z,r,v,w,s;
-        int ix,hx;
-        const float
-        one   =  1.0000000000e+00, /* 0x3f800000 */
-        pio4  =  7.8539812565e-01, /* 0x3f490fda */
-        pio4lo=  3.7748947079e-08; /* 0x33222168 */
-        float T[13];// =  {
-         T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
-         T[1] = 1.3333334029e-01; /* 0x3e088889 */
-         T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
-         T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
-         T[4] = 8.8632395491e-03; /* 0x3c11371f */
-         T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
-         T[6] = 1.4562094584e-03; /* 0x3abede48 */
-         T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
-
-        GEN_OCL_GET_FLOAT_WORD(hx,x);
-        ix = hx&0x7fffffff;     /* high word of |x| */
-        if(ix<0x31800000)                       /* x < 2**-28 */
-            {if((int)x==0) {                    /* generate inexact */
-                if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
-                else return (iy==1)? x: -one/x;
-            }
-            }
-        if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
-            if(hx<0) {x = -x; y = -y;}
-            z = pio4-x;
-            w = pio4lo-y;
-            x = z+w; y = 0.0;
-        }
-        z       =  x*x;
-        w       =  z*z;
-		/* Break x^5*(T[1]+x^2*T[2]+...) into
-		 *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
-		 *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
-		 */
-
-        r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
-        v = z* mad(w, mad(w, T[6], T[4]), T[2]);
-
-        s = z*x;
-        r = mad(z, mad(s, r + v, y), y);
-        r += T[0]*s;
-        w = x+r;
-        if(ix>=0x3f2ca140) {
-            v = (float)iy;
-            return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
-        }
-        if(iy==1) return w;
-        else
-        	return -1.0/(x+r);
-}
-
-OVERLOADABLE float tan(float x)
-{
-    if (__ocl_math_fastpath_flag)
-      return __gen_ocl_internal_fastpath_tan(x);
-
-    float y,z=0.0;
-    int n, ix;
-    float negative = x < 0.0f? -1.0f : 1.0f;
-    x = negative * x;
-
-    GEN_OCL_GET_FLOAT_WORD(ix,x);
-
-    ix &= 0x7fffffff;
-
-    /* tan(Inf or NaN) is NaN */
-    if (ix>=0x7f800000) return x-x;            /* NaN */
-
-    /* argument reduction needed */
-    else {
-      n = __ieee754_rem_pio2f(x,&y);
-      return negative * __kernel_tanf(y,0.0f,1-((n&1)<<1)); /*   1 -- n even
-                                                              -1 -- n odd */
-    }
-}
-
-OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
-  int ix;
-  if(isinf(x) || isnan(x)) { return NAN; }
-  if(x < 0.0f) { x = -x; }
-  GEN_OCL_GET_FLOAT_WORD(ix, x);
-  if(x> 0x1.0p24) return 1.0f;
-  float m = __gen_ocl_internal_floor(x);
-  ix = (int)m;
-  m = x-m;
-  if((ix&0x1) != 0) m+=1.0f;
-    ix = __gen_ocl_internal_floor(m*4.0f);
-
-  switch(ix) {
-   case 0:
-    return __kernel_cosf(m*M_PI_F, 0.0f);
-   case 1:
-   case 2:
-    return __kernel_sinf((0.5f-m)*M_PI_F);
-   case 3:
-   case 4:
-    return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
-   case 5:
-   case 6:
-    return __kernel_sinf((m-1.5f)*M_PI_F);
-   default:
-    return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
-   }
-}
-
-OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+OVERLOADABLE float __gen_ocl_internal_floor_12(float x) { return __gen_ocl_rndd(x); }
+OVERLOADABLE float __gen_ocl_internal_sinpi_12(float x) {
   float sign = 1.0f;
   int ix;
   if(isinf(x)) return NAN;
   if(x < 0.0f) { x = -x; sign = -1.0f; }
   GEN_OCL_GET_FLOAT_WORD(ix, x);
   if(x> 0x1.0p24) return 0.0f;
-  float m = __gen_ocl_internal_floor(x);
+  float m = __gen_ocl_internal_floor_12(x);
   ix = (int)m;
   m = x-m;
   if((ix&0x1) != 0) m+=1.0f;
-    ix = __gen_ocl_internal_floor(m*4.0f);
+    ix = __gen_ocl_internal_floor_12(m*4.0f);
 
   switch(ix) {
    case 0:
-    return sign*__kernel_sinf(m*M_PI_F);
+    return sign*__kernel_sinf_12(m*M_PI_F);
    case 1:
    case 2:
-    return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+    return sign*__kernel_cosf_12((m-0.5f)*M_PI_F, 0.0f);
    case 3:
    case 4:
-    return -sign*__kernel_sinf((m-1.0f)*M_PI_F);
+    return -sign*__kernel_sinf_12((m-1.0f)*M_PI_F);
    case 5:
    case 6:
-    return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+    return -sign*__kernel_cosf_12((m-1.5f)*M_PI_F, 0.0f);
    default:
-    return -sign*__kernel_sinf((2.0f-m)*M_PI_F);
+    return -sign*__kernel_sinf_12((2.0f-m)*M_PI_F);
    }
 
 }
 
-OVERLOADABLE float lgamma(float x) {
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    const float
-        zero=  0.,
-        one =  1.0000000000e+00,
-        pi  =  3.1415927410e+00,
-        a0  =  7.7215664089e-02,
-        a1  =  3.2246702909e-01,
-        a2  =  6.7352302372e-02,
-        a3  =  2.0580807701e-02,
-        a4  =  7.3855509982e-03,
-        a5  =  2.8905137442e-03,
-        a6  =  1.1927076848e-03,
-        a7  =  5.1006977446e-04,
-        a8  =  2.2086278477e-04,
-        a9  =  1.0801156895e-04,
-        a10 =  2.5214456400e-05,
-        a11 =  4.4864096708e-05,
-        tc  =  1.4616321325e+00,
-        tf  = -1.2148628384e-01,
-        tt  =  6.6971006518e-09,
-        t0  =  4.8383611441e-01,
-        t1  = -1.4758771658e-01,
-        t2  =  6.4624942839e-02,
-        t3  = -3.2788541168e-02,
-        t4  =  1.7970675603e-02,
-        t5  = -1.0314224288e-02,
-        t6  =  6.1005386524e-03,
-        t7  = -3.6845202558e-03,
-        t8  =  2.2596477065e-03,
-        t9  = -1.4034647029e-03,
-        t10 =  8.8108185446e-04,
-        t11 = -5.3859531181e-04,
-        t12 =  3.1563205994e-04,
-        t13 = -3.1275415677e-04,
-        t14 =  3.3552918467e-04,
-        u0  = -7.7215664089e-02,
-        u1  =  6.3282704353e-01,
-        u2  =  1.4549225569e+00,
-        u3  =  9.7771751881e-01,
-        u4  =  2.2896373272e-01,
-        u5  =  1.3381091878e-02,
-        v1  =  2.4559779167e+00,
-        v2  =  2.1284897327e+00,
-        v3  =  7.6928514242e-01,
-        v4  =  1.0422264785e-01,
-        v5  =  3.2170924824e-03,
-        s0  = -7.7215664089e-02,
-        s1  =  2.1498242021e-01,
-        s2  =  3.2577878237e-01,
-        s3  =  1.4635047317e-01,
-        s4  =  2.6642270386e-02,
-        s5  =  1.8402845599e-03,
-        s6  =  3.1947532989e-05,
-        r1  =  1.3920053244e+00,
-        r2  =  7.2193557024e-01,
-        r3  =  1.7193385959e-01,
-        r4  =  1.8645919859e-02,
-        r5  =  7.7794247773e-04,
-        r6  =  7.3266842264e-06,
-        w0  =  4.1893854737e-01,
-        w1  =  8.3333335817e-02,
-        w2  = -2.7777778450e-03,
-        w3  =  7.9365057172e-04,
-        w4  = -5.9518753551e-04,
-        w5  =  8.3633989561e-04,
-        w6  = -1.6309292987e-03;
-	float t, y, z, nadj, p, p1, p2, p3, q, r, w;
-	int i, hx, ix;
-	nadj = 0;
-	hx = *(int *)&x;
-	ix = hx & 0x7fffffff;
-	if (ix >= 0x7f800000)
-		return x * x;
-	if (ix == 0)
-		return ((x + one) / zero);
-	if (ix < 0x1c800000) {
-		if (hx < 0) {
-			return -native_log(-x);
-		} else
-			return -native_log(x);
-	}
-	if (hx < 0) {
-		if (ix >= 0x4b000000)
-			return ((-x) / zero);
-		t = __gen_ocl_internal_sinpi(x);
-		if (t == zero)
-			return ((-x) / zero);
-		nadj = native_log(pi / __gen_ocl_fabs(t * x));
-		x = -x;
-	}
-	if (ix == 0x3f800000 || ix == 0x40000000)
-		r = 0;
-	else if (ix < 0x40000000) {
-		if (ix <= 0x3f666666) {
-			r = -native_log(x);
-			if (ix >= 0x3f3b4a20) {
-				y = one - x;
-				i = 0;
-			} else if (ix >= 0x3e6d3308) {
-				y = x - (tc - one);
-				i = 1;
-			} else {
-				y = x;
-				i = 2;
-			}
-		} else {
-			r = zero;
-			if (ix >= 0x3fdda618) {
-				y = (float) 2.0 - x;
-				i = 0;
-			}
-			else if (ix >= 0x3F9da620) {
-				y = x - tc;
-				i = 1;
-			}
-			else {
-				y = x - one;
-				i = 2;
-			}
-		}
-		switch (i) {
-		case 0:
-			z = y * y;
-			p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
-			p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
-			p = mad(y, p1, p2);
-			r += (p - (float) 0.5 * y);
-			break;
-		case 1:
-			z = y * y;
-			w = z * y;
-			p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
-			p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
-			p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
-			p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
-			r += (tf + p);
-			break;
-		case 2:
-			p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
-			p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);
-			r += (-(float) 0.5 * y + p1 / p2);
-		}
-	} else if (ix < 0x41000000) {
-		i = (int) x;
-		t = zero;
-		y = x - (float) i;
+#define BODY \
+  if (isnan(x)) { \
+    *p = x; \
+    return x; \
+  } \
+  *p = __gen_ocl_rndd(x); \
+  if (isinf(x)) { \
+    return x > 0 ? +0. : -0.; \
+  } \
+  return min(x - *p, 0x1.FFFFFep-1F);
+OVERLOADABLE float fract(float x, global float *p) { BODY; }
+OVERLOADABLE float fract(float x, local float *p) { BODY; }
+OVERLOADABLE float fract(float x, private float *p) { BODY; }
+#undef BODY
+
+OVERLOADABLE half fract(half x, global half *p) {
+  float _x = (float)x;
+  float _p;
+  half ret = (half)fract(_x, &_p);
+  *p = (half)_p;
+  return ret;
+}
+OVERLOADABLE half fract(half x, local half *p) {
+  float _x = (float)x;
+  float _p;
+  half ret = (half)fract(_x, &_p);
+  *p = (half)_p;
+  return ret;
+}
+OVERLOADABLE half fract(half x, private half *p) {
+  float _x = (float)x;
+  float _p;
+  half ret = (half)fract(_x, &_p);
+  *p = (half)_p;
+  return ret;
+}
 
-		p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
-		q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);
-		r = .5f * y + p / q;
-		z = one;
+#define BODY \
+  if (isnan(x) || isinf(x)) { \
+    *exp = 0; \
+    return x; \
+  } \
+  uint u = as_uint(x); \
+  uint a = u & 0x7FFFFFFFu; \
+  if (a == 0) { \
+    *exp = 0; \
+    return x; \
+  } \
+  if (a >= 0x800000) { \
+    *exp = (a >> 23) - 126; \
+    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+  } \
+  int e = -126; \
+  while (a < 0x400000) { \
+    e --; \
+    a <<= 1; \
+  } \
+  a <<= 1; \
+  *exp = e; \
+  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
+OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
+OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
+#undef BODY
 
-		switch (i) {
-		case 7:
-			z *= (y + 6.0f);
-		case 6:
-			z *= (y + 5.0f);
-		case 5:
-			z *= (y + 4.0f);
-		case 4:
-			z *= (y + 3.0f);
-		case 3:
-			z *= (y + 2.0f);
-			r += native_log(z);
-			break;
-		}
+OVERLOADABLE half frexp(half x, global int *exp) {
+  float _x = (float)x;
+  return (half)frexp(_x, exp);
+}
+OVERLOADABLE half frexp(half x, local int *exp) {
+  float _x = (float)x;
+  return (half)frexp(_x, exp);
+}
+OVERLOADABLE half frexp(half x, private int *exp) {
+  float _x = (float)x;
+  return (half)frexp(_x, exp);
+}
 
-	} else if (ix < 0x5c800000) {
-		t = native_log(x);
-		z = one / x;
-		y = z * z;
-		w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
-		r = (x - .5f) * (t - one) + w;
-	} else
-		r = x * (native_log(x) - one);
-	if (hx < 0)
-		r = nadj - r;
-	return r;
+#define BODY \
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+  if (ix > 0x7F800000) { \
+    *i = nan(0u); \
+    return nan(0u); \
+  } \
+  if (ix == 0x7F800000) { \
+    *i = x; \
+    return as_float(hx & 0x80000000u); \
+  } \
+  *i = __gen_ocl_rndz(x); \
+  return x - *i;
+OVERLOADABLE float modf(float x, global float *i) { BODY; }
+OVERLOADABLE float modf(float x, local float *i) { BODY; }
+OVERLOADABLE float modf(float x, private float *i) { BODY; }
+#undef BODY
+
+OVERLOADABLE half modf(half x, global half *i) {
+  float _x = (float)x;
+  float _i;
+  half ret = (half)modf(_x, &_i);
+  *i = (half)_i;
+  return ret;
+}
+OVERLOADABLE half modf(half x, local half *i) {
+  float _x = (float)x;
+  float _i;
+  half ret = (half)modf(_x, &_i);
+  *i = (half)_i;
+  return ret;
+}
+OVERLOADABLE half modf(half x, private half *i) {
+  float _x = (float)x;
+  float _i;
+  half ret = (half)modf(_x, &_i);
+  *i = (half)_i;
+  return ret;
 }
 
 /*
@@ -989,7 +337,7 @@ OVERLOADABLE float lgamma(float x) {
 	if (hx < 0) {  \
 		if (ix >= 0x4b000000)  \
 			return ((-x) / zero);  \
-		t = __gen_ocl_internal_sinpi(x);  \
+		t = __gen_ocl_internal_sinpi_12(x);  \
 		if (t == zero)  \
 			return ((-x) / zero);  \
 		nadj = native_log(pi / __gen_ocl_fabs(t * x));  \
@@ -1088,2089 +436,20 @@ OVERLOADABLE float lgamma_r(float x, local int *signgamp) { BODY; }
 OVERLOADABLE float lgamma_r(float x, private int *signgamp) { BODY; }
 #undef BODY
 
-OVERLOADABLE float log1p(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_log1p(x);
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  const float
-  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =    3.355443200e+07, /* 0x4c000000 */
-  Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
-  Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
-  Lp3 = 2.8571429849e-01, /* 3E924925 */
-  Lp4 = 2.2222198546e-01; /* 3E638E29 */
-  const float zero = 0.0;
-  float hfsq,f,c,s,z,R,u;
-  int k,hx,hu,ax;
-  union {float f; unsigned i;} un;
-  un.f = x;  hx = un.i;
-  ax = hx&0x7fffffff;
-
-  k = 1;
-  if (hx < 0x3ed413d7) {      /* x < 0.41422  */
-      if(ax>=0x3f800000) {    /* x <= -1.0 */
-    if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
-    else return (x-x)/(x-x);  /* log1p(x<-1)=NaN */
-      }
-      if(ax<0x31000000) {     /* |x| < 2**-29 */
-    if(two25+x>zero     /* raise inexact */
-              &&ax<0x24800000)    /* |x| < 2**-54 */
-        return x;
-    else
-        return x - x*x*(float)0.5;
-      }
-      if(hx>0||hx<=((int)0xbe95f61f)) {
-    k=0;f=x;hu=1;}  /* -0.2929<x<0.41422 */
-  }
-  if (hx >= 0x7f800000) return x+x;
-  if(k!=0) {
-      if(hx<0x5a000000) {
-    u  = (float)1.0+x;
-
-    un.f = u; hu = un.i;
-          k  = (hu>>23)-127;
-    /* correction term */
-          c  = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
-    c /= u;
-      } else {
-    u  = x;
-    un.f = u; hu = un.i;
-          k  = (hu>>23)-127;
-    c  = 0;
-      }
-      hu &= 0x007fffff;
-      if(hu<0x3504f7) {
-          un.i = hu|0x3f800000; u = un.f;/* normalize u */
-      } else {
-          k += 1;
-          un.i = hu|0x3f000000; u = un.f;  /* normalize u/2 */
-          hu = (0x00800000-hu)>>2;
-      }
-      f = u-(float)1.0;
-  }
-  hfsq=(float)0.5*f*f;
-  if(hu==0)
-  { /* |f| < 2**-20 */
-      if(f==zero)
-      {
-    	  if(k==0) return zero;
-    	  else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
-      }
-      R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
-      if(k==0) return f-R; else
-    	  return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
-  }
-  s = f/((float)2.0+f);
-  z = s*s;
-  R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
-  if(k==0)
-	  return f + mad(hfsq + R, s, -hfsq);
-  else
-	  return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
-}
-
-OVERLOADABLE float logb(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_logb(x);
-
-  union {float f; unsigned i;} u;
-  u.f = x;
-  int e =  ((u.i & 0x7f800000) >> 23);
-  float r1 = e-127;
-  float r2 = -INFINITY;
-  float r3 = x*x;
-    /* sub normal or +/-0 */
-  float r = e == 0 ? r2 : r1;
-    /* inf & nan */
-  return e == 0xff ? r3 : r;
-}
-
-OVERLOADABLE int ilogb(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_ilogb(x);
-
-  union { int i; float f; } u;
-  if (isnan(x))
-    return FP_ILOGBNAN;
-  if (isinf(x))
-    return 0x7FFFFFFF;
-  u.f = x;
-  u.i &= 0x7fffffff;
-  if (u.i == 0)
-    return FP_ILOGB0;
-  if (u.i >= 0x800000)
-    return (u.i >> 23) - 127;
-  int r = -126;
-  int a = u.i & 0x7FFFFF;
-  while(a < 0x800000) {
-    a <<= 1;
-    r --;
-  }
-  return r;
-}
-OVERLOADABLE float nan(uint code) {
-  return NAN;
-}
-OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
-  float sign = 1.0f;
-  int ix;
-  if(isinf(x)) return NAN;
-  if(x < 0.0f) { x = -x; sign = -1.0f; }
-  GEN_OCL_GET_FLOAT_WORD(ix, x);
-  if(x> 0x1.0p24) return 0.0f;
-  float m = __gen_ocl_internal_floor(x);
-  ix = (int)m;
-  m = x-m;
-  int n = __gen_ocl_internal_floor(m*4.0f);
-  if(m == 0.5f) {
-    return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
-  }
-  if(m == 0.0f) {
-    return (ix&0x1) == 0 ? 0.0f : -0.0f;
-  }
-
-  switch(n) {
-    case 0:
-      return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
-    case 1:
-      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
-    case 2:
-      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
-    default:
-      return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
-  }
-}
-OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
-  /* copied from fdlibm */
-  const unsigned
-  B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
-  B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
-
-  const float
-  C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
-  D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
-  E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
-  F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
-  G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
-
-  float r,s,t, w;
-  int hx;
-  uint sign;
-  uint high;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  sign=hx&0x80000000;     /* sign= sign(x) */
-  hx  ^=sign;
-  if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
-  if(hx==0)
-      return(x);    /* cbrt(0) is itself */
-
-  GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
-    /* rough cbrt to 5 bits */
-  if(hx<0x00800000)     /* subnormal number */
-    {
-    //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
-     //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
-      t = (sign = 0) ? 0.0f : -0.0f;
-      return t;
-    }
-  else
-    GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
-
-
-    /* new cbrt to 23 bits */
-  r=t*t/x;
-  s=mad(r, t, C);
-  t*=G+F/(s+E+D/s);
-    /* one step newton iteration to 53 bits with error less than 0.667 ulps */
-  s=t*t;    /* t*t is exact */
-  r=x/s;
-  w=t+t;
-  r=(r-t)/(w+r);  /* r-s is exact */
-  t=mad(t, r, t);
-
-    /* retore the sign bit */
-  GEN_OCL_GET_FLOAT_WORD(high,t);
-  GEN_OCL_SET_FLOAT_WORD(t,high|sign);
-  return(t);
-}
-
-#define BODY \
-  *cosval = cos(x); \
-  return sin(x);
-
-OVERLOADABLE float sincos(float x, global float *cosval) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sincos(x, cosval);
-  BODY;
-}
-OVERLOADABLE float sincos(float x, local float *cosval) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sincos(x, cosval);
-  BODY;
-}
-OVERLOADABLE float sincos(float x, private float *cosval) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sincos(x, cosval);
-  BODY;
-}
-#undef BODY
-
-INLINE float __gen_ocl_asin_util(float x) {
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  float
-  pS0 =  1.66666666666666657415e-01,
-  pS1 = -3.25565818622400915405e-01,
-  pS2 =  2.01212532134862925881e-01,
-  pS3 = -4.00555345006794114027e-02,
-  pS4 =  7.91534994289814532176e-04,
-  qS1 = -2.40339491173441421878e+00,
-  qS2 =  2.02094576023350569471e+00,
-  qS3 = -6.88283971605453293030e-01,
-  qS4 =  7.70381505559019352791e-02;
-
-  float t = x*x;
-  float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), pS0);
-  float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), 1.0f);
-  float w = p / q;
-  return mad(x, w, x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_asin(float x) {
-  uint ix;
-  union { uint i; float f; } u;
-  u.f = x;
-  ix = u.i & 0x7fffffff;
-  if(ix == 0x3f800000) {
-    return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
-  }
-  if(ix > 0x3f800000) {            /* |x|>= 1 */
-    return  NAN;          /* asin(|x|>1) is NaN */
-  }
-
-  if(ix < 0x32000000) {            /* if |x| < 2**-27 */
-    if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
-  }
-
-  if(x < -0.5) {
-    return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
-  } else if(x > 0.5) {
-    return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
-  } else {
-    return __gen_ocl_asin_util(x);
-  }
-}
-OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
-  return __gen_ocl_internal_asin(x) / M_PI_F;
-}
-OVERLOADABLE float __gen_ocl_internal_acos(float x) {
-  if(x > 0.5)
-    return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
-  else
-    return M_PI_2_F - __gen_ocl_internal_asin(x);
-}
-OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
-  return __gen_ocl_internal_acos(x) / M_PI_F;
-}
-__constant float atanhi[4] = {
-  4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
-  7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
-  9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
-  1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
-};
-__constant float atanlo[4] = {
-  5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
-  3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
-  3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
-  7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
-};
-
-OVERLOADABLE float __gen_ocl_internal_atan(float x) {
-  /* copied from fdlibm */
-  float aT[11];
-  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
-  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
-  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
-  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
-  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
-  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
-  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
-  const float one = 1.0, huge = 1.0e30;
-
-  float w,s1,s2,z;
-  int ix,hx,id;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
-      if(ix>0x7f800000)
-    return x+x;   /* NaN */
-      if(hx>0) return  atanhi[3]+atanlo[3];
-      else     return -atanhi[3]-atanlo[3];
-  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
-      if (ix < 0x31000000) {  /* |x| < 2^-29 */
-    if(huge+x>one) return x;  /* raise inexact */
-      }
-      id = -1;
-  } else {
-  x = __gen_ocl_fabs(x);
-  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
-      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
-    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
-      } else {      /* 11/16<=|x|< 19/16 */
-    id = 1; x  = (x-one)/(x+one);
-      }
-  } else {
-      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
-    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
-      } else {      /* 2.4375 <= |x| < 2^66 */
-    id = 3; x  = -(float)1.0/x;
-      }
-  }}
-    /* end of argument reduction */
-  z = x*x;
-  w = z*z;
-    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
-  s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
-  s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
-  if (id<0) return x - x*(s1+s2);
-  else {
-      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
-      return (hx<0)? -z:z;
-  }
-
-}
-OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
-  return __gen_ocl_internal_atan(x) / M_PI_F;
-}
-
-// XXX work-around PTX profile
-OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
-OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
-OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
-  /* copied from fdlibm */
-  float z;
-  int k,m,hx,hy,ix,iy;
-  const float
-  tiny  = 1.0e-30,
-  zero  = 0.0,
-  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
-  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
-  pi      = 3.1415927410e+00, /* 0x40490fdb */
-  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  iy = hy&0x7fffffff;
-
-  if((ix>0x7f800000)||
-     (iy>0x7f800000)) /* x or y is NaN */
-     return x+y;
-  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
-  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
-
-    /* when y = 0 */
-  if(iy==0) {
-      switch(m) {
-    case 0:
-    case 1: return y;   /* atan(+-0,+anything)=+-0 */
-    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
-    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
-      }
-  }
-    /* when x = 0 */
-  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
-
-  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
-  if(ix <= 0x7fffff && iy <= 0x7fffff) {
-    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
-    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
-  }
-
-    /* when x is INF */
-  if(ix==0x7f800000) {
-      if(iy==0x7f800000) {
-    switch(m) {
-        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
-        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
-        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
-        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
-    }
-      } else {
-    switch(m) {
-        case 0: return  zero  ; /* atan(+...,+INF) */
-        case 1: return -zero  ; /* atan(-...,+INF) */
-        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
-        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
-    }
-      }
-  }
-    /* when y is INF */
-  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
-
-    /* compute y/x */
-  k = (iy-ix)>>23;
-  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
-  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
-  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
-  switch (m) {
-      case 0: return       z  ; /* atan(+,+) */
-      case 1: {
-              uint zh;
-          GEN_OCL_GET_FLOAT_WORD(zh,z);
-          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
-        }
-        return       z  ; /* atan(-,+) */
-      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
-      default: /* case 3 */
-            return  (z-pi_lo)-pi;/* atan(-,-) */
-  }
-}
-
-OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
-  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
-}
-OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
-OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-OVERLOADABLE float __gen_ocl_internal_round(float x) {
-  float y = __gen_ocl_rndz(x);
-  if (__gen_ocl_fabs(x - y) >= 0.5f)
-    y += __gen_ocl_internal_copysign(1.f, x);
-  return y;
-}
-OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-OVERLOADABLE float __gen_ocl_internal_rint(float x) {
-  return __gen_ocl_rnde(x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_exp(float x) {
-  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
-  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
-  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
-  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one = 1.0,
-  huge = 1.0e+30,
-  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
-  P2 = -2.7777778450e-03; /* 0xbb360b61 */
-  float y,hi=0.0,lo=0.0,c,t;
-  int k=0,xsb;
-  unsigned hx;
-  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
-  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
-  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
-  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
-  float half_0 = 0.5;
-  float half_1 =	-0.5;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = (hx>>31)&1;		/* sign bit of x */
-  hx &= 0x7fffffff;		/* high word of |x| */
-
-  /* filter out non-finite argument */
-  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
-    if(hx>0x7f800000)
-      return x+x;			/* NaN */
-    if(hx==0x7f800000)
-      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
-    if(x > o_threshold) return huge*huge; /* overflow */
-    if(x < u_threshold) return twom100*twom100; /* underflow */
-  }
-  /* argument reduction */
-  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
-      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
-    } else {
-      float tmp = xsb == 1 ? half_1 : half_0;
-      k  = ivln2*x+tmp;
-      t  = k;
-      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
-      lo = t*ln2LO_0;
-    }
-    x  = hi - lo;
-  }
-  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
-    if(huge+x>one) return one+x;/* trigger inexact */
-  }
-  else k = 0;
-
-  /* x is now in primary range */
-  t  = x*x;
-  c  = x - t*(P1+t*P2);
-  if(k==0)
-    return one-((x*c)/(c-(float)2.0)-x);
-  else
-    y = one-((lo-(x*c)/((float)2.0-c))-hi);
-  if(k >= -125) {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
-    return y;
-  } else {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
-    return y*twom100;
-  }
-}
-
-/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
-/*...*/
-const float
-tiny = 1.0e-30,
-half_val=  5.0000000000e-01, /* 0x3F000000 */
-one =  1.0000000000e+00, /* 0x3F800000 */
-two =  2.0000000000e+00, /* 0x40000000 */
-	/* c = (subfloat)0.84506291151 */
-erx =  8.4506291151e-01, /* 0x3f58560b */
-/*
- * Coefficients for approximation to  erf on [0,0.84375]
- */
-efx =  1.2837916613e-01, /* 0x3e0375d4 */
-efx8=  1.0270333290e+00, /* 0x3f8375d4 */
-pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
-pp1  = -3.2504209876e-01, /* 0xbea66beb */
-pp2  = -2.8481749818e-02, /* 0xbce9528f */
-pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
-pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
-qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
-qq2  =  6.5022252500e-02, /* 0x3d852a63 */
-qq3  =  5.0813062117e-03, /* 0x3ba68116 */
-qq4  =  1.3249473704e-04, /* 0x390aee49 */
-qq5  = -3.9602282413e-06, /* 0xb684e21a */
-/*
- * Coefficients for approximation to  erf  in [0.84375,1.25]
- */
-pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
-pa1  =  4.1485610604e-01, /* 0x3ed46805 */
-pa2  = -3.7220788002e-01, /* 0xbebe9208 */
-pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
-pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
-pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
-pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
-qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
-qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
-qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
-qa4  =  1.2617121637e-01, /* 0x3e013307 */
-qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
-qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
- /*
- * Coefficients for approximation to  erfc in [1.25,1/0.35]
- */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
-ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
-ra2  = -1.0558626175e+01, /* 0xc128f022 */
-ra3  = -6.2375331879e+01, /* 0xc2798057 */
-ra4  = -1.6239666748e+02, /* 0xc322658c */
-ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
-ra6  = -8.1287437439e+01, /* 0xc2a2932b */
-ra7  = -9.8143291473e+00, /* 0xc11d077e */
-sa1  =  1.9651271820e+01, /* 0x419d35ce */
-sa2  =  1.3765776062e+02, /* 0x4309a863 */
-sa3  =  4.3456588745e+02, /* 0x43d9486f */
-sa4  =  6.4538726807e+02, /* 0x442158c9 */
-sa5  =  4.2900814819e+02, /* 0x43d6810b */
-sa6  =  1.0863500214e+02, /* 0x42d9451f */
-sa7  =  6.5702495575e+00, /* 0x40d23f7c */
-sa8  = -6.0424413532e-02, /* 0xbd777f97 */
-/*
- * Coefficients for approximation to  erfc in [1/.35,28]
- */
-rb0  = -9.8649431020e-03, /* 0xbc21a092 */
-rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
-rb2  = -1.7757955551e+01, /* 0xc18e104b */
-rb3  = -1.6063638306e+02, /* 0xc320a2ea */
-rb4  = -6.3756646729e+02, /* 0xc41f6441 */
-rb5  = -1.0250950928e+03, /* 0xc480230b */
-rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
-sb1  =  3.0338060379e+01, /* 0x41f2b459 */
-sb2  =  3.2579251099e+02, /* 0x43a2e571 */
-sb3  =  1.5367296143e+03, /* 0x44c01759 */
-sb4  =  3.1998581543e+03, /* 0x4547fdbb */
-sb5  =  2.5530502930e+03, /* 0x451f90ce */
-sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
-sb7  = -2.2440952301e+01; /* 0xc1b38712 */
-
-	int hx,ix,i;
-	float R,S,P,Q,s,y,z,r;
-	GEN_OCL_GET_FLOAT_WORD(hx,x);
-	ix = hx&0x7fffffff;
-	if(ix>=0x7f800000) {		/* erf(nan)=nan */
-	    i = ((unsigned int)hx>>31)<<1;
-	    return (float)(1-i)+one/x;	/* erf(+-inf)=+-1 */
-	}
-
-	if(ix < 0x3f580000) {		/* |x|<0.84375 */
-	    if(ix < 0x31800000) { 	/* |x|<2**-28 */
-	        if (ix < 0x04000000)
-		    /*avoid underflow */
-		    return (float)0.125*((float)8.0*x+efx8*x);
-		return x + efx*x;
-	    }
-	    z = x*x;
-	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
-	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), one);
-	    y = r / s;
-	    return mad(x, y, x);
-	}
-	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
-	    s = __gen_ocl_internal_fabs(x)-one;
-	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
-	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
-	    if(hx>=0) return erx + P/Q; else return -erx - P/Q;
-	}
-	if (ix >= 0x40c00000) {		/* inf>|x|>=6 */
-	    if(hx>=0) return one-tiny; else return tiny-one;
-	}
-	x = __gen_ocl_internal_fabs(x);
-    s = one/(x*x);
-	if(ix< 0x4036DB6E) {	/* |x| < 1/0.35 */
-	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-	    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
-	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-	    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
-	} else {	/* |x| >= 1/0.35 */
-	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-	    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
-	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-	    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
-	}
-	GEN_OCL_GET_FLOAT_WORD(ix,x);
-	GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
-	r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
-	if(hx>=0) return one-r/x; else return  r/x-one;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
-/*...*/
-const float
-tiny = 1.0e-30,
-half_val=  5.0000000000e-01, /* 0x3F000000 */
-one =  1.0000000000e+00, /* 0x3F800000 */
-two =  2.0000000000e+00, /* 0x40000000 */
-	/* c = (subfloat)0.84506291151 */
-erx =  8.4506291151e-01, /* 0x3f58560b */
-/*
- * Coefficients for approximation to  erf on [0,0.84375]
- */
-efx =  1.2837916613e-01, /* 0x3e0375d4 */
-efx8=  1.0270333290e+00, /* 0x3f8375d4 */
-pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
-pp1  = -3.2504209876e-01, /* 0xbea66beb */
-pp2  = -2.8481749818e-02, /* 0xbce9528f */
-pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
-pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
-qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
-qq2  =  6.5022252500e-02, /* 0x3d852a63 */
-qq3  =  5.0813062117e-03, /* 0x3ba68116 */
-qq4  =  1.3249473704e-04, /* 0x390aee49 */
-qq5  = -3.9602282413e-06, /* 0xb684e21a */
-/*
- * Coefficients for approximation to  erf  in [0.84375,1.25]
- */
-pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
-pa1  =  4.1485610604e-01, /* 0x3ed46805 */
-pa2  = -3.7220788002e-01, /* 0xbebe9208 */
-pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
-pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
-pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
-pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
-qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
-qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
-qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
-qa4  =  1.2617121637e-01, /* 0x3e013307 */
-qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
-qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
- /*
- * Coefficients for approximation to  erfc in [1.25,1/0.35]
- */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
-ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
-ra2  = -1.0558626175e+01, /* 0xc128f022 */
-ra3  = -6.2375331879e+01, /* 0xc2798057 */
-ra4  = -1.6239666748e+02, /* 0xc322658c */
-ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
-ra6  = -8.1287437439e+01, /* 0xc2a2932b */
-ra7  = -9.8143291473e+00, /* 0xc11d077e */
-sa1  =  1.9651271820e+01, /* 0x419d35ce */
-sa2  =  1.3765776062e+02, /* 0x4309a863 */
-sa3  =  4.3456588745e+02, /* 0x43d9486f */
-sa4  =  6.4538726807e+02, /* 0x442158c9 */
-sa5  =  4.2900814819e+02, /* 0x43d6810b */
-sa6  =  1.0863500214e+02, /* 0x42d9451f */
-sa7  =  6.5702495575e+00, /* 0x40d23f7c */
-sa8  = -6.0424413532e-02, /* 0xbd777f97 */
-/*
- * Coefficients for approximation to  erfc in [1/.35,28]
- */
-rb0  = -9.8649431020e-03, /* 0xbc21a092 */
-rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
-rb2  = -1.7757955551e+01, /* 0xc18e104b */
-rb3  = -1.6063638306e+02, /* 0xc320a2ea */
-rb4  = -6.3756646729e+02, /* 0xc41f6441 */
-rb5  = -1.0250950928e+03, /* 0xc480230b */
-rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
-sb1  =  3.0338060379e+01, /* 0x41f2b459 */
-sb2  =  3.2579251099e+02, /* 0x43a2e571 */
-sb3  =  1.5367296143e+03, /* 0x44c01759 */
-sb4  =  3.1998581543e+03, /* 0x4547fdbb */
-sb5  =  2.5530502930e+03, /* 0x451f90ce */
-sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
-sb7  = -2.2440952301e+01; /* 0xc1b38712 */
-	int hx,ix;
-	float R,S,P,Q,s,y,z,r;
-	GEN_OCL_GET_FLOAT_WORD(hx,x);
-	ix = hx&0x7fffffff;
-	if(ix>=0x7f800000) {			/* erfc(nan)=nan */
-						/* erfc(+-inf)=0,2 */
-	    return (float)(((unsigned int)hx>>31)<<1)+one/x;
-	}
-
-	if(ix < 0x3f580000) {		/* |x|<0.84375 */
-	    if(ix < 0x23800000)  	/* |x|<2**-56 */
-		return one-x;
-	    z = x*x;
-	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
-	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), qq1), one);
-	    y = r/s;
-	    if(hx < 0x3e800000) {  	/* x<1/4 */
-		return one-(x+x*y);
-	    } else {
-		r = x*y;
-		r += (x-half_val);
-	        return half_val - r ;
-	    }
-	}
-	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
-	    s = __gen_ocl_internal_fabs(x)-one;
-	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
-	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
-	    if(hx>=0) {
-	        z  = one-erx; return z - P/Q;
-	    } else {
-		z = erx+P/Q; return one+z;
-	    }
-	}
-	if (ix < 0x41e00000) {		/* |x|<28 */
-	    x = __gen_ocl_internal_fabs(x);
-        s = one/(x*x);
-	    if(ix< 0x4036DB6D) {	/* |x| < 1/.35 ~ 2.857143*/
-		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-		    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
-		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-		    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
-	    } else {			/* |x| >= 1/.35 ~ 2.857143 */
-		if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
-		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-		    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
-		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-		    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
-	    }
-	    GEN_OCL_GET_FLOAT_WORD(ix,x);
-	    GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
-	    r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*
-			__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
-	    if(hx>0) {
-		float ret = r/x;
-		return ret;
-	    } else
-		return two-r/x;
-	} else {
-	    if(hx>0) {
-		return tiny*tiny;
-	    } else
-		return two-tiny;
-	}
-}
-
-OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
-  //return x-y*__gen_ocl_rndz(x/y);
-  float one = 1.0;
-  float Zero[2];
-  int n,hx,hy,hz,ix,iy,sx,i;
-  Zero[0] = 0.0;
-  Zero[1] = -0.0;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  sx = hx&0x80000000;		/* sign of x */
-  hx ^=sx;		/* |x| */
-  hy &= 0x7fffffff;	/* |y| */
-  /* purge off exception values */
-  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
-  (hy>0x7f800000))			/* or y is NaN */
-    return (x*y)/(x*y);
-  if(hx<hy) return x;			/* |x|<|y| return x */
-  if(hx==hy)
-    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
-
-  /* determine ix = ilogb(x) */
-  if(hx<0x00800000) {	/* subnormal x */
-    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
-  } else ix = (hx>>23)-127;
-
-  /* determine iy = ilogb(y) */
-  if(hy<0x00800000) {	/* subnormal y */
-    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
-  } else iy = (hy>>23)-127;
-
-  /* set up {hx,lx}, {hy,ly} and align y to x */
-  if(ix >= -126)
-    hx = 0x00800000|(0x007fffff&hx);
-  else {		/* subnormal x, shift x to normal */
-    n = -126-ix;
-    hx = hx<<n;
-  }
-  if(iy >= -126)
-    hy = 0x00800000|(0x007fffff&hy);
-  else {		/* subnormal y, shift y to normal */
-    n = -126-iy;
-    hy = hy<<n;
-  }
-  /* fix point fmod */
-  n = ix - iy;
-  while(n--) {
-    hz=hx-hy;
-    if(hz<0){hx = hx+hx;}
-    else {
-      if(hz==0)		/* return sign(x)*0 */
-        return Zero[(unsigned)sx>>31];
-      hx = hz+hz;
-    }
-  }
-  hz=hx-hy;
-  if(hz>=0) {hx=hz;}
-
-    /* convert back to floating value and restore the sign */
-  if(hx==0)			/* return sign(x)*0 */
-    return Zero[(unsigned)sx>>31];
-  while(hx<0x00800000) {		/* normalize x */
-    hx = hx+hx;
-    iy -= 1;
-  }
-  if(iy>= -126) {		/* normalize output */
-    hx = ((hx-0x00800000)|((iy+127)<<23));
-	GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
-   } else {		/* subnormal output */
-     n = -126 - iy;
-     hx >>= n;
-     GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
-     x *= one;		/* create necessary signal */
-  }
-  return x;		/* exact output */
-}
-
-OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
-  //return __gen_ocl_pow(M_E_F, x) - 1;
-  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
-  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
-  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
-  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
-  huge = 1.0e30,
-  tiny = 1.0e-30,
-  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one	=  1.0,
-  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
-  float y,hi,lo,c,t,e,hxs,hfx,r1;
-  int k,xsb;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = hx&0x80000000;
-  /* sign bit of x */
-  //if(xsb==0)
-  //y=x;
-  //else
-  //y= -x; /* y = |x| */
-  y = __gen_ocl_internal_fabs(x);
-  hx &= 0x7fffffff;		/* high word of |x| */
-  /* filter out huge and non-finite argument */
-  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
-    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
-      if(hx>0x7f800000)
-        return x+x; 	 /* NaN */
-      if(hx==0x7f800000)
-        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
-      if(x > o_threshold)
-        return huge*huge; /* overflow */
-    }
-    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
-      if(x+tiny<(float)0.0)	/* raise inexact */
-        return tiny-one;	/* return -1 */
-    }
-  }
-  /* argument reduction */
-  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
-      if(xsb==0){
-        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
-      }	else {
-        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
-      }
-    } else {
-      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
-      t  = k;
-      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
-      lo = t*ln2_lo;
-    }
-    x  = hi - lo;
-    c  = (hi-x)-lo;
-  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
-    //t = huge+x; /* return x with inexact flags when x!=0 */
-    //return x - (t-(huge+x));
-    return x;
-  } else k = 0;
-  /* x is now in primary range */
-  hfx = (float)0.5*x;
-  hxs = x*hfx;
-  r1 = one+hxs*(Q1+hxs*Q2);
-  t = (float)3.0-r1*hfx;
-  e = hxs*((r1-t)/((float)6.0 - x*t));
-  if(k==0)
-    return x - (x*e-hxs);		/* c is 0 */
-  else{
-    e = (x*(e-c)-c);
-    e -= hxs;
-    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
-    if(k==1){
-      if(x < (float)-0.25)
-        return -(float)2.0*(e-(x+(float)0.5));
-      else
-        return  (one+(float)2.0*(x-e));
-    }
-    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
-      int i;
-      y = one-(e-x);
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-      return y-one;
-    }
-    t = one;
-    if(k<23) {
-      int i;
-      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
-      y = t-(e-x);
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-    } else {
-      int i;
-      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
-      y = x-(e+t);
-      y += one;
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-    }
-  }
-  return y;
-}
-
-OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
-  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-  float one	= 1.0,
-  ln2	= 6.9314718246e-01;/* 0x3f317218 */
-  float t;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  if(hx<0x3f800000) {	/* x < 1 */
-    return (x-x)/(x-x);
-  } else if(hx >=0x4d800000) {	/* x > 2**28 */
-    if(hx >=0x7f800000) {/* x is inf of NaN */
-      return x+x;
-    } else
-      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
-  } else if (hx==0x3f800000) {
-    return 0.0;			/* acosh(1) = 0 */
-  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
-    t=x*x;
-    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
-  } else {			/* 1<x<2 */
-    t = x-one;
-    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
-  }
-}
-
-OVERLOADABLE float __gen_ocl_internal_asinh(float x){
-  //return native_log(x + native_sqrt(x * x + 1));
-  float one =  1.0000000000e+00, /* 0x3F800000 */
-  ln2 =  6.9314718246e-01, /* 0x3f317218 */
-  huge=  1.0000000000e+30;
-  float w;
-  int hx,ix;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix< 0x38000000) {	/* |x|<2**-14 */
-    if(huge+x>one) return x;	/* return x inexact except 0 */
-  }
-  if(ix>0x47000000) {/* |x| > 2**14 */
-    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
-    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
-  } else {
-    float xa = __gen_ocl_internal_fabs(x);
-    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
-      w = __gen_ocl_internal_log(mad(xa, 2.0f, one / (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
-    } else {		/* 2.0 > |x| > 2**-14 */
-      float t = xa*xa;
-      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
-    }
-  }
-  return __gen_ocl_internal_copysign(w, x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_sinh(float x){
-  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-  float one = 1.0,
-  shuge = 1.0e37;
-  float t,w,h;
-  int ix,jx;
-  GEN_OCL_GET_FLOAT_WORD(jx,x);
-  ix = jx&0x7fffffff;
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) return x+x;
-  h = 0.5;
-  if (jx<0) h = -h;
-  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
-  if (ix < 0x41b00000) {		/* |x|<22 */
-    if (ix<0x31800000)	/* |x|<2**-28 */
-      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
-    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
-    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
-      return h*(t+t/(t+one));
-  }
-  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
-  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
-  /* |x| in [log(maxdouble), overflowthresold] */
-  if (ix<=0x42b2d4fc) {
-    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
-    t = h*w;
-    return t*w;
-  }
-  /* |x| > overflowthresold, sinh(x) overflow */
-  return x*shuge;
-}
-
-OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
-  //float y = native_exp(-2 * x);
-  //return (1 - y) / (1 + y);
-  float one=1.0, two=2.0, tiny = 1.0e-30;
-  float t,z;
-  int jx,ix;
-  GEN_OCL_GET_FLOAT_WORD(jx,x);
-  ix = jx&0x7fffffff;
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) {
-    if (jx>=0)
-      return one/x+one; /* tanh(+-inf)=+-1 */
-    else
-      return one/x-one; /* tanh(NaN) = NaN */
-  }
-
-  if (ix < 0x41b00000) { /* |x|<22 */
-    if (ix == 0)
-      return x;		/* x == +-0 */
-    if (ix<0x24000000) 	/* |x|<2**-55 */
-      return x*(one+x);    	/* tanh(small) = small */
-    if (ix>=0x3f800000) {	/* |x|>=1  */
-      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
-      z = one - two/(t+two);
-    } else {
-      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
-      z= -t/(t+two);
-    }
-  } else { /* |x| > 22, return +-1 */
-    z = one - tiny;		/* raised inexact flag */
-  }
-  return (jx>=0)? z: -z;
-}
-
-OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
-  //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
-  float halF = 0.5,
-  huge = 1.0e+30,
-  tiny = 1.0e-30,
-  one = 1.0;
-  float t,w;
-  int ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  ix &= 0x7fffffff;
-  /* |x| in [0,22] */
-  if (ix < 0x41b00000) {
-    /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
-    if(ix<0x3eb17218) {
-      t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
-      w = one+t;
-      if (ix<0x24000000) return w;	/* cosh(tiny) = 1 */
-      return one+(t*t)/(w+w);
-    }
-    /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
-    t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
-    return halF*t+halF/t;
-  }
-  /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
-  if (ix < 0x42b17180)  return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
-  /* |x| in [log(maxdouble), overflowthresold] */
-  if (ix<=0x42b2d4fc) {
-    w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
-    t = halF*w;
-    return t*w;
-  }
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) return x*x;
-  /* |x| > overflowthresold, cosh(x) overflow */
-  return huge*huge;
-}
-
-OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
-  //return x-y*__gen_ocl_rnde(x/y);
-  float zero = 0.0;
-  int hx,hp;
-  unsigned sx;
-  float p_half;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hp,p);
-  sx = hx&0x80000000;
-  hp &= 0x7fffffff;
-  hx &= 0x7fffffff;
-  /* purge off exception values */
-  if(hp==0) return (x*p)/(x*p);	        /* p = 0 */
-  if((hx>=0x7f800000)||               /* x not finite */
-    ((hp>0x7f800000)))	               /* p is NaN */
-    return (x*p)/(x*p);
-  if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
-  if ((hx-hp)==0) return zero*x;
-  x = __gen_ocl_fabs(x);
-  p = __gen_ocl_fabs(p);
-  if (hp<0x01000000) {
-    if(x+x>p) {
-      x-=p;
-      if(x+x>=p) x -= p;
-    }
-  } else {
-    p_half = (float)0.5*p;
-    if(x>p_half) {
-      x-=p;
-      if(x>=p_half) x -= p;
-    }
-  }
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
-  return x;
-}
-
-OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
-  x = __gen_ocl_scalbnf(x,n);
-  return x;
-}
-
-OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
-  //return 0.5f * native_sqrt((1 + x) / (1 - x));
-  float xa = __gen_ocl_fabs (x);
-  float t;
-  if (isless (xa, 0.5f)){
-    if (xa < 0x1.0p-28f) return x;
-    t = xa + xa;
-    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
-  } else if (isless (xa, 1.0f)){
-    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
-  } else{
-    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
-    return x / 0.0f;
-  }
-  return __gen_ocl_internal_copysign(t, x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_exp10(float x){
-  float px, qx,ans;
-  short n;
-  int i;
-  float*p;
-  float MAXL10 = 38.230809449325611792;
-  float LOG210 = 3.32192809488736234787e0;
-  float LG102A = 3.00781250000000000000E-1;
-  float LG102B = 2.48745663981195213739E-4;
-  float P[6];
-  P[0] = 2.063216740311022E-001;
-  P[1] = 5.420251702225484E-001;
-  P[2] = 1.171292686296281E+000;
-  P[3] = 2.034649854009453E+000;
-  P[4] = 2.650948748208892E+000;
-  P[5] = 2.302585167056758E+000;
-
-  if( x < -MAXL10 ) return 0.0;
-
-  if( isinf(x))  return INFINITY;
-  /* The following is necessary because range reduction blows up: */
-  if( x == 0 )return 1.0;
-
-  /* Express 10**x = 10**g 2**n
-    *	 = 10**g 10**( n log10(2) )
-    *	 = 10**( g + n log10(2) )
-    */
-  px = x * LOG210;
-  qx = __gen_ocl_internal_floor( px + 0.5 );
-  n = qx;
-  x -= qx * LG102A;
-  x -= qx * LG102B;
-
-  /* rational approximation for exponential
-    * of the fractional part:
-    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
-    */
-  p = P;
-  ans = *p++;
-  i = 5;
-  do{
-    ans = ans * x  +  *p++;
-  }
-  while( --i );
-  px = 1.0 + x * ans;
-
-  /* multiply by power of 2 */
-  x = __gen_ocl_internal_ldexp( px, n );
-  return x;
-}
-
-OVERLOADABLE float cospi(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_cospi(x);
-
-  return __gen_ocl_internal_cospi(x);
-}
-
-OVERLOADABLE float cosh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_cosh(x);
-
-  return  __gen_ocl_internal_cosh(x);
-}
-
-OVERLOADABLE float acos(float x) {
-  return __gen_ocl_internal_acos(x);
-}
-
-OVERLOADABLE float acospi(float x) {
-  return __gen_ocl_internal_acospi(x);
-}
-
-OVERLOADABLE float acosh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_acosh(x);
-
-  return __gen_ocl_internal_acosh(x);
-}
-
-OVERLOADABLE float sinpi(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sinpi(x);
-
-  return __gen_ocl_internal_sinpi(x);
-}
-
-OVERLOADABLE float sinh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sinh(x);
-
-  return __gen_ocl_internal_sinh(x);
-}
-
-OVERLOADABLE float asin(float x) {
-  return __gen_ocl_internal_asin(x);
-}
-
-OVERLOADABLE float asinpi(float x) {
-  return __gen_ocl_internal_asinpi(x);
-}
-
-OVERLOADABLE float asinh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_asinh(x);
-
-  return __gen_ocl_internal_asinh(x);
-}
-
-OVERLOADABLE float tanpi(float x) {
-  return __gen_ocl_internal_tanpi(x);
-}
-
-OVERLOADABLE float tanh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_tanh(x);
-
-  return __gen_ocl_internal_tanh(x);
-}
-
-OVERLOADABLE float atan(float x) {
-  return __gen_ocl_internal_atan(x);
-}
-
-OVERLOADABLE float atan2(float y, float x) {
-  return __gen_ocl_internal_atan2(y, x);
-}
-
-OVERLOADABLE float atan2pi(float y, float x) {
-  return __gen_ocl_internal_atan2pi(y, x);
-}
-
-OVERLOADABLE float atanpi(float x) {
-  return __gen_ocl_internal_atanpi(x);
-}
-
-OVERLOADABLE float atanh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_atanh(x);
-
-  return __gen_ocl_internal_atanh(x);
-}
-
-OVERLOADABLE float cbrt(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_cbrt(x);
-
-  return __gen_ocl_internal_cbrt(x);
-}
-
-OVERLOADABLE float rint(float x) {
-  return __gen_ocl_internal_rint(x);
-}
-
-OVERLOADABLE float copysign(float x, float y) {
-  return __gen_ocl_internal_copysign(x, y);
-}
-
-OVERLOADABLE float erf(float x) {
-  return __gen_ocl_internal_erf(x);
-}
-
-OVERLOADABLE float erfc(float x) {
-  return __gen_ocl_internal_erfc(x);
-}
-
-OVERLOADABLE float fmod (float x, float y) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_fmod(x, y);
-
-  return __gen_ocl_internal_fmod(x, y);
-}
-
-OVERLOADABLE float remainder(float x, float p) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_remainder(x, p);
-
-  return __gen_ocl_internal_remainder(x, p);
-}
-
-OVERLOADABLE float ldexp(float x, int n) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_ldexp(x, n);
-
-  if (x == (float)0.0f) x = 0.0f;
-  return __gen_ocl_internal_ldexp(x, n);
-}
-
-CONST OVERLOADABLE float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
-CONST OVERLOADABLE half __gen_ocl_mad(half a, half b, half c) __asm("llvm.fma" ".f16");
-PURE CONST float __gen_ocl_fmax(float a, float b);
-PURE CONST float __gen_ocl_fmin(float a, float b);
-
-OVERLOADABLE float mad(float a, float b, float c) {
-  return __gen_ocl_mad(a, b, c);
-}
-
-
-#define BODY \
-  if (isnan(x) || isinf(x)) { \
-    *exp = 0; \
-    return x; \
-  } \
-  uint u = as_uint(x); \
-  uint a = u & 0x7FFFFFFFu; \
-  if (a == 0) { \
-    *exp = 0; \
-    return x; \
-  } \
-  if (a >= 0x800000) { \
-    *exp = (a >> 23) - 126; \
-    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
-  } \
-  int e = -126; \
-  while (a < 0x400000) { \
-    e --; \
-    a <<= 1; \
-  } \
-  a <<= 1; \
-  *exp = e; \
-  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
-OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
-OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
-OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
-#undef BODY
-
-OVERLOADABLE float nextafter(float x, float y) {
-  int hx, hy, ix, iy;
-  hx = as_int(x);
-  hy = as_int(y);
-  ix = hx & 0x7fffffff;
-  iy = hy & 0x7fffffff;
-  if(ix == 0)
-    ix = hx & 0x7fffff;
-  if(iy == 0)
-    iy = hy & 0x7fffff;
-  if(ix>0x7f800000 || iy>0x7f800000)
-    return x+y;
-  if(hx == hy)
-    return y;
-  if(ix == 0) {
-    if(iy == 0)
-      return y;
-    else
-      return as_float((hy&0x80000000) | 1);
-  }
-  if(hx >= 0) {
-    if(hx > hy) {
-      hx -= 1;
-    } else {
-      hx += 1;
-    }
-  } else {
-    if(hy >= 0 || hx > hy){
-      hx -= 1;
-    } else {
-      hx += 1;
-    }
-  }
-  return as_float(hx);
-}
-
-#define BODY \
-  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
-  if (ix > 0x7F800000) { \
-    *i = nan(0u); \
-    return nan(0u); \
-  } \
-  if (ix == 0x7F800000) { \
-    *i = x; \
-    return as_float(hx & 0x80000000u); \
-  } \
-  *i = __gen_ocl_rndz(x); \
-  return x - *i;
-OVERLOADABLE float modf(float x, global float *i) { BODY; }
-OVERLOADABLE float modf(float x, local float *i) { BODY; }
-OVERLOADABLE float modf(float x, private float *i) { BODY; }
-#undef BODY
-
-OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
-OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
-OVERLOADABLE float __gen_ocl_internal_fmax(half a, half b) { return max(a,b); }
-OVERLOADABLE float __gen_ocl_internal_fmin(half a, half b) { return min(a,b); }
-OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
-  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
-  return a > b ? x : b > a ? y : max(x, y);
-}
-OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
-  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
-  return a < b ? x : b < a ? y : min(x, y);
-}
-OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
-  if(isnan(x))
-    return x;
-  if(isnan(y))
-    return y;
-  return x > y ? (x - y) : +0.f;
-}
-/*
- * the pow/pown high precision implementation are copied from msun library.
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
-  float z,ax,z_h,z_l,p_h,p_l;
-  float y1,t1,t2,r,s,sn,t,u,v,w;
-  int i,j,k,yisint,n;
-  int hx,hy,ix,iy,is;
-  float bp[2],dp_h[2],dp_l[2],
-  zero    =  0.0,
-  one	=  1.0,
-  two	=  2.0,
-  two24	=  16777216.0,	/* 0x4b800000 */
-  huge	=  1.0e30,
-  tiny    =  1.0e-30,
-  /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
-  L1  =  6.0000002384e-01, /* 0x3f19999a */
-  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
-  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
-  P2   = -2.7777778450e-03, /* 0xbb360b61 */
-  lg2  =  6.9314718246e-01, /* 0x3f317218 */
-  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
-  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
-  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
-  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
-  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
-  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
-  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
-  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
-  bp[0] = 1.0,bp[1] = 1.5,
-  dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
-  dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
-  if (ix < 0x00800000) {	   /* x < 2**-126  */
-    ix = 0;/* Gen does not support subnormal number now */
-  }
-  if (iy < 0x00800000) {	  /* y < 2**-126  */
-    iy = 0;/* Gen does not support subnormal number now */
-  }
-   /* y==zero: x**0 = 1 */
-  if(iy==0) return one;
-  /* pow(+1, y) returns 1 for any y, even a NAN */
-  if(hx==0x3f800000) return one;
-  /* +-NaN return x+y */
-  if(ix > 0x7f800000 || iy > 0x7f800000)
-    return (x+0.0f)+y+(0.0f);
-  /* determine if y is an odd int when x < 0
-     * yisint = 0	... y is not an integer
-     * yisint = 1	... y is an odd int
-     * yisint = 2	... y is an even int
-     */
-  yisint  = 0;
-  if(hx<0) {
-    if(iy>=0x4b800000) yisint = 2; /* even integer y */
-    else if(iy>=0x3f800000) {
-      k = (iy>>23)-0x7f;	   /* exponent */
-      j = iy>>(23-k);
-      if((j<<(23-k))==iy) yisint = 2-(j&1);
-    }
-  }
-  /* special value of y */
-  if (iy==0x7f800000) {	/* y is +-inf */
-    if (ix==0x3f800000)
-      //return  y - y;	/* inf**+-1 is NaN */
-      return one;
-    else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
-      return (hy>=0)? y: zero;
-    else			/* (|x|<1)**-,+inf = inf,0 */
-      return (hy<0)?-y: zero;
-  }
-  if(iy==0x3f800000) {	/* y is  +-1 */
-    if(hy<0) return one/x; else return x;
-  }
-  if(hy==0x40000000) return x*x; /* y is  2 */
-  if(hy==0x3f000000) {	/* y is  0.5 */
-    if(hx>=0)return __gen_ocl_sqrt(x);
-  }
-
-  ax   = __gen_ocl_fabs(x);
-    /* special value of x */
-  if(ix==0x7f800000||ix==0||ix==0x3f800000){
-    z = ax;			/*x is +-0,+-inf,+-1*/
-    if(hy<0) z = one/z;	/* z = (1/|x|) */
-    if(hx<0) {
-      if(((ix-0x3f800000)|yisint)==0) {
-        z = (z-z)/(z-z); /* (-1)**non-int is NaN */
-      } else if(yisint==1)
-        z = -z;		/* (x<0)**odd = -(|x|**odd) */
-    }
-    return z;
-  }
-  n = ((uint)hx>>31)-1;
-
-  /* (x<0)**(non-int) is NaN */
-  if((n|yisint)==0) return (x-x)/(x-x);
-
-  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
-  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
-
-  /* |y| is huge */
-  if(iy>0x4d000000) { /* if |y| > 2**27 */
-    /* over/underflow if x is not close to one */
-    if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
-    if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
-    /* now |1-x| is tiny <= 2**-20, suffice to compute
-          log(x) by x-x^2/2+x^3/3-x^4/4 */
-    t = ax-1;		/* t has 20 trailing zeros */
-    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
-    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
-    v = t*ivln2_l-w*ivln2;
-    t1 = u+v;
-    GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
-    t2 = v-(t1-u);
-  } else {
-    float s2,s_h,s_l,t_h,t_l;
-    n = 0;
-	/* take care subnormal number */
-    //if(ix<0x00800000)
-      //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
-    n  += ((ix)>>23)-0x7f;
-    j  = ix&0x007fffff;
-	/* determine interval */
-    ix = j|0x3f800000;		/* normalize ix */
-    if(j<=0x1cc471) k=0;	/* |x|<sqrt(3/2) */
-    else if(j<0x5db3d7) k=1;	/* |x|<sqrt(3)   */
-    else {k=0;n+=1;ix -= 0x00800000;}
-    GEN_OCL_SET_FLOAT_WORD(ax,ix);
-
-	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
-    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
-    v = one/(ax+bp[k]);
-    s = u*v;
-    s_h = s;
-    GEN_OCL_GET_FLOAT_WORD(is,s_h);
-    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
-    /* t_h=ax+bp[k] High */
-    is = ((ix>>1)&0xfffff000)|0x20000000;
-    GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
-    t_l = ax - (t_h-bp[k]);
-    s_l = v*((u-s_h*t_h)-s_h*t_l);
-
-    /* compute log(ax) */
-    s2 = s*s;
-    r = s2*s2*(L1+s2*L2);
-    r += s_l*(s_h+s);
-    s2  = s_h*s_h;
-    t_h = 3.0f+s2+r;
-    GEN_OCL_GET_FLOAT_WORD(is,t_h);
-    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
-    t_l = r-((t_h-3.0f)-s2);
-    /* u+v = s*(1+...) */
-    u = s_h*t_h;
-    v = s_l*t_h+t_l*s;
-    /* 2/(3log2)*(s+...) */
-    p_h = u+v;
-    GEN_OCL_GET_FLOAT_WORD(is,p_h);
-    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
-    p_l = v-(p_h-u);
-    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
-    z_l = cp_l*p_h+p_l*cp+dp_l[k];
-    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
-    t = (float)n;
-    t1 = (((z_h+z_l)+dp_h[k])+t);
-    GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
-    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
-  }
-
-  /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
-  GEN_OCL_GET_FLOAT_WORD(is,y);
-  GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
-  p_l = (y-y1)*t1+y*t2;
-  p_h = y1*t1;
-  z = p_l+p_h;
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  if (j>0x43000000)				/* if z > 128 */
-    return sn*huge*huge;			/* overflow */
-  else if (j==0x43000000) {			/* if z == 128 */
-    if(p_l+ovt>z-p_h) return sn*huge*huge;	/* overflow */
-  }
-  else if ((j&0x7fffffff)>0x43160000)		/* z <= -150 */
-    return sn*tiny*tiny;			/* underflow */
-  else if (j==0xc3160000){			/* z == -150 */
-    if(p_l<=z-p_h) return sn*tiny*tiny;		/* underflow */
-  }
-
-  /*
-    * compute 2**(p_h+p_l)
-    */
-  i = j&0x7fffffff;
-  k = (i>>23)-0x7f;
-  n = 0;
-  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
-    n = j+(0x00800000>>(k+1));
-    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
-    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
-    n = ((n&0x007fffff)|0x00800000)>>(23-k);
-    if(j<0) n = -n;
-    p_h -= t;
-  }
-  t = p_l+p_h;
-  GEN_OCL_GET_FLOAT_WORD(is,t);
-  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
-  u = t*lg2_h;
-  v = (p_l-(t-p_h))*lg2+t*lg2_l;
-  z = u+v;
-  w = v-(z-u);
-  t  = z*z;
-  t1  = z - t*(P1+t*P2);
-  r  = (z*t1)/(t1-two)-(w+z*w);
-  z  = one-(r-z);
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  j += (n<<23);
-  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);	/* subnormal output */
-  else GEN_OCL_SET_FLOAT_WORD(z,j);
-  return sn*z;
-}
-
-OVERLOADABLE float tgamma (float x)
-{
-  /* based on glibc __ieee754_gammaf_r by Ulrich Drepper <drepper@cygnus.com> */
-
-  unsigned int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  if (hx == 0xff800000)
-    {
-      /* x == -Inf.  According to ISO this is NaN.  */
-      return NAN;
-    }
-  if ((hx & 0x7f800000) == 0x7f800000)
-    {
-      /* Positive infinity (return positive infinity) or NaN (return
-	 NaN).  */
-      return x;
-    }
-  if (x < 0.0f && __gen_ocl_internal_floor (x) == x)
-    {
-      /* integer x < 0 */
-      return NAN;
-    }
-
-  if (x >= 36.0f)
-    {
-      /* Overflow.  */
-      return INFINITY;
-    }
-  else if (x <= 0.0f && x >= -FLT_EPSILON / 4.0f)
-    {
-      return 1.0f / x;
-    }
-  else
-    {
-      float sinpix = __gen_ocl_internal_sinpi(x);
-      if (x <= -42.0f)
-	/* Underflow.  */
-	{return 0.0f * sinpix /*for sign*/;}
-      int exp2_adj = 0;
-      float x_abs = __gen_ocl_fabs(x);
-      float gam0;
-
-      if (x_abs < 4.0f) {
-        /* gamma = exp(lgamma) is only accurate for small lgamma */
-        float prod,x_adj;
-        if (x_abs < 0.5f) {
-          prod = 1.0f / x_abs;
-          x_adj = x_abs + 1.0f;
-        } else if (x_abs <= 1.5f) {
-          prod = 1.0f;
-          x_adj = x_abs;
-        } else if (x_abs < 2.5f) {
-          x_adj = x_abs - 1.0f;
-          prod = x_adj;
-        } else {
-          x_adj = x_abs - 2.0f;
-          prod = x_adj * (x_abs - 1.0f);
-        }
-        gam0 = __gen_ocl_internal_exp (lgamma (x_adj)) * prod;
-      }
-      else {
-        /* Compute gamma (X) using Stirling's approximation,
-  	 starting by computing pow (X, X) with a power of 2
-  	 factored out to avoid intermediate overflow.  */
-        float x_int = __gen_ocl_internal_round (x_abs);
-        float x_frac = x_abs - x_int;
-        int x_log2;
-        float x_mant = frexp (x_abs, &x_log2);
-        if (x_mant < M_SQRT1_2_F)
-          {
-          x_log2--;
-          x_mant *= 2.0f;
-          }
-        exp2_adj = x_log2 * (int) x_int;
-        float ret = (__gen_ocl_internal_pow(x_mant, x_abs)
-  		   * exp2 (x_log2 * x_frac)
-  		   * __gen_ocl_internal_exp (-x_abs)
-  		   * sqrt (2.0f * M_PI_F / x_abs) );
-
-        float x2 = x_abs * x_abs;
-        float bsum = (0x3.403404p-12f / x2 -0xb.60b61p-12f) / x2 + 0x1.555556p-4f;
-        gam0 = ret + ret * __gen_ocl_internal_expm1 (bsum / x_abs);
-      }
-      if (x > 0.0f) {return __gen_ocl_internal_ldexp (gam0, exp2_adj);}
-      float gam1 = M_PI_F / (-x * sinpix * gam0);
-      return __gen_ocl_internal_ldexp (gam1, -exp2_adj);
-    }
+OVERLOADABLE half lgamma_r(half x, global int *signgamp) {
+  float _x = (float)x;
+  return (half)lgamma_r(_x, signgamp);
 }
-
-float __gen_ocl_internal_pown(float x, int y) {
-  const float
-  bp[] = {1.0, 1.5,},
-  dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */
-  dp_l[] = { 0.0, 1.56322085e-06,}, /* 0x35d1cfdc */
-  zero    =  0.0,
-  one =  1.0,
-  two =  2.0,
-  two24 =  16777216.0,  /* 0x4b800000 */
-  huge  =  1.0e30,
-  tiny    =  1.0e-30,
-    /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
-  L1  =  6.0000002384e-01, /* 0x3f19999a */
-  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
-  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
-  P2   = -2.7777778450e-03, /* 0xbb360b61 */
-  lg2  =  6.9314718246e-01, /* 0x3f317218 */
-  lg2_h  =  0x1.62ep-1,
-  lg2_l  =  0x1.0bfbe8p-15,
-  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
-  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
-  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
-  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
-  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
-  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
-
-  float z,ax,z_h,z_l,p_h,p_l;
-  float y1,t1,t2,r,s,t,u,v,w;
-  int i,j,k,yisint,n;
-  int hx,ix,iy,is;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  iy = y > 0 ? y&0x7fffffff : (-y)&0x7fffffff;
-    /* y==zero: x**0 = 1 */
-  if(y==0) return one;
-
-    /* +-NaN return NAN */
-  if(ix > 0x7f800000)
-    return NAN;
-
-    /* determine if y is an odd int
-     * yisint = 1 ... y is an odd int
-     * yisint = 2 ... y is an even int
-     */
-    yisint = y&1 ? 1 : 2;
-
-  if (y == 1) return x;
-  if (y == -1) return one/x;
-  if (y == 2) return x*x;
-
-  ax   = __gen_ocl_fabs(x);
-
-   /* special value of x */
-  if(ix==0x7f800000||ix==0||ix==0x3f800000){
-      z = ax;     /*x is +-0,+-inf,+-1*/
-      if(y<0) z = one/z; /* z = (1/|x|) */
-      if(hx<0) {
-      if(yisint==1)
-        z = -z;   /* (x<0)**odd = -(|x|**odd) */
-      }
-      return z;
-  }
-
-  float sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
-  if(((((unsigned)hx>>31)-1)|(yisint-1))==0)
-      sn = -one; /* (-ve)**(odd int) */
-
-    /* |y| is huge */
-  if(iy>0x08000000) { /* if |y| > 2**27 */
-    /* over/underflow if x is not close to one */
-      if(ix<0x3f7ffff8) return (y<0)? sn*huge*huge:tiny*tiny;
-      if(ix>0x3f800007) return (y>0)? sn*huge*huge:tiny*tiny;
-    /* now |1-x| is tiny <= 2**-20, suffice to compute
-     log(x) by x-x^2/2+x^3/3-x^4/4 */
-      t = ax-1;   /* t has 20 trailing zeros */
-      w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
-      u = ivln2_h*t;  /* ivln2_h has 16 sig. bits */
-      v = t*ivln2_l-w*ivln2;
-      t1 = u+v;
-      GEN_OCL_GET_FLOAT_WORD(is,t1);
-      GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
-      t2 = v-(t1-u);
-  } else {
-    float s2,s_h,s_l,t_h,t_l;
-    n = 0;
-    /* take care subnormal number */
-//      if(ix<0x00800000)
-//    {ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
-    n  += ((ix)>>23)-0x7f;
-    j  = ix&0x007fffff;
-    /* determine interval */
-    ix = j|0x3f800000;    /* normalize ix */
-    if(j<=0x1cc471) k=0;  /* |x|<sqrt(3/2) */
-    else if(j<0x5db3d7) k=1;  /* |x|<sqrt(3)   */
-    else {k=0;n+=1;ix -= 0x00800000;}
-    GEN_OCL_SET_FLOAT_WORD(ax,ix);
-
-    /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
-    u = ax-bp[k];   /* bp[0]=1.0, bp[1]=1.5 */
-    v = one/(ax+bp[k]);
-    s = u*v;
-    s_h = s;
-    GEN_OCL_GET_FLOAT_WORD(is,s_h);
-    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
-
-    /* t_h=ax+bp[k] High */
-    GEN_OCL_SET_FLOAT_WORD(t_h, (((ix>>1)|0x20000000)+0x00400000+(k<<21)) &0xfffff000);
-    t_l = ax - (t_h-bp[k]);
-    s_l = v*((u-s_h*t_h)-s_h*t_l);
-
-
-    /* compute log(ax) */
-    s2 = s*s;
-    r = s2*s2*(L1+s2*L2);
-    r += s_l*(s_h+s);
-    s2  = s_h*s_h;
-    t_h = (float)3.0+s2+r;
-    GEN_OCL_GET_FLOAT_WORD(is,t_h);
-    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
-    t_l = r-((t_h-(float)3.0)-s2);
-    /* u+v = s*(1+...) */
-    u = s_h*t_h;
-    v = s_l*t_h+t_l*s;
-    /* 2/(3log2)*(s+...) */
-    p_h = u+v;
-    GEN_OCL_GET_FLOAT_WORD(is,p_h);
-    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
-    p_l = v-(p_h-u);
-    z_h = cp_h*p_h;   /* cp_h+cp_l = 2/(3*log2) */
-    z_l = cp_l*p_h+p_l*cp+dp_l[k];
-    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
-    t = (float)n;
-    t1 = (((z_h+z_l)+dp_h[k])+t);
-    GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
-    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
-  }
-
-  /* split up y into y1+y2+y3 and compute (y1+y2+y3)*(t1+t2) */
-
-  float fy = (float)y;
-  float y3 = (float)(y-(int)fy);
-  GEN_OCL_GET_FLOAT_WORD(is,fy);
-  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
-
-  p_l = (fy-y1)*t1 + y3*t1 + fy*t2 + y3*t2;
-  p_h = y1*t1;
-  z = p_l+p_h;
-
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  if (j>0x43000000)       /* if z > 128 */
-      return sn*huge*huge;       /* overflow */
-  else if (j==0x43000000) {     /* if z == 128 */
-      if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
-  }
-  else if ((j&0x7fffffff)>0x43160000)   /* z <= -150 */
-      return sn*tiny*tiny;       /* underflow */
-  else if (j==0xc3160000){      /* z == -150 */
-      if(p_l<=z-p_h) return sn*tiny*tiny;    /* underflow */
-  }
-    /*
-     * compute 2**(p_h+p_l)
-     */
-  i = j&0x7fffffff;
-  k = (i>>23)-0x7f;
-  n = 0;
-  if(i>0x3f000000) {    /* if |z| > 0.5, set n = [z+0.5] */
-      n = j+(0x00800000>>(k+1));
-      k = ((n&0x7fffffff)>>23)-0x7f;  /* new k for n */
-      GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
-      n = ((n&0x007fffff)|0x00800000)>>(23-k);
-      if(j<0) n = -n;
-      p_h -= t;
-
-      z -= n;
-  }
-
-  t = z;
-  GEN_OCL_GET_FLOAT_WORD(is,t);
-  GEN_OCL_SET_FLOAT_WORD(t,is&0xfffff000);
-  u = t*lg2_h;
-  v = (p_l-(t-p_h))*lg2+t*lg2_l;
-  z = u+v;
-  w = v-(z-u);
-  t  = z*z;
-  t1  = z - t*(P1+t*P2);
-  r  = (z*t1)/(t1-two)-(w+z*w);
-  z  = one-(r-z);
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  j += (n<<23);
-  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);  /* subnormal output */
-  else GEN_OCL_SET_FLOAT_WORD(z,j);
-  return sn*z;
+OVERLOADABLE half lgamma_r(half x, local int *signgamp) {
+  float _x = (float)x;
+  return (half)lgamma_r(_x, signgamp);
 }
-
-OVERLOADABLE float hypot(float x, float y) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_hypot(x, y);
-
-  //return __gen_ocl_sqrt(x*x + y*y);
-  float a,b,an,bn,cn;
-  int e;
-  if (isfinite (x) && isfinite (y)){      /* Determine absolute values.  */
-  x = __gen_ocl_fabs (x);
-  y = __gen_ocl_fabs (y);
-  /* Find the bigger and the smaller one.  */
-  a = max(x,y);
-  b = min(x,y);
-  /* Now 0 <= b <= a.  */
-  /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1.  */
-  an = frexp (a, &e);
-  bn = ldexp (b, - e);
-  /* Through the normalization, no unneeded overflow or underflow will occur here.  */
-  cn = __gen_ocl_sqrt (an * an + bn * bn);
-  return ldexp (cn, e);
-  }else{
-    if (isinf (x) || isinf (y))  /* x or y is infinite.  Return +Infinity.  */
-      return INFINITY;
-    else        /* x or y is NaN.  Return NaN.  */
-      return x + y;
-  }
+OVERLOADABLE half lgamma_r(half x, private int *signgamp) {
+  float _x = (float)x;
+  return (half)lgamma_r(_x, signgamp);
 }
 
 #define BODY \
-  if (isnan(x)) { \
-    *p = x; \
-    return x; \
-  } \
-  *p = __gen_ocl_internal_floor(x); \
-  if (isinf(x)) { \
-    return x > 0 ? +0. : -0.; \
-  } \
-  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
-OVERLOADABLE float fract(float x, global float *p) { BODY; }
-OVERLOADABLE float fract(float x, local float *p) { BODY; }
-OVERLOADABLE float fract(float x, private float *p) { BODY; }
-#undef BODY
-
-#define BODY \
   float Zero[2]; \
   int n,hx,hy,hz,ix,iy,sx,i,sy; \
   uint q,sxy; \
@@ -3256,523 +535,41 @@ OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
 OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
 #undef BODY
 
-OVERLOADABLE float powr(float x, float y) {
-  unsigned int hx, sx, hy, sy;
-
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_pow(x,y);
-  else {
-    if (isnan(x) || isnan(y)) return NAN;
-    GEN_OCL_GET_FLOAT_WORD(hx,x);
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    sx = (hx & 0x80000000) >> 31;
-    sy = (hy & 0x80000000) >> 31;
-
-    if ((hx&0x7fffffff) < 0x00800000) {	   /* x < 2**-126  */
-      x = 0.0f;/* Gen does not support subnormal number now */
-      hx = hx &0x80000000;
-    }
-    if ((hy&0x7fffffff) < 0x00800000) {	  /* y < 2**-126  */
-      y = 0.0;/* Gen does not support subnormal number now */
-      hy = hy &0x80000000;
-    }
-
-    // (x < 0) ** y = NAN (y!=0)
-    if ((sx && (hx & 0x7fffffff))) return NAN;
-
-    // +/-0 ** +/-0 = NAN
-    if ( !(hx&0x7fffffff) && !(hy&0x7fffffff)) return NAN;
-
-    // +inf ** +/-0 = NAN
-    if ( ((hx & 0x7f800000) ==0x7f800000) && !(hy&0x7fffffff)) return NAN;
-
-    // others except nan/inf/0 ** 0 = 1.0
-    if (!(hy&0x7fffffff)) return 1.0f;
-
-    // +1 ** inf = NAN; +1 ** finite = 1;
-    if (hx == 0x3f800000) {
-      return isinf(y) ? NAN : 1.0f;
-    }
-
-    if ( !(hx & 0x7fffffff)) {
-        // +/-0 ** y<0 = +inf
-        // +/-0 ** y>0 = +0
-      return sy ? INFINITY : 0.0f;
-    }
-
-    return __gen_ocl_internal_pow(x,y);
-  }
-}
-
-OVERLOADABLE float pown(float x, int n) {
-  if (__ocl_math_fastpath_flag) {
-    if (x == 0.f && n == 0)
-      return 1.f;
-    if (x < 0.f && (n&1) )
-      return -powr(-x, n);
-    return powr(x, n);
-  } else {
-    int ix;
-    GEN_OCL_GET_FLOAT_WORD(ix, x);
-    float sign = ix < 0 ? -1.0f : 1.0f;
-    if (x == 0.0f) x = sign * 0.0f;
-
-    return __gen_ocl_internal_pown(x, n);
-  }
-}
-
-OVERLOADABLE float pow(float x, float y) {
-  if (!__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_pow(x,y);
-  else {
-    int n;
-    if (x == 0.f && y == 0.f)
-      return 1.f;
-    if (x >= 0.f)
-      return powr(x, y);
-    n = y;
-    if ((float)n == y)//is exact integer
-      return pown(x, n);
-    return NAN;
-  }
-}
-
-OVERLOADABLE float rootn(float x, int n) {
-  float ax,re;
-  int sign = 0;
-  int hx;
-  if( n == 0 )return NAN;
-
-  GEN_OCL_GET_FLOAT_WORD(hx, x);
-  // Gen does not support denorm, flush to zero
-  if ((hx & 0x7fffffff) < 0x00800000) {
-    x = hx < 0 ? -0.0f : 0.0f;
-  }
-
-  //rootn ( x, n )  returns a NaN for x < 0 and n is even.
-  if( x < 0 && 0 == (n&1) )
-    return NAN;
-  if( x == 0.0 ){
-    switch( n & 0x80000001 ){
-      //rootn ( +-0,  n ) is +0 for even n > 0.
-      case 0:
-        return 0.0f;
-      //rootn ( +-0,  n ) is +-0 for odd n > 0.
-      case 1:
-        return x;
-      //rootn ( +-0,  n ) is +inf for even n < 0.
-      case 0x80000000:
-        return INFINITY;
-
-      //rootn ( +-0,  n ) is +-inf for odd n < 0.
-      case 0x80000001:
-        return __gen_ocl_internal_copysign(INFINITY, x);
-    }
-  }
-  ax = __gen_ocl_fabs(x);
-  if(x <0.0f && (n&1))
-    sign = 1;
-  if (__ocl_math_fastpath_flag)
-    re = __gen_ocl_pow(ax, 1.f/n);
-  else
-    re = __gen_ocl_internal_pow(ax,1.f/n);
-  if(sign)
-    re = -re;
-  return re;
-}
-
-OVERLOADABLE float fabs(float x) {
-  return __gen_ocl_internal_fabs(x);
-}
-
-OVERLOADABLE float trunc(float x) {
-  return  __gen_ocl_internal_trunc(x);
-}
-
-OVERLOADABLE float round(float x) {
-  return __gen_ocl_internal_round(x);
-}
-
-OVERLOADABLE float floor(float x) {
-  return __gen_ocl_internal_floor(x);
-}
-
-OVERLOADABLE float ceil(float x) {
-  return __gen_ocl_internal_ceil(x);
-}
-
-OVERLOADABLE float log(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_log(x);
-
-  /* Use native instruction when it has enough precision */
-  if((x > 0x1.1p0) || (x <= 0))
-    return __gen_ocl_internal_fastpath_log(x);
-
-  return  __gen_ocl_internal_log(x);
-}
-
-OVERLOADABLE float log2(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_log2(x);
-
-  /* Use native instruction when it has enough precision */
-  if((x > 0x1.1p0) || (x <= 0))
-    return __gen_ocl_internal_fastpath_log2(x);
-
-  return  __gen_ocl_internal_log2(x);
-}
-
-OVERLOADABLE float log10(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_log10(x);
-
-  /* Use native instruction when it has enough precision */
-  if((x > 0x1.1p0) || (x <= 0))
-    return __gen_ocl_internal_fastpath_log10(x);
-
-  return  __gen_ocl_internal_log10(x);
-}
+#define BODY \
+  *cosval = cos(x); \
+  return sin(x);
 
-OVERLOADABLE float exp(float x) {
+OVERLOADABLE float sincos(float x, global float *cosval) {
   if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_exp(x);
-
-  /* Use native instruction when it has enough precision */
-  if (x > -0x1.6p1 && x < 0x1.6p1)
-    return __gen_ocl_internal_fastpath_exp(x);
-
-  return  __gen_ocl_internal_exp(x);
-}
-
-OVERLOADABLE float exp2(float x) {
-  /* Use native instruction when it has enough precision, exp2 always */
-  return native_exp2(x);
+    return __gen_ocl_internal_fastpath_sincos(x, cosval);
+  BODY;
 }
-
-OVERLOADABLE float exp10(float x) {
+OVERLOADABLE float sincos(float x, local float *cosval) {
   if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_exp10(x);
-
-  return  __gen_ocl_internal_exp10(x);
+    return __gen_ocl_internal_fastpath_sincos(x, cosval);
+  BODY;
 }
-
-OVERLOADABLE float expm1(float x) {
+OVERLOADABLE float sincos(float x, private float *cosval) {
   if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_expm1(x);
-
-  return  __gen_ocl_internal_expm1(x);
-}
-
-OVERLOADABLE float fmin(float a, float b) {
-  return __gen_ocl_internal_fmin(a, b);
-}
-
-OVERLOADABLE float fmax(float a, float b) {
-  return __gen_ocl_internal_fmax(a, b);
-}
-
-OVERLOADABLE float fma(float a, float b, float c) {
-  return mad(a, b, c);
-}
-
-OVERLOADABLE float fdim(float x, float y) {
-  return __gen_ocl_internal_fdim(x, y);
-}
-
-OVERLOADABLE float maxmag(float x, float y) {
-  return __gen_ocl_internal_maxmag(x, y);
-}
-
-OVERLOADABLE float minmag(float x, float y) {
-  return __gen_ocl_internal_minmag(x, y);
+    return __gen_ocl_internal_fastpath_sincos(x, cosval);
+  BODY;
 }
+#undef BODY
 
-
-/* So far, the HW do not support half float math function.
-   We just do the conversion and call the float version here. */
-OVERLOADABLE half cospi(half x) {
-  float _x = (float)x;
-  return (half)cospi(_x);
-}
-OVERLOADABLE half cosh(half x) {
-  float _x = (float)x;
-  return (half)cosh(_x);
-}
-OVERLOADABLE half acos(half x) {
-  float _x = (float)x;
-  return (half)acos(_x);
-}
-OVERLOADABLE float half_cos(float x) {
-  return (float)cos(x);
-}
-OVERLOADABLE float half_divide(float x, float y) {
-  return (float)native_divide(x, y);
-}
-OVERLOADABLE float half_exp(float x) {
-  return (float)native_exp(x);
-}
-OVERLOADABLE float half_exp2(float x){
-  return (float)native_exp2(x);
-}
-OVERLOADABLE float half_exp10(float x){
-  return (float)native_exp10(x);
-}
-OVERLOADABLE float half_log(float x){
-  return (float)native_log(x);
-}
-OVERLOADABLE float half_log2(float x){
-  return (float)native_log2(x);
-}
-OVERLOADABLE float half_log10(float x){
-  return (float)native_log10(x);
-}
-OVERLOADABLE float half_powr(float x, float y){
-  return (float)powr(x, y);
-}
-OVERLOADABLE float half_recip(float x){
-  return (float)native_recip(x);
-}
-OVERLOADABLE float half_rsqrt(float x){
-  return (float)native_rsqrt(x);
-}
-OVERLOADABLE float half_sin(float x){
-  return (float)sin(x);
-}
-OVERLOADABLE float half_sqrt(float x){
-  return (float)native_sqrt(x);
-}
-OVERLOADABLE float half_tan(float x){
-  return (float)tan(x);
-}
-OVERLOADABLE half acospi(half x) {
-  float _x = (float)x;
-  return (half)acospi(_x);
-}
-OVERLOADABLE half acosh(half x) {
-  float _x = (float)x;
-  return (half)acosh(_x);
-}
-OVERLOADABLE half sinpi(half x) {
-  float _x = (float)x;
-  return (half)sinpi(_x);
-}
-OVERLOADABLE half sinh(half x) {
-  float _x = (float)x;
-  return (half)sinh(_x);
-}
-OVERLOADABLE half asin(half x) {
-  float _x = (float)x;
-  return (half)asin(_x);
-}
-OVERLOADABLE half asinpi(half x) {
-  float _x = (float)x;
-  return (half)asinpi(_x);
-}
-OVERLOADABLE half asinh(half x) {
-  float _x = (float)x;
-  return (half)asinh(_x);
-}
-OVERLOADABLE half tanpi(half x) {
-  float _x = (float)x;
-  return (half)tanpi(_x);
-}
-OVERLOADABLE half tanh(half x) {
-  float _x = (float)x;
-  return (half)tanh(_x);
-}
-OVERLOADABLE half atan(half x) {
-  float _x = (float)x;
-  return (half)atan(_x);
-}
-OVERLOADABLE half atan2(half y, half x) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)atan2(_x, _y);
-}
-OVERLOADABLE half atan2pi(half y, half x) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)atan2pi(_x, _y);
-}
-OVERLOADABLE half atanpi(half x) {
-  float _x = (float)x;
-  return (half)atanpi(_x);
-}
-OVERLOADABLE half atanh(half x) {
-  float _x = (float)x;
-  return (half)atanh(_x);
-}
-OVERLOADABLE half cbrt(half x) {
-  float _x = (float)x;
-  return (half)cbrt(_x);
-}
-OVERLOADABLE half rint(half x) {
-  float _x = (float)x;
-  return (half)rint(_x);
-}
-OVERLOADABLE half copysign(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)copysign(_x, _y);
-}
-OVERLOADABLE half erf(half x) {
-  float _x = (float)x;
-  return (half)erf(_x);
-}
-OVERLOADABLE half erfc(half x) {
-  float _x = (float)x;
-  return (half)erfc(_x);
-}
-OVERLOADABLE half fmod(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)fmod(_x, _y);
-}
-OVERLOADABLE half remainder(half x, half p) {
-  float _x = (float)x;
-  float _p = (float)p;
-  return (half)remainder(_x, _p);
-}
-OVERLOADABLE half ldexp(half x, int n) {
-  float _x = (float)x;
-  return (half)ldexp(_x, n);
-}
-OVERLOADABLE half powr(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)powr(_x, _y);
-}
-OVERLOADABLE half pow(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)pow(_x, _y);
-}
-//no pow, we use powr instead
-OVERLOADABLE half fabs(half x) {
-  float _x = (float)x;
-  return (half)fabs(_x);
-}
-OVERLOADABLE half trunc(half x) {
-  float _x = (float)x;
-  return (half)trunc(_x);
-}
-OVERLOADABLE half round(half x) {
-  float _x = (float)x;
-  return (half)round(_x);
-}
-OVERLOADABLE half floor(half x) {
-  float _x = (float)x;
-  return (half)floor(_x);
-}
-OVERLOADABLE half ceil(half x) {
-  float _x = (float)x;
-  return (half)ceil(_x);
-}
-OVERLOADABLE half log(half x) {
-  float _x = (float)x;
-  return (half)log(_x);
-}
-OVERLOADABLE half log2(half x) {
-  float _x = (float)x;
-  return (half)log2(_x);
-}
-OVERLOADABLE half log10(half x) {
-  float _x = (float)x;
-  return (half)log10(_x);
-}
-OVERLOADABLE half exp(half x) {
-  float _x = (float)x;
-  return (half)exp(_x);
-}
-OVERLOADABLE half exp10(half x) {
-  float _x = (float)x;
-  return (half)exp10(_x);
-}
-OVERLOADABLE half expm1(half x) {
-  float _x = (float)x;
-  return (half)expm1(_x);
-}
-OVERLOADABLE half fmin(half a, half b) {
-  return __gen_ocl_internal_fmin(a, b);
-}
-OVERLOADABLE half fmax(half a, half b) {
-  return __gen_ocl_internal_fmax(a, b);
-}
-OVERLOADABLE half fma(half a, half b, half c) {
-  float _a = (float)a;
-  float _b = (float)b;
-  float _c = (float)c;
-  return (half)fma(_a, _b, _c);
-}
-OVERLOADABLE half fdim(half x, half y) {
+OVERLOADABLE half remquo(half x, half y, global int *quo) {
   float _x = (float)x;
   float _y = (float)y;
-  return (half)fdim(_x, _y);
+  return (half)remquo(_x, _y, quo);
 }
-OVERLOADABLE half maxmag(half x, half y) {
+OVERLOADABLE half remquo(half x, half y, local int *quo) {
   float _x = (float)x;
   float _y = (float)y;
-  return (half)maxmag(_x, _y);
+  return (half)remquo(_x, _y, quo);
 }
-OVERLOADABLE half minmag(half x, half y) {
+OVERLOADABLE half remquo(half x, half y, private int *quo) {
   float _x = (float)x;
   float _y = (float)y;
-  return (half)minmag(_x, _y);
-}
-OVERLOADABLE half exp2(half x) {
-  float _x = (float)x;
-  return (half)exp2(_x);
-}
-OVERLOADABLE half mad(half a, half b, half c) {
-  return __gen_ocl_mad(a,b,c);
-}
-OVERLOADABLE half sin(half x) {
-  float _x = (float)x;
-  return (half)sin(_x);
-}
-OVERLOADABLE half cos(half x) {
-  float _x = (float)x;
-  return (half)cos(_x);
-}
-OVERLOADABLE half tan(half x) {
-  float _x = (float)x;
-  return (half)tan(_x);
-}
-OVERLOADABLE half tgamma(half x) {
-  float _x = (float)x;
-  return (half)tgamma(_x);
-}
-OVERLOADABLE half lgamma(half x) {
-  float _x = (float)x;
-  return (half)lgamma(_x);
-}
-OVERLOADABLE half lgamma_r(half x, global int *signgamp) {
-  float _x = (float)x;
-  return (half)lgamma_r(_x, signgamp);
-}
-OVERLOADABLE half lgamma_r(half x, local int *signgamp) {
-  float _x = (float)x;
-  return (half)lgamma_r(_x, signgamp);
-}
-OVERLOADABLE half lgamma_r(half x, private int *signgamp) {
-  float _x = (float)x;
-  return (half)lgamma_r(_x, signgamp);
-}
-OVERLOADABLE half log1p(half x) {
-  float _x = (float)x;
-  return (half)log1p(_x);
-}
-OVERLOADABLE half logb(half x) {
-  float _x = (float)x;
-  return (half)logb(_x);
-}
-OVERLOADABLE int ilogb(half x) {
-  float _x = (float)x;
-  return ilogb(_x);
-}
-OVERLOADABLE half nan(ushort code) {
-  return (half)NAN;
+  return (half)remquo(_x, _y, quo);
 }
 
 OVERLOADABLE half sincos(half x, global half *cosval) {
@@ -3797,109 +594,6 @@ OVERLOADABLE half sincos(half x, private half *cosval) {
   return ret;
 }
 
-OVERLOADABLE half sqrt(half x) {
-  float _x = (float)x;
-  return (half)sqrt(_x);
-}
-OVERLOADABLE half rsqrt(half x) {
-  float _x = (float)x;
-  return (half)rsqrt(_x);
-}
-OVERLOADABLE half frexp(half x, global int *exp) {
-  float _x = (float)x;
-  return (half)frexp(_x, exp);
-}
-OVERLOADABLE half frexp(half x, local int *exp) {
-  float _x = (float)x;
-  return (half)frexp(_x, exp);
-}
-OVERLOADABLE half frexp(half x, private int *exp) {
-  float _x = (float)x;
-  return (half)frexp(_x, exp);
-}
-OVERLOADABLE half nextafter(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)nextafter(_x, _y);
-}
-
-OVERLOADABLE half modf(half x, global half *i) {
-  float _x = (float)x;
-  float _i;
-  half ret = (half)modf(_x, &_i);
-  *i = (half)_i;
-  return ret;
-}
-OVERLOADABLE half modf(half x, local half *i) {
-  float _x = (float)x;
-  float _i;
-  half ret = (half)modf(_x, &_i);
-  *i = (half)_i;
-  return ret;
-}
-OVERLOADABLE half modf(half x, private half *i) {
-  float _x = (float)x;
-  float _i;
-  half ret = (half)modf(_x, &_i);
-  *i = (half)_i;
-  return ret;
-}
-
-OVERLOADABLE half hypot(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)hypot(_x, _y);
-}
-
-OVERLOADABLE half fract(half x, global half *p) {
-  float _x = (float)x;
-  float _p;
-  half ret = (half)fract(_x, &_p);
-  *p = (half)_p;
-  return ret;
-}
-OVERLOADABLE half fract(half x, local half *p) {
-  float _x = (float)x;
-  float _p;
-  half ret = (half)fract(_x, &_p);
-  *p = (half)_p;
-  return ret;
-}
-OVERLOADABLE half fract(half x, private half *p) {
-  float _x = (float)x;
-  float _p;
-  half ret = (half)fract(_x, &_p);
-  *p = (half)_p;
-  return ret;
-}
-
-OVERLOADABLE half remquo(half x, half y, global int *quo) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)remquo(_x, _y, quo);
-}
-OVERLOADABLE half remquo(half x, half y, local int *quo) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)remquo(_x, _y, quo);
-}
-OVERLOADABLE half remquo(half x, half y, private int *quo) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)remquo(_x, _y, quo);
-}
-
-OVERLOADABLE half pown(half x, int n) {
-  float _x = (float)x;
-  return (half)pown(_x, n);
-}
-OVERLOADABLE half rootn(half x, int n) {
-  float _x = (float)x;
-  return (half)rootn(_x, n);
-}
-
-
-
 //-----------------double -----------------------
 INLINE int  __HI(double x){
     long x64 = as_long(x);
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
index ca11b25d..cafb7678 100644
--- a/backend/src/libocl/tmpl/ocl_math.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -21,217 +21,47 @@
 #include "ocl_types.h"
 #include "ocl_math_common.h"
 
-OVERLOADABLE float cospi(float x);
-OVERLOADABLE float cosh(float x);
-OVERLOADABLE float acos(float x);
-OVERLOADABLE float acospi(float x);
-OVERLOADABLE float acosh(float x);
-OVERLOADABLE float sinpi(float x);
-OVERLOADABLE float sinh(float x);
-OVERLOADABLE float asin(float x);
-OVERLOADABLE float asinpi(float x);
-OVERLOADABLE float asinh(float x);
-OVERLOADABLE float tanpi(float x);
-OVERLOADABLE float tanh(float x);
-OVERLOADABLE float atan(float x);
-OVERLOADABLE float atan2(float y, float x);
-OVERLOADABLE float atan2pi(float y, float x);
-OVERLOADABLE float atanpi(float x);
-OVERLOADABLE float atanh(float x);
-OVERLOADABLE float cbrt(float x);
-OVERLOADABLE float rint(float x);
-OVERLOADABLE float copysign(float x, float y);
-OVERLOADABLE float erf(float x);
-OVERLOADABLE float erfc(float x);
-OVERLOADABLE float fmod (float x, float y);
-OVERLOADABLE float remainder(float x, float p);
-OVERLOADABLE float ldexp(float x, int n);
-OVERLOADABLE float powr(float x, float y);
-OVERLOADABLE float pow(float x, float y);
-//no pow, we use powr instead
-OVERLOADABLE float fabs(float x);
-OVERLOADABLE float trunc(float x);
-OVERLOADABLE float round(float x);
-OVERLOADABLE float floor(float x);
-OVERLOADABLE float ceil(float x);
-OVERLOADABLE float log(float x);
-OVERLOADABLE float log2(float x);
-OVERLOADABLE float log10(float x);
-OVERLOADABLE float exp(float x);
-OVERLOADABLE float exp10(float x);
-OVERLOADABLE float expm1(float x);
-OVERLOADABLE float fmin(float a, float b);
-OVERLOADABLE float fmax(float a, float b);
-OVERLOADABLE float fma(float a, float b, float c);
-OVERLOADABLE float fdim(float x, float y);
-OVERLOADABLE float maxmag(float x, float y);
-OVERLOADABLE float minmag(float x, float y);
-OVERLOADABLE float exp2(float x);
-OVERLOADABLE float mad(float a, float b, float c);
-OVERLOADABLE float sin(float x);
-OVERLOADABLE float cos(float x);
-OVERLOADABLE float tan(float x);
-OVERLOADABLE float tgamma(float x);
-OVERLOADABLE float lgamma(float x);
+
 OVERLOADABLE float lgamma_r(float x, global int *signgamp);
 OVERLOADABLE float lgamma_r(float x, local int *signgamp);
 OVERLOADABLE float lgamma_r(float x, private int *signgamp);
-OVERLOADABLE float log1p(float x);
-OVERLOADABLE float logb(float x);
-OVERLOADABLE int ilogb(float x);
-OVERLOADABLE float nan(uint code);
 OVERLOADABLE float sincos(float x, global float *cosval);
 OVERLOADABLE float sincos(float x, local float *cosval);
 OVERLOADABLE float sincos(float x, private float *cosval);
-OVERLOADABLE float sqrt(float x);
-OVERLOADABLE float rsqrt(float x);
 OVERLOADABLE float frexp(float x, global int *exp);
 OVERLOADABLE float frexp(float x, local int *exp);
 OVERLOADABLE float frexp(float x, private int *exp);
-OVERLOADABLE float nextafter(float x, float y);
 OVERLOADABLE float modf(float x, global float *i);
 OVERLOADABLE float modf(float x, local float *i);
 OVERLOADABLE float modf(float x, private float *i);
-OVERLOADABLE float hypot(float x, float y);
 OVERLOADABLE float fract(float x, global float *p);
 OVERLOADABLE float fract(float x, local float *p);
 OVERLOADABLE float fract(float x, private float *p);
 OVERLOADABLE float remquo(float x, float y, global int *quo);
 OVERLOADABLE float remquo(float x, float y, local int *quo);
 OVERLOADABLE float remquo(float x, float y, private int *quo);
-OVERLOADABLE float pown(float x, int n);
-OVERLOADABLE float rootn(float x, int n);
-
-// native
-OVERLOADABLE float native_cos(float x);
-OVERLOADABLE float native_divide(float x, float y);
-OVERLOADABLE float native_exp(float x);
-OVERLOADABLE float native_exp2(float x);
-OVERLOADABLE float native_exp10(float x);
-OVERLOADABLE float native_log(float x);
-OVERLOADABLE float native_log2(float x);
-OVERLOADABLE float native_log10(float x);
-OVERLOADABLE float native_powr(float x, float y);
-OVERLOADABLE float native_recip(float x);
-OVERLOADABLE float native_rsqrt(float x);
-OVERLOADABLE float native_sin(float x);
-OVERLOADABLE float native_sqrt(float x);
-OVERLOADABLE float native_tan(float x);
 
 
-// Half float version.
-OVERLOADABLE half cospi(half x);
-OVERLOADABLE half cosh(half x);
-OVERLOADABLE half acos(half x);
-OVERLOADABLE half acospi(half x);
-OVERLOADABLE half acosh(half x);
-OVERLOADABLE half sinpi(half x);
-OVERLOADABLE half sinh(half x);
-OVERLOADABLE half asin(half x);
-OVERLOADABLE half asinpi(half x);
-OVERLOADABLE half asinh(half x);
-OVERLOADABLE half tanpi(half x);
-OVERLOADABLE half tanh(half x);
-OVERLOADABLE half atan(half x);
-OVERLOADABLE half atan2(half y, half x);
-OVERLOADABLE half atan2pi(half y, half x);
-OVERLOADABLE half atanpi(half x);
-OVERLOADABLE half atanh(half x);
-OVERLOADABLE half cbrt(half x);
-OVERLOADABLE half rint(half x);
-OVERLOADABLE half copysign(half x, half y);
-OVERLOADABLE half erf(half x);
-OVERLOADABLE half erfc(half x);
-OVERLOADABLE half fmod (half x, half y);
-OVERLOADABLE half remainder(half x, half p);
-OVERLOADABLE half ldexp(half x, int n);
-OVERLOADABLE half powr(half x, half y);
-OVERLOADABLE half pow(half x, half y);
-//no pow, we use powr instead
-OVERLOADABLE half fabs(half x);
-OVERLOADABLE half trunc(half x);
-OVERLOADABLE half round(half x);
-OVERLOADABLE half floor(half x);
-OVERLOADABLE half ceil(half x);
-OVERLOADABLE half log(half x);
-OVERLOADABLE half log2(half x);
-OVERLOADABLE half log10(half x);
-OVERLOADABLE half exp(half x);
-OVERLOADABLE half exp10(half x);
-OVERLOADABLE half expm1(half x);
-OVERLOADABLE half fmin(half a, half b);
-OVERLOADABLE half fmax(half a, half b);
-OVERLOADABLE half fma(half a, half b, half c);
-OVERLOADABLE half fdim(half x, half y);
-OVERLOADABLE half maxmag(half x, half y);
-OVERLOADABLE half minmag(half x, half y);
-OVERLOADABLE half exp2(half x);
-OVERLOADABLE half mad(half a, half b, half c);
-OVERLOADABLE half sin(half x);
-OVERLOADABLE half cos(half x);
-OVERLOADABLE half tan(half x);
-OVERLOADABLE half tgamma(half x);
-OVERLOADABLE half lgamma(half x);
 OVERLOADABLE half lgamma_r(half x, global int *signgamp);
 OVERLOADABLE half lgamma_r(half x, local int *signgamp);
 OVERLOADABLE half lgamma_r(half x, private int *signgamp);
-OVERLOADABLE half log1p(half x);
-OVERLOADABLE half logb(half x);
-OVERLOADABLE int ilogb(half x);
-OVERLOADABLE half nan(ushort code);
 OVERLOADABLE half sincos(half x, global half *cosval);
 OVERLOADABLE half sincos(half x, local half *cosval);
 OVERLOADABLE half sincos(half x, private half *cosval);
-OVERLOADABLE half sqrt(half x);
-OVERLOADABLE half rsqrt(half x);
 OVERLOADABLE half frexp(half x, global int *exp);
 OVERLOADABLE half frexp(half x, local int *exp);
 OVERLOADABLE half frexp(half x, private int *exp);
-OVERLOADABLE half nextafter(half x, half y);
 OVERLOADABLE half modf(half x, global half *i);
 OVERLOADABLE half modf(half x, local half *i);
 OVERLOADABLE half modf(half x, private half *i);
-OVERLOADABLE half hypot(half x, half y);
 OVERLOADABLE half fract(half x, global half *p);
 OVERLOADABLE half fract(half x, local half *p);
 OVERLOADABLE half fract(half x, private half *p);
 OVERLOADABLE half remquo(half x, half y, global int *quo);
 OVERLOADABLE half remquo(half x, half y, local int *quo);
 OVERLOADABLE half remquo(half x, half y, private int *quo);
-OVERLOADABLE half pown(half x, int n);
-OVERLOADABLE half rootn(half x, int n);
 
-// native half
-OVERLOADABLE half native_cos(half x);
-OVERLOADABLE half native_divide(half x, half y);
-OVERLOADABLE half native_exp(half x);
-OVERLOADABLE half native_exp2(half x);
-OVERLOADABLE half native_exp10(half x);
-OVERLOADABLE half native_log(half x);
-OVERLOADABLE half native_log2(half x);
-OVERLOADABLE half native_log10(half x);
-OVERLOADABLE half native_powr(half x, half y);
-OVERLOADABLE half native_recip(half x);
-OVERLOADABLE half native_rsqrt(half x);
-OVERLOADABLE half native_sin(half x);
-OVERLOADABLE half native_sqrt(half x);
-OVERLOADABLE half native_tan(half x);
 
-// half accuracy
-OVERLOADABLE float half_cos(float x);
-OVERLOADABLE float half_divide(float x, float y);
-OVERLOADABLE float half_exp(float x);
-OVERLOADABLE float half_exp2(float x);
-OVERLOADABLE float half_exp10(float x);
-OVERLOADABLE float half_log(float x);
-OVERLOADABLE float half_log2(float x);
-OVERLOADABLE float half_log10(float x);
-OVERLOADABLE float half_powr(float x, float y);
-OVERLOADABLE float half_recip(float x);
-OVERLOADABLE float half_rsqrt(float x);
-OVERLOADABLE float half_sin(float x);
-OVERLOADABLE float half_sqrt(float x);
-OVERLOADABLE float half_tan(float x);
 
 //------- double -----------
 OVERLOADABLE double fract(double x, global double *p);
diff --git a/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl b/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl
index ba35ddd5..c6b52a31 100644
--- a/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math_20.tmpl.cl
@@ -37,414 +37,200 @@ CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
 CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
 CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
 
+#define BODY \
+  if (isnan(x)) { \
+    *p = x; \
+    return x; \
+  } \
+  *p = __gen_ocl_rndd(x); \
+  if (isinf(x)) { \
+    return x > 0 ? +0. : -0.; \
+  } \
+  return min(x - *p, 0x1.FFFFFep-1F);
+OVERLOADABLE float fract(float x, float *p) { BODY; }
+#undef BODY
 
-/* native functions */
-OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
-OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
-OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
-OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
-OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
-OVERLOADABLE float native_log(float x) {
-  return native_log2(x) * 0.6931472002f;
-}
-OVERLOADABLE float native_log10(float x) {
-  return native_log2(x) * 0.3010299956f;
-}
-OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
-OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
-OVERLOADABLE float native_tan(float x) {
-  return native_sin(x) / native_cos(x);
-}
-OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
-OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
-OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
-OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+#define BODY \
+  float Zero[2]; \
+  int n,hx,hy,hz,ix,iy,sx,i,sy; \
+  uint q,sxy; \
+  Zero[0] = 0.0;Zero[1] = -0.0; \
+  if (x == 0.0f) { x = 0.0f; }; \
+  if (y == 0.0f) { y = 0.0f; }\
+  GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
+  sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
+  hx ^=sx; hy &= 0x7fffffff; \
+  if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
+  if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
+    *quo = 0;return NAN; \
+  } \
+  if( hy == 0x7F800000 || hx == 0 ) { \
+    *quo = 0;return x; \
+  } \
+  if( hx == hy ) { \
+    *quo = (x == y) ? 1 : -1; \
+    return sx ? -0.0 : 0.0; \
+  } \
+  if(hx<hy) { \
+    q = 0; \
+    goto fixup; \
+  } else if(hx==hy) { \
+    *quo = (sxy ? -1 : 1); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  ix = (hx>>23)-127; \
+  iy = (hy>>23)-127; \
+  hx = 0x00800000|(0x007fffff&hx); \
+  hy = 0x00800000|(0x007fffff&hy); \
+  n = ix - iy; \
+  q = 0; \
+  while(n--) { \
+    hz=hx-hy; \
+    if(hz<0) hx = hx << 1; \
+    else {hx = hz << 1; q++;} \
+    q <<= 1; \
+  } \
+  hz=hx-hy; \
+  if(hz>=0) {hx=hz;q++;} \
+  if(hx==0) { \
+    q &= 0x0000007f; \
+    *quo = (sxy ? -q : q); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  while(hx<0x00800000) { \
+    hx <<= 1;iy -= 1; \
+  } \
+  if(iy>= -126) { \
+    hx = ((hx-0x00800000)|((iy+127)<<23)); \
+  } else {\
+    n = -126 - iy; \
+    hx >>= n; \
+  } \
+fixup: \
+  GEN_OCL_SET_FLOAT_WORD(x,hx); \
+  if(hx<0x00800000){ \
+    GEN_OCL_GET_FLOAT_WORD(hy,y); \
+    hy &= 0x7fffffff; \
+    if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
+    x = 0; \
+  }else{ \
+    y = __gen_ocl_fabs(y); \
+    if (y < 0x1p-125f) { \
+      if (x+x>y || (x+x==y && (q & 1))) { \
+        q++;x-=y; \
+      } \
+    }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
+      q++;x-=y; \
+    } \
+    GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_SET_FLOAT_WORD(x,hx^sx); \
+  } \
+  int sign = sx==sy?0:1; \
+  q &= 0x0000007f; \
+  *quo = (sign ? -q : q); \
+  return x;
 
-/* Fast path */
-OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x) {
-    return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x) {
-    return native_log(x + native_sqrt(x * x + 1));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x) {
-    return 0.5f * native_log((1 + x) / (1 - x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x) {
-    return __gen_ocl_pow(x, 0.3333333333f);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x) {
-    return native_cos(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
-    return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
-    return __gen_ocl_cos(x * M_PI_F);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
-    return native_exp(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x) {
-    return native_exp10(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x) {
-    return __gen_ocl_pow(M_E_F, x) - 1;
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y) {
-    return x-y*__gen_ocl_rndz(x/y);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y) {
-    return __gen_ocl_sqrt(x*x + y*y);
-}
-OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x) {
-    return __gen_ocl_rndd(native_log2(x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n) {
-    return __gen_ocl_pow(2, n) * x;
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x) {
-    return native_log(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x) {
-    return native_log2(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x) {
-    return native_log10(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x) {
-    return native_log(x + 1);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x) {
-    return __gen_ocl_rndd(native_log2(x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y) {
-    return x-y*__gen_ocl_rnde(x/y);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n) {
-    return __gen_ocl_pow(x, 1.f / n);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x) {
-    return native_sin(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, float *cosval) {
-    *cosval = native_cos(x);
-    return native_sin(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x) {
-    return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x) {
-    return __gen_ocl_sin(x * M_PI_F);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x) {
-    return native_tan(x);
-}
-OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x) {
-    float y = native_exp(-2 * x);
-    return (1 - y) / (1 + y);
+OVERLOADABLE float remquo(float x, float y, int *quo) {
+	BODY;
 }
+#undef BODY
 
+#define BODY \
+  if (isnan(x) || isinf(x)) { \
+    *exp = 0; \
+    return x; \
+  } \
+  uint u = as_uint(x); \
+  uint a = u & 0x7FFFFFFFu; \
+  if (a == 0) { \
+    *exp = 0; \
+    return x; \
+  } \
+  if (a >= 0x800000) { \
+    *exp = (a >> 23) - 126; \
+    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+  } \
+  int e = -126; \
+  while (a < 0x400000) { \
+    e --; \
+    a <<= 1; \
+  } \
+  a <<= 1; \
+  *exp = e; \
+  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+OVERLOADABLE float frexp(float x, int *exp) { BODY; }
+#undef BODY
 
-/* Internal implement, high accuracy. */
-OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
-OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
-  union { unsigned u; float f; } ux, uy;
-  ux.f = x;
-  uy.f = y;
-  ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
-  return ux.f;
+OVERLOADABLE half fract(half x, half *p) {
+  float _x = (float)x;
+  float _p;
+  half ret = (half)fract(_x, &_p);
+  *p = (half)_p;
+  return ret;
 }
 
-OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  union { unsigned int i; float f; } u;
-  const float
-  ln2_hi = 6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo = 9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =  3.355443200e+07, /* 0x4c000000 */
-  Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
-  Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
-  Lg3 = 2.8571429849e-01, /* 3E924925 */
-  Lg4 = 2.2222198546e-01; /* 3E638E29 */
-
-  const float zero   =  0.0;
-  float fsq, f, s, z, R, w, t1, t2, partial;
-  int k, ix, i, j;
-
-  u.f = x;  ix = u.i;
-  k = 0;
-
-  k += (ix>>23) - 127;
-  ix &= 0x007fffff;
-  i = (ix + (0x95f64<<3)) & 0x800000;
-  u.i = ix | (i^0x3f800000); x = u.f;
-  k += (i>>23);
-  f = x - 1.0f;
-  fsq = f * f;
-
-  if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
-      R = fsq * (0.5f - 0.33333333333333333f * f);
-      return k * ln2_hi + k * ln2_lo + f - R;
-  }
-
-  s = f / (2.0f + f);
-  z = s * s;
-  i = ix - (0x6147a << 3);
-  w = z * z;
-  j = (0x6b851 << 3) - ix;
-  t1= w * mad(w, Lg4, Lg2);
-  t2= z * mad(w, Lg3, Lg1);
-  i |= j;
-  R = t2 + t1;
-  partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
-
-  return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
+OVERLOADABLE half remquo(half x, half y, int *quo) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)remquo(_x, _y, quo);
 }
 
-OVERLOADABLE float __gen_ocl_internal_log(float x)
-{
-  union { unsigned int i; float f; } u;
-  u.f = x;
-  int ix = u.i;
-
-  if (ix < 0 )
-	return NAN;  /* log(-#) = NaN */
-  if (ix >= 0x7f800000)
-    return NAN;
-
-  return __gen_ocl_internal_log_valid(x);
+OVERLOADABLE half modf(half x, half *i) {
+  float _x = (float)x;
+  float _i;
+  half ret = (half)modf(_x, &_i);
+  *i = (half)_i;
+  return ret;
 }
 
-OVERLOADABLE float __gen_ocl_internal_log10(float x)
-{
-  union { float f; unsigned i; } u;
-  const float
-  ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
-  log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
-  log10_2lo  =  7.9034151668e-07; /* 0x355427db */
-
-  float y, z;
-  int i, k, hx;
-
-  u.f = x; hx = u.i;
-
-  if (hx<0)
-    return NAN; /* log(-#) = NaN */
-  if (hx >= 0x7f800000)
-    return NAN;
-
-  k = (hx >> 23) - 127;
-  i  = ((unsigned)k & 0x80000000) >> 31;
-  hx = (hx&0x007fffff) | ((0x7f-i) << 23);
-  y  = (float)(k + i);
-  u.i = hx; x = u.f;
-
-  return  y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
+OVERLOADABLE half frexp(half x, int *exp) {
+  float _x = (float)x;
+  return (half)frexp(_x, exp);
 }
 
-
-OVERLOADABLE float __gen_ocl_internal_log2(float x)
-{
-  const float zero   =  0.0,
-  invln2 = 0x1.715476p+0f;
-  int ix;
-
-  union { float f; int i; } u;
-  u.f = x; ix = u.i;
-
-  if (ix < 0)
-	return NAN;    /** log(-#) = NaN */
-  if (ix >= 0x7f800000)
-	return NAN;
-
-  return invln2 * __gen_ocl_internal_log_valid(x);
+OVERLOADABLE half sincos(half x, half *cosval) {
+  float _x = (float)x;
+  float _cosval;
+  half ret = (half)sincos(_x, &_cosval);
+  *cosval = (half)_cosval;
+  return ret;
 }
 
-
-float __gen_ocl_scalbnf (float x, int n){
-  /* copy from fdlibm */
-  float two25 = 3.355443200e+07,	/* 0x4c000000 */
-  twom25 = 2.9802322388e-08,	        /* 0x33000000 */
-  huge = 1.0e+30,
-  tiny = 1.0e-30;
-  int k,ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  k = (ix&0x7f800000)>>23; /* extract exponent */
-  if (k==0) {	/* 0 or subnormal x */
-    if ((ix&0x7fffffff)==0) return x; /* +-0 */
-    x *= two25;
-    GEN_OCL_GET_FLOAT_WORD(ix,x);
-    k = ((ix&0x7f800000)>>23) - 25;
-  }
-  if (k==0xff) return x+x;	/* NaN or Inf */
-  if (n< -50000)
-    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
-  if (n> 50000 || k+n > 0xfe)
-    return huge*__gen_ocl_internal_copysign(huge,x); /* overflow  */
-  /* Now k and n are bounded we know that k = k+n does not overflow. */
-  k = k+n;
-  if (k > 0) { /* normal result */
-    GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
-    return x;
-  }
-  if (k <= -25)
-    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
-  k += 25;				/* subnormal result */
-  GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
-  return x*twom25;
+OVERLOADABLE half lgamma_r(half x, int *signgamp) {
+  float _x = (float)x;
+  return (half)lgamma_r(_x, signgamp);
 }
 
-const __constant unsigned int two_over_pi[] = {
-0, 0, 0xA2F, 0x983, 0x6E4, 0xe44, 0x152, 0x9FC,
-0x275, 0x7D1, 0xF53, 0x4DD, 0xC0D, 0xB62,
-0x959, 0x93C, 0x439, 0x041, 0xFE5, 0x163,
-};
-
-// The main idea is from "Radian Reduction for Trigonometric Functions"
-// written by Mary H. Payne and Robert N. Hanek. Also another reference
-// is "A Continued-Fraction Analysis of Trigonometric Argument Reduction"
-// written by Roger Alan Smith, who gave the worst case in this paper.
-// for single float, worst x = 0x1.47d0fep34, and there are 29 bit
-// leading zeros in the fraction part of x*(2.0/pi). so we need at least
-// 29 (leading zero)+ 24 (fraction )+12 (integer) + guard bits. that is,
-// 65 + guard bits, as we calculate in 12*7 = 84bits, which means we have
-// about 19 guard bits. If we need further precision, we may need more
-// guard bits
-// Note we place two 0 in two_over_pi, which is used to handle input less
-// than 0x1.0p23
-
-int payne_hanek(float x, float *y) {
-  union { float f; unsigned u;} ieee;
-  ieee.f = x;
-  unsigned u = ieee.u;
-  int k = ((u & 0x7f800000) >> 23)-127;
-  int ma = (u & 0x7fffff) | 0x800000;
-  unsigned  high, low;
-  high = (ma & 0xfff000) >> 12;
-  low = ma & 0xfff;
-
-  // Two tune below macro, you need to fully understand the algorithm
-#define CALC_BLOCKS 7
-#define ZERO_BITS 2
-
-  unsigned result[CALC_BLOCKS];
-
-  // round down, note we need 2 bits integer precision
-  int index = (k-23-2) < 0 ? (k-23-2-11)/12 : (k-23-2)/12;
+#define BODY \
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+  if (ix > 0x7F800000) { \
+    *i = nan(0u); \
+    return nan(0u); \
+  } \
+  if (ix == 0x7F800000) { \
+    *i = x; \
+    return as_float(hx & 0x80000000u); \
+  } \
+  *i = __gen_ocl_rndz(x); \
+  return x - *i;
+OVERLOADABLE float modf(float x, float *i) { BODY; }
+#undef BODY
 
-  for (int i = 0; i < CALC_BLOCKS; i++) {
-    result[i] =  low * two_over_pi[index+i+ZERO_BITS] ;
-    result[i] +=  high * two_over_pi[index+i+1+ZERO_BITS];
-  }
+#define BODY \
+  *cosval = cos(x); \
+  return sin(x);
 
-  for (int i = CALC_BLOCKS-1; i > 0; i--) {
-    int temp = result[i] >> 12;
-    result[i]  -= temp << 12;
-    result[i-1] += temp;
+OVERLOADABLE float sincos(float x, float *cosval) {
+  if (__ocl_math_fastpath_flag)
+  {
+    *cosval = native_cos(x);
+    return native_sin(x);
   }
-#undef CALC_BLOCKS
-#undef ZERO_BITS
-
-  // get number of integer digits in result[0], note we only consider 12 valid bits
-  // and also it means the fraction digits in result[0] is (12-intDigit)
-
-  int intDigit = index*(-12) + (k-23);
-
-  // As the integer bits may be all included in result[0], and also maybe
-  // some bits in result[0], and some in result[1]. So we merge succesive bits,
-  // which makes easy coding.
-
-  unsigned b0 = (result[0] << 12) | result[1];
-  unsigned b1 = (result[2] << 12) | result[3];
-  unsigned b2 = (result[4] << 12) | result[5];
-  unsigned b3 = (result[6] << 12);
-
-  unsigned intPart = b0 >> (24-intDigit);
-
-  unsigned fract1 = ((b0 << intDigit) | (b1 >> (24-intDigit))) & 0xffffff;
-  unsigned fract2 = ((b1 << intDigit) | (b2 >> (24-intDigit))) & 0xffffff;
-  unsigned fract3 = ((b2 << intDigit) | (b3 >> (24-intDigit))) & 0xffffff;
-
-  // larger than 0.5? which mean larger than pi/4, we need
-  // transform from [0,pi/2] to [-pi/4, pi/4] through -(1.0-fract)
-  int largerPiBy4 = ((fract1 & 0x800000) != 0);
-  int sign = largerPiBy4 ? 1 : 0;
-  intPart = largerPiBy4 ? (intPart+1) : intPart;
-
-  fract1 = largerPiBy4 ? (fract1 ^ 0x00ffffff) : fract1;
-  fract2 = largerPiBy4 ? (fract2 ^ 0x00ffffff) : fract2;
-  fract3 = largerPiBy4 ? (fract3 ^ 0x00ffffff) : fract3;
-
-  int leadingZero = (fract1 == 0);
-
-  // +1 is for the hidden bit 1 in floating-point format
-  int exponent = leadingZero ? -(24+1) : -(0+1);
-
-  fract1 = leadingZero ? fract2 : fract1;
-  fract2 = leadingZero ? fract3 : fract2;
-
-  // fract1 may have leading zeros, add it
-  int shift = clz(fract1)-8;
-  exponent += -shift;
-
-  float pio2 = 0x1.921fb6p+0;
-  unsigned fdigit = ((fract1 << shift) | (fract2 >> (24-shift))) & 0xffffff;
-
-  // we know that denormal number will not appear here
-  ieee.u = (sign << 31) | ((exponent+127) << 23) | (fdigit & 0x7fffff);
-  *y = ieee.f * pio2;
-  return intPart;
-}
 
-int argumentReduceSmall(float x, float * remainder) {
-  union {
-    float f;
-    unsigned u;
-  } ieee;
-
-  float twoByPi = 2.0f/3.14159265f;
-  float piBy2_1h = (float) 0xc90/0x1.0p11,
-        piBy2_1l = (float) 0xfda/0x1.0p23,
-        piBy2_2h = (float) 0xa22/0x1.0p35,
-        piBy2_2l = (float) 0x168/0x1.0p47,
-        piBy2_3h = (float) 0xc23/0x1.0p59,
-        piBy2_3l = (float) 0x4c4/0x1.0p71;
-
-  float y = (float)(int)(twoByPi * x + 0.5f);
-  ieee.f = y;
-  ieee.u = ieee.u & 0xfffff000;
-
-  float yh = ieee.f;
-  float yl = y - yh;
-  float rem = x - yh*piBy2_1h - yh*piBy2_1l - yl*piBy2_1h - yl*piBy2_1l;
-  rem = rem - yh*piBy2_2h - yh*piBy2_2l + yl*piBy2_2h + yl*piBy2_2l;
-  rem = rem - yh*piBy2_3h - yh*piBy2_3l - yl*piBy2_3h - yl*piBy2_3l;
-
-  *remainder = rem;
-  return (int)y;
-}
-
-
-int __ieee754_rem_pio2f(float x, float *y) {
-  if (x < 4000.0f) {
-    return argumentReduceSmall(x, y);
-  } else {
-    return payne_hanek(x, y);
-  }
+  BODY;
 }
+#undef BODY
 
-OVERLOADABLE float __kernel_sinf(float x)
+OVERLOADABLE float __kernel_sinf_20(float x)
 {
   /* copied from fdlibm */
   const float
@@ -460,7 +246,7 @@ OVERLOADABLE float __kernel_sinf(float x)
   return mad(v, r, x);
 }
 
-float __kernel_cosf(float x, float y)
+float __kernel_cosf_20(float x, float y)
 {
   /* copied from fdlibm */
   const float
@@ -485,414 +271,38 @@ float __kernel_cosf(float x, float y)
   }
 }
 
-OVERLOADABLE float sin(float x)
-{
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sin(x);
-
-  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
-  float y,z=0.0;
-  int n, ix;
-
-  float negative = x < 0.0f? -1.0f : 1.0f;
-  x = fabs(x);
-
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  ix &= 0x7fffffff;
-
-    /* sin(Inf or NaN) is NaN */
-  if (ix >= 0x7f800000) return x-x;
-
-  if(x <= pio4)
-	  return negative * __kernel_sinf(x);
-  /* argument reduction needed */
-  else {
-      n = __ieee754_rem_pio2f(x,&y);
-      float s = __kernel_sinf(y);
-      float c = __kernel_cosf(y,0.0f);
-      float ret = (n&1) ? negative*c : negative*s;
-      return (n&3)> 1? -1.0f*ret : ret;
-  }
-}
-
-OVERLOADABLE float cos(float x)
-{
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_cos(x);
-
-  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
-  float y,z=0.0;
-  int n, ix;
-  x = __gen_ocl_fabs(x);
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-
-  ix &= 0x7fffffff;
-
-    /* cos(Inf or NaN) is NaN */
-  if (ix >= 0x7f800000) return x-x;
-
-  if(x <= pio4)
-	  return __kernel_cosf(x, 0.f);
-  /* argument reduction needed */
-  else {
-      n = __ieee754_rem_pio2f(x,&y);
-      n &= 3;
-      float c = __kernel_cosf(y, 0.0f);
-      float s = __kernel_sinf(y);
-      float v = (n&1) ? s : c;
-      /* n&3   return
-          0    cos(y)
-          1   -sin(y)
-          2   -cos(y)
-          3    sin(y)
-      */
-      int mask = (n>>1) ^ n;
-      float sign = (mask&1) ? -1.0f : 1.0f;
-      return sign * v;
-  }
-}
-
-float __kernel_tanf(float x, float y, int iy)
-{
-  /* copied from fdlibm */
-        float z,r,v,w,s;
-        int ix,hx;
-        const float
-        one   =  1.0000000000e+00, /* 0x3f800000 */
-        pio4  =  7.8539812565e-01, /* 0x3f490fda */
-        pio4lo=  3.7748947079e-08; /* 0x33222168 */
-        float T[13];// =  {
-         T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
-         T[1] = 1.3333334029e-01; /* 0x3e088889 */
-         T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
-         T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
-         T[4] = 8.8632395491e-03; /* 0x3c11371f */
-         T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
-         T[6] = 1.4562094584e-03; /* 0x3abede48 */
-         T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
-
-        GEN_OCL_GET_FLOAT_WORD(hx,x);
-        ix = hx&0x7fffffff;     /* high word of |x| */
-        if(ix<0x31800000)                       /* x < 2**-28 */
-            {if((int)x==0) {                    /* generate inexact */
-                if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
-                else return (iy==1)? x: -one/x;
-            }
-            }
-        if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
-            if(hx<0) {x = -x; y = -y;}
-            z = pio4-x;
-            w = pio4lo-y;
-            x = z+w; y = 0.0;
-        }
-        z       =  x*x;
-        w       =  z*z;
-		/* Break x^5*(T[1]+x^2*T[2]+...) into
-		 *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
-		 *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
-		 */
-
-        r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
-        v = z* mad(w, mad(w, T[6], T[4]), T[2]);
-
-        s = z*x;
-        r = mad(z, mad(s, r + v, y), y);
-        r += T[0]*s;
-        w = x+r;
-        if(ix>=0x3f2ca140) {
-            v = (float)iy;
-            return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
-        }
-        if(iy==1) return w;
-        else
-        	return -1.0/(x+r);
-}
-
-OVERLOADABLE float tan(float x)
-{
-    if (__ocl_math_fastpath_flag)
-      return __gen_ocl_internal_fastpath_tan(x);
-
-    float y,z=0.0;
-    int n, ix;
-    float negative = x < 0.0f? -1.0f : 1.0f;
-    x = negative * x;
-
-    GEN_OCL_GET_FLOAT_WORD(ix,x);
-
-    ix &= 0x7fffffff;
-
-    /* tan(Inf or NaN) is NaN */
-    if (ix>=0x7f800000) return x-x;            /* NaN */
-
-    /* argument reduction needed */
-    else {
-      n = __ieee754_rem_pio2f(x,&y);
-      return negative * __kernel_tanf(y,0.0f,1-((n&1)<<1)); /*   1 -- n even
-                                                              -1 -- n odd */
-    }
-}
-
-OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
-  int ix;
-  if(isinf(x) || isnan(x)) { return NAN; }
-  if(x < 0.0f) { x = -x; }
-  GEN_OCL_GET_FLOAT_WORD(ix, x);
-  if(x> 0x1.0p24) return 1.0f;
-  float m = __gen_ocl_internal_floor(x);
-  ix = (int)m;
-  m = x-m;
-  if((ix&0x1) != 0) m+=1.0f;
-    ix = __gen_ocl_internal_floor(m*4.0f);
-
-  switch(ix) {
-   case 0:
-    return __kernel_cosf(m*M_PI_F, 0.0f);
-   case 1:
-   case 2:
-    return __kernel_sinf((0.5f-m)*M_PI_F);
-   case 3:
-   case 4:
-    return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
-   case 5:
-   case 6:
-    return __kernel_sinf((m-1.5f)*M_PI_F);
-   default:
-    return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
-   }
-}
-
-OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+OVERLOADABLE float __gen_ocl_internal_floor_20(float x) { return __gen_ocl_rndd(x); }
+OVERLOADABLE float __gen_ocl_internal_sinpi_20(float x) {
   float sign = 1.0f;
   int ix;
   if(isinf(x)) return NAN;
   if(x < 0.0f) { x = -x; sign = -1.0f; }
   GEN_OCL_GET_FLOAT_WORD(ix, x);
   if(x> 0x1.0p24) return 0.0f;
-  float m = __gen_ocl_internal_floor(x);
+  float m = __gen_ocl_internal_floor_20(x);
   ix = (int)m;
   m = x-m;
   if((ix&0x1) != 0) m+=1.0f;
-    ix = __gen_ocl_internal_floor(m*4.0f);
+    ix = __gen_ocl_internal_floor_20(m*4.0f);
 
   switch(ix) {
    case 0:
-    return sign*__kernel_sinf(m*M_PI_F);
+    return sign*__kernel_sinf_20(m*M_PI_F);
    case 1:
    case 2:
-    return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+    return sign*__kernel_cosf_20((m-0.5f)*M_PI_F, 0.0f);
    case 3:
    case 4:
-    return -sign*__kernel_sinf((m-1.0f)*M_PI_F);
+    return -sign*__kernel_sinf_20((m-1.0f)*M_PI_F);
    case 5:
    case 6:
-    return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+    return -sign*__kernel_cosf_20((m-1.5f)*M_PI_F, 0.0f);
    default:
-    return -sign*__kernel_sinf((2.0f-m)*M_PI_F);
+    return -sign*__kernel_sinf_20((2.0f-m)*M_PI_F);
    }
 
 }
 
-OVERLOADABLE float lgamma(float x) {
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    const float
-        zero=  0.,
-        one =  1.0000000000e+00,
-        pi  =  3.1415927410e+00,
-        a0  =  7.7215664089e-02,
-        a1  =  3.2246702909e-01,
-        a2  =  6.7352302372e-02,
-        a3  =  2.0580807701e-02,
-        a4  =  7.3855509982e-03,
-        a5  =  2.8905137442e-03,
-        a6  =  1.1927076848e-03,
-        a7  =  5.1006977446e-04,
-        a8  =  2.2086278477e-04,
-        a9  =  1.0801156895e-04,
-        a10 =  2.5214456400e-05,
-        a11 =  4.4864096708e-05,
-        tc  =  1.4616321325e+00,
-        tf  = -1.2148628384e-01,
-        tt  =  6.6971006518e-09,
-        t0  =  4.8383611441e-01,
-        t1  = -1.4758771658e-01,
-        t2  =  6.4624942839e-02,
-        t3  = -3.2788541168e-02,
-        t4  =  1.7970675603e-02,
-        t5  = -1.0314224288e-02,
-        t6  =  6.1005386524e-03,
-        t7  = -3.6845202558e-03,
-        t8  =  2.2596477065e-03,
-        t9  = -1.4034647029e-03,
-        t10 =  8.8108185446e-04,
-        t11 = -5.3859531181e-04,
-        t12 =  3.1563205994e-04,
-        t13 = -3.1275415677e-04,
-        t14 =  3.3552918467e-04,
-        u0  = -7.7215664089e-02,
-        u1  =  6.3282704353e-01,
-        u2  =  1.4549225569e+00,
-        u3  =  9.7771751881e-01,
-        u4  =  2.2896373272e-01,
-        u5  =  1.3381091878e-02,
-        v1  =  2.4559779167e+00,
-        v2  =  2.1284897327e+00,
-        v3  =  7.6928514242e-01,
-        v4  =  1.0422264785e-01,
-        v5  =  3.2170924824e-03,
-        s0  = -7.7215664089e-02,
-        s1  =  2.1498242021e-01,
-        s2  =  3.2577878237e-01,
-        s3  =  1.4635047317e-01,
-        s4  =  2.6642270386e-02,
-        s5  =  1.8402845599e-03,
-        s6  =  3.1947532989e-05,
-        r1  =  1.3920053244e+00,
-        r2  =  7.2193557024e-01,
-        r3  =  1.7193385959e-01,
-        r4  =  1.8645919859e-02,
-        r5  =  7.7794247773e-04,
-        r6  =  7.3266842264e-06,
-        w0  =  4.1893854737e-01,
-        w1  =  8.3333335817e-02,
-        w2  = -2.7777778450e-03,
-        w3  =  7.9365057172e-04,
-        w4  = -5.9518753551e-04,
-        w5  =  8.3633989561e-04,
-        w6  = -1.6309292987e-03;
-	float t, y, z, nadj, p, p1, p2, p3, q, r, w;
-	int i, hx, ix;
-	nadj = 0;
-	hx = *(int *)&x;
-	ix = hx & 0x7fffffff;
-	if (ix >= 0x7f800000)
-		return x * x;
-	if (ix == 0)
-		return ((x + one) / zero);
-	if (ix < 0x1c800000) {
-		if (hx < 0) {
-			return -native_log(-x);
-		} else
-			return -native_log(x);
-	}
-	if (hx < 0) {
-		if (ix >= 0x4b000000)
-			return ((-x) / zero);
-		t = __gen_ocl_internal_sinpi(x);
-		if (t == zero)
-			return ((-x) / zero);
-		nadj = native_log(pi / __gen_ocl_fabs(t * x));
-		x = -x;
-	}
-	if (ix == 0x3f800000 || ix == 0x40000000)
-		r = 0;
-	else if (ix < 0x40000000) {
-		if (ix <= 0x3f666666) {
-			r = -native_log(x);
-			if (ix >= 0x3f3b4a20) {
-				y = one - x;
-				i = 0;
-			} else if (ix >= 0x3e6d3308) {
-				y = x - (tc - one);
-				i = 1;
-			} else {
-				y = x;
-				i = 2;
-			}
-		} else {
-			r = zero;
-			if (ix >= 0x3fdda618) {
-				y = (float) 2.0 - x;
-				i = 0;
-			}
-			else if (ix >= 0x3F9da620) {
-				y = x - tc;
-				i = 1;
-			}
-			else {
-				y = x - one;
-				i = 2;
-			}
-		}
-		switch (i) {
-		case 0:
-			z = y * y;
-			p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
-			p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
-			p = mad(y, p1, p2);
-			r += (p - (float) 0.5 * y);
-			break;
-		case 1:
-			z = y * y;
-			w = z * y;
-			p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
-			p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
-			p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
-			p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
-			r += (tf + p);
-			break;
-		case 2:
-			p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
-			p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);
-			r += (-(float) 0.5 * y + p1 / p2);
-		}
-	} else if (ix < 0x41000000) {
-		i = (int) x;
-		t = zero;
-		y = x - (float) i;
-
-		p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
-		q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);
-		r = .5f * y + p / q;
-		z = one;
-
-		switch (i) {
-		case 7:
-			z *= (y + 6.0f);
-		case 6:
-			z *= (y + 5.0f);
-		case 5:
-			z *= (y + 4.0f);
-		case 4:
-			z *= (y + 3.0f);
-		case 3:
-			z *= (y + 2.0f);
-			r += native_log(z);
-			break;
-		}
-
-	} else if (ix < 0x5c800000) {
-		t = native_log(x);
-		z = one / x;
-		y = z * z;
-		w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
-		r = (x - .5f) * (t - one) + w;
-	} else
-		r = x * (native_log(x) - one);
-	if (hx < 0)
-		r = nadj - r;
-	return r;
-}
-
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
 #define BODY \
     const float  \
         zero=  0.,  \
@@ -979,7 +389,7 @@ OVERLOADABLE float lgamma(float x) {
 	if (hx < 0) {  \
 		if (ix >= 0x4b000000)  \
 			return ((-x) / zero);  \
-		t = __gen_ocl_internal_sinpi(x);  \
+		t = __gen_ocl_internal_sinpi_20(x);  \
 		if (t == zero)  \
 			return ((-x) / zero);  \
 		nadj = native_log(pi / __gen_ocl_fabs(t * x));  \
@@ -1076,2730 +486,6 @@ OVERLOADABLE float lgamma(float x) {
 OVERLOADABLE float lgamma_r(float x, int *signgamp) { BODY; }
 #undef BODY
 
-OVERLOADABLE float log1p(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_log1p(x);
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  const float
-  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =    3.355443200e+07, /* 0x4c000000 */
-  Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
-  Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
-  Lp3 = 2.8571429849e-01, /* 3E924925 */
-  Lp4 = 2.2222198546e-01; /* 3E638E29 */
-  const float zero = 0.0;
-  float hfsq,f,c,s,z,R,u;
-  int k,hx,hu,ax;
-  union {float f; unsigned i;} un;
-  un.f = x;  hx = un.i;
-  ax = hx&0x7fffffff;
-
-  k = 1;
-  if (hx < 0x3ed413d7) {      /* x < 0.41422  */
-      if(ax>=0x3f800000) {    /* x <= -1.0 */
-    if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
-    else return (x-x)/(x-x);  /* log1p(x<-1)=NaN */
-      }
-      if(ax<0x31000000) {     /* |x| < 2**-29 */
-    if(two25+x>zero     /* raise inexact */
-              &&ax<0x24800000)    /* |x| < 2**-54 */
-        return x;
-    else
-        return x - x*x*(float)0.5;
-      }
-      if(hx>0||hx<=((int)0xbe95f61f)) {
-    k=0;f=x;hu=1;}  /* -0.2929<x<0.41422 */
-  }
-  if (hx >= 0x7f800000) return x+x;
-  if(k!=0) {
-      if(hx<0x5a000000) {
-    u  = (float)1.0+x;
-
-    un.f = u; hu = un.i;
-          k  = (hu>>23)-127;
-    /* correction term */
-          c  = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
-    c /= u;
-      } else {
-    u  = x;
-    un.f = u; hu = un.i;
-          k  = (hu>>23)-127;
-    c  = 0;
-      }
-      hu &= 0x007fffff;
-      if(hu<0x3504f7) {
-          un.i = hu|0x3f800000; u = un.f;/* normalize u */
-      } else {
-          k += 1;
-          un.i = hu|0x3f000000; u = un.f;  /* normalize u/2 */
-          hu = (0x00800000-hu)>>2;
-      }
-      f = u-(float)1.0;
-  }
-  hfsq=(float)0.5*f*f;
-  if(hu==0)
-  { /* |f| < 2**-20 */
-      if(f==zero)
-      {
-    	  if(k==0) return zero;
-    	  else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
-      }
-      R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
-      if(k==0) return f-R; else
-    	  return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
-  }
-  s = f/((float)2.0+f);
-  z = s*s;
-  R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
-  if(k==0)
-	  return f + mad(hfsq + R, s, -hfsq);
-  else
-	  return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
-}
-
-OVERLOADABLE float logb(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_logb(x);
-
-  union {float f; unsigned i;} u;
-  u.f = x;
-  int e =  ((u.i & 0x7f800000) >> 23);
-  float r1 = e-127;
-  float r2 = -INFINITY;
-  float r3 = x*x;
-    /* sub normal or +/-0 */
-  float r = e == 0 ? r2 : r1;
-    /* inf & nan */
-  return e == 0xff ? r3 : r;
-}
-
-OVERLOADABLE int ilogb(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_ilogb(x);
-
-  union { int i; float f; } u;
-  if (isnan(x))
-    return FP_ILOGBNAN;
-  if (isinf(x))
-    return 0x7FFFFFFF;
-  u.f = x;
-  u.i &= 0x7fffffff;
-  if (u.i == 0)
-    return FP_ILOGB0;
-  if (u.i >= 0x800000)
-    return (u.i >> 23) - 127;
-  int r = -126;
-  int a = u.i & 0x7FFFFF;
-  while(a < 0x800000) {
-    a <<= 1;
-    r --;
-  }
-  return r;
-}
-OVERLOADABLE float nan(uint code) {
-  return NAN;
-}
-OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
-  float sign = 1.0f;
-  int ix;
-  if(isinf(x)) return NAN;
-  if(x < 0.0f) { x = -x; sign = -1.0f; }
-  GEN_OCL_GET_FLOAT_WORD(ix, x);
-  if(x> 0x1.0p24) return 0.0f;
-  float m = __gen_ocl_internal_floor(x);
-  ix = (int)m;
-  m = x-m;
-  int n = __gen_ocl_internal_floor(m*4.0f);
-  if(m == 0.5f) {
-    return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
-  }
-  if(m == 0.0f) {
-    return (ix&0x1) == 0 ? 0.0f : -0.0f;
-  }
-
-  switch(n) {
-    case 0:
-      return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
-    case 1:
-      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
-    case 2:
-      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
-    default:
-      return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
-  }
-}
-OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
-  /* copied from fdlibm */
-  const unsigned
-  B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
-  B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
-
-  const float
-  C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
-  D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
-  E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
-  F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
-  G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
-
-  float r,s,t, w;
-  int hx;
-  uint sign;
-  uint high;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  sign=hx&0x80000000;     /* sign= sign(x) */
-  hx  ^=sign;
-  if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
-  if(hx==0)
-      return(x);    /* cbrt(0) is itself */
-
-  GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
-    /* rough cbrt to 5 bits */
-  if(hx<0x00800000)     /* subnormal number */
-    {
-    //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
-     //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
-      t = (sign = 0) ? 0.0f : -0.0f;
-      return t;
-    }
-  else
-    GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
-
-
-    /* new cbrt to 23 bits */
-  r=t*t/x;
-  s=mad(r, t, C);
-  t*=G+F/(s+E+D/s);
-    /* one step newton iteration to 53 bits with error less than 0.667 ulps */
-  s=t*t;    /* t*t is exact */
-  r=x/s;
-  w=t+t;
-  r=(r-t)/(w+r);  /* r-s is exact */
-  t=mad(t, r, t);
-
-    /* retore the sign bit */
-  GEN_OCL_GET_FLOAT_WORD(high,t);
-  GEN_OCL_SET_FLOAT_WORD(t,high|sign);
-  return(t);
-}
-
-#define BODY \
-  *cosval = cos(x); \
-  return sin(x);
-
-OVERLOADABLE float sincos(float x, float *cosval) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sincos(x, cosval);
-  BODY;
-}
-#undef BODY
-
-INLINE float __gen_ocl_asin_util(float x) {
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  float
-  pS0 =  1.66666666666666657415e-01,
-  pS1 = -3.25565818622400915405e-01,
-  pS2 =  2.01212532134862925881e-01,
-  pS3 = -4.00555345006794114027e-02,
-  pS4 =  7.91534994289814532176e-04,
-  qS1 = -2.40339491173441421878e+00,
-  qS2 =  2.02094576023350569471e+00,
-  qS3 = -6.88283971605453293030e-01,
-  qS4 =  7.70381505559019352791e-02;
-
-  float t = x*x;
-  float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), pS0);
-  float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), 1.0f);
-  float w = p / q;
-  return mad(x, w, x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_asin(float x) {
-  uint ix;
-  union { uint i; float f; } u;
-  u.f = x;
-  ix = u.i & 0x7fffffff;
-  if(ix == 0x3f800000) {
-    return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
-  }
-  if(ix > 0x3f800000) {            /* |x|>= 1 */
-    return  NAN;          /* asin(|x|>1) is NaN */
-  }
-
-  if(ix < 0x32000000) {            /* if |x| < 2**-27 */
-    if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
-  }
-
-  if(x < -0.5) {
-    return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
-  } else if(x > 0.5) {
-    return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
-  } else {
-    return __gen_ocl_asin_util(x);
-  }
-}
-OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
-  return __gen_ocl_internal_asin(x) / M_PI_F;
-}
-OVERLOADABLE float __gen_ocl_internal_acos(float x) {
-  if(x > 0.5)
-    return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
-  else
-    return M_PI_2_F - __gen_ocl_internal_asin(x);
-}
-OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
-  return __gen_ocl_internal_acos(x) / M_PI_F;
-}
-__constant float atanhi[4] = {
-  4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
-  7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
-  9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
-  1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
-};
-__constant float atanlo[4] = {
-  5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
-  3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
-  3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
-  7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
-};
-
-OVERLOADABLE float __gen_ocl_internal_atan(float x) {
-  /* copied from fdlibm */
-  float aT[11];
-  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
-  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
-  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
-  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
-  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
-  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
-  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
-  const float one = 1.0, huge = 1.0e30;
-
-  float w,s1,s2,z;
-  int ix,hx,id;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
-      if(ix>0x7f800000)
-    return x+x;   /* NaN */
-      if(hx>0) return  atanhi[3]+atanlo[3];
-      else     return -atanhi[3]-atanlo[3];
-  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
-      if (ix < 0x31000000) {  /* |x| < 2^-29 */
-    if(huge+x>one) return x;  /* raise inexact */
-      }
-      id = -1;
-  } else {
-  x = __gen_ocl_fabs(x);
-  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
-      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
-    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
-      } else {      /* 11/16<=|x|< 19/16 */
-    id = 1; x  = (x-one)/(x+one);
-      }
-  } else {
-      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
-    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
-      } else {      /* 2.4375 <= |x| < 2^66 */
-    id = 3; x  = -(float)1.0/x;
-      }
-  }}
-    /* end of argument reduction */
-  z = x*x;
-  w = z*z;
-    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
-  s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
-  s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
-  if (id<0) return x - x*(s1+s2);
-  else {
-      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
-      return (hx<0)? -z:z;
-  }
-
-}
-OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
-  return __gen_ocl_internal_atan(x) / M_PI_F;
-}
-
-// XXX work-around PTX profile
-OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
-OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
-OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
-  /* copied from fdlibm */
-  float z;
-  int k,m,hx,hy,ix,iy;
-  const float
-  tiny  = 1.0e-30,
-  zero  = 0.0,
-  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
-  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
-  pi      = 3.1415927410e+00, /* 0x40490fdb */
-  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  iy = hy&0x7fffffff;
-
-  if((ix>0x7f800000)||
-     (iy>0x7f800000)) /* x or y is NaN */
-     return x+y;
-  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
-  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
-
-    /* when y = 0 */
-  if(iy==0) {
-      switch(m) {
-    case 0:
-    case 1: return y;   /* atan(+-0,+anything)=+-0 */
-    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
-    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
-      }
-  }
-    /* when x = 0 */
-  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
-
-  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
-  if(ix <= 0x7fffff && iy <= 0x7fffff) {
-    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
-    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
-  }
-
-    /* when x is INF */
-  if(ix==0x7f800000) {
-      if(iy==0x7f800000) {
-    switch(m) {
-        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
-        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
-        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
-        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
-    }
-      } else {
-    switch(m) {
-        case 0: return  zero  ; /* atan(+...,+INF) */
-        case 1: return -zero  ; /* atan(-...,+INF) */
-        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
-        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
-    }
-      }
-  }
-    /* when y is INF */
-  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
-
-    /* compute y/x */
-  k = (iy-ix)>>23;
-  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
-  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
-  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
-  switch (m) {
-      case 0: return       z  ; /* atan(+,+) */
-      case 1: {
-              uint zh;
-          GEN_OCL_GET_FLOAT_WORD(zh,z);
-          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
-        }
-        return       z  ; /* atan(-,+) */
-      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
-      default: /* case 3 */
-            return  (z-pi_lo)-pi;/* atan(-,-) */
-  }
-}
-
-OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
-  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
-}
-OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
-OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-OVERLOADABLE float __gen_ocl_internal_round(float x) {
-  float y = __gen_ocl_rndz(x);
-  if (__gen_ocl_fabs(x - y) >= 0.5f)
-    y += __gen_ocl_internal_copysign(1.f, x);
-  return y;
-}
-OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-OVERLOADABLE float __gen_ocl_internal_rint(float x) {
-  return __gen_ocl_rnde(x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_exp(float x) {
-  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
-  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
-  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
-  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one = 1.0,
-  huge = 1.0e+30,
-  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
-  P2 = -2.7777778450e-03; /* 0xbb360b61 */
-  float y,hi=0.0,lo=0.0,c,t;
-  int k=0,xsb;
-  unsigned hx;
-  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
-  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
-  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
-  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
-  float half_0 = 0.5;
-  float half_1 =	-0.5;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = (hx>>31)&1;		/* sign bit of x */
-  hx &= 0x7fffffff;		/* high word of |x| */
-
-  /* filter out non-finite argument */
-  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
-    if(hx>0x7f800000)
-      return x+x;			/* NaN */
-    if(hx==0x7f800000)
-      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
-    if(x > o_threshold) return huge*huge; /* overflow */
-    if(x < u_threshold) return twom100*twom100; /* underflow */
-  }
-  /* argument reduction */
-  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
-      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
-    } else {
-      float tmp = xsb == 1 ? half_1 : half_0;
-      k  = ivln2*x+tmp;
-      t  = k;
-      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
-      lo = t*ln2LO_0;
-    }
-    x  = hi - lo;
-  }
-  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
-    if(huge+x>one) return one+x;/* trigger inexact */
-  }
-  else k = 0;
-
-  /* x is now in primary range */
-  t  = x*x;
-  c  = x - t*(P1+t*P2);
-  if(k==0)
-    return one-((x*c)/(c-(float)2.0)-x);
-  else
-    y = one-((lo-(x*c)/((float)2.0-c))-hi);
-  if(k >= -125) {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
-    return y;
-  } else {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
-    return y*twom100;
-  }
-}
-
-/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
-/*...*/
-const float
-tiny = 1.0e-30,
-half_val=  5.0000000000e-01, /* 0x3F000000 */
-one =  1.0000000000e+00, /* 0x3F800000 */
-two =  2.0000000000e+00, /* 0x40000000 */
-	/* c = (subfloat)0.84506291151 */
-erx =  8.4506291151e-01, /* 0x3f58560b */
-/*
- * Coefficients for approximation to  erf on [0,0.84375]
- */
-efx =  1.2837916613e-01, /* 0x3e0375d4 */
-efx8=  1.0270333290e+00, /* 0x3f8375d4 */
-pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
-pp1  = -3.2504209876e-01, /* 0xbea66beb */
-pp2  = -2.8481749818e-02, /* 0xbce9528f */
-pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
-pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
-qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
-qq2  =  6.5022252500e-02, /* 0x3d852a63 */
-qq3  =  5.0813062117e-03, /* 0x3ba68116 */
-qq4  =  1.3249473704e-04, /* 0x390aee49 */
-qq5  = -3.9602282413e-06, /* 0xb684e21a */
-/*
- * Coefficients for approximation to  erf  in [0.84375,1.25]
- */
-pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
-pa1  =  4.1485610604e-01, /* 0x3ed46805 */
-pa2  = -3.7220788002e-01, /* 0xbebe9208 */
-pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
-pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
-pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
-pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
-qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
-qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
-qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
-qa4  =  1.2617121637e-01, /* 0x3e013307 */
-qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
-qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
- /*
- * Coefficients for approximation to  erfc in [1.25,1/0.35]
- */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
-ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
-ra2  = -1.0558626175e+01, /* 0xc128f022 */
-ra3  = -6.2375331879e+01, /* 0xc2798057 */
-ra4  = -1.6239666748e+02, /* 0xc322658c */
-ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
-ra6  = -8.1287437439e+01, /* 0xc2a2932b */
-ra7  = -9.8143291473e+00, /* 0xc11d077e */
-sa1  =  1.9651271820e+01, /* 0x419d35ce */
-sa2  =  1.3765776062e+02, /* 0x4309a863 */
-sa3  =  4.3456588745e+02, /* 0x43d9486f */
-sa4  =  6.4538726807e+02, /* 0x442158c9 */
-sa5  =  4.2900814819e+02, /* 0x43d6810b */
-sa6  =  1.0863500214e+02, /* 0x42d9451f */
-sa7  =  6.5702495575e+00, /* 0x40d23f7c */
-sa8  = -6.0424413532e-02, /* 0xbd777f97 */
-/*
- * Coefficients for approximation to  erfc in [1/.35,28]
- */
-rb0  = -9.8649431020e-03, /* 0xbc21a092 */
-rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
-rb2  = -1.7757955551e+01, /* 0xc18e104b */
-rb3  = -1.6063638306e+02, /* 0xc320a2ea */
-rb4  = -6.3756646729e+02, /* 0xc41f6441 */
-rb5  = -1.0250950928e+03, /* 0xc480230b */
-rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
-sb1  =  3.0338060379e+01, /* 0x41f2b459 */
-sb2  =  3.2579251099e+02, /* 0x43a2e571 */
-sb3  =  1.5367296143e+03, /* 0x44c01759 */
-sb4  =  3.1998581543e+03, /* 0x4547fdbb */
-sb5  =  2.5530502930e+03, /* 0x451f90ce */
-sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
-sb7  = -2.2440952301e+01; /* 0xc1b38712 */
-
-	int hx,ix,i;
-	float R,S,P,Q,s,y,z,r;
-	GEN_OCL_GET_FLOAT_WORD(hx,x);
-	ix = hx&0x7fffffff;
-	if(ix>=0x7f800000) {		/* erf(nan)=nan */
-	    i = ((unsigned int)hx>>31)<<1;
-	    return (float)(1-i)+one/x;	/* erf(+-inf)=+-1 */
-	}
-
-	if(ix < 0x3f580000) {		/* |x|<0.84375 */
-	    if(ix < 0x31800000) { 	/* |x|<2**-28 */
-	        if (ix < 0x04000000)
-		    /*avoid underflow */
-		    return (float)0.125*((float)8.0*x+efx8*x);
-		return x + efx*x;
-	    }
-	    z = x*x;
-	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
-	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), one);
-	    y = r / s;
-	    return mad(x, y, x);
-	}
-	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
-	    s = __gen_ocl_internal_fabs(x)-one;
-	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
-	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
-	    if(hx>=0) return erx + P/Q; else return -erx - P/Q;
-	}
-	if (ix >= 0x40c00000) {		/* inf>|x|>=6 */
-	    if(hx>=0) return one-tiny; else return tiny-one;
-	}
-	x = __gen_ocl_internal_fabs(x);
-    s = one/(x*x);
-	if(ix< 0x4036DB6E) {	/* |x| < 1/0.35 */
-	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-	    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
-	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-	    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
-	} else {	/* |x| >= 1/0.35 */
-	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-	    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
-	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-	    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
-	}
-	GEN_OCL_GET_FLOAT_WORD(ix,x);
-	GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
-	r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
-	if(hx>=0) return one-r/x; else return  r/x-one;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
-/*...*/
-const float
-tiny = 1.0e-30,
-half_val=  5.0000000000e-01, /* 0x3F000000 */
-one =  1.0000000000e+00, /* 0x3F800000 */
-two =  2.0000000000e+00, /* 0x40000000 */
-	/* c = (subfloat)0.84506291151 */
-erx =  8.4506291151e-01, /* 0x3f58560b */
-/*
- * Coefficients for approximation to  erf on [0,0.84375]
- */
-efx =  1.2837916613e-01, /* 0x3e0375d4 */
-efx8=  1.0270333290e+00, /* 0x3f8375d4 */
-pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
-pp1  = -3.2504209876e-01, /* 0xbea66beb */
-pp2  = -2.8481749818e-02, /* 0xbce9528f */
-pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
-pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
-qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
-qq2  =  6.5022252500e-02, /* 0x3d852a63 */
-qq3  =  5.0813062117e-03, /* 0x3ba68116 */
-qq4  =  1.3249473704e-04, /* 0x390aee49 */
-qq5  = -3.9602282413e-06, /* 0xb684e21a */
-/*
- * Coefficients for approximation to  erf  in [0.84375,1.25]
- */
-pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
-pa1  =  4.1485610604e-01, /* 0x3ed46805 */
-pa2  = -3.7220788002e-01, /* 0xbebe9208 */
-pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
-pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
-pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
-pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
-qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
-qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
-qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
-qa4  =  1.2617121637e-01, /* 0x3e013307 */
-qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
-qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
- /*
- * Coefficients for approximation to  erfc in [1.25,1/0.35]
- */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
-ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
-ra2  = -1.0558626175e+01, /* 0xc128f022 */
-ra3  = -6.2375331879e+01, /* 0xc2798057 */
-ra4  = -1.6239666748e+02, /* 0xc322658c */
-ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
-ra6  = -8.1287437439e+01, /* 0xc2a2932b */
-ra7  = -9.8143291473e+00, /* 0xc11d077e */
-sa1  =  1.9651271820e+01, /* 0x419d35ce */
-sa2  =  1.3765776062e+02, /* 0x4309a863 */
-sa3  =  4.3456588745e+02, /* 0x43d9486f */
-sa4  =  6.4538726807e+02, /* 0x442158c9 */
-sa5  =  4.2900814819e+02, /* 0x43d6810b */
-sa6  =  1.0863500214e+02, /* 0x42d9451f */
-sa7  =  6.5702495575e+00, /* 0x40d23f7c */
-sa8  = -6.0424413532e-02, /* 0xbd777f97 */
-/*
- * Coefficients for approximation to  erfc in [1/.35,28]
- */
-rb0  = -9.8649431020e-03, /* 0xbc21a092 */
-rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
-rb2  = -1.7757955551e+01, /* 0xc18e104b */
-rb3  = -1.6063638306e+02, /* 0xc320a2ea */
-rb4  = -6.3756646729e+02, /* 0xc41f6441 */
-rb5  = -1.0250950928e+03, /* 0xc480230b */
-rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
-sb1  =  3.0338060379e+01, /* 0x41f2b459 */
-sb2  =  3.2579251099e+02, /* 0x43a2e571 */
-sb3  =  1.5367296143e+03, /* 0x44c01759 */
-sb4  =  3.1998581543e+03, /* 0x4547fdbb */
-sb5  =  2.5530502930e+03, /* 0x451f90ce */
-sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
-sb7  = -2.2440952301e+01; /* 0xc1b38712 */
-	int hx,ix;
-	float R,S,P,Q,s,y,z,r;
-	GEN_OCL_GET_FLOAT_WORD(hx,x);
-	ix = hx&0x7fffffff;
-	if(ix>=0x7f800000) {			/* erfc(nan)=nan */
-						/* erfc(+-inf)=0,2 */
-	    return (float)(((unsigned int)hx>>31)<<1)+one/x;
-	}
-
-	if(ix < 0x3f580000) {		/* |x|<0.84375 */
-	    if(ix < 0x23800000)  	/* |x|<2**-56 */
-		return one-x;
-	    z = x*x;
-	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
-	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), qq1), one);
-	    y = r/s;
-	    if(hx < 0x3e800000) {  	/* x<1/4 */
-		return one-(x+x*y);
-	    } else {
-		r = x*y;
-		r += (x-half_val);
-	        return half_val - r ;
-	    }
-	}
-	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
-	    s = __gen_ocl_internal_fabs(x)-one;
-	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
-	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
-	    if(hx>=0) {
-	        z  = one-erx; return z - P/Q;
-	    } else {
-		z = erx+P/Q; return one+z;
-	    }
-	}
-	if (ix < 0x41e00000) {		/* |x|<28 */
-	    x = __gen_ocl_internal_fabs(x);
-        s = one/(x*x);
-	    if(ix< 0x4036DB6D) {	/* |x| < 1/.35 ~ 2.857143*/
-		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-		    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
-		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-		    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
-	    } else {			/* |x| >= 1/.35 ~ 2.857143 */
-		if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
-		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-		    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
-		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
-		    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
-	    }
-	    GEN_OCL_GET_FLOAT_WORD(ix,x);
-	    GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
-	    r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*
-			__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
-	    if(hx>0) {
-		float ret = r/x;
-		return ret;
-	    } else
-		return two-r/x;
-	} else {
-	    if(hx>0) {
-		return tiny*tiny;
-	    } else
-		return two-tiny;
-	}
-}
-
-OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
-  //return x-y*__gen_ocl_rndz(x/y);
-  float one = 1.0;
-  float Zero[2];
-  int n,hx,hy,hz,ix,iy,sx,i;
-  Zero[0] = 0.0;
-  Zero[1] = -0.0;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  sx = hx&0x80000000;		/* sign of x */
-  hx ^=sx;		/* |x| */
-  hy &= 0x7fffffff;	/* |y| */
-  /* purge off exception values */
-  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
-  (hy>0x7f800000))			/* or y is NaN */
-    return (x*y)/(x*y);
-  if(hx<hy) return x;			/* |x|<|y| return x */
-  if(hx==hy)
-    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
-
-  /* determine ix = ilogb(x) */
-  if(hx<0x00800000) {	/* subnormal x */
-    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
-  } else ix = (hx>>23)-127;
-
-  /* determine iy = ilogb(y) */
-  if(hy<0x00800000) {	/* subnormal y */
-    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
-  } else iy = (hy>>23)-127;
-
-  /* set up {hx,lx}, {hy,ly} and align y to x */
-  if(ix >= -126)
-    hx = 0x00800000|(0x007fffff&hx);
-  else {		/* subnormal x, shift x to normal */
-    n = -126-ix;
-    hx = hx<<n;
-  }
-  if(iy >= -126)
-    hy = 0x00800000|(0x007fffff&hy);
-  else {		/* subnormal y, shift y to normal */
-    n = -126-iy;
-    hy = hy<<n;
-  }
-  /* fix point fmod */
-  n = ix - iy;
-  while(n--) {
-    hz=hx-hy;
-    if(hz<0){hx = hx+hx;}
-    else {
-      if(hz==0)		/* return sign(x)*0 */
-        return Zero[(unsigned)sx>>31];
-      hx = hz+hz;
-    }
-  }
-  hz=hx-hy;
-  if(hz>=0) {hx=hz;}
-
-    /* convert back to floating value and restore the sign */
-  if(hx==0)			/* return sign(x)*0 */
-    return Zero[(unsigned)sx>>31];
-  while(hx<0x00800000) {		/* normalize x */
-    hx = hx+hx;
-    iy -= 1;
-  }
-  if(iy>= -126) {		/* normalize output */
-    hx = ((hx-0x00800000)|((iy+127)<<23));
-	GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
-   } else {		/* subnormal output */
-     n = -126 - iy;
-     hx >>= n;
-     GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
-     x *= one;		/* create necessary signal */
-  }
-  return x;		/* exact output */
-}
-
-OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
-  //return __gen_ocl_pow(M_E_F, x) - 1;
-  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
-  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
-  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
-  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
-  huge = 1.0e30,
-  tiny = 1.0e-30,
-  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one	=  1.0,
-  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
-  float y,hi,lo,c,t,e,hxs,hfx,r1;
-  int k,xsb;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = hx&0x80000000;
-  /* sign bit of x */
-  //if(xsb==0)
-  //y=x;
-  //else
-  //y= -x; /* y = |x| */
-  y = __gen_ocl_internal_fabs(x);
-  hx &= 0x7fffffff;		/* high word of |x| */
-  /* filter out huge and non-finite argument */
-  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
-    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
-      if(hx>0x7f800000)
-        return x+x; 	 /* NaN */
-      if(hx==0x7f800000)
-        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
-      if(x > o_threshold)
-        return huge*huge; /* overflow */
-    }
-    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
-      if(x+tiny<(float)0.0)	/* raise inexact */
-        return tiny-one;	/* return -1 */
-    }
-  }
-  /* argument reduction */
-  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
-      if(xsb==0){
-        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
-      }	else {
-        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
-      }
-    } else {
-      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
-      t  = k;
-      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
-      lo = t*ln2_lo;
-    }
-    x  = hi - lo;
-    c  = (hi-x)-lo;
-  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
-    //t = huge+x; /* return x with inexact flags when x!=0 */
-    //return x - (t-(huge+x));
-    return x;
-  } else k = 0;
-  /* x is now in primary range */
-  hfx = (float)0.5*x;
-  hxs = x*hfx;
-  r1 = one+hxs*(Q1+hxs*Q2);
-  t = (float)3.0-r1*hfx;
-  e = hxs*((r1-t)/((float)6.0 - x*t));
-  if(k==0)
-    return x - (x*e-hxs);		/* c is 0 */
-  else{
-    e = (x*(e-c)-c);
-    e -= hxs;
-    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
-    if(k==1){
-      if(x < (float)-0.25)
-        return -(float)2.0*(e-(x+(float)0.5));
-      else
-        return  (one+(float)2.0*(x-e));
-    }
-    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
-      int i;
-      y = one-(e-x);
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-      return y-one;
-    }
-    t = one;
-    if(k<23) {
-      int i;
-      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
-      y = t-(e-x);
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-    } else {
-      int i;
-      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
-      y = x-(e+t);
-      y += one;
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-    }
-  }
-  return y;
-}
-
-OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
-  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-  float one	= 1.0,
-  ln2	= 6.9314718246e-01;/* 0x3f317218 */
-  float t;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  if(hx<0x3f800000) {	/* x < 1 */
-    return (x-x)/(x-x);
-  } else if(hx >=0x4d800000) {	/* x > 2**28 */
-    if(hx >=0x7f800000) {/* x is inf of NaN */
-      return x+x;
-    } else
-      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
-  } else if (hx==0x3f800000) {
-    return 0.0;			/* acosh(1) = 0 */
-  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
-    t=x*x;
-    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
-  } else {			/* 1<x<2 */
-    t = x-one;
-    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
-  }
-}
-
-OVERLOADABLE float __gen_ocl_internal_asinh(float x){
-  //return native_log(x + native_sqrt(x * x + 1));
-  float one =  1.0000000000e+00, /* 0x3F800000 */
-  ln2 =  6.9314718246e-01, /* 0x3f317218 */
-  huge=  1.0000000000e+30;
-  float w;
-  int hx,ix;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix< 0x38000000) {	/* |x|<2**-14 */
-    if(huge+x>one) return x;	/* return x inexact except 0 */
-  }
-  if(ix>0x47000000) {/* |x| > 2**14 */
-    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
-    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
-  } else {
-    float xa = __gen_ocl_internal_fabs(x);
-    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
-      w = __gen_ocl_internal_log(mad(xa, 2.0f, one / (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
-    } else {		/* 2.0 > |x| > 2**-14 */
-      float t = xa*xa;
-      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
-    }
-  }
-  return __gen_ocl_internal_copysign(w, x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_sinh(float x){
-  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-  float one = 1.0,
-  shuge = 1.0e37;
-  float t,w,h;
-  int ix,jx;
-  GEN_OCL_GET_FLOAT_WORD(jx,x);
-  ix = jx&0x7fffffff;
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) return x+x;
-  h = 0.5;
-  if (jx<0) h = -h;
-  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
-  if (ix < 0x41b00000) {		/* |x|<22 */
-    if (ix<0x31800000)	/* |x|<2**-28 */
-      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
-    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
-    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
-      return h*(t+t/(t+one));
-  }
-  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
-  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
-  /* |x| in [log(maxdouble), overflowthresold] */
-  if (ix<=0x42b2d4fc) {
-    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
-    t = h*w;
-    return t*w;
-  }
-  /* |x| > overflowthresold, sinh(x) overflow */
-  return x*shuge;
-}
-
-OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
-  //float y = native_exp(-2 * x);
-  //return (1 - y) / (1 + y);
-  float one=1.0, two=2.0, tiny = 1.0e-30;
-  float t,z;
-  int jx,ix;
-  GEN_OCL_GET_FLOAT_WORD(jx,x);
-  ix = jx&0x7fffffff;
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) {
-    if (jx>=0)
-      return one/x+one; /* tanh(+-inf)=+-1 */
-    else
-      return one/x-one; /* tanh(NaN) = NaN */
-  }
-
-  if (ix < 0x41b00000) { /* |x|<22 */
-    if (ix == 0)
-      return x;		/* x == +-0 */
-    if (ix<0x24000000) 	/* |x|<2**-55 */
-      return x*(one+x);    	/* tanh(small) = small */
-    if (ix>=0x3f800000) {	/* |x|>=1  */
-      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
-      z = one - two/(t+two);
-    } else {
-      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
-      z= -t/(t+two);
-    }
-  } else { /* |x| > 22, return +-1 */
-    z = one - tiny;		/* raised inexact flag */
-  }
-  return (jx>=0)? z: -z;
-}
-
-OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
-  //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
-  float halF = 0.5,
-  huge = 1.0e+30,
-  tiny = 1.0e-30,
-  one = 1.0;
-  float t,w;
-  int ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  ix &= 0x7fffffff;
-  /* |x| in [0,22] */
-  if (ix < 0x41b00000) {
-    /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
-    if(ix<0x3eb17218) {
-      t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
-      w = one+t;
-      if (ix<0x24000000) return w;	/* cosh(tiny) = 1 */
-      return one+(t*t)/(w+w);
-    }
-    /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
-    t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
-    return halF*t+halF/t;
-  }
-  /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
-  if (ix < 0x42b17180)  return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
-  /* |x| in [log(maxdouble), overflowthresold] */
-  if (ix<=0x42b2d4fc) {
-    w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
-    t = halF*w;
-    return t*w;
-  }
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) return x*x;
-  /* |x| > overflowthresold, cosh(x) overflow */
-  return huge*huge;
-}
-
-OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
-  //return x-y*__gen_ocl_rnde(x/y);
-  float zero = 0.0;
-  int hx,hp;
-  unsigned sx;
-  float p_half;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hp,p);
-  sx = hx&0x80000000;
-  hp &= 0x7fffffff;
-  hx &= 0x7fffffff;
-  /* purge off exception values */
-  if(hp==0) return (x*p)/(x*p);	        /* p = 0 */
-  if((hx>=0x7f800000)||               /* x not finite */
-    ((hp>0x7f800000)))	               /* p is NaN */
-    return (x*p)/(x*p);
-  if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
-  if ((hx-hp)==0) return zero*x;
-  x = __gen_ocl_fabs(x);
-  p = __gen_ocl_fabs(p);
-  if (hp<0x01000000) {
-    if(x+x>p) {
-      x-=p;
-      if(x+x>=p) x -= p;
-    }
-  } else {
-    p_half = (float)0.5*p;
-    if(x>p_half) {
-      x-=p;
-      if(x>=p_half) x -= p;
-    }
-  }
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
-  return x;
-}
-
-OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
-  x = __gen_ocl_scalbnf(x,n);
-  return x;
-}
-
-OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
-  //return 0.5f * native_sqrt((1 + x) / (1 - x));
-  float xa = __gen_ocl_fabs (x);
-  float t;
-  if (isless (xa, 0.5f)){
-    if (xa < 0x1.0p-28f) return x;
-    t = xa + xa;
-    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
-  } else if (isless (xa, 1.0f)){
-    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
-  } else{
-    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
-    return x / 0.0f;
-  }
-  return __gen_ocl_internal_copysign(t, x);
-}
-
-OVERLOADABLE float __gen_ocl_internal_exp10(float x){
-  float px, qx,ans;
-  short n;
-  int i;
-  float*p;
-  float MAXL10 = 38.230809449325611792;
-  float LOG210 = 3.32192809488736234787e0;
-  float LG102A = 3.00781250000000000000E-1;
-  float LG102B = 2.48745663981195213739E-4;
-  float P[6];
-  P[0] = 2.063216740311022E-001;
-  P[1] = 5.420251702225484E-001;
-  P[2] = 1.171292686296281E+000;
-  P[3] = 2.034649854009453E+000;
-  P[4] = 2.650948748208892E+000;
-  P[5] = 2.302585167056758E+000;
-
-  if( x < -MAXL10 ) return 0.0;
-
-  if( isinf(x))  return INFINITY;
-  /* The following is necessary because range reduction blows up: */
-  if( x == 0 )return 1.0;
-
-  /* Express 10**x = 10**g 2**n
-    *	 = 10**g 10**( n log10(2) )
-    *	 = 10**( g + n log10(2) )
-    */
-  px = x * LOG210;
-  qx = __gen_ocl_internal_floor( px + 0.5 );
-  n = qx;
-  x -= qx * LG102A;
-  x -= qx * LG102B;
-
-  /* rational approximation for exponential
-    * of the fractional part:
-    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
-    */
-  p = P;
-  ans = *p++;
-  i = 5;
-  do{
-    ans = ans * x  +  *p++;
-  }
-  while( --i );
-  px = 1.0 + x * ans;
-
-  /* multiply by power of 2 */
-  x = __gen_ocl_internal_ldexp( px, n );
-  return x;
-}
-
-OVERLOADABLE float cospi(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_cospi(x);
-
-  return __gen_ocl_internal_cospi(x);
-}
-
-OVERLOADABLE float cosh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_cosh(x);
-
-  return  __gen_ocl_internal_cosh(x);
-}
-
-OVERLOADABLE float acos(float x) {
-  return __gen_ocl_internal_acos(x);
-}
-
-OVERLOADABLE float acospi(float x) {
-  return __gen_ocl_internal_acospi(x);
-}
-
-OVERLOADABLE float acosh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_acosh(x);
-
-  return __gen_ocl_internal_acosh(x);
-}
-
-OVERLOADABLE float sinpi(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sinpi(x);
-
-  return __gen_ocl_internal_sinpi(x);
-}
-
-OVERLOADABLE float sinh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_sinh(x);
-
-  return __gen_ocl_internal_sinh(x);
-}
-
-OVERLOADABLE float asin(float x) {
-  return __gen_ocl_internal_asin(x);
-}
-
-OVERLOADABLE float asinpi(float x) {
-  return __gen_ocl_internal_asinpi(x);
-}
-
-OVERLOADABLE float asinh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_asinh(x);
-
-  return __gen_ocl_internal_asinh(x);
-}
-
-OVERLOADABLE float tanpi(float x) {
-  return __gen_ocl_internal_tanpi(x);
-}
-
-OVERLOADABLE float tanh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_tanh(x);
-
-  return __gen_ocl_internal_tanh(x);
-}
-
-OVERLOADABLE float atan(float x) {
-  return __gen_ocl_internal_atan(x);
-}
-
-OVERLOADABLE float atan2(float y, float x) {
-  return __gen_ocl_internal_atan2(y, x);
-}
-
-OVERLOADABLE float atan2pi(float y, float x) {
-  return __gen_ocl_internal_atan2pi(y, x);
-}
-
-OVERLOADABLE float atanpi(float x) {
-  return __gen_ocl_internal_atanpi(x);
-}
-
-OVERLOADABLE float atanh(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_atanh(x);
-
-  return __gen_ocl_internal_atanh(x);
-}
-
-OVERLOADABLE float cbrt(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_cbrt(x);
-
-  return __gen_ocl_internal_cbrt(x);
-}
-
-OVERLOADABLE float rint(float x) {
-  return __gen_ocl_internal_rint(x);
-}
-
-OVERLOADABLE float copysign(float x, float y) {
-  return __gen_ocl_internal_copysign(x, y);
-}
-
-OVERLOADABLE float erf(float x) {
-  return __gen_ocl_internal_erf(x);
-}
-
-OVERLOADABLE float erfc(float x) {
-  return __gen_ocl_internal_erfc(x);
-}
-
-OVERLOADABLE float fmod (float x, float y) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_fmod(x, y);
-
-  return __gen_ocl_internal_fmod(x, y);
-}
-
-OVERLOADABLE float remainder(float x, float p) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_remainder(x, p);
-
-  return __gen_ocl_internal_remainder(x, p);
-}
-
-OVERLOADABLE float ldexp(float x, int n) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_ldexp(x, n);
-
-  if (x == (float)0.0f) x = 0.0f;
-  return __gen_ocl_internal_ldexp(x, n);
-}
-
-CONST OVERLOADABLE float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
-CONST OVERLOADABLE half __gen_ocl_mad(half a, half b, half c) __asm("llvm.fma" ".f16");
-PURE CONST float __gen_ocl_fmax(float a, float b);
-PURE CONST float __gen_ocl_fmin(float a, float b);
-
-OVERLOADABLE float mad(float a, float b, float c) {
-  return __gen_ocl_mad(a, b, c);
-}
-
-
-#define BODY \
-  if (isnan(x) || isinf(x)) { \
-    *exp = 0; \
-    return x; \
-  } \
-  uint u = as_uint(x); \
-  uint a = u & 0x7FFFFFFFu; \
-  if (a == 0) { \
-    *exp = 0; \
-    return x; \
-  } \
-  if (a >= 0x800000) { \
-    *exp = (a >> 23) - 126; \
-    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
-  } \
-  int e = -126; \
-  while (a < 0x400000) { \
-    e --; \
-    a <<= 1; \
-  } \
-  a <<= 1; \
-  *exp = e; \
-  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
-OVERLOADABLE float frexp(float x, int *exp) { BODY; }
-#undef BODY
-
-OVERLOADABLE float nextafter(float x, float y) {
-  int hx, hy, ix, iy;
-  hx = as_int(x);
-  hy = as_int(y);
-  ix = hx & 0x7fffffff;
-  iy = hy & 0x7fffffff;
-  if(ix == 0)
-    ix = hx & 0x7fffff;
-  if(iy == 0)
-    iy = hy & 0x7fffff;
-  if(ix>0x7f800000 || iy>0x7f800000)
-    return x+y;
-  if(hx == hy)
-    return y;
-  if(ix == 0) {
-    if(iy == 0)
-      return y;
-    else
-      return as_float((hy&0x80000000) | 1);
-  }
-  if(hx >= 0) {
-    if(hx > hy) {
-      hx -= 1;
-    } else {
-      hx += 1;
-    }
-  } else {
-    if(hy >= 0 || hx > hy){
-      hx -= 1;
-    } else {
-      hx += 1;
-    }
-  }
-  return as_float(hx);
-}
-
-#define BODY \
-  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
-  if (ix > 0x7F800000) { \
-    *i = nan(0u); \
-    return nan(0u); \
-  } \
-  if (ix == 0x7F800000) { \
-    *i = x; \
-    return as_float(hx & 0x80000000u); \
-  } \
-  *i = __gen_ocl_rndz(x); \
-  return x - *i;
-OVERLOADABLE float modf(float x, float *i) { BODY; }
-#undef BODY
-
-OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
-OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
-OVERLOADABLE float __gen_ocl_internal_fmax(half a, half b) { return max(a,b); }
-OVERLOADABLE float __gen_ocl_internal_fmin(half a, half b) { return min(a,b); }
-OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
-  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
-  return a > b ? x : b > a ? y : max(x, y);
-}
-OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
-  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
-  return a < b ? x : b < a ? y : min(x, y);
-}
-OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
-  if(isnan(x))
-    return x;
-  if(isnan(y))
-    return y;
-  return x > y ? (x - y) : +0.f;
-}
-/*
- * the pow/pown high precision implementation are copied from msun library.
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- */
-
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
-  float z,ax,z_h,z_l,p_h,p_l;
-  float y1,t1,t2,r,s,sn,t,u,v,w;
-  int i,j,k,yisint,n;
-  int hx,hy,ix,iy,is;
-  float bp[2],dp_h[2],dp_l[2],
-  zero    =  0.0,
-  one	=  1.0,
-  two	=  2.0,
-  two24	=  16777216.0,	/* 0x4b800000 */
-  huge	=  1.0e30,
-  tiny    =  1.0e-30,
-  /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
-  L1  =  6.0000002384e-01, /* 0x3f19999a */
-  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
-  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
-  P2   = -2.7777778450e-03, /* 0xbb360b61 */
-  lg2  =  6.9314718246e-01, /* 0x3f317218 */
-  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
-  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
-  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
-  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
-  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
-  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
-  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
-  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
-  bp[0] = 1.0,bp[1] = 1.5,
-  dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
-  dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
-  if (ix < 0x00800000) {	   /* x < 2**-126  */
-    ix = 0;/* Gen does not support subnormal number now */
-  }
-  if (iy < 0x00800000) {	  /* y < 2**-126  */
-    iy = 0;/* Gen does not support subnormal number now */
-  }
-   /* y==zero: x**0 = 1 */
-  if(iy==0) return one;
-  /* pow(+1, y) returns 1 for any y, even a NAN */
-  if(hx==0x3f800000) return one;
-  /* +-NaN return x+y */
-  if(ix > 0x7f800000 || iy > 0x7f800000)
-    return (x+0.0f)+y+(0.0f);
-  /* determine if y is an odd int when x < 0
-     * yisint = 0	... y is not an integer
-     * yisint = 1	... y is an odd int
-     * yisint = 2	... y is an even int
-     */
-  yisint  = 0;
-  if(hx<0) {
-    if(iy>=0x4b800000) yisint = 2; /* even integer y */
-    else if(iy>=0x3f800000) {
-      k = (iy>>23)-0x7f;	   /* exponent */
-      j = iy>>(23-k);
-      if((j<<(23-k))==iy) yisint = 2-(j&1);
-    }
-  }
-  /* special value of y */
-  if (iy==0x7f800000) {	/* y is +-inf */
-    if (ix==0x3f800000)
-      //return  y - y;	/* inf**+-1 is NaN */
-      return one;
-    else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
-      return (hy>=0)? y: zero;
-    else			/* (|x|<1)**-,+inf = inf,0 */
-      return (hy<0)?-y: zero;
-  }
-  if(iy==0x3f800000) {	/* y is  +-1 */
-    if(hy<0) return one/x; else return x;
-  }
-  if(hy==0x40000000) return x*x; /* y is  2 */
-  if(hy==0x3f000000) {	/* y is  0.5 */
-    if(hx>=0)return __gen_ocl_sqrt(x);
-  }
-
-  ax   = __gen_ocl_fabs(x);
-    /* special value of x */
-  if(ix==0x7f800000||ix==0||ix==0x3f800000){
-    z = ax;			/*x is +-0,+-inf,+-1*/
-    if(hy<0) z = one/z;	/* z = (1/|x|) */
-    if(hx<0) {
-      if(((ix-0x3f800000)|yisint)==0) {
-        z = (z-z)/(z-z); /* (-1)**non-int is NaN */
-      } else if(yisint==1)
-        z = -z;		/* (x<0)**odd = -(|x|**odd) */
-    }
-    return z;
-  }
-  n = ((uint)hx>>31)-1;
-
-  /* (x<0)**(non-int) is NaN */
-  if((n|yisint)==0) return (x-x)/(x-x);
-
-  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
-  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
-
-  /* |y| is huge */
-  if(iy>0x4d000000) { /* if |y| > 2**27 */
-    /* over/underflow if x is not close to one */
-    if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
-    if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
-    /* now |1-x| is tiny <= 2**-20, suffice to compute
-          log(x) by x-x^2/2+x^3/3-x^4/4 */
-    t = ax-1;		/* t has 20 trailing zeros */
-    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
-    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
-    v = t*ivln2_l-w*ivln2;
-    t1 = u+v;
-    GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
-    t2 = v-(t1-u);
-  } else {
-    float s2,s_h,s_l,t_h,t_l;
-    n = 0;
-	/* take care subnormal number */
-    //if(ix<0x00800000)
-      //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
-    n  += ((ix)>>23)-0x7f;
-    j  = ix&0x007fffff;
-	/* determine interval */
-    ix = j|0x3f800000;		/* normalize ix */
-    if(j<=0x1cc471) k=0;	/* |x|<sqrt(3/2) */
-    else if(j<0x5db3d7) k=1;	/* |x|<sqrt(3)   */
-    else {k=0;n+=1;ix -= 0x00800000;}
-    GEN_OCL_SET_FLOAT_WORD(ax,ix);
-
-	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
-    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
-    v = one/(ax+bp[k]);
-    s = u*v;
-    s_h = s;
-    GEN_OCL_GET_FLOAT_WORD(is,s_h);
-    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
-    /* t_h=ax+bp[k] High */
-    is = ((ix>>1)&0xfffff000)|0x20000000;
-    GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
-    t_l = ax - (t_h-bp[k]);
-    s_l = v*((u-s_h*t_h)-s_h*t_l);
-
-    /* compute log(ax) */
-    s2 = s*s;
-    r = s2*s2*(L1+s2*L2);
-    r += s_l*(s_h+s);
-    s2  = s_h*s_h;
-    t_h = 3.0f+s2+r;
-    GEN_OCL_GET_FLOAT_WORD(is,t_h);
-    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
-    t_l = r-((t_h-3.0f)-s2);
-    /* u+v = s*(1+...) */
-    u = s_h*t_h;
-    v = s_l*t_h+t_l*s;
-    /* 2/(3log2)*(s+...) */
-    p_h = u+v;
-    GEN_OCL_GET_FLOAT_WORD(is,p_h);
-    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
-    p_l = v-(p_h-u);
-    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
-    z_l = cp_l*p_h+p_l*cp+dp_l[k];
-    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
-    t = (float)n;
-    t1 = (((z_h+z_l)+dp_h[k])+t);
-    GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
-    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
-  }
-
-  /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
-  GEN_OCL_GET_FLOAT_WORD(is,y);
-  GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
-  p_l = (y-y1)*t1+y*t2;
-  p_h = y1*t1;
-  z = p_l+p_h;
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  if (j>0x43000000)				/* if z > 128 */
-    return sn*huge*huge;			/* overflow */
-  else if (j==0x43000000) {			/* if z == 128 */
-    if(p_l+ovt>z-p_h) return sn*huge*huge;	/* overflow */
-  }
-  else if ((j&0x7fffffff)>0x43160000)		/* z <= -150 */
-    return sn*tiny*tiny;			/* underflow */
-  else if (j==0xc3160000){			/* z == -150 */
-    if(p_l<=z-p_h) return sn*tiny*tiny;		/* underflow */
-  }
-
-  /*
-    * compute 2**(p_h+p_l)
-    */
-  i = j&0x7fffffff;
-  k = (i>>23)-0x7f;
-  n = 0;
-  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
-    n = j+(0x00800000>>(k+1));
-    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
-    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
-    n = ((n&0x007fffff)|0x00800000)>>(23-k);
-    if(j<0) n = -n;
-    p_h -= t;
-  }
-  t = p_l+p_h;
-  GEN_OCL_GET_FLOAT_WORD(is,t);
-  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
-  u = t*lg2_h;
-  v = (p_l-(t-p_h))*lg2+t*lg2_l;
-  z = u+v;
-  w = v-(z-u);
-  t  = z*z;
-  t1  = z - t*(P1+t*P2);
-  r  = (z*t1)/(t1-two)-(w+z*w);
-  z  = one-(r-z);
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  j += (n<<23);
-  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);	/* subnormal output */
-  else GEN_OCL_SET_FLOAT_WORD(z,j);
-  return sn*z;
-}
-
-OVERLOADABLE float tgamma (float x)
-{
-  /* based on glibc __ieee754_gammaf_r by Ulrich Drepper <drepper@cygnus.com> */
-
-  unsigned int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  if (hx == 0xff800000)
-    {
-      /* x == -Inf.  According to ISO this is NaN.  */
-      return NAN;
-    }
-  if ((hx & 0x7f800000) == 0x7f800000)
-    {
-      /* Positive infinity (return positive infinity) or NaN (return
-	 NaN).  */
-      return x;
-    }
-  if (x < 0.0f && __gen_ocl_internal_floor (x) == x)
-    {
-      /* integer x < 0 */
-      return NAN;
-    }
-
-  if (x >= 36.0f)
-    {
-      /* Overflow.  */
-      return INFINITY;
-    }
-  else if (x <= 0.0f && x >= -FLT_EPSILON / 4.0f)
-    {
-      return 1.0f / x;
-    }
-  else
-    {
-      float sinpix = __gen_ocl_internal_sinpi(x);
-      if (x <= -42.0f)
-	/* Underflow.  */
-	{return 0.0f * sinpix /*for sign*/;}
-      int exp2_adj = 0;
-      float x_abs = __gen_ocl_fabs(x);
-      float gam0;
-
-      if (x_abs < 4.0f) {
-        /* gamma = exp(lgamma) is only accurate for small lgamma */
-        float prod,x_adj;
-        if (x_abs < 0.5f) {
-          prod = 1.0f / x_abs;
-          x_adj = x_abs + 1.0f;
-        } else if (x_abs <= 1.5f) {
-          prod = 1.0f;
-          x_adj = x_abs;
-        } else if (x_abs < 2.5f) {
-          x_adj = x_abs - 1.0f;
-          prod = x_adj;
-        } else {
-          x_adj = x_abs - 2.0f;
-          prod = x_adj * (x_abs - 1.0f);
-        }
-        gam0 = __gen_ocl_internal_exp (lgamma (x_adj)) * prod;
-      }
-      else {
-        /* Compute gamma (X) using Stirling's approximation,
-  	 starting by computing pow (X, X) with a power of 2
-  	 factored out to avoid intermediate overflow.  */
-        float x_int = __gen_ocl_internal_round (x_abs);
-        float x_frac = x_abs - x_int;
-        int x_log2;
-        float x_mant = frexp (x_abs, &x_log2);
-        if (x_mant < M_SQRT1_2_F)
-          {
-          x_log2--;
-          x_mant *= 2.0f;
-          }
-        exp2_adj = x_log2 * (int) x_int;
-        float ret = (__gen_ocl_internal_pow(x_mant, x_abs)
-  		   * exp2 (x_log2 * x_frac)
-  		   * __gen_ocl_internal_exp (-x_abs)
-  		   * sqrt (2.0f * M_PI_F / x_abs) );
-
-        float x2 = x_abs * x_abs;
-        float bsum = (0x3.403404p-12f / x2 -0xb.60b61p-12f) / x2 + 0x1.555556p-4f;
-        gam0 = ret + ret * __gen_ocl_internal_expm1 (bsum / x_abs);
-      }
-      if (x > 0.0f) {return __gen_ocl_internal_ldexp (gam0, exp2_adj);}
-      float gam1 = M_PI_F / (-x * sinpix * gam0);
-      return __gen_ocl_internal_ldexp (gam1, -exp2_adj);
-    }
-}
-
-float __gen_ocl_internal_pown(float x, int y) {
-  const float
-  bp[] = {1.0, 1.5,},
-  dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */
-  dp_l[] = { 0.0, 1.56322085e-06,}, /* 0x35d1cfdc */
-  zero    =  0.0,
-  one =  1.0,
-  two =  2.0,
-  two24 =  16777216.0,  /* 0x4b800000 */
-  huge  =  1.0e30,
-  tiny    =  1.0e-30,
-    /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
-  L1  =  6.0000002384e-01, /* 0x3f19999a */
-  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
-  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
-  P2   = -2.7777778450e-03, /* 0xbb360b61 */
-  lg2  =  6.9314718246e-01, /* 0x3f317218 */
-  lg2_h  =  0x1.62ep-1,
-  lg2_l  =  0x1.0bfbe8p-15,
-  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
-  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
-  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
-  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
-  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
-  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
-
-  float z,ax,z_h,z_l,p_h,p_l;
-  float y1,t1,t2,r,s,t,u,v,w;
-  int i,j,k,yisint,n;
-  int hx,ix,iy,is;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  iy = y > 0 ? y&0x7fffffff : (-y)&0x7fffffff;
-    /* y==zero: x**0 = 1 */
-  if(y==0) return one;
-
-    /* +-NaN return NAN */
-  if(ix > 0x7f800000)
-    return NAN;
-
-    /* determine if y is an odd int
-     * yisint = 1 ... y is an odd int
-     * yisint = 2 ... y is an even int
-     */
-    yisint = y&1 ? 1 : 2;
-
-  if (y == 1) return x;
-  if (y == -1) return one/x;
-  if (y == 2) return x*x;
-
-  ax   = __gen_ocl_fabs(x);
-
-   /* special value of x */
-  if(ix==0x7f800000||ix==0||ix==0x3f800000){
-      z = ax;     /*x is +-0,+-inf,+-1*/
-      if(y<0) z = one/z; /* z = (1/|x|) */
-      if(hx<0) {
-      if(yisint==1)
-        z = -z;   /* (x<0)**odd = -(|x|**odd) */
-      }
-      return z;
-  }
-
-  float sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
-  if(((((unsigned)hx>>31)-1)|(yisint-1))==0)
-      sn = -one; /* (-ve)**(odd int) */
-
-    /* |y| is huge */
-  if(iy>0x08000000) { /* if |y| > 2**27 */
-    /* over/underflow if x is not close to one */
-      if(ix<0x3f7ffff8) return (y<0)? sn*huge*huge:tiny*tiny;
-      if(ix>0x3f800007) return (y>0)? sn*huge*huge:tiny*tiny;
-    /* now |1-x| is tiny <= 2**-20, suffice to compute
-     log(x) by x-x^2/2+x^3/3-x^4/4 */
-      t = ax-1;   /* t has 20 trailing zeros */
-      w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
-      u = ivln2_h*t;  /* ivln2_h has 16 sig. bits */
-      v = t*ivln2_l-w*ivln2;
-      t1 = u+v;
-      GEN_OCL_GET_FLOAT_WORD(is,t1);
-      GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
-      t2 = v-(t1-u);
-  } else {
-    float s2,s_h,s_l,t_h,t_l;
-    n = 0;
-    /* take care subnormal number */
-//      if(ix<0x00800000)
-//    {ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
-    n  += ((ix)>>23)-0x7f;
-    j  = ix&0x007fffff;
-    /* determine interval */
-    ix = j|0x3f800000;    /* normalize ix */
-    if(j<=0x1cc471) k=0;  /* |x|<sqrt(3/2) */
-    else if(j<0x5db3d7) k=1;  /* |x|<sqrt(3)   */
-    else {k=0;n+=1;ix -= 0x00800000;}
-    GEN_OCL_SET_FLOAT_WORD(ax,ix);
-
-    /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
-    u = ax-bp[k];   /* bp[0]=1.0, bp[1]=1.5 */
-    v = one/(ax+bp[k]);
-    s = u*v;
-    s_h = s;
-    GEN_OCL_GET_FLOAT_WORD(is,s_h);
-    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
-
-    /* t_h=ax+bp[k] High */
-    GEN_OCL_SET_FLOAT_WORD(t_h, (((ix>>1)|0x20000000)+0x00400000+(k<<21)) &0xfffff000);
-    t_l = ax - (t_h-bp[k]);
-    s_l = v*((u-s_h*t_h)-s_h*t_l);
-
-
-    /* compute log(ax) */
-    s2 = s*s;
-    r = s2*s2*(L1+s2*L2);
-    r += s_l*(s_h+s);
-    s2  = s_h*s_h;
-    t_h = (float)3.0+s2+r;
-    GEN_OCL_GET_FLOAT_WORD(is,t_h);
-    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
-    t_l = r-((t_h-(float)3.0)-s2);
-    /* u+v = s*(1+...) */
-    u = s_h*t_h;
-    v = s_l*t_h+t_l*s;
-    /* 2/(3log2)*(s+...) */
-    p_h = u+v;
-    GEN_OCL_GET_FLOAT_WORD(is,p_h);
-    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
-    p_l = v-(p_h-u);
-    z_h = cp_h*p_h;   /* cp_h+cp_l = 2/(3*log2) */
-    z_l = cp_l*p_h+p_l*cp+dp_l[k];
-    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
-    t = (float)n;
-    t1 = (((z_h+z_l)+dp_h[k])+t);
-    GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
-    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
-  }
-
-  /* split up y into y1+y2+y3 and compute (y1+y2+y3)*(t1+t2) */
-
-  float fy = (float)y;
-  float y3 = (float)(y-(int)fy);
-  GEN_OCL_GET_FLOAT_WORD(is,fy);
-  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
-
-  p_l = (fy-y1)*t1 + y3*t1 + fy*t2 + y3*t2;
-  p_h = y1*t1;
-  z = p_l+p_h;
-
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  if (j>0x43000000)       /* if z > 128 */
-      return sn*huge*huge;       /* overflow */
-  else if (j==0x43000000) {     /* if z == 128 */
-      if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
-  }
-  else if ((j&0x7fffffff)>0x43160000)   /* z <= -150 */
-      return sn*tiny*tiny;       /* underflow */
-  else if (j==0xc3160000){      /* z == -150 */
-      if(p_l<=z-p_h) return sn*tiny*tiny;    /* underflow */
-  }
-    /*
-     * compute 2**(p_h+p_l)
-     */
-  i = j&0x7fffffff;
-  k = (i>>23)-0x7f;
-  n = 0;
-  if(i>0x3f000000) {    /* if |z| > 0.5, set n = [z+0.5] */
-      n = j+(0x00800000>>(k+1));
-      k = ((n&0x7fffffff)>>23)-0x7f;  /* new k for n */
-      GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
-      n = ((n&0x007fffff)|0x00800000)>>(23-k);
-      if(j<0) n = -n;
-      p_h -= t;
-
-      z -= n;
-  }
-
-  t = z;
-  GEN_OCL_GET_FLOAT_WORD(is,t);
-  GEN_OCL_SET_FLOAT_WORD(t,is&0xfffff000);
-  u = t*lg2_h;
-  v = (p_l-(t-p_h))*lg2+t*lg2_l;
-  z = u+v;
-  w = v-(z-u);
-  t  = z*z;
-  t1  = z - t*(P1+t*P2);
-  r  = (z*t1)/(t1-two)-(w+z*w);
-  z  = one-(r-z);
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  j += (n<<23);
-  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);  /* subnormal output */
-  else GEN_OCL_SET_FLOAT_WORD(z,j);
-  return sn*z;
-}
-
-OVERLOADABLE float hypot(float x, float y) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_hypot(x, y);
-
-  //return __gen_ocl_sqrt(x*x + y*y);
-  float a,b,an,bn,cn;
-  int e;
-  if (isfinite (x) && isfinite (y)){      /* Determine absolute values.  */
-  x = __gen_ocl_fabs (x);
-  y = __gen_ocl_fabs (y);
-  /* Find the bigger and the smaller one.  */
-  a = max(x,y);
-  b = min(x,y);
-  /* Now 0 <= b <= a.  */
-  /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1.  */
-  an = frexp (a, &e);
-  bn = ldexp (b, - e);
-  /* Through the normalization, no unneeded overflow or underflow will occur here.  */
-  cn = __gen_ocl_sqrt (an * an + bn * bn);
-  return ldexp (cn, e);
-  }else{
-    if (isinf (x) || isinf (y))  /* x or y is infinite.  Return +Infinity.  */
-      return INFINITY;
-    else        /* x or y is NaN.  Return NaN.  */
-      return x + y;
-  }
-}
-
-#define BODY \
-  if (isnan(x)) { \
-    *p = x; \
-    return x; \
-  } \
-  *p = __gen_ocl_internal_floor(x); \
-  if (isinf(x)) { \
-    return x > 0 ? +0. : -0.; \
-  } \
-  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
-OVERLOADABLE float fract(float x, float *p) { BODY; }
-#undef BODY
-
-#define BODY \
-  float Zero[2]; \
-  int n,hx,hy,hz,ix,iy,sx,i,sy; \
-  uint q,sxy; \
-  Zero[0] = 0.0;Zero[1] = -0.0; \
-  if (x == 0.0f) { x = 0.0f; }; \
-  if (y == 0.0f) { y = 0.0f; }\
-  GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
-  sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
-  hx ^=sx; hy &= 0x7fffffff; \
-  if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
-  if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
-    *quo = 0;return NAN; \
-  } \
-  if( hy == 0x7F800000 || hx == 0 ) { \
-    *quo = 0;return x; \
-  } \
-  if( hx == hy ) { \
-    *quo = (x == y) ? 1 : -1; \
-    return sx ? -0.0 : 0.0; \
-  } \
-  if(hx<hy) { \
-    q = 0; \
-    goto fixup; \
-  } else if(hx==hy) { \
-    *quo = (sxy ? -1 : 1); \
-    return Zero[(uint)sx>>31]; \
-  } \
-  ix = (hx>>23)-127; \
-  iy = (hy>>23)-127; \
-  hx = 0x00800000|(0x007fffff&hx); \
-  hy = 0x00800000|(0x007fffff&hy); \
-  n = ix - iy; \
-  q = 0; \
-  while(n--) { \
-    hz=hx-hy; \
-    if(hz<0) hx = hx << 1; \
-    else {hx = hz << 1; q++;} \
-    q <<= 1; \
-  } \
-  hz=hx-hy; \
-  if(hz>=0) {hx=hz;q++;} \
-  if(hx==0) { \
-    q &= 0x0000007f; \
-    *quo = (sxy ? -q : q); \
-    return Zero[(uint)sx>>31]; \
-  } \
-  while(hx<0x00800000) { \
-    hx <<= 1;iy -= 1; \
-  } \
-  if(iy>= -126) { \
-    hx = ((hx-0x00800000)|((iy+127)<<23)); \
-  } else {\
-    n = -126 - iy; \
-    hx >>= n; \
-  } \
-fixup: \
-  GEN_OCL_SET_FLOAT_WORD(x,hx); \
-  if(hx<0x00800000){ \
-    GEN_OCL_GET_FLOAT_WORD(hy,y); \
-    hy &= 0x7fffffff; \
-    if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
-    x = 0; \
-  }else{ \
-    y = __gen_ocl_fabs(y); \
-    if (y < 0x1p-125f) { \
-      if (x+x>y || (x+x==y && (q & 1))) { \
-        q++;x-=y; \
-      } \
-    }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
-      q++;x-=y; \
-    } \
-    GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_SET_FLOAT_WORD(x,hx^sx); \
-  } \
-  int sign = sx==sy?0:1; \
-  q &= 0x0000007f; \
-  *quo = (sign ? -q : q); \
-  return x;
-
-OVERLOADABLE float remquo(float x, float y, int *quo) {
-	BODY;
-}
-#undef BODY
-
-OVERLOADABLE float powr(float x, float y) {
-  unsigned int hx, sx, hy, sy;
-
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_pow(x,y);
-  else {
-    if (isnan(x) || isnan(y)) return NAN;
-    GEN_OCL_GET_FLOAT_WORD(hx,x);
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    sx = (hx & 0x80000000) >> 31;
-    sy = (hy & 0x80000000) >> 31;
-
-    if ((hx&0x7fffffff) < 0x00800000) {	   /* x < 2**-126  */
-      x = 0.0f;/* Gen does not support subnormal number now */
-      hx = hx &0x80000000;
-    }
-    if ((hy&0x7fffffff) < 0x00800000) {	  /* y < 2**-126  */
-      y = 0.0;/* Gen does not support subnormal number now */
-      hy = hy &0x80000000;
-    }
-
-    // (x < 0) ** y = NAN (y!=0)
-    if ((sx && (hx & 0x7fffffff))) return NAN;
-
-    // +/-0 ** +/-0 = NAN
-    if ( !(hx&0x7fffffff) && !(hy&0x7fffffff)) return NAN;
-
-    // +inf ** +/-0 = NAN
-    if ( ((hx & 0x7f800000) ==0x7f800000) && !(hy&0x7fffffff)) return NAN;
-
-    // others except nan/inf/0 ** 0 = 1.0
-    if (!(hy&0x7fffffff)) return 1.0f;
-
-    // +1 ** inf = NAN; +1 ** finite = 1;
-    if (hx == 0x3f800000) {
-      return isinf(y) ? NAN : 1.0f;
-    }
-
-    if ( !(hx & 0x7fffffff)) {
-        // +/-0 ** y<0 = +inf
-        // +/-0 ** y>0 = +0
-      return sy ? INFINITY : 0.0f;
-    }
-
-    return __gen_ocl_internal_pow(x,y);
-  }
-}
-
-OVERLOADABLE float pown(float x, int n) {
-  if (__ocl_math_fastpath_flag) {
-    if (x == 0.f && n == 0)
-      return 1.f;
-    if (x < 0.f && (n&1) )
-      return -powr(-x, n);
-    return powr(x, n);
-  } else {
-    int ix;
-    GEN_OCL_GET_FLOAT_WORD(ix, x);
-    float sign = ix < 0 ? -1.0f : 1.0f;
-    if (x == 0.0f) x = sign * 0.0f;
-
-    return __gen_ocl_internal_pown(x, n);
-  }
-}
-
-OVERLOADABLE float pow(float x, float y) {
-  if (!__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_pow(x,y);
-  else {
-    int n;
-    if (x == 0.f && y == 0.f)
-      return 1.f;
-    if (x >= 0.f)
-      return powr(x, y);
-    n = y;
-    if ((float)n == y)//is exact integer
-      return pown(x, n);
-    return NAN;
-  }
-}
-
-OVERLOADABLE float rootn(float x, int n) {
-  float ax,re;
-  int sign = 0;
-  int hx;
-  if( n == 0 )return NAN;
-
-  GEN_OCL_GET_FLOAT_WORD(hx, x);
-  // Gen does not support denorm, flush to zero
-  if ((hx & 0x7fffffff) < 0x00800000) {
-    x = hx < 0 ? -0.0f : 0.0f;
-  }
-
-  //rootn ( x, n )  returns a NaN for x < 0 and n is even.
-  if( x < 0 && 0 == (n&1) )
-    return NAN;
-  if( x == 0.0 ){
-    switch( n & 0x80000001 ){
-      //rootn ( +-0,  n ) is +0 for even n > 0.
-      case 0:
-        return 0.0f;
-      //rootn ( +-0,  n ) is +-0 for odd n > 0.
-      case 1:
-        return x;
-      //rootn ( +-0,  n ) is +inf for even n < 0.
-      case 0x80000000:
-        return INFINITY;
-
-      //rootn ( +-0,  n ) is +-inf for odd n < 0.
-      case 0x80000001:
-        return __gen_ocl_internal_copysign(INFINITY, x);
-    }
-  }
-  ax = __gen_ocl_fabs(x);
-  if(x <0.0f && (n&1))
-    sign = 1;
-  if (__ocl_math_fastpath_flag)
-    re = __gen_ocl_pow(ax, 1.f/n);
-  else
-    re = __gen_ocl_internal_pow(ax,1.f/n);
-  if(sign)
-    re = -re;
-  return re;
-}
-
-OVERLOADABLE float fabs(float x) {
-  return __gen_ocl_internal_fabs(x);
-}
-
-OVERLOADABLE float trunc(float x) {
-  return  __gen_ocl_internal_trunc(x);
-}
-
-OVERLOADABLE float round(float x) {
-  return __gen_ocl_internal_round(x);
-}
-
-OVERLOADABLE float floor(float x) {
-  return __gen_ocl_internal_floor(x);
-}
-
-OVERLOADABLE float ceil(float x) {
-  return __gen_ocl_internal_ceil(x);
-}
-
-OVERLOADABLE float log(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_log(x);
-
-  /* Use native instruction when it has enough precision */
-  if((x > 0x1.1p0) || (x <= 0))
-    return __gen_ocl_internal_fastpath_log(x);
-
-  return  __gen_ocl_internal_log(x);
-}
-
-OVERLOADABLE float log2(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_log2(x);
-
-  /* Use native instruction when it has enough precision */
-  if((x > 0x1.1p0) || (x <= 0))
-    return __gen_ocl_internal_fastpath_log2(x);
-
-  return  __gen_ocl_internal_log2(x);
-}
-
-OVERLOADABLE float log10(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_log10(x);
-
-  /* Use native instruction when it has enough precision */
-  if((x > 0x1.1p0) || (x <= 0))
-    return __gen_ocl_internal_fastpath_log10(x);
-
-  return  __gen_ocl_internal_log10(x);
-}
-
-OVERLOADABLE float exp(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_exp(x);
-
-  /* Use native instruction when it has enough precision */
-  if (x > -0x1.6p1 && x < 0x1.6p1)
-    return __gen_ocl_internal_fastpath_exp(x);
-
-  return  __gen_ocl_internal_exp(x);
-}
-
-OVERLOADABLE float exp2(float x) {
-  /* Use native instruction when it has enough precision, exp2 always */
-  return native_exp2(x);
-}
-
-OVERLOADABLE float exp10(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_exp10(x);
-
-  return  __gen_ocl_internal_exp10(x);
-}
-
-OVERLOADABLE float expm1(float x) {
-  if (__ocl_math_fastpath_flag)
-    return __gen_ocl_internal_fastpath_expm1(x);
-
-  return  __gen_ocl_internal_expm1(x);
-}
-
-OVERLOADABLE float fmin(float a, float b) {
-  return __gen_ocl_internal_fmin(a, b);
-}
-
-OVERLOADABLE float fmax(float a, float b) {
-  return __gen_ocl_internal_fmax(a, b);
-}
-
-OVERLOADABLE float fma(float a, float b, float c) {
-  return mad(a, b, c);
-}
-
-OVERLOADABLE float fdim(float x, float y) {
-  return __gen_ocl_internal_fdim(x, y);
-}
-
-OVERLOADABLE float maxmag(float x, float y) {
-  return __gen_ocl_internal_maxmag(x, y);
-}
-
-OVERLOADABLE float minmag(float x, float y) {
-  return __gen_ocl_internal_minmag(x, y);
-}
-
-
-/* So far, the HW do not support half float math function.
-   We just do the conversion and call the float version here. */
-OVERLOADABLE half cospi(half x) {
-  float _x = (float)x;
-  return (half)cospi(_x);
-}
-OVERLOADABLE half cosh(half x) {
-  float _x = (float)x;
-  return (half)cosh(_x);
-}
-OVERLOADABLE half acos(half x) {
-  float _x = (float)x;
-  return (half)acos(_x);
-}
-OVERLOADABLE float half_cos(float x) {
-  return (float)cos(x);
-}
-OVERLOADABLE float half_divide(float x, float y) {
-  return (float)native_divide(x, y);
-}
-OVERLOADABLE float half_exp(float x) {
-  return (float)native_exp(x);
-}
-OVERLOADABLE float half_exp2(float x){
-  return (float)native_exp2(x);
-}
-OVERLOADABLE float half_exp10(float x){
-  return (float)native_exp10(x);
-}
-OVERLOADABLE float half_log(float x){
-  return (float)native_log(x);
-}
-OVERLOADABLE float half_log2(float x){
-  return (float)native_log2(x);
-}
-OVERLOADABLE float half_log10(float x){
-  return (float)native_log10(x);
-}
-OVERLOADABLE float half_powr(float x, float y){
-  return (float)powr(x, y);
-}
-OVERLOADABLE float half_recip(float x){
-  return (float)native_recip(x);
-}
-OVERLOADABLE float half_rsqrt(float x){
-  return (float)native_rsqrt(x);
-}
-OVERLOADABLE float half_sin(float x){
-  return (float)sin(x);
-}
-OVERLOADABLE float half_sqrt(float x){
-  return (float)native_sqrt(x);
-}
-OVERLOADABLE float half_tan(float x){
-  return (float)tan(x);
-}
-OVERLOADABLE half acospi(half x) {
-  float _x = (float)x;
-  return (half)acospi(_x);
-}
-OVERLOADABLE half acosh(half x) {
-  float _x = (float)x;
-  return (half)acosh(_x);
-}
-OVERLOADABLE half sinpi(half x) {
-  float _x = (float)x;
-  return (half)sinpi(_x);
-}
-OVERLOADABLE half sinh(half x) {
-  float _x = (float)x;
-  return (half)sinh(_x);
-}
-OVERLOADABLE half asin(half x) {
-  float _x = (float)x;
-  return (half)asin(_x);
-}
-OVERLOADABLE half asinpi(half x) {
-  float _x = (float)x;
-  return (half)asinpi(_x);
-}
-OVERLOADABLE half asinh(half x) {
-  float _x = (float)x;
-  return (half)asinh(_x);
-}
-OVERLOADABLE half tanpi(half x) {
-  float _x = (float)x;
-  return (half)tanpi(_x);
-}
-OVERLOADABLE half tanh(half x) {
-  float _x = (float)x;
-  return (half)tanh(_x);
-}
-OVERLOADABLE half atan(half x) {
-  float _x = (float)x;
-  return (half)atan(_x);
-}
-OVERLOADABLE half atan2(half y, half x) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)atan2(_x, _y);
-}
-OVERLOADABLE half atan2pi(half y, half x) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)atan2pi(_x, _y);
-}
-OVERLOADABLE half atanpi(half x) {
-  float _x = (float)x;
-  return (half)atanpi(_x);
-}
-OVERLOADABLE half atanh(half x) {
-  float _x = (float)x;
-  return (half)atanh(_x);
-}
-OVERLOADABLE half cbrt(half x) {
-  float _x = (float)x;
-  return (half)cbrt(_x);
-}
-OVERLOADABLE half rint(half x) {
-  float _x = (float)x;
-  return (half)rint(_x);
-}
-OVERLOADABLE half copysign(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)copysign(_x, _y);
-}
-OVERLOADABLE half erf(half x) {
-  float _x = (float)x;
-  return (half)erf(_x);
-}
-OVERLOADABLE half erfc(half x) {
-  float _x = (float)x;
-  return (half)erfc(_x);
-}
-OVERLOADABLE half fmod(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)fmod(_x, _y);
-}
-OVERLOADABLE half remainder(half x, half p) {
-  float _x = (float)x;
-  float _p = (float)p;
-  return (half)remainder(_x, _p);
-}
-OVERLOADABLE half ldexp(half x, int n) {
-  float _x = (float)x;
-  return (half)ldexp(_x, n);
-}
-OVERLOADABLE half powr(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)powr(_x, _y);
-}
-OVERLOADABLE half pow(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)pow(_x, _y);
-}
-//no pow, we use powr instead
-OVERLOADABLE half fabs(half x) {
-  float _x = (float)x;
-  return (half)fabs(_x);
-}
-OVERLOADABLE half trunc(half x) {
-  float _x = (float)x;
-  return (half)trunc(_x);
-}
-OVERLOADABLE half round(half x) {
-  float _x = (float)x;
-  return (half)round(_x);
-}
-OVERLOADABLE half floor(half x) {
-  float _x = (float)x;
-  return (half)floor(_x);
-}
-OVERLOADABLE half ceil(half x) {
-  float _x = (float)x;
-  return (half)ceil(_x);
-}
-OVERLOADABLE half log(half x) {
-  float _x = (float)x;
-  return (half)log(_x);
-}
-OVERLOADABLE half log2(half x) {
-  float _x = (float)x;
-  return (half)log2(_x);
-}
-OVERLOADABLE half log10(half x) {
-  float _x = (float)x;
-  return (half)log10(_x);
-}
-OVERLOADABLE half exp(half x) {
-  float _x = (float)x;
-  return (half)exp(_x);
-}
-OVERLOADABLE half exp10(half x) {
-  float _x = (float)x;
-  return (half)exp10(_x);
-}
-OVERLOADABLE half expm1(half x) {
-  float _x = (float)x;
-  return (half)expm1(_x);
-}
-OVERLOADABLE half fmin(half a, half b) {
-  return __gen_ocl_internal_fmin(a, b);
-}
-OVERLOADABLE half fmax(half a, half b) {
-  return __gen_ocl_internal_fmax(a, b);
-}
-OVERLOADABLE half fma(half a, half b, half c) {
-  float _a = (float)a;
-  float _b = (float)b;
-  float _c = (float)c;
-  return (half)fma(_a, _b, _c);
-}
-OVERLOADABLE half fdim(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)fdim(_x, _y);
-}
-OVERLOADABLE half maxmag(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)maxmag(_x, _y);
-}
-OVERLOADABLE half minmag(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)minmag(_x, _y);
-}
-OVERLOADABLE half exp2(half x) {
-  float _x = (float)x;
-  return (half)exp2(_x);
-}
-OVERLOADABLE half mad(half a, half b, half c) {
-  return __gen_ocl_mad(a,b,c);
-}
-OVERLOADABLE half sin(half x) {
-  float _x = (float)x;
-  return (half)sin(_x);
-}
-OVERLOADABLE half cos(half x) {
-  float _x = (float)x;
-  return (half)cos(_x);
-}
-OVERLOADABLE half tan(half x) {
-  float _x = (float)x;
-  return (half)tan(_x);
-}
-OVERLOADABLE half tgamma(half x) {
-  float _x = (float)x;
-  return (half)tgamma(_x);
-}
-OVERLOADABLE half lgamma(half x) {
-  float _x = (float)x;
-  return (half)lgamma(_x);
-}
-OVERLOADABLE half lgamma_r(half x, int *signgamp) {
-  float _x = (float)x;
-  return (half)lgamma_r(_x, signgamp);
-}
-OVERLOADABLE half log1p(half x) {
-  float _x = (float)x;
-  return (half)log1p(_x);
-}
-OVERLOADABLE half logb(half x) {
-  float _x = (float)x;
-  return (half)logb(_x);
-}
-OVERLOADABLE int ilogb(half x) {
-  float _x = (float)x;
-  return ilogb(_x);
-}
-OVERLOADABLE half nan(ushort code) {
-  return (half)NAN;
-}
-
-OVERLOADABLE half sincos(half x, half *cosval) {
-  float _x = (float)x;
-  float _cosval;
-  half ret = (half)sincos(_x, &_cosval);
-  *cosval = (half)_cosval;
-  return ret;
-}
-
-OVERLOADABLE half sqrt(half x) {
-  float _x = (float)x;
-  return (half)sqrt(_x);
-}
-OVERLOADABLE half rsqrt(half x) {
-  float _x = (float)x;
-  return (half)rsqrt(_x);
-}
-OVERLOADABLE half frexp(half x, int *exp) {
-  float _x = (float)x;
-  return (half)frexp(_x, exp);
-}
-OVERLOADABLE half nextafter(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)nextafter(_x, _y);
-}
-
-OVERLOADABLE half modf(half x, half *i) {
-  float _x = (float)x;
-  float _i;
-  half ret = (half)modf(_x, &_i);
-  *i = (half)_i;
-  return ret;
-}
-
-OVERLOADABLE half hypot(half x, half y) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)hypot(_x, _y);
-}
-
-OVERLOADABLE half fract(half x, half *p) {
-  float _x = (float)x;
-  float _p;
-  half ret = (half)fract(_x, &_p);
-  *p = (half)_p;
-  return ret;
-}
-
-OVERLOADABLE half remquo(half x, half y, int *quo) {
-  float _x = (float)x;
-  float _y = (float)y;
-  return (half)remquo(_x, _y, quo);
-}
-
-OVERLOADABLE half pown(half x, int n) {
-  float _x = (float)x;
-  return (half)pown(_x, n);
-}
-OVERLOADABLE half rootn(half x, int n) {
-  float _x = (float)x;
-  return (half)rootn(_x, n);
-}
-
 
 //-----------------double -----------------------
 INLINE int  __HI(double x){
diff --git a/backend/src/libocl/tmpl/ocl_math_20.tmpl.h b/backend/src/libocl/tmpl/ocl_math_20.tmpl.h
index c384b51b..ad68de42 100644
--- a/backend/src/libocl/tmpl/ocl_math_20.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math_20.tmpl.h
@@ -21,194 +21,19 @@
 #include "ocl_types.h"
 #include "ocl_math_common.h"
 
-OVERLOADABLE float cospi(float x);
-OVERLOADABLE float cosh(float x);
-OVERLOADABLE float acos(float x);
-OVERLOADABLE float acospi(float x);
-OVERLOADABLE float acosh(float x);
-OVERLOADABLE float sinpi(float x);
-OVERLOADABLE float sinh(float x);
-OVERLOADABLE float asin(float x);
-OVERLOADABLE float asinpi(float x);
-OVERLOADABLE float asinh(float x);
-OVERLOADABLE float tanpi(float x);
-OVERLOADABLE float tanh(float x);
-OVERLOADABLE float atan(float x);
-OVERLOADABLE float atan2(float y, float x);
-OVERLOADABLE float atan2pi(float y, float x);
-OVERLOADABLE float atanpi(float x);
-OVERLOADABLE float atanh(float x);
-OVERLOADABLE float cbrt(float x);
-OVERLOADABLE float rint(float x);
-OVERLOADABLE float copysign(float x, float y);
-OVERLOADABLE float erf(float x);
-OVERLOADABLE float erfc(float x);
-OVERLOADABLE float fmod (float x, float y);
-OVERLOADABLE float remainder(float x, float p);
-OVERLOADABLE float ldexp(float x, int n);
-OVERLOADABLE float powr(float x, float y);
-OVERLOADABLE float pow(float x, float y);
-//no pow, we use powr instead
-OVERLOADABLE float fabs(float x);
-OVERLOADABLE float trunc(float x);
-OVERLOADABLE float round(float x);
-OVERLOADABLE float floor(float x);
-OVERLOADABLE float ceil(float x);
-OVERLOADABLE float log(float x);
-OVERLOADABLE float log2(float x);
-OVERLOADABLE float log10(float x);
-OVERLOADABLE float exp(float x);
-OVERLOADABLE float exp10(float x);
-OVERLOADABLE float expm1(float x);
-OVERLOADABLE float fmin(float a, float b);
-OVERLOADABLE float fmax(float a, float b);
-OVERLOADABLE float fma(float a, float b, float c);
-OVERLOADABLE float fdim(float x, float y);
-OVERLOADABLE float maxmag(float x, float y);
-OVERLOADABLE float minmag(float x, float y);
-OVERLOADABLE float exp2(float x);
-OVERLOADABLE float mad(float a, float b, float c);
-OVERLOADABLE float sin(float x);
-OVERLOADABLE float cos(float x);
-OVERLOADABLE float tan(float x);
-OVERLOADABLE float tgamma(float x);
-OVERLOADABLE float lgamma(float x);
 OVERLOADABLE float lgamma_r(float x, int *signgamp);
-OVERLOADABLE float log1p(float x);
-OVERLOADABLE float logb(float x);
-OVERLOADABLE int ilogb(float x);
-OVERLOADABLE float nan(uint code);
 OVERLOADABLE float sincos(float x, float *cosval);
-OVERLOADABLE float sqrt(float x);
-OVERLOADABLE float rsqrt(float x);
-OVERLOADABLE float frexp(float x, int *exp);
-OVERLOADABLE float nextafter(float x, float y);
 OVERLOADABLE float modf(float x, float *i);
-OVERLOADABLE float hypot(float x, float y);
+OVERLOADABLE float frexp(float x, int *exp);
 OVERLOADABLE float fract(float x, float *p);
 OVERLOADABLE float remquo(float x, float y, int *quo);
-OVERLOADABLE float pown(float x, int n);
-OVERLOADABLE float rootn(float x, int n);
-
-// native
-OVERLOADABLE float native_cos(float x);
-OVERLOADABLE float native_divide(float x, float y);
-OVERLOADABLE float native_exp(float x);
-OVERLOADABLE float native_exp2(float x);
-OVERLOADABLE float native_exp10(float x);
-OVERLOADABLE float native_log(float x);
-OVERLOADABLE float native_log2(float x);
-OVERLOADABLE float native_log10(float x);
-OVERLOADABLE float native_powr(float x, float y);
-OVERLOADABLE float native_recip(float x);
-OVERLOADABLE float native_rsqrt(float x);
-OVERLOADABLE float native_sin(float x);
-OVERLOADABLE float native_sqrt(float x);
-OVERLOADABLE float native_tan(float x);
-
 
-// Half float version.
-OVERLOADABLE half cospi(half x);
-OVERLOADABLE half cosh(half x);
-OVERLOADABLE half acos(half x);
-OVERLOADABLE half acospi(half x);
-OVERLOADABLE half acosh(half x);
-OVERLOADABLE half sinpi(half x);
-OVERLOADABLE half sinh(half x);
-OVERLOADABLE half asin(half x);
-OVERLOADABLE half asinpi(half x);
-OVERLOADABLE half asinh(half x);
-OVERLOADABLE half tanpi(half x);
-OVERLOADABLE half tanh(half x);
-OVERLOADABLE half atan(half x);
-OVERLOADABLE half atan2(half y, half x);
-OVERLOADABLE half atan2pi(half y, half x);
-OVERLOADABLE half atanpi(half x);
-OVERLOADABLE half atanh(half x);
-OVERLOADABLE half cbrt(half x);
-OVERLOADABLE half rint(half x);
-OVERLOADABLE half copysign(half x, half y);
-OVERLOADABLE half erf(half x);
-OVERLOADABLE half erfc(half x);
-OVERLOADABLE half fmod (half x, half y);
-OVERLOADABLE half remainder(half x, half p);
-OVERLOADABLE half ldexp(half x, int n);
-OVERLOADABLE half powr(half x, half y);
-OVERLOADABLE half pow(half x, half y);
-//no pow, we use powr instead
-OVERLOADABLE half fabs(half x);
-OVERLOADABLE half trunc(half x);
-OVERLOADABLE half round(half x);
-OVERLOADABLE half floor(half x);
-OVERLOADABLE half ceil(half x);
-OVERLOADABLE half log(half x);
-OVERLOADABLE half log2(half x);
-OVERLOADABLE half log10(half x);
-OVERLOADABLE half exp(half x);
-OVERLOADABLE half exp10(half x);
-OVERLOADABLE half expm1(half x);
-OVERLOADABLE half fmin(half a, half b);
-OVERLOADABLE half fmax(half a, half b);
-OVERLOADABLE half fma(half a, half b, half c);
-OVERLOADABLE half fdim(half x, half y);
-OVERLOADABLE half maxmag(half x, half y);
-OVERLOADABLE half minmag(half x, half y);
-OVERLOADABLE half exp2(half x);
-OVERLOADABLE half mad(half a, half b, half c);
-OVERLOADABLE half sin(half x);
-OVERLOADABLE half cos(half x);
-OVERLOADABLE half tan(half x);
-OVERLOADABLE half tgamma(half x);
-OVERLOADABLE half lgamma(half x);
 OVERLOADABLE half lgamma_r(half x, int *signgamp);
-OVERLOADABLE half log1p(half x);
-OVERLOADABLE half logb(half x);
-OVERLOADABLE int ilogb(half x);
-OVERLOADABLE half nan(ushort code);
 OVERLOADABLE half sincos(half x, half *cosval);
-OVERLOADABLE half sqrt(half x);
-OVERLOADABLE half rsqrt(half x);
 OVERLOADABLE half frexp(half x, int *exp);
-OVERLOADABLE half nextafter(half x, half y);
 OVERLOADABLE half modf(half x, half *i);
-OVERLOADABLE half hypot(half x, half y);
 OVERLOADABLE half fract(half x, half *p);
 OVERLOADABLE half remquo(half x, half y, int *quo);
-OVERLOADABLE half pown(half x, int n);
-OVERLOADABLE half rootn(half x, int n);
-
-// native half
-OVERLOADABLE half native_cos(half x);
-OVERLOADABLE half native_divide(half x, half y);
-OVERLOADABLE half native_exp(half x);
-OVERLOADABLE half native_exp2(half x);
-OVERLOADABLE half native_exp10(half x);
-OVERLOADABLE half native_log(half x);
-OVERLOADABLE half native_log2(half x);
-OVERLOADABLE half native_log10(half x);
-OVERLOADABLE half native_powr(half x, half y);
-OVERLOADABLE half native_recip(half x);
-OVERLOADABLE half native_rsqrt(half x);
-OVERLOADABLE half native_sin(half x);
-OVERLOADABLE half native_sqrt(half x);
-OVERLOADABLE half native_tan(half x);
-
-// half accuracy
-OVERLOADABLE float half_cos(float x);
-OVERLOADABLE float half_divide(float x, float y);
-OVERLOADABLE float half_exp(float x);
-OVERLOADABLE float half_exp2(float x);
-OVERLOADABLE float half_exp10(float x);
-OVERLOADABLE float half_log(float x);
-OVERLOADABLE float half_log2(float x);
-OVERLOADABLE float half_log10(float x);
-OVERLOADABLE float half_powr(float x, float y);
-OVERLOADABLE float half_recip(float x);
-OVERLOADABLE float half_rsqrt(float x);
-OVERLOADABLE float half_sin(float x);
-OVERLOADABLE float half_sqrt(float x);
-OVERLOADABLE float half_tan(float x);
-
 
 //------- double -----------
 OVERLOADABLE double fract(double x, double *p);
diff --git a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
index 63dd202d..e82b7f55 100644
--- a/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_math_common.tmpl.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2012 - 2014 Intel Corporation
+ * Copyright © 2012 - 2017 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -32,6 +32,3440 @@
  * is preserved.
  * ====================================================
  */
+
+extern constant int __ocl_math_fastpath_flag;
+
+CONST float __gen_ocl_fabs(float x) __asm("llvm.fabs" ".f32");
+CONST float __gen_ocl_sin(float x) __asm("llvm.sin" ".f32");
+CONST float __gen_ocl_cos(float x) __asm("llvm.cos" ".f32");
+CONST float __gen_ocl_sqrt(float x) __asm("llvm.sqrt" ".f32");
+PURE CONST float __gen_ocl_rsqrt(float x);
+CONST float __gen_ocl_log(float x) __asm("llvm.log2" ".f32");
+CONST float __gen_ocl_exp(float x) __asm("llvm.exp2" ".f32");
+PURE CONST float __gen_ocl_pow(float x, float y) __asm("llvm.pow" ".f32");
+PURE CONST float __gen_ocl_rcp(float x);
+CONST float __gen_ocl_rndz(float x) __asm("llvm.trunc" ".f32");
+CONST float __gen_ocl_rnde(float x) __asm("llvm.rint" ".f32");
+CONST float __gen_ocl_rndu(float x) __asm("llvm.ceil" ".f32");
+CONST float __gen_ocl_rndd(float x) __asm("llvm.floor" ".f32");
+
+
+/* native functions */
+OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+OVERLOADABLE float native_log(float x) {
+  return native_log2(x) * 0.6931472002f;
+}
+OVERLOADABLE float native_log10(float x) {
+  return native_log2(x) * 0.3010299956f;
+}
+OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+OVERLOADABLE float native_tan(float x) {
+  return native_sin(x) / native_cos(x);
+}
+OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
+OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
+OVERLOADABLE float native_exp10(float x) { return __gen_ocl_exp(M_LOG210_F*x); }
+OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+
+/* Fast path */
+OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x) {
+    return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x) {
+    return native_log(x + native_sqrt(x * x + 1));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x) {
+    return 0.5f * native_log((1 + x) / (1 - x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x) {
+    return __gen_ocl_pow(x, 0.3333333333f);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x) {
+    return native_cos(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
+    return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
+    return __gen_ocl_cos(x * M_PI_F);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
+    return native_exp(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x) {
+    return native_exp10(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x) {
+    return __gen_ocl_pow(M_E_F, x) - 1;
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y) {
+    return x-y*__gen_ocl_rndz(x/y);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y) {
+    return __gen_ocl_sqrt(x*x + y*y);
+}
+OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x) {
+    return __gen_ocl_rndd(native_log2(x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n) {
+    return __gen_ocl_pow(2, n) * x;
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x) {
+    return native_log(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x) {
+    return native_log2(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x) {
+    return native_log10(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x) {
+    return native_log(x + 1);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x) {
+    return __gen_ocl_rndd(native_log2(x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y) {
+    return x-y*__gen_ocl_rnde(x/y);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n) {
+    return __gen_ocl_pow(x, 1.f / n);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x) {
+    return native_sin(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x) {
+    return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x) {
+    return __gen_ocl_sin(x * M_PI_F);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x) {
+    return native_tan(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x) {
+    float y = native_exp(-2 * x);
+    return (1 - y) / (1 + y);
+}
+
+
+/* Internal implement, high accuracy. */
+OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+  union { unsigned u; float f; } ux, uy;
+  ux.f = x;
+  uy.f = y;
+  ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
+  return ux.f;
+}
+
+OVERLOADABLE float inline __gen_ocl_internal_log_valid(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  union { unsigned int i; float f; } u;
+  const float
+  ln2_hi = 6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =  3.355443200e+07, /* 0x4c000000 */
+  Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lg3 = 2.8571429849e-01, /* 3E924925 */
+  Lg4 = 2.2222198546e-01; /* 3E638E29 */
+
+  const float zero   =  0.0;
+  float fsq, f, s, z, R, w, t1, t2, partial;
+  int k, ix, i, j;
+
+  u.f = x;  ix = u.i;
+  k = 0;
+
+  k += (ix>>23) - 127;
+  ix &= 0x007fffff;
+  i = (ix + (0x95f64<<3)) & 0x800000;
+  u.i = ix | (i^0x3f800000); x = u.f;
+  k += (i>>23);
+  f = x - 1.0f;
+  fsq = f * f;
+
+  if((0x007fffff & (15 + ix)) < 16) { /* |f| < 2**-20 */
+      R = fsq * (0.5f - 0.33333333333333333f * f);
+      return k * ln2_hi + k * ln2_lo + f - R;
+  }
+
+  s = f / (2.0f + f);
+  z = s * s;
+  i = ix - (0x6147a << 3);
+  w = z * z;
+  j = (0x6b851 << 3) - ix;
+  t1= w * mad(w, Lg4, Lg2);
+  t2= z * mad(w, Lg3, Lg1);
+  i |= j;
+  R = t2 + t1;
+  partial = (i > 0) ? -mad(s, 0.5f * fsq, -0.5f * fsq) : (s * f);
+
+  return mad(s, R, f) - partial + k * ln2_hi + k * ln2_lo;;
+}
+
+OVERLOADABLE float __gen_ocl_internal_log(float x)
+{
+  union { unsigned int i; float f; } u;
+  u.f = x;
+  int ix = u.i;
+
+  if (ix < 0 )
+	return NAN;  /* log(-#) = NaN */
+  if (ix >= 0x7f800000)
+    return NAN;
+
+  return __gen_ocl_internal_log_valid(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_log10(float x)
+{
+  union { float f; unsigned i; } u;
+  const float
+  ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
+  log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
+  log10_2lo  =  7.9034151668e-07; /* 0x355427db */
+
+  float y, z;
+  int i, k, hx;
+
+  u.f = x; hx = u.i;
+
+  if (hx<0)
+    return NAN; /* log(-#) = NaN */
+  if (hx >= 0x7f800000)
+    return NAN;
+
+  k = (hx >> 23) - 127;
+  i  = ((unsigned)k & 0x80000000) >> 31;
+  hx = (hx&0x007fffff) | ((0x7f-i) << 23);
+  y  = (float)(k + i);
+  u.i = hx; x = u.f;
+
+  return  y * log10_2lo + y * log10_2hi + ivln10 * __gen_ocl_internal_log_valid(x);
+}
+
+
+OVERLOADABLE float __gen_ocl_internal_log2(float x)
+{
+  const float zero   =  0.0,
+  invln2 = 0x1.715476p+0f;
+  int ix;
+
+  union { float f; int i; } u;
+  u.f = x; ix = u.i;
+
+  if (ix < 0)
+	return NAN;    /** log(-#) = NaN */
+  if (ix >= 0x7f800000)
+	return NAN;
+
+  return invln2 * __gen_ocl_internal_log_valid(x);
+}
+
+
+float __gen_ocl_scalbnf (float x, int n){
+  /* copy from fdlibm */
+  float two25 = 3.355443200e+07,	/* 0x4c000000 */
+  twom25 = 2.9802322388e-08,	        /* 0x33000000 */
+  huge = 1.0e+30,
+  tiny = 1.0e-30;
+  int k,ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  k = (ix&0x7f800000)>>23; /* extract exponent */
+  if (k==0) {	/* 0 or subnormal x */
+    if ((ix&0x7fffffff)==0) return x; /* +-0 */
+    x *= two25;
+    GEN_OCL_GET_FLOAT_WORD(ix,x);
+    k = ((ix&0x7f800000)>>23) - 25;
+  }
+  if (k==0xff) return x+x;	/* NaN or Inf */
+  if (n< -50000)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  if (n> 50000 || k+n > 0xfe)
+    return huge*__gen_ocl_internal_copysign(huge,x); /* overflow  */
+  /* Now k and n are bounded we know that k = k+n does not overflow. */
+  k = k+n;
+  if (k > 0) { /* normal result */
+    GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+    return x;
+  }
+  if (k <= -25)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  k += 25;				/* subnormal result */
+  GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+  return x*twom25;
+}
+
+const __constant unsigned int two_over_pi[] = {
+0, 0, 0xA2F, 0x983, 0x6E4, 0xe44, 0x152, 0x9FC,
+0x275, 0x7D1, 0xF53, 0x4DD, 0xC0D, 0xB62,
+0x959, 0x93C, 0x439, 0x041, 0xFE5, 0x163,
+};
+
+// The main idea is from "Radian Reduction for Trigonometric Functions"
+// written by Mary H. Payne and Robert N. Hanek. Also another reference
+// is "A Continued-Fraction Analysis of Trigonometric Argument Reduction"
+// written by Roger Alan Smith, who gave the worst case in this paper.
+// for single float, worst x = 0x1.47d0fep34, and there are 29 bit
+// leading zeros in the fraction part of x*(2.0/pi). so we need at least
+// 29 (leading zero)+ 24 (fraction )+12 (integer) + guard bits. that is,
+// 65 + guard bits, as we calculate in 12*7 = 84bits, which means we have
+// about 19 guard bits. If we need further precision, we may need more
+// guard bits
+// Note we place two 0 in two_over_pi, which is used to handle input less
+// than 0x1.0p23
+
+int payne_hanek(float x, float *y) {
+  union { float f; unsigned u;} ieee;
+  ieee.f = x;
+  unsigned u = ieee.u;
+  int k = ((u & 0x7f800000) >> 23)-127;
+  int ma = (u & 0x7fffff) | 0x800000;
+  unsigned  high, low;
+  high = (ma & 0xfff000) >> 12;
+  low = ma & 0xfff;
+
+  // Two tune below macro, you need to fully understand the algorithm
+#define CALC_BLOCKS 7
+#define ZERO_BITS 2
+
+  unsigned result[CALC_BLOCKS];
+
+  // round down, note we need 2 bits integer precision
+  int index = (k-23-2) < 0 ? (k-23-2-11)/12 : (k-23-2)/12;
+
+  for (int i = 0; i < CALC_BLOCKS; i++) {
+    result[i] =  low * two_over_pi[index+i+ZERO_BITS] ;
+    result[i] +=  high * two_over_pi[index+i+1+ZERO_BITS];
+  }
+
+  for (int i = CALC_BLOCKS-1; i > 0; i--) {
+    int temp = result[i] >> 12;
+    result[i]  -= temp << 12;
+    result[i-1] += temp;
+  }
+#undef CALC_BLOCKS
+#undef ZERO_BITS
+
+  // get number of integer digits in result[0], note we only consider 12 valid bits
+  // and also it means the fraction digits in result[0] is (12-intDigit)
+
+  int intDigit = index*(-12) + (k-23);
+
+  // As the integer bits may be all included in result[0], and also maybe
+  // some bits in result[0], and some in result[1]. So we merge succesive bits,
+  // which makes easy coding.
+
+  unsigned b0 = (result[0] << 12) | result[1];
+  unsigned b1 = (result[2] << 12) | result[3];
+  unsigned b2 = (result[4] << 12) | result[5];
+  unsigned b3 = (result[6] << 12);
+
+  unsigned intPart = b0 >> (24-intDigit);
+
+  unsigned fract1 = ((b0 << intDigit) | (b1 >> (24-intDigit))) & 0xffffff;
+  unsigned fract2 = ((b1 << intDigit) | (b2 >> (24-intDigit))) & 0xffffff;
+  unsigned fract3 = ((b2 << intDigit) | (b3 >> (24-intDigit))) & 0xffffff;
+
+  // larger than 0.5? which mean larger than pi/4, we need
+  // transform from [0,pi/2] to [-pi/4, pi/4] through -(1.0-fract)
+  int largerPiBy4 = ((fract1 & 0x800000) != 0);
+  int sign = largerPiBy4 ? 1 : 0;
+  intPart = largerPiBy4 ? (intPart+1) : intPart;
+
+  fract1 = largerPiBy4 ? (fract1 ^ 0x00ffffff) : fract1;
+  fract2 = largerPiBy4 ? (fract2 ^ 0x00ffffff) : fract2;
+  fract3 = largerPiBy4 ? (fract3 ^ 0x00ffffff) : fract3;
+
+  int leadingZero = (fract1 == 0);
+
+  // +1 is for the hidden bit 1 in floating-point format
+  int exponent = leadingZero ? -(24+1) : -(0+1);
+
+  fract1 = leadingZero ? fract2 : fract1;
+  fract2 = leadingZero ? fract3 : fract2;
+
+  // fract1 may have leading zeros, add it
+  int shift = clz(fract1)-8;
+  exponent += -shift;
+
+  float pio2 = 0x1.921fb6p+0;
+  unsigned fdigit = ((fract1 << shift) | (fract2 >> (24-shift))) & 0xffffff;
+
+  // we know that denormal number will not appear here
+  ieee.u = (sign << 31) | ((exponent+127) << 23) | (fdigit & 0x7fffff);
+  *y = ieee.f * pio2;
+  return intPart;
+}
+
+int argumentReduceSmall(float x, float * remainder) {
+  union {
+    float f;
+    unsigned u;
+  } ieee;
+
+  float twoByPi = 2.0f/3.14159265f;
+  float piBy2_1h = (float) 0xc90/0x1.0p11,
+        piBy2_1l = (float) 0xfda/0x1.0p23,
+        piBy2_2h = (float) 0xa22/0x1.0p35,
+        piBy2_2l = (float) 0x168/0x1.0p47,
+        piBy2_3h = (float) 0xc23/0x1.0p59,
+        piBy2_3l = (float) 0x4c4/0x1.0p71;
+
+  float y = (float)(int)(twoByPi * x + 0.5f);
+  ieee.f = y;
+  ieee.u = ieee.u & 0xfffff000;
+
+  float yh = ieee.f;
+  float yl = y - yh;
+  float rem = x - yh*piBy2_1h - yh*piBy2_1l - yl*piBy2_1h - yl*piBy2_1l;
+  rem = rem - yh*piBy2_2h - yh*piBy2_2l + yl*piBy2_2h + yl*piBy2_2l;
+  rem = rem - yh*piBy2_3h - yh*piBy2_3l - yl*piBy2_3h - yl*piBy2_3l;
+
+  *remainder = rem;
+  return (int)y;
+}
+
+
+int __ieee754_rem_pio2f(float x, float *y) {
+  if (x < 4000.0f) {
+    return argumentReduceSmall(x, y);
+  } else {
+    return payne_hanek(x, y);
+  }
+}
+
+OVERLOADABLE float __kernel_sinf(float x)
+{
+  /* copied from fdlibm */
+  const float
+  S1  = -1.6666667163e-01, /* 0xbe2aaaab */
+  S2  =  8.3333337680e-03, /* 0x3c088889 */
+  S3  = -1.9841270114e-04, /* 0xb9500d01 */
+  S4  =  2.7557314297e-06; /* 0x3638ef1b */
+  float z,r,v;
+  z =  x*x;
+  v =  z*x;
+  r = mad(z, mad(z, mad(z, S4, S3), S2), S1);
+
+  return mad(v, r, x);
+}
+
+float __kernel_cosf(float x, float y)
+{
+  /* copied from fdlibm */
+  const float
+  one =  1.0000000000e+00, /* 0x3f800000 */
+  C1  =  4.1666667908e-02, /* 0x3d2aaaab */
+  C2  = -1.3888889225e-03, /* 0xbab60b61 */
+  C3  =  2.4801587642e-05; /* 0x37d00d01 */
+  float a,hz,z,r,qx;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;     /* ix = |x|'s high word*/
+  z  = x*x;
+  r = z * mad(z, mad(z, C3, C2), C1);
+
+  if(ix < 0x3e99999a)       /* if |x| < 0.3 */
+      return one - ((float)0.5*z - (z*r - x*y));
+  else {
+      GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
+      hz = (float)0.5*z-qx;
+      a  = one-qx;
+      return a - (hz - (z*r-x*y));
+  }
+}
+
+OVERLOADABLE float sin(float x)
+{
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sin(x);
+
+  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
+  float y,z=0.0;
+  int n, ix;
+
+  float negative = x < 0.0f? -1.0f : 1.0f;
+  x = fabs(x);
+
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;
+
+    /* sin(Inf or NaN) is NaN */
+  if (ix >= 0x7f800000) return x-x;
+
+  if(x <= pio4)
+	  return negative * __kernel_sinf(x);
+  /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,&y);
+      float s = __kernel_sinf(y);
+      float c = __kernel_cosf(y,0.0f);
+      float ret = (n&1) ? negative*c : negative*s;
+      return (n&3)> 1? -1.0f*ret : ret;
+  }
+}
+
+OVERLOADABLE float cos(float x)
+{
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cos(x);
+
+  const float pio4  =  7.8539812565e-01; /* 0x3f490fda */
+  float y,z=0.0;
+  int n, ix;
+  x = __gen_ocl_fabs(x);
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+  ix &= 0x7fffffff;
+
+    /* cos(Inf or NaN) is NaN */
+  if (ix >= 0x7f800000) return x-x;
+
+  if(x <= pio4)
+	  return __kernel_cosf(x, 0.f);
+  /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,&y);
+      n &= 3;
+      float c = __kernel_cosf(y, 0.0f);
+      float s = __kernel_sinf(y);
+      float v = (n&1) ? s : c;
+      /* n&3   return
+          0    cos(y)
+          1   -sin(y)
+          2   -cos(y)
+          3    sin(y)
+      */
+      int mask = (n>>1) ^ n;
+      float sign = (mask&1) ? -1.0f : 1.0f;
+      return sign * v;
+  }
+}
+
+float __kernel_tanf(float x, float y, int iy)
+{
+  /* copied from fdlibm */
+        float z,r,v,w,s;
+        int ix,hx;
+        const float
+        one   =  1.0000000000e+00, /* 0x3f800000 */
+        pio4  =  7.8539812565e-01, /* 0x3f490fda */
+        pio4lo=  3.7748947079e-08; /* 0x33222168 */
+        float T[13];// =  {
+         T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
+         T[1] = 1.3333334029e-01; /* 0x3e088889 */
+         T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
+         T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
+         T[4] = 8.8632395491e-03; /* 0x3c11371f */
+         T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
+         T[6] = 1.4562094584e-03; /* 0x3abede48 */
+         T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
+
+        GEN_OCL_GET_FLOAT_WORD(hx,x);
+        ix = hx&0x7fffffff;     /* high word of |x| */
+        if(ix<0x31800000)                       /* x < 2**-28 */
+            {if((int)x==0) {                    /* generate inexact */
+                if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
+                else return (iy==1)? x: -one/x;
+            }
+            }
+        if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
+            if(hx<0) {x = -x; y = -y;}
+            z = pio4-x;
+            w = pio4lo-y;
+            x = z+w; y = 0.0;
+        }
+        z       =  x*x;
+        w       =  z*z;
+		/* Break x^5*(T[1]+x^2*T[2]+...) into
+		 *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+		 *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+		 */
+
+        r = mad(w, mad(w, mad(w, T[7], T[5]), T[3]), T[1]);
+        v = z* mad(w, mad(w, T[6], T[4]), T[2]);
+
+        s = z*x;
+        r = mad(z, mad(s, r + v, y), y);
+        r += T[0]*s;
+        w = x+r;
+        if(ix>=0x3f2ca140) {
+            v = (float)iy;
+            return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
+        }
+        if(iy==1) return w;
+        else
+        	return -1.0/(x+r);
+}
+
+OVERLOADABLE float tan(float x)
+{
+    if (__ocl_math_fastpath_flag)
+      return __gen_ocl_internal_fastpath_tan(x);
+
+    float y,z=0.0;
+    int n, ix;
+    float negative = x < 0.0f? -1.0f : 1.0f;
+    x = negative * x;
+
+    GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    ix &= 0x7fffffff;
+
+    /* tan(Inf or NaN) is NaN */
+    if (ix>=0x7f800000) return x-x;            /* NaN */
+
+    /* argument reduction needed */
+    else {
+      n = __ieee754_rem_pio2f(x,&y);
+      return negative * __kernel_tanf(y,0.0f,1-((n&1)<<1)); /*   1 -- n even
+                                                              -1 -- n odd */
+    }
+}
+
+OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+  int ix;
+  if(isinf(x) || isnan(x)) { return NAN; }
+  if(x < 0.0f) { x = -x; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 1.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+
+  switch(ix) {
+   case 0:
+    return __kernel_cosf(m*M_PI_F, 0.0f);
+   case 1:
+   case 2:
+    return __kernel_sinf((0.5f-m)*M_PI_F);
+   case 3:
+   case 4:
+    return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
+   case 5:
+   case 6:
+    return __kernel_sinf((m-1.5f)*M_PI_F);
+   default:
+    return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
+   }
+}
+
+OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+
+  switch(ix) {
+   case 0:
+    return sign*__kernel_sinf(m*M_PI_F);
+   case 1:
+   case 2:
+    return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+   case 3:
+   case 4:
+    return -sign*__kernel_sinf((m-1.0f)*M_PI_F);
+   case 5:
+   case 6:
+    return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+   default:
+    return -sign*__kernel_sinf((2.0f-m)*M_PI_F);
+   }
+
+}
+
+OVERLOADABLE float lgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+    const float
+        zero=  0.,
+        one =  1.0000000000e+00,
+        pi  =  3.1415927410e+00,
+        a0  =  7.7215664089e-02,
+        a1  =  3.2246702909e-01,
+        a2  =  6.7352302372e-02,
+        a3  =  2.0580807701e-02,
+        a4  =  7.3855509982e-03,
+        a5  =  2.8905137442e-03,
+        a6  =  1.1927076848e-03,
+        a7  =  5.1006977446e-04,
+        a8  =  2.2086278477e-04,
+        a9  =  1.0801156895e-04,
+        a10 =  2.5214456400e-05,
+        a11 =  4.4864096708e-05,
+        tc  =  1.4616321325e+00,
+        tf  = -1.2148628384e-01,
+        tt  =  6.6971006518e-09,
+        t0  =  4.8383611441e-01,
+        t1  = -1.4758771658e-01,
+        t2  =  6.4624942839e-02,
+        t3  = -3.2788541168e-02,
+        t4  =  1.7970675603e-02,
+        t5  = -1.0314224288e-02,
+        t6  =  6.1005386524e-03,
+        t7  = -3.6845202558e-03,
+        t8  =  2.2596477065e-03,
+        t9  = -1.4034647029e-03,
+        t10 =  8.8108185446e-04,
+        t11 = -5.3859531181e-04,
+        t12 =  3.1563205994e-04,
+        t13 = -3.1275415677e-04,
+        t14 =  3.3552918467e-04,
+        u0  = -7.7215664089e-02,
+        u1  =  6.3282704353e-01,
+        u2  =  1.4549225569e+00,
+        u3  =  9.7771751881e-01,
+        u4  =  2.2896373272e-01,
+        u5  =  1.3381091878e-02,
+        v1  =  2.4559779167e+00,
+        v2  =  2.1284897327e+00,
+        v3  =  7.6928514242e-01,
+        v4  =  1.0422264785e-01,
+        v5  =  3.2170924824e-03,
+        s0  = -7.7215664089e-02,
+        s1  =  2.1498242021e-01,
+        s2  =  3.2577878237e-01,
+        s3  =  1.4635047317e-01,
+        s4  =  2.6642270386e-02,
+        s5  =  1.8402845599e-03,
+        s6  =  3.1947532989e-05,
+        r1  =  1.3920053244e+00,
+        r2  =  7.2193557024e-01,
+        r3  =  1.7193385959e-01,
+        r4  =  1.8645919859e-02,
+        r5  =  7.7794247773e-04,
+        r6  =  7.3266842264e-06,
+        w0  =  4.1893854737e-01,
+        w1  =  8.3333335817e-02,
+        w2  = -2.7777778450e-03,
+        w3  =  7.9365057172e-04,
+        w4  = -5.9518753551e-04,
+        w5  =  8.3633989561e-04,
+        w6  = -1.6309292987e-03;
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+	int i, hx, ix;
+	nadj = 0;
+	hx = *(int *)&x;
+	ix = hx & 0x7fffffff;
+	if (ix >= 0x7f800000)
+		return x * x;
+	if (ix == 0)
+		return ((x + one) / zero);
+	if (ix < 0x1c800000) {
+		if (hx < 0) {
+			return -native_log(-x);
+		} else
+			return -native_log(x);
+	}
+	if (hx < 0) {
+		if (ix >= 0x4b000000)
+			return ((-x) / zero);
+		t = __gen_ocl_internal_sinpi(x);
+		if (t == zero)
+			return ((-x) / zero);
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));
+		x = -x;
+	}
+	if (ix == 0x3f800000 || ix == 0x40000000)
+		r = 0;
+	else if (ix < 0x40000000) {
+		if (ix <= 0x3f666666) {
+			r = -native_log(x);
+			if (ix >= 0x3f3b4a20) {
+				y = one - x;
+				i = 0;
+			} else if (ix >= 0x3e6d3308) {
+				y = x - (tc - one);
+				i = 1;
+			} else {
+				y = x;
+				i = 2;
+			}
+		} else {
+			r = zero;
+			if (ix >= 0x3fdda618) {
+				y = (float) 2.0 - x;
+				i = 0;
+			}
+			else if (ix >= 0x3F9da620) {
+				y = x - tc;
+				i = 1;
+			}
+			else {
+				y = x - one;
+				i = 2;
+			}
+		}
+		switch (i) {
+		case 0:
+			z = y * y;
+			p1 = mad(z, mad(z, mad(z, mad(z, mad(z, a10, a8), a6), a4), a2), a0);
+			p2 = z * mad(z, mad(z, mad(z, mad(z, mad(z, a11, a9), a7), a5), a3), a1);
+			p = mad(y, p1, p2);
+			r += (p - (float) 0.5 * y);
+			break;
+		case 1:
+			z = y * y;
+			w = z * y;
+			p1 = mad(w, mad(w, mad(w, mad(w, t12, t9), t6), t3), t0);
+			p2 = mad(w, mad(w, mad(w, mad(w, t13, t10), t7), t4), t1);
+			p3 = mad(w, mad(w, mad(w, mad(w, t14, t11), t8), t5), t2);
+			p = mad(p1, z, mad(w, mad(y, p3, p2), -tt));
+			r += (tf + p);
+			break;
+		case 2:
+			p1 = y * mad(y, mad(y, mad(y, mad(y, mad(y, u5, u4), u3), u2), u1), u0);
+			p2 = mad(y, mad(y, mad(y, mad(y, mad(y, v5, v4), v3), v2), v1), one);
+			r += (-(float) 0.5 * y + p1 / p2);
+		}
+	} else if (ix < 0x41000000) {
+		i = (int) x;
+		t = zero;
+		y = x - (float) i;
+
+		p =y * mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, s6, s5), s4), s3), s2), s1), s0);
+		q = mad(y, mad(y, mad(y, mad(y, mad(y, mad(y, r6, r5), r4), r3), r2), r1), one);
+		r = .5f * y + p / q;
+		z = one;
+
+		switch (i) {
+		case 7:
+			z *= (y + 6.0f);
+		case 6:
+			z *= (y + 5.0f);
+		case 5:
+			z *= (y + 4.0f);
+		case 4:
+			z *= (y + 3.0f);
+		case 3:
+			z *= (y + 2.0f);
+			r += native_log(z);
+			break;
+		}
+
+	} else if (ix < 0x5c800000) {
+		t = native_log(x);
+		z = one / x;
+		y = z * z;
+		w = mad(z, mad(y, mad(y, mad(y, mad(y, mad(y, w6, w5), w4), w3), w2), w1), w0);
+		r = (x - .5f) * (t - one) + w;
+	} else
+		r = x * (native_log(x) - one);
+	if (hx < 0)
+		r = nadj - r;
+	return r;
+}
+
+OVERLOADABLE float log1p(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log1p(x);
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  const float
+  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =    3.355443200e+07, /* 0x4c000000 */
+  Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lp3 = 2.8571429849e-01, /* 3E924925 */
+  Lp4 = 2.2222198546e-01; /* 3E638E29 */
+  const float zero = 0.0;
+  float hfsq,f,c,s,z,R,u;
+  int k,hx,hu,ax;
+  union {float f; unsigned i;} un;
+  un.f = x;  hx = un.i;
+  ax = hx&0x7fffffff;
+
+  k = 1;
+  if (hx < 0x3ed413d7) {      /* x < 0.41422  */
+      if(ax>=0x3f800000) {    /* x <= -1.0 */
+    if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
+    else return (x-x)/(x-x);  /* log1p(x<-1)=NaN */
+      }
+      if(ax<0x31000000) {     /* |x| < 2**-29 */
+    if(two25+x>zero     /* raise inexact */
+              &&ax<0x24800000)    /* |x| < 2**-54 */
+        return x;
+    else
+        return x - x*x*(float)0.5;
+      }
+      if(hx>0||hx<=((int)0xbe95f61f)) {
+    k=0;f=x;hu=1;}  /* -0.2929<x<0.41422 */
+  }
+  if (hx >= 0x7f800000) return x+x;
+  if(k!=0) {
+      if(hx<0x5a000000) {
+    u  = (float)1.0+x;
+
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    /* correction term */
+          c  = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
+    c /= u;
+      } else {
+    u  = x;
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    c  = 0;
+      }
+      hu &= 0x007fffff;
+      if(hu<0x3504f7) {
+          un.i = hu|0x3f800000; u = un.f;/* normalize u */
+      } else {
+          k += 1;
+          un.i = hu|0x3f000000; u = un.f;  /* normalize u/2 */
+          hu = (0x00800000-hu)>>2;
+      }
+      f = u-(float)1.0;
+  }
+  hfsq=(float)0.5*f*f;
+  if(hu==0)
+  { /* |f| < 2**-20 */
+      if(f==zero)
+      {
+    	  if(k==0) return zero;
+    	  else {c = mad(k , ln2_lo, c); return mad(k, ln2_hi, c);}
+      }
+      R = mad(hfsq, 1.0f, -0.66666666666666666f * f);
+      if(k==0) return f-R; else
+    	  return k * ln2_hi - (R - mad(k, ln2_lo, c) - f);
+  }
+  s = f/((float)2.0+f);
+  z = s*s;
+  R = z * mad(z, mad(z, mad(z, Lp4, Lp3), Lp2), Lp1);
+  if(k==0)
+	  return f + mad(hfsq + R, s, -hfsq);
+  else
+	  return k*ln2_hi-( (hfsq - mad(s, hfsq + R, mad(k, ln2_lo, c))) - f);
+}
+
+OVERLOADABLE float logb(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_logb(x);
+
+  union {float f; unsigned i;} u;
+  u.f = x;
+  int e =  ((u.i & 0x7f800000) >> 23);
+  float r1 = e-127;
+  float r2 = -INFINITY;
+  float r3 = x*x;
+    /* sub normal or +/-0 */
+  float r = e == 0 ? r2 : r1;
+    /* inf & nan */
+  return e == 0xff ? r3 : r;
+}
+
+OVERLOADABLE int ilogb(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_ilogb(x);
+
+  union { int i; float f; } u;
+  if (isnan(x))
+    return FP_ILOGBNAN;
+  if (isinf(x))
+    return 0x7FFFFFFF;
+  u.f = x;
+  u.i &= 0x7fffffff;
+  if (u.i == 0)
+    return FP_ILOGB0;
+  if (u.i >= 0x800000)
+    return (u.i >> 23) - 127;
+  int r = -126;
+  int a = u.i & 0x7FFFFF;
+  while(a < 0x800000) {
+    a <<= 1;
+    r --;
+  }
+  return r;
+}
+OVERLOADABLE float nan(uint code) {
+  return NAN;
+}
+OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  int n = __gen_ocl_internal_floor(m*4.0f);
+  if(m == 0.5f) {
+    return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
+  }
+  if(m == 0.0f) {
+    return (ix&0x1) == 0 ? 0.0f : -0.0f;
+  }
+
+  switch(n) {
+    case 0:
+      return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
+    case 1:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    case 2:
+      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
+    default:
+      return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
+  }
+}
+OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+  /* copied from fdlibm */
+  const unsigned
+  B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
+  B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
+
+  const float
+  C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
+  D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
+  E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
+  F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
+  G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
+
+  float r,s,t, w;
+  int hx;
+  uint sign;
+  uint high;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  sign=hx&0x80000000;     /* sign= sign(x) */
+  hx  ^=sign;
+  if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
+  if(hx==0)
+      return(x);    /* cbrt(0) is itself */
+
+  GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
+    /* rough cbrt to 5 bits */
+  if(hx<0x00800000)     /* subnormal number */
+    {
+    //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
+     //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
+      t = (sign = 0) ? 0.0f : -0.0f;
+      return t;
+    }
+  else
+    GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
+
+
+    /* new cbrt to 23 bits */
+  r=t*t/x;
+  s=mad(r, t, C);
+  t*=G+F/(s+E+D/s);
+    /* one step newton iteration to 53 bits with error less than 0.667 ulps */
+  s=t*t;    /* t*t is exact */
+  r=x/s;
+  w=t+t;
+  r=(r-t)/(w+r);  /* r-s is exact */
+  t=mad(t, r, t);
+
+    /* retore the sign bit */
+  GEN_OCL_GET_FLOAT_WORD(high,t);
+  GEN_OCL_SET_FLOAT_WORD(t,high|sign);
+  return(t);
+}
+
+INLINE float __gen_ocl_asin_util(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float
+  pS0 =  1.66666666666666657415e-01,
+  pS1 = -3.25565818622400915405e-01,
+  pS2 =  2.01212532134862925881e-01,
+  pS3 = -4.00555345006794114027e-02,
+  pS4 =  7.91534994289814532176e-04,
+  qS1 = -2.40339491173441421878e+00,
+  qS2 =  2.02094576023350569471e+00,
+  qS3 = -6.88283971605453293030e-01,
+  qS4 =  7.70381505559019352791e-02;
+
+  float t = x*x;
+  float p = t * mad(t, mad(t, mad(t, mad(t, pS4, pS3), pS2), pS1), pS0);
+  float q = mad(t, mad(t, mad(t, mad(t, qS4, qS3), qS2), qS1), 1.0f);
+  float w = p / q;
+  return mad(x, w, x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+  uint ix;
+  union { uint i; float f; } u;
+  u.f = x;
+  ix = u.i & 0x7fffffff;
+  if(ix == 0x3f800000) {
+    return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
+  }
+  if(ix > 0x3f800000) {            /* |x|>= 1 */
+    return  NAN;          /* asin(|x|>1) is NaN */
+  }
+
+  if(ix < 0x32000000) {            /* if |x| < 2**-27 */
+    if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
+  }
+
+  if(x < -0.5) {
+    return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
+  } else if(x > 0.5) {
+    return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
+  } else {
+    return __gen_ocl_asin_util(x);
+  }
+}
+OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+  return __gen_ocl_internal_asin(x) / M_PI_F;
+}
+OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+  if(x > 0.5)
+    return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
+  else
+    return M_PI_2_F - __gen_ocl_internal_asin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+  return __gen_ocl_internal_acos(x) / M_PI_F;
+}
+__constant float atanhi[4] = {
+  4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
+  7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
+  9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
+  1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
+};
+__constant float atanlo[4] = {
+  5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
+  3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
+  3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
+  7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
+};
+
+OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+  /* copied from fdlibm */
+  float aT[11];
+  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
+  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
+  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
+  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
+  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
+  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
+  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
+  const float one = 1.0, huge = 1.0e30;
+
+  float w,s1,s2,z;
+  int ix,hx,id;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
+      if(ix>0x7f800000)
+    return x+x;   /* NaN */
+      if(hx>0) return  atanhi[3]+atanlo[3];
+      else     return -atanhi[3]-atanlo[3];
+  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
+      if (ix < 0x31000000) {  /* |x| < 2^-29 */
+    if(huge+x>one) return x;  /* raise inexact */
+      }
+      id = -1;
+  } else {
+  x = __gen_ocl_fabs(x);
+  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
+      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
+    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
+      } else {      /* 11/16<=|x|< 19/16 */
+    id = 1; x  = (x-one)/(x+one);
+      }
+  } else {
+      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
+    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
+      } else {      /* 2.4375 <= |x| < 2^66 */
+    id = 3; x  = -(float)1.0/x;
+      }
+  }}
+    /* end of argument reduction */
+  z = x*x;
+  w = z*z;
+    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
+  s1 = z * mad(w, mad(w, mad(w, aT[6], aT[4]), aT[2]), aT[0]);
+  s2 = w * mad(w, mad(w, aT[5], aT[3]), aT[1]);
+  if (id<0) return x - x*(s1+s2);
+  else {
+      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
+      return (hx<0)? -z:z;
+  }
+
+}
+OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+  return __gen_ocl_internal_atan(x) / M_PI_F;
+}
+
+// XXX work-around PTX profile
+OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
+OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
+    const float pi = 0x1.921fb6p+1f;
+    const float piby2 = 0x1.921fb6p+0f;
+    const float piby4 = 0x1.921fb6p-1f;
+    const float threepiby4 = 0x1.2d97c8p+1f;
+
+    float ax = fabs(x);
+    float ay = fabs(y);
+    float v = min(ax, ay);
+    float u = max(ax, ay);
+
+    // Scale since u could be large, as in "regular" divide
+    float s = u > 0x1.0p+96f ? 0x1.0p-32f : 1.0f;
+    float vbyu = s * v/ (s*u);
+
+    float vbyu2 = vbyu * vbyu;
+
+    float p = mad(vbyu2, mad(vbyu2, -0x1.7e1f78p-9f, -0x1.7d1b98p-3f), -0x1.5554d0p-2f) * vbyu2 * vbyu;
+    float q = mad(vbyu2, mad(vbyu2, 0x1.1a714cp-2f, 0x1.287c56p+0f), 1.0f);
+
+    // Octant 0 result
+    float a = mad(p, 1.0f/q, vbyu);
+
+    // Fix up 3 other octants
+    float at = piby2 - a;
+    a = ay > ax ? at : a;
+    at = pi - a;
+    a = x < 0.0F ? at : a;
+
+    // y == 0 => 0 for x >= 0, pi for x < 0
+    int hx = as_int(x);
+    at = (hx < 0) ? pi : 0.0f;
+    a = y == 0.0f ? at : a;
+
+    at = x > 0.0f ? piby4 : threepiby4;
+    a = ax == INFINITY & ay == INFINITY ? at : a;
+
+    // x or y is NaN
+    a = isnan(x) | isnan(y) ? NAN : a;
+
+    // Fixup sign and return
+    return copysign(a, y);
+}
+
+OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
+}
+OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
+OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+OVERLOADABLE float __gen_ocl_internal_round(float x) {
+  float y = __gen_ocl_rndz(x);
+  if (__gen_ocl_fabs(x - y) >= 0.5f)
+    y += __gen_ocl_internal_copysign(1.f, x);
+  return y;
+}
+OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
+OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+  return __gen_ocl_rnde(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
+  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
+  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
+  ivln2	 =	1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one = 1.0,
+  huge = 1.0e+30,
+  P1 = 1.6666667163e-01, /* 0x3e2aaaab */
+  P2 = -2.7777778450e-03; /* 0xbb360b61 */
+  float y,hi=0.0,lo=0.0,c,t;
+  int k=0,xsb;
+  unsigned hx;
+  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
+  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
+  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
+  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
+  float half_0 = 0.5;
+  float half_1 =	-0.5;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = (hx>>31)&1;		/* sign bit of x */
+  hx &= 0x7fffffff;		/* high word of |x| */
+
+  /* filter out non-finite argument */
+  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
+    if(hx>0x7f800000)
+      return x+x;			/* NaN */
+    if(hx==0x7f800000)
+      return (xsb==0)? x:0.0; 	/* exp(+-inf)={inf,0} */
+    if(x > o_threshold) return huge*huge; /* overflow */
+    if(x < u_threshold) return twom100*twom100; /* underflow */
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
+      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0); lo= xsb == 1? ln2LO_1 : ln2LO_0; k = 1-xsb-xsb;
+    } else {
+      float tmp = xsb == 1 ? half_1 : half_0;
+      k  = ivln2*x+tmp;
+      t  = k;
+      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
+      lo = t*ln2LO_0;
+    }
+    x  = hi - lo;
+  }
+  else if(hx < 0x31800000)  { /* when |x|<2**-28 */
+    if(huge+x>one) return one+x;/* trigger inexact */
+  }
+  else k = 0;
+
+  /* x is now in primary range */
+  t  = x*x;
+  c  = x - t*(P1+t*P2);
+  if(k==0)
+    return one-((x*c)/(c-(float)2.0)-x);
+  else
+    y = one-((lo-(x*c)/((float)2.0-c))-hi);
+  if(k >= -125) {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
+    return y;
+  } else {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+    return y*twom100;
+  }
+}
+
+/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+/*...*/
+const float
+tiny = 1.0e-30,
+half_val=  5.0000000000e-01, /* 0x3F000000 */
+one =  1.0000000000e+00, /* 0x3F800000 */
+two =  2.0000000000e+00, /* 0x40000000 */
+	/* c = (subfloat)0.84506291151 */
+erx =  8.4506291151e-01, /* 0x3f58560b */
+/*
+ * Coefficients for approximation to  erf on [0,0.84375]
+ */
+efx =  1.2837916613e-01, /* 0x3e0375d4 */
+efx8=  1.0270333290e+00, /* 0x3f8375d4 */
+pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
+pp1  = -3.2504209876e-01, /* 0xbea66beb */
+pp2  = -2.8481749818e-02, /* 0xbce9528f */
+pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
+qq2  =  6.5022252500e-02, /* 0x3d852a63 */
+qq3  =  5.0813062117e-03, /* 0x3ba68116 */
+qq4  =  1.3249473704e-04, /* 0x390aee49 */
+qq5  = -3.9602282413e-06, /* 0xb684e21a */
+/*
+ * Coefficients for approximation to  erf  in [0.84375,1.25]
+ */
+pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1  =  4.1485610604e-01, /* 0x3ed46805 */
+pa2  = -3.7220788002e-01, /* 0xbebe9208 */
+pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
+pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
+pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
+qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
+qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
+qa4  =  1.2617121637e-01, /* 0x3e013307 */
+qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
+qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
+ /*
+ * Coefficients for approximation to  erfc in [1.25,1/0.35]
+ */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
+ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2  = -1.0558626175e+01, /* 0xc128f022 */
+ra3  = -6.2375331879e+01, /* 0xc2798057 */
+ra4  = -1.6239666748e+02, /* 0xc322658c */
+ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6  = -8.1287437439e+01, /* 0xc2a2932b */
+ra7  = -9.8143291473e+00, /* 0xc11d077e */
+sa1  =  1.9651271820e+01, /* 0x419d35ce */
+sa2  =  1.3765776062e+02, /* 0x4309a863 */
+sa3  =  4.3456588745e+02, /* 0x43d9486f */
+sa4  =  6.4538726807e+02, /* 0x442158c9 */
+sa5  =  4.2900814819e+02, /* 0x43d6810b */
+sa6  =  1.0863500214e+02, /* 0x42d9451f */
+sa7  =  6.5702495575e+00, /* 0x40d23f7c */
+sa8  = -6.0424413532e-02, /* 0xbd777f97 */
+/*
+ * Coefficients for approximation to  erfc in [1/.35,28]
+ */
+rb0  = -9.8649431020e-03, /* 0xbc21a092 */
+rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2  = -1.7757955551e+01, /* 0xc18e104b */
+rb3  = -1.6063638306e+02, /* 0xc320a2ea */
+rb4  = -6.3756646729e+02, /* 0xc41f6441 */
+rb5  = -1.0250950928e+03, /* 0xc480230b */
+rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1  =  3.0338060379e+01, /* 0x41f2b459 */
+sb2  =  3.2579251099e+02, /* 0x43a2e571 */
+sb3  =  1.5367296143e+03, /* 0x44c01759 */
+sb4  =  3.1998581543e+03, /* 0x4547fdbb */
+sb5  =  2.5530502930e+03, /* 0x451f90ce */
+sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
+sb7  = -2.2440952301e+01; /* 0xc1b38712 */
+
+	int hx,ix,i;
+	float R,S,P,Q,s,y,z,r;
+	GEN_OCL_GET_FLOAT_WORD(hx,x);
+	ix = hx&0x7fffffff;
+	if(ix>=0x7f800000) {		/* erf(nan)=nan */
+	    i = ((unsigned int)hx>>31)<<1;
+	    return (float)(1-i)+one/x;	/* erf(+-inf)=+-1 */
+	}
+
+	if(ix < 0x3f580000) {		/* |x|<0.84375 */
+	    if(ix < 0x31800000) { 	/* |x|<2**-28 */
+	        if (ix < 0x04000000)
+		    /*avoid underflow */
+		    return (float)0.125*((float)8.0*x+efx8*x);
+		return x + efx*x;
+	    }
+	    z = x*x;
+	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5,qq4), qq3), qq2), qq1), one);
+	    y = r / s;
+	    return mad(x, y, x);
+	}
+	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
+	    s = __gen_ocl_internal_fabs(x)-one;
+	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
+	    if(hx>=0) return erx + P/Q; else return -erx - P/Q;
+	}
+	if (ix >= 0x40c00000) {		/* inf>|x|>=6 */
+	    if(hx>=0) return one-tiny; else return tiny-one;
+	}
+	x = __gen_ocl_internal_fabs(x);
+    s = one/(x*x);
+	if(ix< 0x4036DB6E) {	/* |x| < 1/0.35 */
+	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
+	} else {	/* |x| >= 1/0.35 */
+	    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+	    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+	    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
+	}
+	GEN_OCL_GET_FLOAT_WORD(ix,x);
+	GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
+	r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
+	if(hx>=0) return one-r/x; else return  r/x-one;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+/*...*/
+const float
+tiny = 1.0e-30,
+half_val=  5.0000000000e-01, /* 0x3F000000 */
+one =  1.0000000000e+00, /* 0x3F800000 */
+two =  2.0000000000e+00, /* 0x40000000 */
+	/* c = (subfloat)0.84506291151 */
+erx =  8.4506291151e-01, /* 0x3f58560b */
+/*
+ * Coefficients for approximation to  erf on [0,0.84375]
+ */
+efx =  1.2837916613e-01, /* 0x3e0375d4 */
+efx8=  1.0270333290e+00, /* 0x3f8375d4 */
+pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
+pp1  = -3.2504209876e-01, /* 0xbea66beb */
+pp2  = -2.8481749818e-02, /* 0xbce9528f */
+pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
+qq2  =  6.5022252500e-02, /* 0x3d852a63 */
+qq3  =  5.0813062117e-03, /* 0x3ba68116 */
+qq4  =  1.3249473704e-04, /* 0x390aee49 */
+qq5  = -3.9602282413e-06, /* 0xb684e21a */
+/*
+ * Coefficients for approximation to  erf  in [0.84375,1.25]
+ */
+pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1  =  4.1485610604e-01, /* 0x3ed46805 */
+pa2  = -3.7220788002e-01, /* 0xbebe9208 */
+pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
+pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
+pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
+qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
+qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
+qa4  =  1.2617121637e-01, /* 0x3e013307 */
+qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
+qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
+ /*
+ * Coefficients for approximation to  erfc in [1.25,1/0.35]
+ */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
+ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2  = -1.0558626175e+01, /* 0xc128f022 */
+ra3  = -6.2375331879e+01, /* 0xc2798057 */
+ra4  = -1.6239666748e+02, /* 0xc322658c */
+ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6  = -8.1287437439e+01, /* 0xc2a2932b */
+ra7  = -9.8143291473e+00, /* 0xc11d077e */
+sa1  =  1.9651271820e+01, /* 0x419d35ce */
+sa2  =  1.3765776062e+02, /* 0x4309a863 */
+sa3  =  4.3456588745e+02, /* 0x43d9486f */
+sa4  =  6.4538726807e+02, /* 0x442158c9 */
+sa5  =  4.2900814819e+02, /* 0x43d6810b */
+sa6  =  1.0863500214e+02, /* 0x42d9451f */
+sa7  =  6.5702495575e+00, /* 0x40d23f7c */
+sa8  = -6.0424413532e-02, /* 0xbd777f97 */
+/*
+ * Coefficients for approximation to  erfc in [1/.35,28]
+ */
+rb0  = -9.8649431020e-03, /* 0xbc21a092 */
+rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2  = -1.7757955551e+01, /* 0xc18e104b */
+rb3  = -1.6063638306e+02, /* 0xc320a2ea */
+rb4  = -6.3756646729e+02, /* 0xc41f6441 */
+rb5  = -1.0250950928e+03, /* 0xc480230b */
+rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1  =  3.0338060379e+01, /* 0x41f2b459 */
+sb2  =  3.2579251099e+02, /* 0x43a2e571 */
+sb3  =  1.5367296143e+03, /* 0x44c01759 */
+sb4  =  3.1998581543e+03, /* 0x4547fdbb */
+sb5  =  2.5530502930e+03, /* 0x451f90ce */
+sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
+sb7  = -2.2440952301e+01; /* 0xc1b38712 */
+	int hx,ix;
+	float R,S,P,Q,s,y,z,r;
+	GEN_OCL_GET_FLOAT_WORD(hx,x);
+	ix = hx&0x7fffffff;
+	if(ix>=0x7f800000) {			/* erfc(nan)=nan */
+						/* erfc(+-inf)=0,2 */
+	    return (float)(((unsigned int)hx>>31)<<1)+one/x;
+	}
+
+	if(ix < 0x3f580000) {		/* |x|<0.84375 */
+	    if(ix < 0x23800000)  	/* |x|<2**-56 */
+		return one-x;
+	    z = x*x;
+	    r = mad(z, mad(z, mad(z, mad(z, pp4, pp3), pp2), pp1), pp0);
+	    s = mad(z, mad(z, mad(z, mad(z, mad(z, qq5, qq4), qq3), qq2), qq1), one);
+	    y = r/s;
+	    if(hx < 0x3e800000) {  	/* x<1/4 */
+		return one-(x+x*y);
+	    } else {
+		r = x*y;
+		r += (x-half_val);
+	        return half_val - r ;
+	    }
+	}
+	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
+	    s = __gen_ocl_internal_fabs(x)-one;
+	    P = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, pa6, pa5), pa4), pa3), pa2), pa1), pa0);
+	    Q = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, qa6, qa5), qa4), qa3), qa2), qa1), one);
+	    if(hx>=0) {
+	        z  = one-erx; return z - P/Q;
+	    } else {
+		z = erx+P/Q; return one+z;
+	    }
+	}
+	if (ix < 0x41e00000) {		/* |x|<28 */
+	    x = __gen_ocl_internal_fabs(x);
+        s = one/(x*x);
+	    if(ix< 0x4036DB6D) {	/* |x| < 1/.35 ~ 2.857143*/
+		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		ra7, ra6), ra5), ra4), ra3), ra2), ra1), ra0);
+		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		sa8, sa7), sa6), sa5), sa4), sa3), sa2), sa1), one);
+	    } else {			/* |x| >= 1/.35 ~ 2.857143 */
+		if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
+		    R = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		rb6, rb5), rb4), rb3), rb2), rb1), rb0);
+		    S = mad(s, mad(s, mad(s, mad(s, mad(s, mad(s, mad(s,
+		    		sb7, sb6), sb5), sb4), sb3), sb2), sb1), one);
+	    }
+	    GEN_OCL_GET_FLOAT_WORD(ix,x);
+	    GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
+	    r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*
+			__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
+	    if(hx>0) {
+		float ret = r/x;
+		return ret;
+	    } else
+		return two-r/x;
+	} else {
+	    if(hx>0) {
+		return tiny*tiny;
+	    } else
+		return two-tiny;
+	}
+}
+
+OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
+  //return x-y*__gen_ocl_rndz(x/y);
+  float one = 1.0;
+  float Zero[2];
+  int n,hx,hy,hz,ix,iy,sx,i;
+  Zero[0] = 0.0;
+  Zero[1] = -0.0;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  sx = hx&0x80000000;		/* sign of x */
+  hx ^=sx;		/* |x| */
+  hy &= 0x7fffffff;	/* |y| */
+  /* purge off exception values */
+  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
+  (hy>0x7f800000))			/* or y is NaN */
+    return (x*y)/(x*y);
+  if(hx<hy) return x;			/* |x|<|y| return x */
+  if(hx==hy)
+    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
+
+  /* determine ix = ilogb(x) */
+  if(hx<0x00800000) {	/* subnormal x */
+    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
+  } else ix = (hx>>23)-127;
+
+  /* determine iy = ilogb(y) */
+  if(hy<0x00800000) {	/* subnormal y */
+    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
+  } else iy = (hy>>23)-127;
+
+  /* set up {hx,lx}, {hy,ly} and align y to x */
+  if(ix >= -126)
+    hx = 0x00800000|(0x007fffff&hx);
+  else {		/* subnormal x, shift x to normal */
+    n = -126-ix;
+    hx = hx<<n;
+  }
+  if(iy >= -126)
+    hy = 0x00800000|(0x007fffff&hy);
+  else {		/* subnormal y, shift y to normal */
+    n = -126-iy;
+    hy = hy<<n;
+  }
+  /* fix point fmod */
+  n = ix - iy;
+  while(n--) {
+    hz=hx-hy;
+    if(hz<0){hx = hx+hx;}
+    else {
+      if(hz==0)		/* return sign(x)*0 */
+        return Zero[(unsigned)sx>>31];
+      hx = hz+hz;
+    }
+  }
+  hz=hx-hy;
+  if(hz>=0) {hx=hz;}
+
+    /* convert back to floating value and restore the sign */
+  if(hx==0)			/* return sign(x)*0 */
+    return Zero[(unsigned)sx>>31];
+  while(hx<0x00800000) {		/* normalize x */
+    hx = hx+hx;
+    iy -= 1;
+  }
+  if(iy>= -126) {		/* normalize output */
+    hx = ((hx-0x00800000)|((iy+127)<<23));
+	GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+   } else {		/* subnormal output */
+     n = -126 - iy;
+     hx >>= n;
+     GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+     x *= one;		/* create necessary signal */
+  }
+  return x;		/* exact output */
+}
+
+OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+  //return __gen_ocl_pow(M_E_F, x) - 1;
+  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
+  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
+  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+  huge = 1.0e30,
+  tiny = 1.0e-30,
+  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one	=  1.0,
+  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
+  float y,hi,lo,c,t,e,hxs,hfx,r1;
+  int k,xsb;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = hx&0x80000000;
+  /* sign bit of x */
+  //if(xsb==0)
+  //y=x;
+  //else
+  //y= -x; /* y = |x| */
+  y = __gen_ocl_internal_fabs(x);
+  hx &= 0x7fffffff;		/* high word of |x| */
+  /* filter out huge and non-finite argument */
+  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
+    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
+      if(hx>0x7f800000)
+        return x+x; 	 /* NaN */
+      if(hx==0x7f800000)
+        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+      if(x > o_threshold)
+        return huge*huge; /* overflow */
+    }
+    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+      if(x+tiny<(float)0.0)	/* raise inexact */
+        return tiny-one;	/* return -1 */
+    }
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+      if(xsb==0){
+        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
+      }	else {
+        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
+      }
+    } else {
+      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+      t  = k;
+      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+      lo = t*ln2_lo;
+    }
+    x  = hi - lo;
+    c  = (hi-x)-lo;
+  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
+    //t = huge+x; /* return x with inexact flags when x!=0 */
+    //return x - (t-(huge+x));
+    return x;
+  } else k = 0;
+  /* x is now in primary range */
+  hfx = (float)0.5*x;
+  hxs = x*hfx;
+  r1 = one+hxs*(Q1+hxs*Q2);
+  t = (float)3.0-r1*hfx;
+  e = hxs*((r1-t)/((float)6.0 - x*t));
+  if(k==0)
+    return x - (x*e-hxs);		/* c is 0 */
+  else{
+    e = (x*(e-c)-c);
+    e -= hxs;
+    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+    if(k==1){
+      if(x < (float)-0.25)
+        return -(float)2.0*(e-(x+(float)0.5));
+      else
+        return  (one+(float)2.0*(x-e));
+    }
+    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
+      int i;
+      y = one-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+      return y-one;
+    }
+    t = one;
+    if(k<23) {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+      y = t-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    } else {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
+      y = x-(e+t);
+      y += one;
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    }
+  }
+  return y;
+}
+
+OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+  float one	= 1.0,
+  ln2	= 6.9314718246e-01;/* 0x3f317218 */
+  float t;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  if(hx<0x3f800000) {	/* x < 1 */
+    return (x-x)/(x-x);
+  } else if(hx >=0x4d800000) {	/* x > 2**28 */
+    if(hx >=0x7f800000) {/* x is inf of NaN */
+      return x+x;
+    } else
+      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+  } else if (hx==0x3f800000) {
+    return 0.0;			/* acosh(1) = 0 */
+  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
+    t=x*x;
+    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
+  } else {			/* 1<x<2 */
+    t = x-one;
+    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+  }
+}
+
+OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+  //return native_log(x + native_sqrt(x * x + 1));
+  float one =  1.0000000000e+00, /* 0x3F800000 */
+  ln2 =  6.9314718246e-01, /* 0x3f317218 */
+  huge=  1.0000000000e+30;
+  float w;
+  int hx,ix;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix< 0x38000000) {	/* |x|<2**-14 */
+    if(huge+x>one) return x;	/* return x inexact except 0 */
+  }
+  if(ix>0x47000000) {/* |x| > 2**14 */
+    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+  } else {
+    float xa = __gen_ocl_internal_fabs(x);
+    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+      w = __gen_ocl_internal_log(mad(xa, 2.0f, one / (__gen_ocl_sqrt(mad(xa, xa, one)) + xa)));
+    } else {		/* 2.0 > |x| > 2**-14 */
+      float t = xa*xa;
+      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+    }
+  }
+  return __gen_ocl_internal_copysign(w, x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+  float one = 1.0,
+  shuge = 1.0e37;
+  float t,w,h;
+  int ix,jx;
+  GEN_OCL_GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x+x;
+  h = 0.5;
+  if (jx<0) h = -h;
+  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+  if (ix < 0x41b00000) {		/* |x|<22 */
+    if (ix<0x31800000)	/* |x|<2**-28 */
+      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+      return h*(t+t/(t+one));
+  }
+  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+    t = h*w;
+    return t*w;
+  }
+  /* |x| > overflowthresold, sinh(x) overflow */
+  return x*shuge;
+}
+
+OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+  //float y = native_exp(-2 * x);
+  //return (1 - y) / (1 + y);
+  float one=1.0, two=2.0, tiny = 1.0e-30;
+  float t,z;
+  int jx,ix;
+  GEN_OCL_GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) {
+    if (jx>=0)
+      return one/x+one; /* tanh(+-inf)=+-1 */
+    else
+      return one/x-one; /* tanh(NaN) = NaN */
+  }
+
+  if (ix < 0x41b00000) { /* |x|<22 */
+    if (ix == 0)
+      return x;		/* x == +-0 */
+    if (ix<0x24000000) 	/* |x|<2**-55 */
+      return x*(one+x);    	/* tanh(small) = small */
+    if (ix>=0x3f800000) {	/* |x|>=1  */
+      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
+      z = one - two/(t+two);
+    } else {
+      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
+      z= -t/(t+two);
+    }
+  } else { /* |x| > 22, return +-1 */
+    z = one - tiny;		/* raised inexact flag */
+  }
+  return (jx>=0)? z: -z;
+}
+
+OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+  //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+  float halF = 0.5,
+  huge = 1.0e+30,
+  tiny = 1.0e-30,
+  one = 1.0;
+  float t,w;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;
+  /* |x| in [0,22] */
+  if (ix < 0x41b00000) {
+    /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
+    if(ix<0x3eb17218) {
+      t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
+      w = one+t;
+      if (ix<0x24000000) return w;	/* cosh(tiny) = 1 */
+      return one+(t*t)/(w+w);
+    }
+    /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
+    t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
+    return halF*t+halF/t;
+  }
+  /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
+  if (ix < 0x42b17180)  return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
+    t = halF*w;
+    return t*w;
+  }
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x*x;
+  /* |x| > overflowthresold, cosh(x) overflow */
+  return huge*huge;
+}
+
+OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
+  //return x-y*__gen_ocl_rnde(x/y);
+  float zero = 0.0;
+  int hx,hp;
+  unsigned sx;
+  float p_half;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hp,p);
+  sx = hx&0x80000000;
+  hp &= 0x7fffffff;
+  hx &= 0x7fffffff;
+  /* purge off exception values */
+  if(hp==0) return (x*p)/(x*p);	        /* p = 0 */
+  if((hx>=0x7f800000)||               /* x not finite */
+    ((hp>0x7f800000)))	               /* p is NaN */
+    return (x*p)/(x*p);
+  if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
+  if ((hx-hp)==0) return zero*x;
+  x = __gen_ocl_fabs(x);
+  p = __gen_ocl_fabs(p);
+  if (hp<0x01000000) {
+    if(x+x>p) {
+      x-=p;
+      if(x+x>=p) x -= p;
+    }
+  } else {
+    p_half = (float)0.5*p;
+    if(x>p_half) {
+      x-=p;
+      if(x>=p_half) x -= p;
+    }
+  }
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
+  return x;
+}
+
+OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+  x = __gen_ocl_scalbnf(x,n);
+  return x;
+}
+
+OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+  //return 0.5f * native_sqrt((1 + x) / (1 - x));
+  float xa = __gen_ocl_fabs (x);
+  float t;
+  if (isless (xa, 0.5f)){
+    if (xa < 0x1.0p-28f) return x;
+    t = xa + xa;
+    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
+  } else if (isless (xa, 1.0f)){
+    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
+  } else{
+    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
+    return x / 0.0f;
+  }
+  return __gen_ocl_internal_copysign(t, x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+  float px, qx,ans;
+  short n;
+  int i;
+  float*p;
+  float MAXL10 = 38.230809449325611792;
+  float LOG210 = 3.32192809488736234787e0;
+  float LG102A = 3.00781250000000000000E-1;
+  float LG102B = 2.48745663981195213739E-4;
+  float P[6];
+  P[0] = 2.063216740311022E-001;
+  P[1] = 5.420251702225484E-001;
+  P[2] = 1.171292686296281E+000;
+  P[3] = 2.034649854009453E+000;
+  P[4] = 2.650948748208892E+000;
+  P[5] = 2.302585167056758E+000;
+
+  if( x < -MAXL10 ) return 0.0;
+
+  if( isinf(x))  return INFINITY;
+  /* The following is necessary because range reduction blows up: */
+  if( x == 0 )return 1.0;
+
+  /* Express 10**x = 10**g 2**n
+    *	 = 10**g 10**( n log10(2) )
+    *	 = 10**( g + n log10(2) )
+    */
+  px = x * LOG210;
+  qx = __gen_ocl_internal_floor( px + 0.5 );
+  n = qx;
+  x -= qx * LG102A;
+  x -= qx * LG102B;
+
+  /* rational approximation for exponential
+    * of the fractional part:
+    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
+    */
+  p = P;
+  ans = *p++;
+  i = 5;
+  do{
+    ans = ans * x  +  *p++;
+  }
+  while( --i );
+  px = 1.0 + x * ans;
+
+  /* multiply by power of 2 */
+  x = __gen_ocl_internal_ldexp( px, n );
+  return x;
+}
+
+OVERLOADABLE float cospi(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cospi(x);
+
+  return __gen_ocl_internal_cospi(x);
+}
+
+OVERLOADABLE float cosh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cosh(x);
+
+  return  __gen_ocl_internal_cosh(x);
+}
+
+OVERLOADABLE float acos(float x) {
+  return __gen_ocl_internal_acos(x);
+}
+
+OVERLOADABLE float acospi(float x) {
+  return __gen_ocl_internal_acospi(x);
+}
+
+OVERLOADABLE float acosh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_acosh(x);
+
+  return __gen_ocl_internal_acosh(x);
+}
+
+OVERLOADABLE float sinpi(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sinpi(x);
+
+  return __gen_ocl_internal_sinpi(x);
+}
+
+OVERLOADABLE float sinh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sinh(x);
+
+  return __gen_ocl_internal_sinh(x);
+}
+
+OVERLOADABLE float asin(float x) {
+  return __gen_ocl_internal_asin(x);
+}
+
+OVERLOADABLE float asinpi(float x) {
+  return __gen_ocl_internal_asinpi(x);
+}
+
+OVERLOADABLE float asinh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_asinh(x);
+
+  return __gen_ocl_internal_asinh(x);
+}
+
+OVERLOADABLE float tanpi(float x) {
+  return __gen_ocl_internal_tanpi(x);
+}
+
+OVERLOADABLE float tanh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_tanh(x);
+
+  return __gen_ocl_internal_tanh(x);
+}
+
+OVERLOADABLE float atan(float x) {
+  return __gen_ocl_internal_atan(x);
+}
+
+OVERLOADABLE float atan2(float y, float x) {
+  return __gen_ocl_internal_atan2(y, x);
+}
+
+OVERLOADABLE float atan2pi(float y, float x) {
+  return __gen_ocl_internal_atan2pi(y, x);
+}
+
+OVERLOADABLE float atanpi(float x) {
+  return __gen_ocl_internal_atanpi(x);
+}
+
+OVERLOADABLE float atanh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_atanh(x);
+
+  return __gen_ocl_internal_atanh(x);
+}
+
+OVERLOADABLE float cbrt(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cbrt(x);
+
+  return __gen_ocl_internal_cbrt(x);
+}
+
+OVERLOADABLE float rint(float x) {
+  return __gen_ocl_internal_rint(x);
+}
+
+OVERLOADABLE float copysign(float x, float y) {
+  return __gen_ocl_internal_copysign(x, y);
+}
+
+OVERLOADABLE float erf(float x) {
+  return __gen_ocl_internal_erf(x);
+}
+
+OVERLOADABLE float erfc(float x) {
+  return __gen_ocl_internal_erfc(x);
+}
+
+OVERLOADABLE float fmod (float x, float y) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_fmod(x, y);
+
+  return __gen_ocl_internal_fmod(x, y);
+}
+
+OVERLOADABLE float remainder(float x, float p) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_remainder(x, p);
+
+  return __gen_ocl_internal_remainder(x, p);
+}
+
+OVERLOADABLE float ldexp(float x, int n) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_ldexp(x, n);
+
+  if (x == (float)0.0f) x = 0.0f;
+  return __gen_ocl_internal_ldexp(x, n);
+}
+
+CONST OVERLOADABLE float __gen_ocl_mad(float a, float b, float c) __asm("llvm.fma" ".f32");
+CONST OVERLOADABLE half __gen_ocl_mad(half a, half b, half c) __asm("llvm.fma" ".f16");
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
+
+OVERLOADABLE float mad(float a, float b, float c) {
+  return __gen_ocl_mad(a, b, c);
+}
+
+OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmax(half a, half b) { return max(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmin(half a, half b) { return min(a,b); }
+OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a > b ? x : b > a ? y : max(x, y);
+}
+OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a < b ? x : b < a ? y : min(x, y);
+}
+OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+  if(isnan(x))
+    return x;
+  if(isnan(y))
+    return y;
+  return x > y ? (x - y) : +0.f;
+}
+/*
+ * the pow/pown high precision implementation are copied from msun library.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
+  float z,ax,z_h,z_l,p_h,p_l;
+  float y1,t1,t2,r,s,sn,t,u,v,w;
+  int i,j,k,yisint,n;
+  int hx,hy,ix,iy,is;
+  float bp[2],dp_h[2],dp_l[2],
+  zero    =  0.0,
+  one	=  1.0,
+  two	=  2.0,
+  two24	=  16777216.0,	/* 0x4b800000 */
+  huge	=  1.0e30,
+  tiny    =  1.0e-30,
+  /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+  L1  =  6.0000002384e-01, /* 0x3f19999a */
+  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
+  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
+  P2   = -2.7777778450e-03, /* 0xbb360b61 */
+  lg2  =  6.9314718246e-01, /* 0x3f317218 */
+  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
+  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
+  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
+  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+  bp[0] = 1.0,bp[1] = 1.5,
+  dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
+  dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
+  if (ix < 0x00800000) {	   /* x < 2**-126  */
+    ix = 0;/* Gen does not support subnormal number now */
+  }
+  if (iy < 0x00800000) {	  /* y < 2**-126  */
+    iy = 0;/* Gen does not support subnormal number now */
+  }
+   /* y==zero: x**0 = 1 */
+  if(iy==0) return one;
+  /* pow(+1, y) returns 1 for any y, even a NAN */
+  if(hx==0x3f800000) return one;
+  /* +-NaN return x+y */
+  if(ix > 0x7f800000 || iy > 0x7f800000)
+    return (x+0.0f)+y+(0.0f);
+  /* determine if y is an odd int when x < 0
+     * yisint = 0	... y is not an integer
+     * yisint = 1	... y is an odd int
+     * yisint = 2	... y is an even int
+     */
+  yisint  = 0;
+  if(hx<0) {
+    if(iy>=0x4b800000) yisint = 2; /* even integer y */
+    else if(iy>=0x3f800000) {
+      k = (iy>>23)-0x7f;	   /* exponent */
+      j = iy>>(23-k);
+      if((j<<(23-k))==iy) yisint = 2-(j&1);
+    }
+  }
+  /* special value of y */
+  if (iy==0x7f800000) {	/* y is +-inf */
+    if (ix==0x3f800000)
+      //return  y - y;	/* inf**+-1 is NaN */
+      return one;
+    else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
+      return (hy>=0)? y: zero;
+    else			/* (|x|<1)**-,+inf = inf,0 */
+      return (hy<0)?-y: zero;
+  }
+  if(iy==0x3f800000) {	/* y is  +-1 */
+    if(hy<0) return one/x; else return x;
+  }
+  if(hy==0x40000000) return x*x; /* y is  2 */
+  if(hy==0x3f000000) {	/* y is  0.5 */
+    if(hx>=0)return __gen_ocl_sqrt(x);
+  }
+
+  ax   = __gen_ocl_fabs(x);
+    /* special value of x */
+  if(ix==0x7f800000||ix==0||ix==0x3f800000){
+    z = ax;			/*x is +-0,+-inf,+-1*/
+    if(hy<0) z = one/z;	/* z = (1/|x|) */
+    if(hx<0) {
+      if(((ix-0x3f800000)|yisint)==0) {
+        z = (z-z)/(z-z); /* (-1)**non-int is NaN */
+      } else if(yisint==1)
+        z = -z;		/* (x<0)**odd = -(|x|**odd) */
+    }
+    return z;
+  }
+  n = ((uint)hx>>31)-1;
+
+  /* (x<0)**(non-int) is NaN */
+  if((n|yisint)==0) return (x-x)/(x-x);
+
+  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+
+  /* |y| is huge */
+  if(iy>0x4d000000) { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+    if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
+    if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+          log(x) by x-x^2/2+x^3/3-x^4/4 */
+    t = ax-1;		/* t has 20 trailing zeros */
+    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
+    v = t*ivln2_l-w*ivln2;
+    t1 = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = v-(t1-u);
+  } else {
+    float s2,s_h,s_l,t_h,t_l;
+    n = 0;
+	/* take care subnormal number */
+    //if(ix<0x00800000)
+      //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+    n  += ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+	/* determine interval */
+    ix = j|0x3f800000;		/* normalize ix */
+    if(j<=0x1cc471) k=0;	/* |x|<sqrt(3/2) */
+    else if(j<0x5db3d7) k=1;	/* |x|<sqrt(3)   */
+    else {k=0;n+=1;ix -= 0x00800000;}
+    GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
+    v = one/(ax+bp[k]);
+    s = u*v;
+    s_h = s;
+    GEN_OCL_GET_FLOAT_WORD(is,s_h);
+    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+    /* t_h=ax+bp[k] High */
+    is = ((ix>>1)&0xfffff000)|0x20000000;
+    GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+    t_l = ax - (t_h-bp[k]);
+    s_l = v*((u-s_h*t_h)-s_h*t_l);
+
+    /* compute log(ax) */
+    s2 = s*s;
+    r = s2*s2*(L1+s2*L2);
+    r += s_l*(s_h+s);
+    s2  = s_h*s_h;
+    t_h = 3.0f+s2+r;
+    GEN_OCL_GET_FLOAT_WORD(is,t_h);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+    t_l = r-((t_h-3.0f)-s2);
+    /* u+v = s*(1+...) */
+    u = s_h*t_h;
+    v = s_l*t_h+t_l*s;
+    /* 2/(3log2)*(s+...) */
+    p_h = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,p_h);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+    p_l = v-(p_h-u);
+    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
+    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+    t = (float)n;
+    t1 = (((z_h+z_l)+dp_h[k])+t);
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+  }
+
+  /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+  GEN_OCL_GET_FLOAT_WORD(is,y);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xffffe000);
+  p_l = (y-y1)*t1+y*t2;
+  p_h = y1*t1;
+  z = p_l+p_h;
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  if (j>0x43000000)				/* if z > 128 */
+    return sn*huge*huge;			/* overflow */
+  else if (j==0x43000000) {			/* if z == 128 */
+    if(p_l+ovt>z-p_h) return sn*huge*huge;	/* overflow */
+  }
+  else if ((j&0x7fffffff)>0x43160000)		/* z <= -150 */
+    return sn*tiny*tiny;			/* underflow */
+  else if (j==0xc3160000){			/* z == -150 */
+    if(p_l<=z-p_h) return sn*tiny*tiny;		/* underflow */
+  }
+
+  /*
+    * compute 2**(p_h+p_l)
+    */
+  i = j&0x7fffffff;
+  k = (i>>23)-0x7f;
+  n = 0;
+  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
+    n = j+(0x00800000>>(k+1));
+    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
+    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+    n = ((n&0x007fffff)|0x00800000)>>(23-k);
+    if(j<0) n = -n;
+    p_h -= t;
+  }
+  t = p_l+p_h;
+  GEN_OCL_GET_FLOAT_WORD(is,t);
+  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
+  u = t*lg2_h;
+  v = (p_l-(t-p_h))*lg2+t*lg2_l;
+  z = u+v;
+  w = v-(z-u);
+  t  = z*z;
+  t1  = z - t*(P1+t*P2);
+  r  = (z*t1)/(t1-two)-(w+z*w);
+  z  = one-(r-z);
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  j += (n<<23);
+  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);	/* subnormal output */
+  else GEN_OCL_SET_FLOAT_WORD(z,j);
+  return sn*z;
+}
+
+#define BODY \
+  if (isnan(x_abs) || isinf(x_abs)) { \
+    x_log2 = 0; \
+    return x_abs; \
+  } \
+  uint u = as_uint(x_abs); \
+  uint a = u & 0x7FFFFFFFu; \
+  if (a == 0) { \
+    x_log2 = 0; \
+    return x_abs; \
+  } \
+  if (a >= 0x800000) { \
+    x_log2 = (a >> 23) - 126; \
+    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+  } \
+  int e = -126; \
+  while (a < 0x400000) { \
+    e --; \
+    a <<= 1; \
+  } \
+  a <<= 1; \
+  x_log2 = e; \
+  float x_mant = as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+
+OVERLOADABLE float tgamma (float x)
+{
+  /* based on glibc __ieee754_gammaf_r by Ulrich Drepper <drepper@cygnus.com> */
+
+  unsigned int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  if (hx == 0xff800000)
+    {
+      /* x == -Inf.  According to ISO this is NaN.  */
+      return NAN;
+    }
+  if ((hx & 0x7f800000) == 0x7f800000)
+    {
+      /* Positive infinity (return positive infinity) or NaN (return
+	 NaN).  */
+      return x;
+    }
+  if (x < 0.0f && __gen_ocl_internal_floor (x) == x)
+    {
+      /* integer x < 0 */
+      return NAN;
+    }
+
+  if (x >= 36.0f)
+    {
+      /* Overflow.  */
+      return INFINITY;
+    }
+  else if (x <= 0.0f && x >= -FLT_EPSILON / 4.0f)
+    {
+      return 1.0f / x;
+    }
+  else
+    {
+      float sinpix = __gen_ocl_internal_sinpi(x);
+      if (x <= -42.0f)
+	/* Underflow.  */
+	{return 0.0f * sinpix /*for sign*/;}
+      int exp2_adj = 0;
+      float x_abs = __gen_ocl_fabs(x);
+      float gam0;
+
+      if (x_abs < 4.0f) {
+        /* gamma = exp(lgamma) is only accurate for small lgamma */
+        float prod,x_adj;
+        if (x_abs < 0.5f) {
+          prod = 1.0f / x_abs;
+          x_adj = x_abs + 1.0f;
+        } else if (x_abs <= 1.5f) {
+          prod = 1.0f;
+          x_adj = x_abs;
+        } else if (x_abs < 2.5f) {
+          x_adj = x_abs - 1.0f;
+          prod = x_adj;
+        } else {
+          x_adj = x_abs - 2.0f;
+          prod = x_adj * (x_abs - 1.0f);
+        }
+        gam0 = __gen_ocl_internal_exp (lgamma (x_adj)) * prod;
+      }
+      else {
+        /* Compute gamma (X) using Stirling's approximation,
+  	 starting by computing pow (X, X) with a power of 2
+  	 factored out to avoid intermediate overflow.  */
+        float x_int = __gen_ocl_internal_round (x_abs);
+        float x_frac = x_abs - x_int;
+        int x_log2;
+
+        BODY
+
+        if (x_mant < M_SQRT1_2_F)
+          {
+          x_log2--;
+          x_mant *= 2.0f;
+          }
+        exp2_adj = x_log2 * (int) x_int;
+        float ret = (__gen_ocl_internal_pow(x_mant, x_abs)
+  		   * exp2 (x_log2 * x_frac)
+  		   * __gen_ocl_internal_exp (-x_abs)
+  		   * sqrt (2.0f * M_PI_F / x_abs) );
+
+        float x2 = x_abs * x_abs;
+        float bsum = (0x3.403404p-12f / x2 -0xb.60b61p-12f) / x2 + 0x1.555556p-4f;
+        gam0 = ret + ret * __gen_ocl_internal_expm1 (bsum / x_abs);
+      }
+      if (x > 0.0f) {return __gen_ocl_internal_ldexp (gam0, exp2_adj);}
+      float gam1 = M_PI_F / (-x * sinpix * gam0);
+      return __gen_ocl_internal_ldexp (gam1, -exp2_adj);
+    }
+}
+#undef BODY
+
+float __gen_ocl_internal_pown(float x, int y) {
+  const float
+  bp[] = {1.0, 1.5,},
+  dp_h[] = { 0.0, 5.84960938e-01,}, /* 0x3f15c000 */
+  dp_l[] = { 0.0, 1.56322085e-06,}, /* 0x35d1cfdc */
+  zero    =  0.0,
+  one =  1.0,
+  two =  2.0,
+  two24 =  16777216.0,  /* 0x4b800000 */
+  huge  =  1.0e30,
+  tiny    =  1.0e-30,
+    /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+  L1  =  6.0000002384e-01, /* 0x3f19999a */
+  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
+  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
+  P2   = -2.7777778450e-03, /* 0xbb360b61 */
+  lg2  =  6.9314718246e-01, /* 0x3f317218 */
+  lg2_h  =  0x1.62ep-1,
+  lg2_l  =  0x1.0bfbe8p-15,
+  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
+  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+
+  float z,ax,z_h,z_l,p_h,p_l;
+  float y1,t1,t2,r,s,t,u,v,w;
+  int i,j,k,yisint,n;
+  int hx,ix,iy,is;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  iy = y > 0 ? y&0x7fffffff : (-y)&0x7fffffff;
+    /* y==zero: x**0 = 1 */
+  if(y==0) return one;
+
+    /* +-NaN return NAN */
+  if(ix > 0x7f800000)
+    return NAN;
+
+    /* determine if y is an odd int
+     * yisint = 1 ... y is an odd int
+     * yisint = 2 ... y is an even int
+     */
+    yisint = y&1 ? 1 : 2;
+
+  if (y == 1) return x;
+  if (y == -1) return one/x;
+  if (y == 2) return x*x;
+
+  ax   = __gen_ocl_fabs(x);
+
+   /* special value of x */
+  if(ix==0x7f800000||ix==0||ix==0x3f800000){
+      z = ax;     /*x is +-0,+-inf,+-1*/
+      if(y<0) z = one/z; /* z = (1/|x|) */
+      if(hx<0) {
+      if(yisint==1)
+        z = -z;   /* (x<0)**odd = -(|x|**odd) */
+      }
+      return z;
+  }
+
+  float sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+  if(((((unsigned)hx>>31)-1)|(yisint-1))==0)
+      sn = -one; /* (-ve)**(odd int) */
+
+    /* |y| is huge */
+  if(iy>0x08000000) { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+      if(ix<0x3f7ffff8) return (y<0)? sn*huge*huge:tiny*tiny;
+      if(ix>0x3f800007) return (y>0)? sn*huge*huge:tiny*tiny;
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+     log(x) by x-x^2/2+x^3/3-x^4/4 */
+      t = ax-1;   /* t has 20 trailing zeros */
+      w = (t*t)*((float)0.5-t*((float)0.333333333333-t*(float)0.25));
+      u = ivln2_h*t;  /* ivln2_h has 16 sig. bits */
+      v = t*ivln2_l-w*ivln2;
+      t1 = u+v;
+      GEN_OCL_GET_FLOAT_WORD(is,t1);
+      GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+      t2 = v-(t1-u);
+  } else {
+    float s2,s_h,s_l,t_h,t_l;
+    n = 0;
+    /* take care subnormal number */
+//      if(ix<0x00800000)
+//    {ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+    n  += ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+    /* determine interval */
+    ix = j|0x3f800000;    /* normalize ix */
+    if(j<=0x1cc471) k=0;  /* |x|<sqrt(3/2) */
+    else if(j<0x5db3d7) k=1;  /* |x|<sqrt(3)   */
+    else {k=0;n+=1;ix -= 0x00800000;}
+    GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+    /* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+    u = ax-bp[k];   /* bp[0]=1.0, bp[1]=1.5 */
+    v = one/(ax+bp[k]);
+    s = u*v;
+    s_h = s;
+    GEN_OCL_GET_FLOAT_WORD(is,s_h);
+    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+
+    /* t_h=ax+bp[k] High */
+    GEN_OCL_SET_FLOAT_WORD(t_h, (((ix>>1)|0x20000000)+0x00400000+(k<<21)) &0xfffff000);
+    t_l = ax - (t_h-bp[k]);
+    s_l = v*((u-s_h*t_h)-s_h*t_l);
+
+
+    /* compute log(ax) */
+    s2 = s*s;
+    r = s2*s2*(L1+s2*L2);
+    r += s_l*(s_h+s);
+    s2  = s_h*s_h;
+    t_h = (float)3.0+s2+r;
+    GEN_OCL_GET_FLOAT_WORD(is,t_h);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xffffe000);
+    t_l = r-((t_h-(float)3.0)-s2);
+    /* u+v = s*(1+...) */
+    u = s_h*t_h;
+    v = s_l*t_h+t_l*s;
+    /* 2/(3log2)*(s+...) */
+    p_h = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,p_h);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xffffe000);
+    p_l = v-(p_h-u);
+    z_h = cp_h*p_h;   /* cp_h+cp_l = 2/(3*log2) */
+    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+    t = (float)n;
+    t1 = (((z_h+z_l)+dp_h[k])+t);
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xffffe000);
+    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+  }
+
+  /* split up y into y1+y2+y3 and compute (y1+y2+y3)*(t1+t2) */
+
+  float fy = (float)y;
+  float y3 = (float)(y-(int)fy);
+  GEN_OCL_GET_FLOAT_WORD(is,fy);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+
+  p_l = (fy-y1)*t1 + y3*t1 + fy*t2 + y3*t2;
+  p_h = y1*t1;
+  z = p_l+p_h;
+
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  if (j>0x43000000)       /* if z > 128 */
+      return sn*huge*huge;       /* overflow */
+  else if (j==0x43000000) {     /* if z == 128 */
+      if(p_l+ovt>z-p_h) return sn*huge*huge; /* overflow */
+  }
+  else if ((j&0x7fffffff)>0x43160000)   /* z <= -150 */
+      return sn*tiny*tiny;       /* underflow */
+  else if (j==0xc3160000){      /* z == -150 */
+      if(p_l<=z-p_h) return sn*tiny*tiny;    /* underflow */
+  }
+    /*
+     * compute 2**(p_h+p_l)
+     */
+  i = j&0x7fffffff;
+  k = (i>>23)-0x7f;
+  n = 0;
+  if(i>0x3f000000) {    /* if |z| > 0.5, set n = [z+0.5] */
+      n = j+(0x00800000>>(k+1));
+      k = ((n&0x7fffffff)>>23)-0x7f;  /* new k for n */
+      GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+      n = ((n&0x007fffff)|0x00800000)>>(23-k);
+      if(j<0) n = -n;
+      p_h -= t;
+
+      z -= n;
+  }
+
+  t = z;
+  GEN_OCL_GET_FLOAT_WORD(is,t);
+  GEN_OCL_SET_FLOAT_WORD(t,is&0xfffff000);
+  u = t*lg2_h;
+  v = (p_l-(t-p_h))*lg2+t*lg2_l;
+  z = u+v;
+  w = v-(z-u);
+  t  = z*z;
+  t1  = z - t*(P1+t*P2);
+  r  = (z*t1)/(t1-two)-(w+z*w);
+  z  = one-(r-z);
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  j += (n<<23);
+  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);  /* subnormal output */
+  else GEN_OCL_SET_FLOAT_WORD(z,j);
+  return sn*z;
+}
+
+#define BODY \
+  if (isnan(x) || isinf(x)) { \
+    *exp = 0; \
+    return x; \
+  } \
+  uint u = as_uint(x); \
+  uint a = u & 0x7FFFFFFFu; \
+  if (a == 0) { \
+    *exp = 0; \
+    return x; \
+  } \
+  if (a >= 0x800000) { \
+    *exp = (a >> 23) - 126; \
+    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+  } \
+  int e = -126; \
+  while (a < 0x400000) { \
+    e --; \
+    a <<= 1; \
+  } \
+  a <<= 1; \
+  *exp = e; \
+  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+float __gen_ocl_internal_frexp(float x, int *exp) { BODY; }
+
+OVERLOADABLE float hypot(float x, float y) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_hypot(x, y);
+
+  //return __gen_ocl_sqrt(x*x + y*y);
+  float a,b,an,bn,cn;
+  int e;
+  if (isfinite (x) && isfinite (y)){      /* Determine absolute values.  */
+  x = __gen_ocl_fabs (x);
+  y = __gen_ocl_fabs (y);
+  /* Find the bigger and the smaller one.  */
+  a = max(x,y);
+  b = min(x,y);
+  /* Now 0 <= b <= a.  */
+  /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1.  */
+  an = __gen_ocl_internal_frexp (a, &e);
+  bn = ldexp (b, - e);
+  /* Through the normalization, no unneeded overflow or underflow will occur here.  */
+  cn = __gen_ocl_sqrt (an * an + bn * bn);
+  return ldexp (cn, e);
+  }else{
+    if (isinf (x) || isinf (y))  /* x or y is infinite.  Return +Infinity.  */
+      return INFINITY;
+    else        /* x or y is NaN.  Return NaN.  */
+      return x + y;
+  }
+}
+
+OVERLOADABLE float powr(float x, float y) {
+  unsigned int hx, sx, hy, sy;
+
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_pow(x,y);
+  else {
+    if (isnan(x) || isnan(y)) return NAN;
+    GEN_OCL_GET_FLOAT_WORD(hx,x);
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    sx = (hx & 0x80000000) >> 31;
+    sy = (hy & 0x80000000) >> 31;
+
+    if ((hx&0x7fffffff) < 0x00800000) {	   /* x < 2**-126  */
+      x = 0.0f;/* Gen does not support subnormal number now */
+      hx = hx &0x80000000;
+    }
+    if ((hy&0x7fffffff) < 0x00800000) {	  /* y < 2**-126  */
+      y = 0.0;/* Gen does not support subnormal number now */
+      hy = hy &0x80000000;
+    }
+
+    // (x < 0) ** y = NAN (y!=0)
+    if ((sx && (hx & 0x7fffffff))) return NAN;
+
+    // +/-0 ** +/-0 = NAN
+    if ( !(hx&0x7fffffff) && !(hy&0x7fffffff)) return NAN;
+
+    // +inf ** +/-0 = NAN
+    if ( ((hx & 0x7f800000) ==0x7f800000) && !(hy&0x7fffffff)) return NAN;
+
+    // others except nan/inf/0 ** 0 = 1.0
+    if (!(hy&0x7fffffff)) return 1.0f;
+
+    // +1 ** inf = NAN; +1 ** finite = 1;
+    if (hx == 0x3f800000) {
+      return isinf(y) ? NAN : 1.0f;
+    }
+
+    if ( !(hx & 0x7fffffff)) {
+        // +/-0 ** y<0 = +inf
+        // +/-0 ** y>0 = +0
+      return sy ? INFINITY : 0.0f;
+    }
+
+    return __gen_ocl_internal_pow(x,y);
+  }
+}
+
+OVERLOADABLE float pown(float x, int n) {
+  if (__ocl_math_fastpath_flag) {
+    if (x == 0.f && n == 0)
+      return 1.f;
+    if (x < 0.f && (n&1) )
+      return -powr(-x, n);
+    return powr(x, n);
+  } else {
+    int ix;
+    GEN_OCL_GET_FLOAT_WORD(ix, x);
+    float sign = ix < 0 ? -1.0f : 1.0f;
+    if (x == 0.0f) x = sign * 0.0f;
+
+    return __gen_ocl_internal_pown(x, n);
+  }
+}
+
+OVERLOADABLE float pow(float x, float y) {
+  if (!__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_pow(x,y);
+  else {
+    int n;
+    if (x == 0.f && y == 0.f)
+      return 1.f;
+    if (x >= 0.f)
+      return powr(x, y);
+    n = y;
+    if ((float)n == y)//is exact integer
+      return pown(x, n);
+    return NAN;
+  }
+}
+
+OVERLOADABLE float rootn(float x, int n) {
+  float ax,re;
+  int sign = 0;
+  int hx;
+  if( n == 0 )return NAN;
+
+  GEN_OCL_GET_FLOAT_WORD(hx, x);
+  // Gen does not support denorm, flush to zero
+  if ((hx & 0x7fffffff) < 0x00800000) {
+    x = hx < 0 ? -0.0f : 0.0f;
+  }
+
+  //rootn ( x, n )  returns a NaN for x < 0 and n is even.
+  if( x < 0 && 0 == (n&1) )
+    return NAN;
+  if( x == 0.0 ){
+    switch( n & 0x80000001 ){
+      //rootn ( +-0,  n ) is +0 for even n > 0.
+      case 0:
+        return 0.0f;
+      //rootn ( +-0,  n ) is +-0 for odd n > 0.
+      case 1:
+        return x;
+      //rootn ( +-0,  n ) is +inf for even n < 0.
+      case 0x80000000:
+        return INFINITY;
+
+      //rootn ( +-0,  n ) is +-inf for odd n < 0.
+      case 0x80000001:
+        return __gen_ocl_internal_copysign(INFINITY, x);
+    }
+  }
+  ax = __gen_ocl_fabs(x);
+  if(x <0.0f && (n&1))
+    sign = 1;
+  if (__ocl_math_fastpath_flag)
+    re = __gen_ocl_pow(ax, 1.f/n);
+  else
+    re = __gen_ocl_internal_pow(ax,1.f/n);
+  if(sign)
+    re = -re;
+  return re;
+}
+
+OVERLOADABLE float fabs(float x) {
+  return __gen_ocl_internal_fabs(x);
+}
+
+OVERLOADABLE float trunc(float x) {
+  return  __gen_ocl_internal_trunc(x);
+}
+
+OVERLOADABLE float round(float x) {
+  return __gen_ocl_internal_round(x);
+}
+
+OVERLOADABLE float floor(float x) {
+  return __gen_ocl_internal_floor(x);
+}
+
+OVERLOADABLE float ceil(float x) {
+  return __gen_ocl_internal_ceil(x);
+}
+
+OVERLOADABLE float log(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log(x);
+
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log(x);
+
+  return  __gen_ocl_internal_log(x);
+}
+
+OVERLOADABLE float log2(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log2(x);
+
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log2(x);
+
+  return  __gen_ocl_internal_log2(x);
+}
+
+OVERLOADABLE float log10(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log10(x);
+
+  /* Use native instruction when it has enough precision */
+  if((x > 0x1.1p0) || (x <= 0))
+    return __gen_ocl_internal_fastpath_log10(x);
+
+  return  __gen_ocl_internal_log10(x);
+}
+
+OVERLOADABLE float exp(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_exp(x);
+
+  /* Use native instruction when it has enough precision */
+  if (x > -0x1.6p1 && x < 0x1.6p1)
+    return __gen_ocl_internal_fastpath_exp(x);
+
+  return  __gen_ocl_internal_exp(x);
+}
+
+OVERLOADABLE float exp2(float x) {
+  /* Use native instruction when it has enough precision, exp2 always */
+  return native_exp2(x);
+}
+
+OVERLOADABLE float exp10(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_exp10(x);
+
+  return  __gen_ocl_internal_exp10(x);
+}
+
+OVERLOADABLE float expm1(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_expm1(x);
+
+  return  __gen_ocl_internal_expm1(x);
+}
+
+OVERLOADABLE float fmin(float a, float b) {
+  return __gen_ocl_internal_fmin(a, b);
+}
+
+OVERLOADABLE float fmax(float a, float b) {
+  return __gen_ocl_internal_fmax(a, b);
+}
+
+OVERLOADABLE float fma(float a, float b, float c) {
+  return mad(a, b, c);
+}
+
+OVERLOADABLE float fdim(float x, float y) {
+  return __gen_ocl_internal_fdim(x, y);
+}
+
+OVERLOADABLE float maxmag(float x, float y) {
+  return __gen_ocl_internal_maxmag(x, y);
+}
+
+OVERLOADABLE float minmag(float x, float y) {
+  return __gen_ocl_internal_minmag(x, y);
+}
+
+OVERLOADABLE float nextafter(float x, float y) {
+  int hx, hy, ix, iy;
+  hx = as_int(x);
+  hy = as_int(y);
+  ix = hx & 0x7fffffff;
+  iy = hy & 0x7fffffff;
+  if(ix == 0)
+    ix = hx & 0x7fffff;
+  if(iy == 0)
+    iy = hy & 0x7fffff;
+  if(ix>0x7f800000 || iy>0x7f800000)
+    return x+y;
+  if(hx == hy)
+    return y;
+  if(ix == 0) {
+    if(iy == 0)
+      return y;
+    else
+      return as_float((hy&0x80000000) | 1);
+  }
+  if(hx >= 0) {
+    if(hx > hy) {
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  } else {
+    if(hy >= 0 || hx > hy){
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  }
+  return as_float(hx);
+}
+
+/* So far, the HW do not support half float math function.
+   We just do the conversion and call the float version here. */
+OVERLOADABLE half cospi(half x) {
+  float _x = (float)x;
+  return (half)cospi(_x);
+}
+OVERLOADABLE half cosh(half x) {
+  float _x = (float)x;
+  return (half)cosh(_x);
+}
+OVERLOADABLE half acos(half x) {
+  float _x = (float)x;
+  return (half)acos(_x);
+}
+OVERLOADABLE float half_cos(float x) {
+  return (float)cos(x);
+}
+OVERLOADABLE float half_divide(float x, float y) {
+  return (float)native_divide(x, y);
+}
+OVERLOADABLE float half_exp(float x) {
+  return (float)native_exp(x);
+}
+OVERLOADABLE float half_exp2(float x){
+  return (float)native_exp2(x);
+}
+OVERLOADABLE float half_exp10(float x){
+  return (float)native_exp10(x);
+}
+OVERLOADABLE float half_log(float x){
+  return (float)native_log(x);
+}
+OVERLOADABLE float half_log2(float x){
+  return (float)native_log2(x);
+}
+OVERLOADABLE float half_log10(float x){
+  return (float)native_log10(x);
+}
+OVERLOADABLE float half_powr(float x, float y){
+  return (float)powr(x, y);
+}
+OVERLOADABLE float half_recip(float x){
+  return (float)native_recip(x);
+}
+OVERLOADABLE float half_rsqrt(float x){
+  return (float)native_rsqrt(x);
+}
+OVERLOADABLE float half_sin(float x){
+  return (float)sin(x);
+}
+OVERLOADABLE float half_sqrt(float x){
+  return (float)native_sqrt(x);
+}
+OVERLOADABLE float half_tan(float x){
+  return (float)tan(x);
+}
+OVERLOADABLE half acospi(half x) {
+  float _x = (float)x;
+  return (half)acospi(_x);
+}
+OVERLOADABLE half acosh(half x) {
+  float _x = (float)x;
+  return (half)acosh(_x);
+}
+OVERLOADABLE half sinpi(half x) {
+  float _x = (float)x;
+  return (half)sinpi(_x);
+}
+OVERLOADABLE half sinh(half x) {
+  float _x = (float)x;
+  return (half)sinh(_x);
+}
+OVERLOADABLE half asin(half x) {
+  float _x = (float)x;
+  return (half)asin(_x);
+}
+OVERLOADABLE half asinpi(half x) {
+  float _x = (float)x;
+  return (half)asinpi(_x);
+}
+OVERLOADABLE half asinh(half x) {
+  float _x = (float)x;
+  return (half)asinh(_x);
+}
+OVERLOADABLE half tanpi(half x) {
+  float _x = (float)x;
+  return (half)tanpi(_x);
+}
+OVERLOADABLE half tanh(half x) {
+  float _x = (float)x;
+  return (half)tanh(_x);
+}
+OVERLOADABLE half atan(half x) {
+  float _x = (float)x;
+  return (half)atan(_x);
+}
+OVERLOADABLE half atan2(half y, half x) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)atan2(_x, _y);
+}
+OVERLOADABLE half atan2pi(half y, half x) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)atan2pi(_x, _y);
+}
+OVERLOADABLE half atanpi(half x) {
+  float _x = (float)x;
+  return (half)atanpi(_x);
+}
+OVERLOADABLE half atanh(half x) {
+  float _x = (float)x;
+  return (half)atanh(_x);
+}
+OVERLOADABLE half cbrt(half x) {
+  float _x = (float)x;
+  return (half)cbrt(_x);
+}
+OVERLOADABLE half rint(half x) {
+  float _x = (float)x;
+  return (half)rint(_x);
+}
+OVERLOADABLE half copysign(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)copysign(_x, _y);
+}
+OVERLOADABLE half erf(half x) {
+  float _x = (float)x;
+  return (half)erf(_x);
+}
+OVERLOADABLE half erfc(half x) {
+  float _x = (float)x;
+  return (half)erfc(_x);
+}
+OVERLOADABLE half fmod(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)fmod(_x, _y);
+}
+OVERLOADABLE half remainder(half x, half p) {
+  float _x = (float)x;
+  float _p = (float)p;
+  return (half)remainder(_x, _p);
+}
+OVERLOADABLE half ldexp(half x, int n) {
+  float _x = (float)x;
+  return (half)ldexp(_x, n);
+}
+OVERLOADABLE half powr(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)powr(_x, _y);
+}
+OVERLOADABLE half pow(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)pow(_x, _y);
+}
+//no pow, we use powr instead
+OVERLOADABLE half fabs(half x) {
+  float _x = (float)x;
+  return (half)fabs(_x);
+}
+OVERLOADABLE half trunc(half x) {
+  float _x = (float)x;
+  return (half)trunc(_x);
+}
+OVERLOADABLE half round(half x) {
+  float _x = (float)x;
+  return (half)round(_x);
+}
+OVERLOADABLE half floor(half x) {
+  float _x = (float)x;
+  return (half)floor(_x);
+}
+OVERLOADABLE half ceil(half x) {
+  float _x = (float)x;
+  return (half)ceil(_x);
+}
+OVERLOADABLE half log(half x) {
+  float _x = (float)x;
+  return (half)log(_x);
+}
+OVERLOADABLE half log2(half x) {
+  float _x = (float)x;
+  return (half)log2(_x);
+}
+OVERLOADABLE half log10(half x) {
+  float _x = (float)x;
+  return (half)log10(_x);
+}
+OVERLOADABLE half exp(half x) {
+  float _x = (float)x;
+  return (half)exp(_x);
+}
+OVERLOADABLE half exp10(half x) {
+  float _x = (float)x;
+  return (half)exp10(_x);
+}
+OVERLOADABLE half expm1(half x) {
+  float _x = (float)x;
+  return (half)expm1(_x);
+}
+OVERLOADABLE half fmin(half a, half b) {
+  return __gen_ocl_internal_fmin(a, b);
+}
+OVERLOADABLE half fmax(half a, half b) {
+  return __gen_ocl_internal_fmax(a, b);
+}
+OVERLOADABLE half fma(half a, half b, half c) {
+  float _a = (float)a;
+  float _b = (float)b;
+  float _c = (float)c;
+  return (half)fma(_a, _b, _c);
+}
+OVERLOADABLE half fdim(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)fdim(_x, _y);
+}
+OVERLOADABLE half maxmag(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)maxmag(_x, _y);
+}
+OVERLOADABLE half minmag(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)minmag(_x, _y);
+}
+OVERLOADABLE half exp2(half x) {
+  float _x = (float)x;
+  return (half)exp2(_x);
+}
+OVERLOADABLE half mad(half a, half b, half c) {
+  return __gen_ocl_mad(a,b,c);
+}
+OVERLOADABLE half sin(half x) {
+  float _x = (float)x;
+  return (half)sin(_x);
+}
+OVERLOADABLE half cos(half x) {
+  float _x = (float)x;
+  return (half)cos(_x);
+}
+OVERLOADABLE half tan(half x) {
+  float _x = (float)x;
+  return (half)tan(_x);
+}
+OVERLOADABLE half tgamma(half x) {
+  float _x = (float)x;
+  return (half)tgamma(_x);
+}
+OVERLOADABLE half lgamma(half x) {
+  float _x = (float)x;
+  return (half)lgamma(_x);
+}
+
+OVERLOADABLE half log1p(half x) {
+  float _x = (float)x;
+  return (half)log1p(_x);
+}
+OVERLOADABLE half logb(half x) {
+  float _x = (float)x;
+  return (half)logb(_x);
+}
+OVERLOADABLE int ilogb(half x) {
+  float _x = (float)x;
+  return ilogb(_x);
+}
+OVERLOADABLE half nan(ushort code) {
+  return (half)NAN;
+}
+
+OVERLOADABLE half sqrt(half x) {
+  float _x = (float)x;
+  return (half)sqrt(_x);
+}
+OVERLOADABLE half rsqrt(half x) {
+  float _x = (float)x;
+  return (half)rsqrt(_x);
+}
+
+OVERLOADABLE half nextafter(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)nextafter(_x, _y);
+}
+
+OVERLOADABLE half hypot(half x, half y) {
+  float _x = (float)x;
+  float _y = (float)y;
+  return (half)hypot(_x, _y);
+}
+
+OVERLOADABLE half pown(half x, int n) {
+  float _x = (float)x;
+  return (half)pown(_x, n);
+}
+OVERLOADABLE half rootn(half x, int n) {
+  float _x = (float)x;
+  return (half)rootn(_x, n);
+}
+
 INLINE int  __HI(double x){
     long x64 = as_long(x);
     int high = convert_int((x64 >> 32) & 0xFFFFFFFF);
diff --git a/backend/src/libocl/tmpl/ocl_math_common.tmpl.h b/backend/src/libocl/tmpl/ocl_math_common.tmpl.h
index 3a9c5e6c..112b4384 100644
--- a/backend/src/libocl/tmpl/ocl_math_common.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_math_common.tmpl.h
@@ -20,6 +20,185 @@
 
 #include "ocl_types.h"
 
+OVERLOADABLE float cospi(float x);
+OVERLOADABLE float cosh(float x);
+OVERLOADABLE float acos(float x);
+OVERLOADABLE float acospi(float x);
+OVERLOADABLE float acosh(float x);
+OVERLOADABLE float sinpi(float x);
+OVERLOADABLE float sinh(float x);
+OVERLOADABLE float asin(float x);
+OVERLOADABLE float asinpi(float x);
+OVERLOADABLE float asinh(float x);
+OVERLOADABLE float tanpi(float x);
+OVERLOADABLE float tanh(float x);
+OVERLOADABLE float atan(float x);
+OVERLOADABLE float atan2(float y, float x);
+OVERLOADABLE float atan2pi(float y, float x);
+OVERLOADABLE float atanpi(float x);
+OVERLOADABLE float atanh(float x);
+OVERLOADABLE float cbrt(float x);
+OVERLOADABLE float rint(float x);
+OVERLOADABLE float copysign(float x, float y);
+OVERLOADABLE float erf(float x);
+OVERLOADABLE float erfc(float x);
+OVERLOADABLE float fmod (float x, float y);
+OVERLOADABLE float remainder(float x, float p);
+OVERLOADABLE float ldexp(float x, int n);
+OVERLOADABLE float powr(float x, float y);
+OVERLOADABLE float pow(float x, float y);
+//no pow, we use powr instead
+OVERLOADABLE float fabs(float x);
+OVERLOADABLE float trunc(float x);
+OVERLOADABLE float round(float x);
+OVERLOADABLE float floor(float x);
+OVERLOADABLE float ceil(float x);
+OVERLOADABLE float log(float x);
+OVERLOADABLE float log2(float x);
+OVERLOADABLE float log10(float x);
+OVERLOADABLE float exp(float x);
+OVERLOADABLE float exp10(float x);
+OVERLOADABLE float expm1(float x);
+OVERLOADABLE float fmin(float a, float b);
+OVERLOADABLE float fmax(float a, float b);
+OVERLOADABLE float fma(float a, float b, float c);
+OVERLOADABLE float fdim(float x, float y);
+OVERLOADABLE float maxmag(float x, float y);
+OVERLOADABLE float minmag(float x, float y);
+OVERLOADABLE float exp2(float x);
+OVERLOADABLE float mad(float a, float b, float c);
+OVERLOADABLE float sin(float x);
+OVERLOADABLE float cos(float x);
+OVERLOADABLE float tan(float x);
+OVERLOADABLE float tgamma(float x);
+OVERLOADABLE float lgamma(float x);
+
+OVERLOADABLE float log1p(float x);
+OVERLOADABLE float logb(float x);
+OVERLOADABLE int ilogb(float x);
+OVERLOADABLE float nan(uint code);
+OVERLOADABLE float sqrt(float x);
+OVERLOADABLE float rsqrt(float x);
+OVERLOADABLE float hypot(float x, float y);
+OVERLOADABLE float nextafter(float x, float y);
+OVERLOADABLE float pown(float x, int n);
+OVERLOADABLE float rootn(float x, int n);
+
+// native
+OVERLOADABLE float native_cos(float x);
+OVERLOADABLE float native_divide(float x, float y);
+OVERLOADABLE float native_exp(float x);
+OVERLOADABLE float native_exp2(float x);
+OVERLOADABLE float native_exp10(float x);
+OVERLOADABLE float native_log(float x);
+OVERLOADABLE float native_log2(float x);
+OVERLOADABLE float native_log10(float x);
+OVERLOADABLE float native_powr(float x, float y);
+OVERLOADABLE float native_recip(float x);
+OVERLOADABLE float native_rsqrt(float x);
+OVERLOADABLE float native_sin(float x);
+OVERLOADABLE float native_sqrt(float x);
+OVERLOADABLE float native_tan(float x);
+
+
+// Half float version.
+OVERLOADABLE half cospi(half x);
+OVERLOADABLE half cosh(half x);
+OVERLOADABLE half acos(half x);
+OVERLOADABLE half acospi(half x);
+OVERLOADABLE half acosh(half x);
+OVERLOADABLE half sinpi(half x);
+OVERLOADABLE half sinh(half x);
+OVERLOADABLE half asin(half x);
+OVERLOADABLE half asinpi(half x);
+OVERLOADABLE half asinh(half x);
+OVERLOADABLE half tanpi(half x);
+OVERLOADABLE half tanh(half x);
+OVERLOADABLE half atan(half x);
+OVERLOADABLE half atan2(half y, half x);
+OVERLOADABLE half atan2pi(half y, half x);
+OVERLOADABLE half atanpi(half x);
+OVERLOADABLE half atanh(half x);
+OVERLOADABLE half cbrt(half x);
+OVERLOADABLE half rint(half x);
+OVERLOADABLE half copysign(half x, half y);
+OVERLOADABLE half erf(half x);
+OVERLOADABLE half erfc(half x);
+OVERLOADABLE half fmod (half x, half y);
+OVERLOADABLE half remainder(half x, half p);
+OVERLOADABLE half ldexp(half x, int n);
+OVERLOADABLE half powr(half x, half y);
+OVERLOADABLE half pow(half x, half y);
+//no pow, we use powr instead
+OVERLOADABLE half fabs(half x);
+OVERLOADABLE half trunc(half x);
+OVERLOADABLE half round(half x);
+OVERLOADABLE half floor(half x);
+OVERLOADABLE half ceil(half x);
+OVERLOADABLE half log(half x);
+OVERLOADABLE half log2(half x);
+OVERLOADABLE half log10(half x);
+OVERLOADABLE half exp(half x);
+OVERLOADABLE half exp10(half x);
+OVERLOADABLE half expm1(half x);
+OVERLOADABLE half fmin(half a, half b);
+OVERLOADABLE half fmax(half a, half b);
+OVERLOADABLE half fma(half a, half b, half c);
+OVERLOADABLE half fdim(half x, half y);
+OVERLOADABLE half maxmag(half x, half y);
+OVERLOADABLE half minmag(half x, half y);
+OVERLOADABLE half exp2(half x);
+OVERLOADABLE half mad(half a, half b, half c);
+OVERLOADABLE half sin(half x);
+OVERLOADABLE half cos(half x);
+OVERLOADABLE half tan(half x);
+OVERLOADABLE half tgamma(half x);
+OVERLOADABLE half lgamma(half x);
+OVERLOADABLE half log1p(half x);
+OVERLOADABLE half logb(half x);
+OVERLOADABLE int ilogb(half x);
+OVERLOADABLE half nan(ushort code);
+OVERLOADABLE half pown(half x, int n);
+OVERLOADABLE half rootn(half x, int n);
+OVERLOADABLE half hypot(half x, half y);
+OVERLOADABLE half nextafter(half x, half y);
+OVERLOADABLE half sqrt(half x);
+OVERLOADABLE half rsqrt(half x);
+
+
+// native half
+OVERLOADABLE half native_cos(half x);
+OVERLOADABLE half native_divide(half x, half y);
+OVERLOADABLE half native_exp(half x);
+OVERLOADABLE half native_exp2(half x);
+OVERLOADABLE half native_exp10(half x);
+OVERLOADABLE half native_log(half x);
+OVERLOADABLE half native_log2(half x);
+OVERLOADABLE half native_log10(half x);
+OVERLOADABLE half native_powr(half x, half y);
+OVERLOADABLE half native_recip(half x);
+OVERLOADABLE half native_rsqrt(half x);
+OVERLOADABLE half native_sin(half x);
+OVERLOADABLE half native_sqrt(half x);
+OVERLOADABLE half native_tan(half x);
+
+// half accuracy
+OVERLOADABLE float half_cos(float x);
+OVERLOADABLE float half_divide(float x, float y);
+OVERLOADABLE float half_exp(float x);
+OVERLOADABLE float half_exp2(float x);
+OVERLOADABLE float half_exp10(float x);
+OVERLOADABLE float half_log(float x);
+OVERLOADABLE float half_log2(float x);
+OVERLOADABLE float half_log10(float x);
+OVERLOADABLE float half_powr(float x, float y);
+OVERLOADABLE float half_recip(float x);
+OVERLOADABLE float half_rsqrt(float x);
+OVERLOADABLE float half_sin(float x);
+OVERLOADABLE float half_sqrt(float x);
+OVERLOADABLE float half_tan(float x);
+
+
 OVERLOADABLE double acos(double x);
 OVERLOADABLE double acospi(double x);
 OVERLOADABLE double acosh(double x);
author	rander.wang <rander.wang@intel.com>	2017-05-15 14:11:18 +0800
committer	Yang Rong <rong.r.yang@intel.com>	2017-05-17 18:09:25 +0800
commit	acd53cffa43da9f92b7e656f1314edc0b4044c0b (patch)
tree	ed63012fbb24a36e521d98ada4a2ce4ddc721d34
parent	98eeab036fc30a036d92bd0e3b63d093aae002c8 (diff)