diff options
author | Luo Xionghu <xionghu.luo@intel.com> | 2015-01-28 11:49:49 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@intel.com> | 2015-01-28 12:44:43 +0800 |
commit | d0ade586e3cdbf2c8f1923872c93b3b6b07b897b (patch) | |
tree | 1a4ae74f539c095a2aef7b3f3a3f0fb4c75de8bb | |
parent | 3fe24b30395224b904ba926ed71d9898135ef181 (diff) |
libocl: reimplement clz with lzd instruction instead of fbh.
the fbh style is inefficient.
v2: use llvm.ctlz to call llvm intrinsic instead of beignet non-standard
intrinsic call style; remove the non-standard clz call path.
v3: lower the qword call to two dword call as the gen platform not
support native qword lzd instruction.
Signed-off-by: Luo Xionghu <xionghu.luo@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
-rw-r--r-- | backend/src/libocl/CMakeLists.txt | 2 | ||||
-rw-r--r-- | backend/src/libocl/src/ocl_clz.ll | 62 | ||||
-rw-r--r-- | backend/src/libocl/tmpl/ocl_integer.tmpl.cl | 78 | ||||
-rw-r--r-- | backend/src/libocl/tmpl/ocl_integer.tmpl.h | 9 |
4 files changed, 85 insertions, 66 deletions
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt index 314d373a..16f00ee3 100644 --- a/backend/src/libocl/CMakeLists.txt +++ b/backend/src/libocl/CMakeLists.txt @@ -181,7 +181,7 @@ MACRO(ADD_LL_TO_BC_TARGET M) ) ENDMACRO(ADD_LL_TO_BC_TARGET) -SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset) +SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset ocl_clz) FOREACH(f ${OCL_LL_MODULES}) COPY_THE_LL(${f}) ADD_LL_TO_BC_TARGET(${f}) diff --git a/backend/src/libocl/src/ocl_clz.ll b/backend/src/libocl/src/ocl_clz.ll new file mode 100644 index 00000000..a274cdef --- /dev/null +++ b/backend/src/libocl/src/ocl_clz.ll @@ -0,0 +1,62 @@ +declare i8 @llvm.ctlz.i8(i8, i1) +declare i16 @llvm.ctlz.i16(i16, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i64 @llvm.ctlz.i64(i64, i1) + +define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline { + %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0) + ret i8 %call +} + +define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline { + %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0) + ret i8 %call +} + +define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline { + %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0) + ret i16 %call +} + +define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline { + %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0) + ret i16 %call +} + +define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline { + %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0) + ret i32 %call +} + +define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline { + %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0) + ret i32 %call +} + +define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline { + %1 = bitcast i64 %x to <2 x i32> + %2 = extractelement <2 x i32> %1, i32 0 + %3 = extractelement <2 x i32> %1, i32 1 + %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0) + %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0) + %cmp = icmp ult i32 %call2, 32 + %4 = add i32 %call1, 32 + %5 = select i1 %cmp, i32 %call2, i32 %4 + %6 = insertelement <2 x i32> undef, i32 %5, i32 0 + %call = bitcast <2 x i32> %6 to i64 + ret i64 %call +} + +define i64 @clz_u64(i64 %x) nounwind readnone alwaysinline { + %1 = bitcast i64 %x to <2 x i32> + %2 = extractelement <2 x i32> %1, i32 0 + %3 = extractelement <2 x i32> %1, i32 1 + %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0) + %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0) + %cmp = icmp ult i32 %call2, 32 + %4 = add i32 %call1, 32 + %5 = select i1 %cmp, i32 %call2, i32 %4 + %6 = insertelement <2 x i32> undef, i32 %5, i32 0 + %call = bitcast <2 x i32> %6 to i64 + ret i64 %call +} diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl index 6da0bab2..a5e1dbc1 100644 --- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl @@ -19,6 +19,8 @@ PURE CONST uint __gen_ocl_fbh(uint); PURE CONST uint __gen_ocl_fbl(uint); + + PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint); PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int); PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort); @@ -26,71 +28,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short); PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar); PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char); -OVERLOADABLE char clz(char x) { - if (x < 0) - return 0; - if (x == 0) - return 8; - return __gen_ocl_fbh(x) - 24; -} - -OVERLOADABLE uchar clz(uchar x) { - if (x == 0) - return 8; - return __gen_ocl_fbh(x) - 24; -} - -OVERLOADABLE short clz(short x) { - if (x < 0) - return 0; - if (x == 0) - return 16; - return __gen_ocl_fbh(x) - 16; -} - -OVERLOADABLE ushort clz(ushort x) { - if (x == 0) - return 16; - return __gen_ocl_fbh(x) - 16; -} - -OVERLOADABLE int clz(int x) { - if (x < 0) - return 0; - if (x == 0) - return 32; - return __gen_ocl_fbh(x); -} - -OVERLOADABLE uint clz(uint x) { - if (x == 0) - return 32; - return __gen_ocl_fbh(x); -} - -OVERLOADABLE long clz(long x) { - union { int i[2]; long x; } u; - u.x = x; - if (u.i[1] & 0x80000000u) - return 0; - if (u.i[1] == 0 && u.i[0] == 0) - return 64; - uint v = clz(u.i[1]); - if(v == 32) - v += clz(u.i[0]); - return v; -} - -OVERLOADABLE ulong clz(ulong x) { - if (x == 0) - return 64; - union { uint i[2]; ulong x; } u; - u.x = x; - uint v = clz(u.i[1]); - if(v == 32) - v += clz(u.i[0]); - return v; -} +#define SDEF(TYPE, TYPE_NAME, SIZE) \ +OVERLOADABLE TYPE clz(TYPE x){ return clz_##TYPE_NAME##SIZE(x);} +SDEF(char, s, 8); +SDEF(uchar, u, 8); +SDEF(short, s, 16); +SDEF(ushort, u, 16); +SDEF(int, s, 32); +SDEF(uint, u, 32); +SDEF(long, s, 64); +SDEF(ulong, u, 64); +#undef SDEF #define SDEF(TYPE) \ OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);} diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h index f067b8da..4b3b5ae3 100644 --- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h @@ -45,6 +45,15 @@ OVERLOADABLE uint clz(uint x); OVERLOADABLE long clz(long x); OVERLOADABLE ulong clz(ulong x); +char clz_s8(char); +uchar clz_u8(uchar); +short clz_s16(short); +ushort clz_u16(ushort); +int clz_s32(int); +uint clz_u32(uint); +long clz_s64(long); +ulong clz_u64(ulong); + OVERLOADABLE char popcount(char x); OVERLOADABLE uchar popcount(uchar x); OVERLOADABLE short popcount(short x); |