summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuo Xionghu <xionghu.luo@intel.com>2015-01-28 11:49:49 +0800
committerZhigang Gong <zhigang.gong@intel.com>2015-01-28 12:44:43 +0800
commitd0ade586e3cdbf2c8f1923872c93b3b6b07b897b (patch)
tree1a4ae74f539c095a2aef7b3f3a3f0fb4c75de8bb
parent3fe24b30395224b904ba926ed71d9898135ef181 (diff)
libocl: reimplement clz with lzd instruction instead of fbh.
the fbh style is inefficient. v2: use llvm.ctlz to call llvm intrinsic instead of beignet non-standard intrinsic call style; remove the non-standard clz call path. v3: lower the qword call to two dword call as the gen platform not support native qword lzd instruction. Signed-off-by: Luo Xionghu <xionghu.luo@intel.com> Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
-rw-r--r--backend/src/libocl/CMakeLists.txt2
-rw-r--r--backend/src/libocl/src/ocl_clz.ll62
-rw-r--r--backend/src/libocl/tmpl/ocl_integer.tmpl.cl78
-rw-r--r--backend/src/libocl/tmpl/ocl_integer.tmpl.h9
4 files changed, 85 insertions, 66 deletions
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
index 314d373a..16f00ee3 100644
--- a/backend/src/libocl/CMakeLists.txt
+++ b/backend/src/libocl/CMakeLists.txt
@@ -181,7 +181,7 @@ MACRO(ADD_LL_TO_BC_TARGET M)
)
ENDMACRO(ADD_LL_TO_BC_TARGET)
-SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset)
+SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset ocl_clz)
FOREACH(f ${OCL_LL_MODULES})
COPY_THE_LL(${f})
ADD_LL_TO_BC_TARGET(${f})
diff --git a/backend/src/libocl/src/ocl_clz.ll b/backend/src/libocl/src/ocl_clz.ll
new file mode 100644
index 00000000..a274cdef
--- /dev/null
+++ b/backend/src/libocl/src/ocl_clz.ll
@@ -0,0 +1,62 @@
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i8 @clz_s8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i8 @clz_u8(i8 %x) nounwind readnone alwaysinline {
+ %call = call i8 @llvm.ctlz.i8(i8 %x, i1 0)
+ ret i8 %call
+}
+
+define i16 @clz_s16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i16 @clz_u16(i16 %x) nounwind readnone alwaysinline {
+ %call = call i16 @llvm.ctlz.i16(i16 %x, i1 0)
+ ret i16 %call
+}
+
+define i32 @clz_s32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i32 @clz_u32(i32 %x) nounwind readnone alwaysinline {
+ %call = call i32 @llvm.ctlz.i32(i32 %x, i1 0)
+ ret i32 %call
+}
+
+define i64 @clz_s64(i64 %x) nounwind readnone alwaysinline {
+ %1 = bitcast i64 %x to <2 x i32>
+ %2 = extractelement <2 x i32> %1, i32 0
+ %3 = extractelement <2 x i32> %1, i32 1
+ %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+ %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+ %cmp = icmp ult i32 %call2, 32
+ %4 = add i32 %call1, 32
+ %5 = select i1 %cmp, i32 %call2, i32 %4
+ %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+ %call = bitcast <2 x i32> %6 to i64
+ ret i64 %call
+}
+
+define i64 @clz_u64(i64 %x) nounwind readnone alwaysinline {
+ %1 = bitcast i64 %x to <2 x i32>
+ %2 = extractelement <2 x i32> %1, i32 0
+ %3 = extractelement <2 x i32> %1, i32 1
+ %call1 = call i32 @llvm.ctlz.i32(i32 %2, i1 0)
+ %call2 = call i32 @llvm.ctlz.i32(i32 %3, i1 0)
+ %cmp = icmp ult i32 %call2, 32
+ %4 = add i32 %call1, 32
+ %5 = select i1 %cmp, i32 %call2, i32 %4
+ %6 = insertelement <2 x i32> undef, i32 %5, i32 0
+ %call = bitcast <2 x i32> %6 to i64
+ ret i64 %call
+}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
index 6da0bab2..a5e1dbc1 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -19,6 +19,8 @@
PURE CONST uint __gen_ocl_fbh(uint);
PURE CONST uint __gen_ocl_fbl(uint);
+
+
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uint);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(int);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(ushort);
@@ -26,71 +28,17 @@ PURE CONST OVERLOADABLE uint __gen_ocl_cbit(short);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(uchar);
PURE CONST OVERLOADABLE uint __gen_ocl_cbit(char);
-OVERLOADABLE char clz(char x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 8;
- return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE uchar clz(uchar x) {
- if (x == 0)
- return 8;
- return __gen_ocl_fbh(x) - 24;
-}
-
-OVERLOADABLE short clz(short x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 16;
- return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE ushort clz(ushort x) {
- if (x == 0)
- return 16;
- return __gen_ocl_fbh(x) - 16;
-}
-
-OVERLOADABLE int clz(int x) {
- if (x < 0)
- return 0;
- if (x == 0)
- return 32;
- return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE uint clz(uint x) {
- if (x == 0)
- return 32;
- return __gen_ocl_fbh(x);
-}
-
-OVERLOADABLE long clz(long x) {
- union { int i[2]; long x; } u;
- u.x = x;
- if (u.i[1] & 0x80000000u)
- return 0;
- if (u.i[1] == 0 && u.i[0] == 0)
- return 64;
- uint v = clz(u.i[1]);
- if(v == 32)
- v += clz(u.i[0]);
- return v;
-}
-
-OVERLOADABLE ulong clz(ulong x) {
- if (x == 0)
- return 64;
- union { uint i[2]; ulong x; } u;
- u.x = x;
- uint v = clz(u.i[1]);
- if(v == 32)
- v += clz(u.i[0]);
- return v;
-}
+#define SDEF(TYPE, TYPE_NAME, SIZE) \
+OVERLOADABLE TYPE clz(TYPE x){ return clz_##TYPE_NAME##SIZE(x);}
+SDEF(char, s, 8);
+SDEF(uchar, u, 8);
+SDEF(short, s, 16);
+SDEF(ushort, u, 16);
+SDEF(int, s, 32);
+SDEF(uint, u, 32);
+SDEF(long, s, 64);
+SDEF(ulong, u, 64);
+#undef SDEF
#define SDEF(TYPE) \
OVERLOADABLE TYPE popcount(TYPE x){ return __gen_ocl_cbit(x);}
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
index f067b8da..4b3b5ae3 100644
--- a/backend/src/libocl/tmpl/ocl_integer.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
@@ -45,6 +45,15 @@ OVERLOADABLE uint clz(uint x);
OVERLOADABLE long clz(long x);
OVERLOADABLE ulong clz(ulong x);
+char clz_s8(char);
+uchar clz_u8(uchar);
+short clz_s16(short);
+ushort clz_u16(ushort);
+int clz_s32(int);
+uint clz_u32(uint);
+long clz_s64(long);
+ulong clz_u64(ulong);
+
OVERLOADABLE char popcount(char x);
OVERLOADABLE uchar popcount(uchar x);
OVERLOADABLE short popcount(short x);