summaryrefslogtreecommitdiff
path: root/backend/kernels
diff options
context:
space:
mode:
authorBenjamin Segovia <segovia.benjamin@gmail.com>2012-03-07 01:49:50 -0800
committerKeith Packard <keithp@keithp.com>2012-08-10 16:15:35 -0700
commit8e09b36606e7693e8f7727af2d348fa82d407e53 (patch)
tree5b863684e48b2ca9a8b5763c12a466014d2aeaba /backend/kernels
parent32518ba7167a6e5cd260d6b6df0fb9a90d243555 (diff)
Added shuffle/insert/extract elements LLVM IR translation Added Gen IR select instruction and LLVM IR select translation
Diffstat (limited to 'backend/kernels')
-rw-r--r--backend/kernels/extract.cl7
-rw-r--r--backend/kernels/extract.ll21
-rw-r--r--backend/kernels/insert.cl5
-rw-r--r--backend/kernels/insert.ll17
-rw-r--r--backend/kernels/select.cl9
-rw-r--r--backend/kernels/select.ll38
-rw-r--r--backend/kernels/shuffle.cl7
-rw-r--r--backend/kernels/shuffle.ll17
-rw-r--r--backend/kernels/stdlib.h40
9 files changed, 133 insertions, 28 deletions
diff --git a/backend/kernels/extract.cl b/backend/kernels/extract.cl
new file mode 100644
index 00000000..ca2ef19e
--- /dev/null
+++ b/backend/kernels/extract.cl
@@ -0,0 +1,7 @@
+#include "stdlib.h"
+__kernel void extract(__global int4 *dst, __global int4 *src, int c)
+{
+ const int4 from = src[0];
+ dst[0] = (int4)(from.x, 1, 2, 3);
+}
+
diff --git a/backend/kernels/extract.ll b/backend/kernels/extract.ll
new file mode 100644
index 00000000..b10a21ff
--- /dev/null
+++ b/backend/kernels/extract.ll
@@ -0,0 +1,21 @@
+; ModuleID = 'extract.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @extract(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
+entry:
+ %0 = load <4 x i32>* %src, align 16, !tbaa !1
+ %1 = extractelement <4 x i32> %0, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %1, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 1, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit1, i32 2, i32 2
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 3, i32 3
+ store <4 x i32> %vecinit3, <4 x i32>* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @extract}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/insert.cl b/backend/kernels/insert.cl
index 52622664..1711feaf 100644
--- a/backend/kernels/insert.cl
+++ b/backend/kernels/insert.cl
@@ -1,7 +1,8 @@
#include "stdlib.h"
__kernel void insert(__global int4 *dst, __global int4 *src, int c)
{
- dst[0].x = dst[0][c];
- dst[0].yzw = dst[1].xyz + src[0].xyz;
+ int4 x = src[0];
+ src[0].z = 1.f;
+ dst[0] = src[0];
}
diff --git a/backend/kernels/insert.ll b/backend/kernels/insert.ll
index 408102c4..5df1dd89 100644
--- a/backend/kernels/insert.ll
+++ b/backend/kernels/insert.ll
@@ -4,19 +4,10 @@ target triple = "ptx32--"
define ptx_kernel void @insert(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
entry:
- %0 = load <4 x i32>* %dst, align 16, !tbaa !1
- %vecext = extractelement <4 x i32> %0, i32 %c
- %1 = insertelement <4 x i32> %0, i32 %vecext, i32 0
- store <4 x i32> %1, <4 x i32>* %dst, align 16
- %arrayidx2 = getelementptr inbounds <4 x i32>* %dst, i32 1
- %2 = load <4 x i32>* %arrayidx2, align 16
- %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
- %4 = load <4 x i32>* %src, align 16
- %5 = shufflevector <4 x i32> %4, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
- %add = add <3 x i32> %3, %5
- %6 = shufflevector <3 x i32> %add, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
- %7 = shufflevector <4 x i32> %1, <4 x i32> %6, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
- store <4 x i32> %7, <4 x i32>* %dst, align 16
+ %0 = load <4 x i32>* %src, align 16
+ %1 = insertelement <4 x i32> %0, i32 1, i32 2
+ store <4 x i32> %1, <4 x i32>* %src, align 16
+ store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1
ret void
}
diff --git a/backend/kernels/select.cl b/backend/kernels/select.cl
new file mode 100644
index 00000000..86a0b082
--- /dev/null
+++ b/backend/kernels/select.cl
@@ -0,0 +1,9 @@
+#include "stdlib.h"
+
+__kernel void test_select(__global int4 *dst,
+ __global int4 *src0,
+ __global int4 *src1)
+{
+ const int4 from = select(src0[0], src0[1], src0[1]);
+ dst[0] = from;
+}
diff --git a/backend/kernels/select.ll b/backend/kernels/select.ll
new file mode 100644
index 00000000..a3d7e16f
--- /dev/null
+++ b/backend/kernels/select.ll
@@ -0,0 +1,38 @@
+; ModuleID = 'select.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @test_select(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src0, <4 x i32>* nocapture %src1) nounwind noinline {
+entry:
+ %0 = load <4 x i32>* %src0, align 16, !tbaa !1
+ %arrayidx1 = getelementptr inbounds <4 x i32>* %src0, i32 1
+ %1 = load <4 x i32>* %arrayidx1, align 16, !tbaa !1
+ %2 = extractelement <4 x i32> %0, i32 0
+ %3 = extractelement <4 x i32> %1, i32 0
+ %4 = extractelement <4 x i32> %0, i32 1
+ %5 = extractelement <4 x i32> %1, i32 1
+ %6 = extractelement <4 x i32> %0, i32 2
+ %7 = extractelement <4 x i32> %1, i32 2
+ %8 = extractelement <4 x i32> %0, i32 3
+ %9 = extractelement <4 x i32> %1, i32 3
+ %tobool.i = icmp slt i32 %3, 0
+ %cond1.i = select i1 %tobool.i, i32 %3, i32 %2
+ %10 = insertelement <4 x i32> undef, i32 %cond1.i, i32 0
+ %tobool3.i = icmp slt i32 %5, 0
+ %cond7.i = select i1 %tobool3.i, i32 %5, i32 %4
+ %11 = insertelement <4 x i32> %10, i32 %cond7.i, i32 1
+ %tobool9.i = icmp slt i32 %7, 0
+ %cond13.i = select i1 %tobool9.i, i32 %7, i32 %6
+ %12 = insertelement <4 x i32> %11, i32 %cond13.i, i32 2
+ %tobool15.i = icmp slt i32 %9, 0
+ %cond19.i = select i1 %tobool15.i, i32 %9, i32 %8
+ %13 = insertelement <4 x i32> %12, i32 %cond19.i, i32 3
+ store <4 x i32> %13, <4 x i32>* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @test_select}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/shuffle.cl b/backend/kernels/shuffle.cl
new file mode 100644
index 00000000..6d49621f
--- /dev/null
+++ b/backend/kernels/shuffle.cl
@@ -0,0 +1,7 @@
+#include "stdlib.h"
+__kernel void shuffle(__global int4 *dst, __global int4 *src, int c)
+{
+ const int4 from = src[0];
+ dst[0] = from.xywz;
+}
+
diff --git a/backend/kernels/shuffle.ll b/backend/kernels/shuffle.ll
new file mode 100644
index 00000000..e17a6844
--- /dev/null
+++ b/backend/kernels/shuffle.ll
@@ -0,0 +1,17 @@
+; ModuleID = 'shuffle.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_kernel void @shuffle(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
+entry:
+ %0 = load <4 x i32>* %src, align 16, !tbaa !1
+ %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+ store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @shuffle}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/stdlib.h b/backend/kernels/stdlib.h
index 3d83799f..49ed4f01 100644
--- a/backend/kernels/stdlib.h
+++ b/backend/kernels/stdlib.h
@@ -38,28 +38,42 @@ inline unsigned get_local_id(unsigned int dim) {
else return 0;
}
+__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) {
+ return cond ? src0 : src1;
+}
+
+__attribute__((overloadable)) inline int select(int src0, int src1, int cond) {
+ return cond ? src0 : src1;
+}
+
typedef float float2 __attribute__((ext_vector_type(2)));
typedef float float3 __attribute__((ext_vector_type(3)));
typedef float float4 __attribute__((ext_vector_type(4)));
typedef int int2 __attribute__((ext_vector_type(2)));
typedef int int3 __attribute__((ext_vector_type(3)));
typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int uint2 __attribute__((ext_vector_type(2)));
+typedef unsigned uint3 __attribute__((ext_vector_type(3)));
+typedef unsigned uint4 __attribute__((ext_vector_type(4)));
typedef bool bool2 __attribute__((ext_vector_type(2)));
typedef bool bool3 __attribute__((ext_vector_type(3)));
typedef bool bool4 __attribute__((ext_vector_type(4)));
-#define DECL_SELECT(TYPE) \
-__attribute__((overloadable)) \
-inline TYPE select(bool b, TYPE x, TYPE y) { \
- if (b) return x; else return y; \
+__attribute__((overloadable)) inline int4 select(int4 src0, int4 src1, int4 cond) {
+ int4 dst;
+ const int x0 = src0.x; // Fix performance issue with CLANG
+ const int x1 = src1.x;
+ const int y0 = src0.y;
+ const int y1 = src1.y;
+ const int z0 = src0.z;
+ const int z1 = src1.z;
+ const int w0 = src0.w;
+ const int w1 = src1.w;
+
+ dst.x = (cond.x & 0x80000000) ? x1 : x0;
+ dst.y = (cond.y & 0x80000000) ? y1 : y0;
+ dst.z = (cond.z & 0x80000000) ? z1 : z0;
+ dst.w = (cond.w & 0x80000000) ? w1 : w0;
+ return dst;
}
-#define DECL_SELECT_ALL(TYPE) \
- DECL_SELECT(TYPE) \
- DECL_SELECT(TYPE##2) \
- DECL_SELECT(TYPE##3) \
- DECL_SELECT(TYPE##4)
-DECL_SELECT_ALL(int)
-DECL_SELECT_ALL(float)
-#undef DECL_SELECT_ALL
-#undef DECL_SELECT