diff options
author | Benjamin Segovia <segovia.benjamin@gmail.com> | 2012-03-07 01:49:50 -0800 |
---|---|---|
committer | Keith Packard <keithp@keithp.com> | 2012-08-10 16:15:35 -0700 |
commit | 8e09b36606e7693e8f7727af2d348fa82d407e53 (patch) | |
tree | 5b863684e48b2ca9a8b5763c12a466014d2aeaba /backend/kernels | |
parent | 32518ba7167a6e5cd260d6b6df0fb9a90d243555 (diff) |
Added shuffle/insert/extract elements LLVM IR translation Added Gen IR select instruction and LLVM IR select translation
Diffstat (limited to 'backend/kernels')
-rw-r--r-- | backend/kernels/extract.cl | 7 | ||||
-rw-r--r-- | backend/kernels/extract.ll | 21 | ||||
-rw-r--r-- | backend/kernels/insert.cl | 5 | ||||
-rw-r--r-- | backend/kernels/insert.ll | 17 | ||||
-rw-r--r-- | backend/kernels/select.cl | 9 | ||||
-rw-r--r-- | backend/kernels/select.ll | 38 | ||||
-rw-r--r-- | backend/kernels/shuffle.cl | 7 | ||||
-rw-r--r-- | backend/kernels/shuffle.ll | 17 | ||||
-rw-r--r-- | backend/kernels/stdlib.h | 40 |
9 files changed, 133 insertions, 28 deletions
diff --git a/backend/kernels/extract.cl b/backend/kernels/extract.cl new file mode 100644 index 00000000..ca2ef19e --- /dev/null +++ b/backend/kernels/extract.cl @@ -0,0 +1,7 @@ +#include "stdlib.h" +__kernel void extract(__global int4 *dst, __global int4 *src, int c) +{ + const int4 from = src[0]; + dst[0] = (int4)(from.x, 1, 2, 3); +} + diff --git a/backend/kernels/extract.ll b/backend/kernels/extract.ll new file mode 100644 index 00000000..b10a21ff --- /dev/null +++ b/backend/kernels/extract.ll @@ -0,0 +1,21 @@ +; ModuleID = 'extract.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_kernel void @extract(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline { +entry: + %0 = load <4 x i32>* %src, align 16, !tbaa !1 + %1 = extractelement <4 x i32> %0, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %1, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 1, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 2, i32 2 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 3, i32 3 + store <4 x i32> %vecinit3, <4 x i32>* %dst, align 16, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @extract} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/insert.cl b/backend/kernels/insert.cl index 52622664..1711feaf 100644 --- a/backend/kernels/insert.cl +++ b/backend/kernels/insert.cl @@ -1,7 +1,8 @@ #include "stdlib.h" __kernel void insert(__global int4 *dst, __global int4 *src, int c) { - dst[0].x = dst[0][c]; - dst[0].yzw = dst[1].xyz + src[0].xyz; + int4 x = src[0]; + src[0].z = 1.f; + dst[0] = src[0]; } diff --git a/backend/kernels/insert.ll b/backend/kernels/insert.ll index 408102c4..5df1dd89 100644 --- a/backend/kernels/insert.ll +++ b/backend/kernels/insert.ll @@ -4,19 +4,10 @@ target triple = "ptx32--" define ptx_kernel void @insert(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline { entry: - %0 = load <4 x i32>* %dst, align 16, !tbaa !1 - %vecext = extractelement <4 x i32> %0, i32 %c - %1 = insertelement <4 x i32> %0, i32 %vecext, i32 0 - store <4 x i32> %1, <4 x i32>* %dst, align 16 - %arrayidx2 = getelementptr inbounds <4 x i32>* %dst, i32 1 - %2 = load <4 x i32>* %arrayidx2, align 16 - %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> - %4 = load <4 x i32>* %src, align 16 - %5 = shufflevector <4 x i32> %4, <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2> - %add = add <3 x i32> %3, %5 - %6 = shufflevector <3 x i32> %add, <3 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef> - %7 = shufflevector <4 x i32> %1, <4 x i32> %6, <4 x i32> <i32 0, i32 4, i32 5, i32 6> - store <4 x i32> %7, <4 x i32>* %dst, align 16 + %0 = load <4 x i32>* %src, align 16 + %1 = insertelement <4 x i32> %0, i32 1, i32 2 + store <4 x i32> %1, <4 x i32>* %src, align 16 + store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1 ret void } diff --git a/backend/kernels/select.cl b/backend/kernels/select.cl new file mode 100644 index 00000000..86a0b082 --- /dev/null +++ b/backend/kernels/select.cl @@ -0,0 +1,9 @@ +#include "stdlib.h" + +__kernel void test_select(__global int4 *dst, + __global int4 *src0, + __global int4 *src1) +{ + const int4 from = select(src0[0], src0[1], src0[1]); + dst[0] = from; +} diff --git a/backend/kernels/select.ll b/backend/kernels/select.ll new file mode 100644 index 00000000..a3d7e16f --- /dev/null +++ b/backend/kernels/select.ll @@ -0,0 +1,38 @@ +; ModuleID = 'select.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_kernel void @test_select(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src0, <4 x i32>* nocapture %src1) nounwind noinline { +entry: + %0 = load <4 x i32>* %src0, align 16, !tbaa !1 + %arrayidx1 = getelementptr inbounds <4 x i32>* %src0, i32 1 + %1 = load <4 x i32>* %arrayidx1, align 16, !tbaa !1 + %2 = extractelement <4 x i32> %0, i32 0 + %3 = extractelement <4 x i32> %1, i32 0 + %4 = extractelement <4 x i32> %0, i32 1 + %5 = extractelement <4 x i32> %1, i32 1 + %6 = extractelement <4 x i32> %0, i32 2 + %7 = extractelement <4 x i32> %1, i32 2 + %8 = extractelement <4 x i32> %0, i32 3 + %9 = extractelement <4 x i32> %1, i32 3 + %tobool.i = icmp slt i32 %3, 0 + %cond1.i = select i1 %tobool.i, i32 %3, i32 %2 + %10 = insertelement <4 x i32> undef, i32 %cond1.i, i32 0 + %tobool3.i = icmp slt i32 %5, 0 + %cond7.i = select i1 %tobool3.i, i32 %5, i32 %4 + %11 = insertelement <4 x i32> %10, i32 %cond7.i, i32 1 + %tobool9.i = icmp slt i32 %7, 0 + %cond13.i = select i1 %tobool9.i, i32 %7, i32 %6 + %12 = insertelement <4 x i32> %11, i32 %cond13.i, i32 2 + %tobool15.i = icmp slt i32 %9, 0 + %cond19.i = select i1 %tobool15.i, i32 %9, i32 %8 + %13 = insertelement <4 x i32> %12, i32 %cond19.i, i32 3 + store <4 x i32> %13, <4 x i32>* %dst, align 16, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @test_select} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/shuffle.cl b/backend/kernels/shuffle.cl new file mode 100644 index 00000000..6d49621f --- /dev/null +++ b/backend/kernels/shuffle.cl @@ -0,0 +1,7 @@ +#include "stdlib.h" +__kernel void shuffle(__global int4 *dst, __global int4 *src, int c) +{ + const int4 from = src[0]; + dst[0] = from.xywz; +} + diff --git a/backend/kernels/shuffle.ll b/backend/kernels/shuffle.ll new file mode 100644 index 00000000..e17a6844 --- /dev/null +++ b/backend/kernels/shuffle.ll @@ -0,0 +1,17 @@ +; ModuleID = 'shuffle.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_kernel void @shuffle(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline { +entry: + %0 = load <4 x i32>* %src, align 16, !tbaa !1 + %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2> + store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @shuffle} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/stdlib.h b/backend/kernels/stdlib.h index 3d83799f..49ed4f01 100644 --- a/backend/kernels/stdlib.h +++ b/backend/kernels/stdlib.h @@ -38,28 +38,42 @@ inline unsigned get_local_id(unsigned int dim) { else return 0; } +__attribute__((overloadable)) inline unsigned select(unsigned src0, unsigned src1, unsigned cond) { + return cond ? src0 : src1; +} + +__attribute__((overloadable)) inline int select(int src0, int src1, int cond) { + return cond ? src0 : src1; +} + typedef float float2 __attribute__((ext_vector_type(2))); typedef float float3 __attribute__((ext_vector_type(3))); typedef float float4 __attribute__((ext_vector_type(4))); typedef int int2 __attribute__((ext_vector_type(2))); typedef int int3 __attribute__((ext_vector_type(3))); typedef int int4 __attribute__((ext_vector_type(4))); +typedef int uint2 __attribute__((ext_vector_type(2))); +typedef unsigned uint3 __attribute__((ext_vector_type(3))); +typedef unsigned uint4 __attribute__((ext_vector_type(4))); typedef bool bool2 __attribute__((ext_vector_type(2))); typedef bool bool3 __attribute__((ext_vector_type(3))); typedef bool bool4 __attribute__((ext_vector_type(4))); -#define DECL_SELECT(TYPE) \ -__attribute__((overloadable)) \ -inline TYPE select(bool b, TYPE x, TYPE y) { \ - if (b) return x; else return y; \ +__attribute__((overloadable)) inline int4 select(int4 src0, int4 src1, int4 cond) { + int4 dst; + const int x0 = src0.x; // Fix performance issue with CLANG + const int x1 = src1.x; + const int y0 = src0.y; + const int y1 = src1.y; + const int z0 = src0.z; + const int z1 = src1.z; + const int w0 = src0.w; + const int w1 = src1.w; + + dst.x = (cond.x & 0x80000000) ? x1 : x0; + dst.y = (cond.y & 0x80000000) ? y1 : y0; + dst.z = (cond.z & 0x80000000) ? z1 : z0; + dst.w = (cond.w & 0x80000000) ? w1 : w0; + return dst; } -#define DECL_SELECT_ALL(TYPE) \ - DECL_SELECT(TYPE) \ - DECL_SELECT(TYPE##2) \ - DECL_SELECT(TYPE##3) \ - DECL_SELECT(TYPE##4) -DECL_SELECT_ALL(int) -DECL_SELECT_ALL(float) -#undef DECL_SELECT_ALL -#undef DECL_SELECT |