summaryrefslogtreecommitdiff
path: root/backend/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'backend/kernels')
-rw-r--r--backend/kernels/add.cl.ll75
-rw-r--r--backend/kernels/add.ll13
-rw-r--r--backend/kernels/add2.cl.ll83
-rw-r--r--backend/kernels/add2.ll21
-rw-r--r--backend/kernels/cmp.cl.ll84
-rw-r--r--backend/kernels/cmp.ll22
-rw-r--r--backend/kernels/cmp_cvt.cl.ll84
-rw-r--r--backend/kernels/cmp_cvt.ll22
-rwxr-xr-xbackend/kernels/compile.sh3
-rw-r--r--backend/kernels/complex_struct.cl.ll84
-rw-r--r--backend/kernels/complex_struct.ll84
-rw-r--r--backend/kernels/cycle.cl.ll77
-rw-r--r--backend/kernels/cycle.ll15
-rw-r--r--backend/kernels/dummy.ll12
-rw-r--r--backend/kernels/extract.cl.ll83
-rw-r--r--backend/kernels/extract.ll21
-rw-r--r--backend/kernels/function.cl.ll86
-rw-r--r--backend/kernels/function.ll24
-rw-r--r--backend/kernels/function_param.cl.ll95
-rw-r--r--backend/kernels/function_param.ll33
-rw-r--r--backend/kernels/get_global_id.cl.ll89
-rw-r--r--backend/kernels/get_global_id.ll27
-rw-r--r--backend/kernels/gg.llbin1092 -> 0 bytes
-rw-r--r--backend/kernels/gg.ll.ll89
-rw-r--r--backend/kernels/insert.cl.ll80
-rw-r--r--backend/kernels/insert.ll18
-rw-r--r--backend/kernels/load_store.cl.ll (renamed from backend/kernels/load_store.ll)2
-rw-r--r--backend/kernels/loop.cl.ll93
-rw-r--r--backend/kernels/loop2.cl.ll101
-rw-r--r--backend/kernels/loop2.ll39
-rw-r--r--backend/kernels/loop3.cl9
-rw-r--r--backend/kernels/loop3.cl.ll100
-rw-r--r--backend/kernels/loop3.ll (renamed from backend/kernels/loop.ll)21
-rw-r--r--backend/kernels/loop4.cl12
-rw-r--r--backend/kernels/loop4.cl.ll129
-rw-r--r--backend/kernels/loop4.ll67
-rw-r--r--backend/kernels/loop5.cl17
-rw-r--r--backend/kernels/loop5.cl.ll131
-rw-r--r--backend/kernels/loop5.ll131
-rw-r--r--backend/kernels/mad.cl18
-rw-r--r--backend/kernels/mad.cl.ll113
-rw-r--r--backend/kernels/mad.ll51
-rw-r--r--backend/kernels/select.cl.ll100
-rw-r--r--backend/kernels/select.ll38
-rw-r--r--backend/kernels/short.cl.ll79
-rw-r--r--backend/kernels/short.ll17
-rw-r--r--backend/kernels/shuffle.cl.ll79
-rw-r--r--backend/kernels/shuffle.ll17
-rw-r--r--backend/kernels/simple_float4.cl.ll83
-rw-r--r--backend/kernels/simple_float4.ll21
-rw-r--r--backend/kernels/simple_float4_2.cl.ll84
-rw-r--r--backend/kernels/simple_float4_2.ll22
-rw-r--r--backend/kernels/simple_float4_3.cl2
-rw-r--r--backend/kernels/simple_float4_3.cl.ll92
-rw-r--r--backend/kernels/simple_float4_3.ll36
-rw-r--r--backend/kernels/stdlib.h53
-rw-r--r--backend/kernels/store.cl.ll78
-rw-r--r--backend/kernels/store.ll16
-rw-r--r--backend/kernels/struct.cl.ll128
-rw-r--r--backend/kernels/struct.ll121
-rw-r--r--backend/kernels/struct2.cl.ll100
-rw-r--r--backend/kernels/struct2.ll38
-rw-r--r--backend/kernels/test_select.cl.ll86
-rw-r--r--backend/kernels/test_select.ll24
-rw-r--r--backend/kernels/undefined.cl.ll78
-rw-r--r--backend/kernels/undefined.ll32
-rw-r--r--backend/kernels/vector_constant.cl7
-rw-r--r--backend/kernels/vector_constant.cl.ll84
-rw-r--r--backend/kernels/void.cl.ll74
-rw-r--r--backend/kernels/void.ll12
70 files changed, 3113 insertions, 846 deletions
diff --git a/backend/kernels/add.cl.ll b/backend/kernels/add.cl.ll
new file mode 100644
index 00000000..3cc40069
--- /dev/null
+++ b/backend/kernels/add.cl.ll
@@ -0,0 +1,75 @@
+; ModuleID = 'add.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel i32 @add(i32 %x, i32 %y) nounwind readnone noinline {
+entry:
+ %add = add i32 %y, %x
+ ret i32 %add
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{i32 (i32, i32)* @add}
diff --git a/backend/kernels/add.ll b/backend/kernels/add.ll
deleted file mode 100644
index 9b2c7413..00000000
--- a/backend/kernels/add.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; ModuleID = 'add.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel i32 @add(i32 %x, i32 %y) nounwind readnone noinline {
-entry:
- %add = add i32 %y, %x
- ret i32 %add
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{i32 (i32, i32)* @add}
diff --git a/backend/kernels/add2.cl.ll b/backend/kernels/add2.cl.ll
new file mode 100644
index 00000000..cb7cf6e3
--- /dev/null
+++ b/backend/kernels/add2.cl.ll
@@ -0,0 +1,83 @@
+; ModuleID = 'add2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { i32, i32 }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(%struct.big* noalias nocapture sret %agg.result, i32 %x, i32 %y) nounwind noinline {
+entry:
+ %add = add i32 %y, %x
+ %sub = add i32 %x, 10
+ %add1 = sub i32 %sub, %y
+ %agg.result.0 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 0
+ store i32 %add, i32* %agg.result.0, align 4
+ %agg.result.1 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 1
+ store i32 %add1, i32* %agg.result.1, align 4
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.big*, i32, i32)* @add}
diff --git a/backend/kernels/add2.ll b/backend/kernels/add2.ll
deleted file mode 100644
index 4ca11259..00000000
--- a/backend/kernels/add2.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; ModuleID = 'add2.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.big = type { i32, i32 }
-
-define ptx_kernel void @add(%struct.big* noalias nocapture sret %agg.result, i32 %x, i32 %y) nounwind noinline {
-entry:
- %add = add i32 %y, %x
- %sub = add i32 %x, 10
- %add1 = sub i32 %sub, %y
- %agg.result.0 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 0
- store i32 %add, i32* %agg.result.0, align 4
- %agg.result.1 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 1
- store i32 %add1, i32* %agg.result.1, align 4
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (%struct.big*, i32, i32)* @add}
diff --git a/backend/kernels/cmp.cl.ll b/backend/kernels/cmp.cl.ll
new file mode 100644
index 00000000..a77694eb
--- /dev/null
+++ b/backend/kernels/cmp.cl.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'cmp.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @test_cmp(i8 addrspace(1)* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline {
+entry:
+ %cmp = icmp slt i32 %x, %y
+ %conv = zext i1 %cmp to i32
+ %cmp1 = fcmp ogt float %z, %w
+ %add = sext i1 %cmp1 to i32
+ %tobool = icmp ne i32 %conv, %add
+ %frombool = zext i1 %tobool to i8
+ store i8 %frombool, i8 addrspace(1)* %dst, align 1, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i8 addrspace(1)*, i32, i32, float, float)* @test_cmp}
+!1 = metadata !{metadata !"bool", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/cmp.ll b/backend/kernels/cmp.ll
deleted file mode 100644
index c2f64007..00000000
--- a/backend/kernels/cmp.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; ModuleID = 'cmp.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @test_cmp(i8* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline {
-entry:
- %cmp = icmp slt i32 %x, %y
- %conv = zext i1 %cmp to i32
- %cmp1 = fcmp ogt float %z, %w
- %add = sext i1 %cmp1 to i32
- %tobool = icmp ne i32 %conv, %add
- %frombool = zext i1 %tobool to i8
- store i8 %frombool, i8* %dst, align 1, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i8*, i32, i32, float, float)* @test_cmp}
-!1 = metadata !{metadata !"bool", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/cmp_cvt.cl.ll b/backend/kernels/cmp_cvt.cl.ll
new file mode 100644
index 00000000..3a85bcae
--- /dev/null
+++ b/backend/kernels/cmp_cvt.cl.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'cmp_cvt.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @cmp_cvt(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline {
+get_local_id.exit:
+ %add = add nsw i32 %y, %x
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %cmp = icmp ult i32 %add, %call.i
+ %conv = zext i1 %cmp to i32
+ store i32 %conv, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, i32)* @cmp_cvt}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/cmp_cvt.ll b/backend/kernels/cmp_cvt.ll
deleted file mode 100644
index ab62b6c4..00000000
--- a/backend/kernels/cmp_cvt.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; ModuleID = 'cmp_cvt.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @cmp_cvt(i32* nocapture %dst, i32 %x, i32 %y) nounwind noinline {
-get_local_id.exit:
- %add = add nsw i32 %y, %x
- %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
- %cmp = icmp ult i32 %add, %call.i
- %conv = zext i1 %cmp to i32
- store i32 %conv, i32* %dst, align 4, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32, i32)* @cmp_cvt}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/compile.sh b/backend/kernels/compile.sh
index 7f5ef36d..e1177a75 100755
--- a/backend/kernels/compile.sh
+++ b/backend/kernels/compile.sh
@@ -1,4 +1,5 @@
-clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1.cl -o $1.o
+#!/bin/bash
+clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1 -o $1.o
llvm-dis $1.o
rm $1.o
mv $1.o.ll $1.ll
diff --git a/backend/kernels/complex_struct.cl.ll b/backend/kernels/complex_struct.cl.ll
new file mode 100644
index 00000000..bf607ca0
--- /dev/null
+++ b/backend/kernels/complex_struct.cl.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'complex_struct.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.my_struct = type { i32, [5 x %struct.hop] }
+%struct.hop = type { float, float }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline {
+entry:
+ %x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0
+ %0 = load float addrspace(1)* %x, align 4, !tbaa !1
+ %y = getelementptr inbounds %struct.my_struct addrspace(1)* %dst, i32 0, i32 1, i32 2, i32 1
+ store float %0, float addrspace(1)* %y, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.my_struct addrspace(1)*, %struct.my_struct addrspace(1)*)* @struct_cl}
+!1 = metadata !{metadata !"float", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/complex_struct.ll b/backend/kernels/complex_struct.ll
new file mode 100644
index 00000000..4274dd34
--- /dev/null
+++ b/backend/kernels/complex_struct.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'complex_struct.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.my_struct = type { i32, [5 x %struct.hop] }
+%struct.hop = type { float, float }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline {
+entry:
+ %x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0
+ %0 = load float addrspace(1)* %x, align 4, !tbaa !1
+ %y = getelementptr inbounds %struct.my_struct addrspace(1)* %dst, i32 0, i32 1, i32 2, i32 1
+ store float %0, float addrspace(1)* %y, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.my_struct addrspace(1)*, %struct.my_struct addrspace(1)*)* @struct_cl}
+!1 = metadata !{metadata !"float", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/cycle.cl.ll b/backend/kernels/cycle.cl.ll
new file mode 100644
index 00000000..0c4ee200
--- /dev/null
+++ b/backend/kernels/cycle.cl.ll
@@ -0,0 +1,77 @@
+; ModuleID = 'cycle.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @cycle(i32 addrspace(1)* nocapture %dst) noreturn nounwind readnone noinline {
+entry:
+ br label %hop0
+
+hop0: ; preds = %hop0, %entry
+ br label %hop0
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*)* @cycle}
diff --git a/backend/kernels/cycle.ll b/backend/kernels/cycle.ll
deleted file mode 100644
index 63363007..00000000
--- a/backend/kernels/cycle.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; ModuleID = 'cycle.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @cycle(i32* nocapture %dst) noreturn nounwind readnone noinline {
-entry:
- br label %hop0
-
-hop0: ; preds = %hop0, %entry
- br label %hop0
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*)* @cycle}
diff --git a/backend/kernels/dummy.ll b/backend/kernels/dummy.ll
deleted file mode 100644
index 3c6c269b..00000000
--- a/backend/kernels/dummy.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; ModuleID = 'void.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @hop() nounwind readnone noinline {
-entry:
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void ()* @hop}
diff --git a/backend/kernels/extract.cl.ll b/backend/kernels/extract.cl.ll
new file mode 100644
index 00000000..11c95bdb
--- /dev/null
+++ b/backend/kernels/extract.cl.ll
@@ -0,0 +1,83 @@
+; ModuleID = 'extract.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @extract(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1
+ %1 = extractelement <4 x i32> %0, i32 0
+ %vecinit = insertelement <4 x i32> undef, i32 %1, i32 0
+ %vecinit1 = insertelement <4 x i32> %vecinit, i32 1, i32 1
+ %vecinit2 = insertelement <4 x i32> %vecinit1, i32 2, i32 2
+ %vecinit3 = insertelement <4 x i32> %vecinit2, i32 3, i32 3
+ store <4 x i32> %vecinit3, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @extract}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/extract.ll b/backend/kernels/extract.ll
deleted file mode 100644
index b10a21ff..00000000
--- a/backend/kernels/extract.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; ModuleID = 'extract.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @extract(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
-entry:
- %0 = load <4 x i32>* %src, align 16, !tbaa !1
- %1 = extractelement <4 x i32> %0, i32 0
- %vecinit = insertelement <4 x i32> undef, i32 %1, i32 0
- %vecinit1 = insertelement <4 x i32> %vecinit, i32 1, i32 1
- %vecinit2 = insertelement <4 x i32> %vecinit1, i32 2, i32 2
- %vecinit3 = insertelement <4 x i32> %vecinit2, i32 3, i32 3
- store <4 x i32> %vecinit3, <4 x i32>* %dst, align 16, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @extract}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/function.cl.ll b/backend/kernels/function.cl.ll
new file mode 100644
index 00000000..62527a71
--- /dev/null
+++ b/backend/kernels/function.cl.ll
@@ -0,0 +1,86 @@
+; ModuleID = 'function.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind {
+entry:
+ store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ ret void
+}
+
+define ptx_kernel void @write2(i32 addrspace(1)* nocapture %dst, i32 %x) nounwind noinline {
+entry:
+ store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %x
+ store i32 1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32)* @write2}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/function.ll b/backend/kernels/function.ll
deleted file mode 100644
index e428448f..00000000
--- a/backend/kernels/function.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; ModuleID = 'function.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind {
-entry:
- store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
- ret void
-}
-
-define ptx_kernel void @write2(i32 addrspace(1)* nocapture %dst, i32 %x) nounwind noinline {
-entry:
- store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %x
- store i32 1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32 addrspace(1)*, i32)* @write2}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/function_param.cl.ll b/backend/kernels/function_param.cl.ll
new file mode 100644
index 00000000..56646dd1
--- /dev/null
+++ b/backend/kernels/function_param.cl.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'function_param.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.struct0 = type { [5 x i32], i32, i32, i32 }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline {
+entry:
+ %arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4
+ %0 = load i32 addrspace(4)* %arrayidx, align 4, !tbaa !1
+ %arrayidx1 = getelementptr inbounds %struct.struct0* %s, i32 0, i32 0, i32 4
+ %1 = load i32* %arrayidx1, align 4, !tbaa !1
+ %add = add i32 %0, %x
+ %add2 = add i32 %add, %1
+ store i32 %add2, i32* %arrayidx1, align 4, !tbaa !1
+ %2 = bitcast %struct.struct0 addrspace(1)* %dst to i8 addrspace(1)*
+ %3 = bitcast %struct.struct0* %s to i8*
+ tail call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %2, i8* %3, i32 32, i32 4, i1 false)
+ %y5 = getelementptr inbounds %struct.struct0 addrspace(1)* %dst, i32 0, i32 2
+ %4 = load i32 addrspace(1)* %y5, align 4, !tbaa !1
+ %add6 = add nsw i32 %4, %y
+ store i32 %add6, i32 addrspace(1)* %y5, align 4, !tbaa !1
+ ret void
+}
+
+declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.struct0 addrspace(1)*, %struct.struct0*, i32 addrspace(4)*, i32, i32)* @param}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/function_param.ll b/backend/kernels/function_param.ll
deleted file mode 100644
index 90c7220f..00000000
--- a/backend/kernels/function_param.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; ModuleID = 'function_param.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.struct0 = type { [5 x i32], i32, i32, i32 }
-
-define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline {
-entry:
- %arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4
- %0 = load i32 addrspace(4)* %arrayidx, align 4, !tbaa !1
- %arrayidx1 = getelementptr inbounds %struct.struct0* %s, i32 0, i32 0, i32 4
- %1 = load i32* %arrayidx1, align 4, !tbaa !1
- %add = add i32 %0, %x
- %add2 = add i32 %add, %1
- store i32 %add2, i32* %arrayidx1, align 4, !tbaa !1
- %2 = bitcast %struct.struct0 addrspace(1)* %dst to i8 addrspace(1)*
- %3 = bitcast %struct.struct0* %s to i8*
- tail call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %2, i8* %3, i32 32, i32 4, i1 false)
- %y5 = getelementptr inbounds %struct.struct0 addrspace(1)* %dst, i32 0, i32 2
- %4 = load i32 addrspace(1)* %y5, align 4, !tbaa !1
- %add6 = add nsw i32 %4, %y
- store i32 %add6, i32 addrspace(1)* %y5, align 4, !tbaa !1
- ret void
-}
-
-declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) nounwind
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (%struct.struct0 addrspace(1)*, %struct.struct0*, i32 addrspace(4)*, i32, i32)* @param}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/get_global_id.cl.ll b/backend/kernels/get_global_id.cl.ll
new file mode 100644
index 00000000..2dacffb7
--- /dev/null
+++ b/backend/kernels/get_global_id.cl.ll
@@ -0,0 +1,89 @@
+; ModuleID = 'get_global_id.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @test_global_id(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %p) nounwind noinline {
+get_global_id.exit13:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %sext = shl i32 %call.i, 16
+ %conv1 = ashr exact i32 %sext, 16
+ %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i6
+ store i32 %conv1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %call.i6
+ store i32 %call.i, i32 addrspace(1)* %arrayidx5, align 4, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*)* @test_global_id}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/get_global_id.ll b/backend/kernels/get_global_id.ll
deleted file mode 100644
index 1df9fdff..00000000
--- a/backend/kernels/get_global_id.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; ModuleID = 'get_global_id.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @test_global_id(i32* nocapture %dst, i32* nocapture %p) nounwind noinline {
-get_global_id.exit13:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
- %sext = shl i32 %call.i, 16
- %conv1 = ashr exact i32 %sext, 16
- %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds i32* %dst, i32 %call.i6
- store i32 %conv1, i32* %arrayidx, align 4, !tbaa !1
- %arrayidx5 = getelementptr inbounds i32* %p, i32 %call.i6
- store i32 %call.i, i32* %arrayidx5, align 4, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32*)* @test_global_id}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/gg.ll b/backend/kernels/gg.ll
deleted file mode 100644
index 0f9d666e..00000000
--- a/backend/kernels/gg.ll
+++ /dev/null
Binary files differ
diff --git a/backend/kernels/gg.ll.ll b/backend/kernels/gg.ll.ll
deleted file mode 100644
index 1c60671c..00000000
--- a/backend/kernels/gg.ll.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; ModuleID = 'gg.ll'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.my_struct = type { i32, [2 x i32] }
-
-@g = addrspace(1) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
-@struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4
-
-define ptx_kernel void @struct_cl(%struct.my_struct* byval %s, i32 %x, i32* %mem, i32 %y) nounwind noinline {
-entry:
- %x.addr = alloca i32, align 4
- %mem.addr = alloca i32*, align 4
- %y.addr = alloca i32, align 4
- %i = alloca i32, align 4
- store i32 %x, i32* %x.addr, align 4
- store i32* %mem, i32** %mem.addr, align 4
- store i32 %y, i32* %y.addr, align 4
- store i32 0, i32* %i, align 4
- br label %for.cond
-
-for.cond: ; preds = %for.inc, %entry
- %0 = load i32* %i, align 4
- %cmp = icmp slt i32 %0, 256
- br i1 %cmp, label %for.body, label %for.end
-
-for.body: ; preds = %for.cond
- %1 = load i32* %i, align 4
- %2 = load i32* %i, align 4
- %arrayidx = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %2
- %a = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx, i32 0, i32 0
- store i32 %1, i32 addrspace(4)* %a, align 4
- %3 = load i32* %i, align 4
- %4 = load i32* %i, align 4
- %arrayidx1 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %4
- %b = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx1, i32 0, i32 1
- %arrayidx2 = getelementptr inbounds [2 x i32] addrspace(4)* %b, i32 0, i32 0
- store i32 %3, i32 addrspace(4)* %arrayidx2, align 4
- %5 = load i32* %i, align 4
- %add = add nsw i32 %5, 1
- %6 = load i32* %i, align 4
- %arrayidx3 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %6
- %b4 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx3, i32 0, i32 1
- %arrayidx5 = getelementptr inbounds [2 x i32] addrspace(4)* %b4, i32 0, i32 0
- store i32 %add, i32 addrspace(4)* %arrayidx5, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body
- %7 = load i32* %i, align 4
- %inc = add nsw i32 %7, 1
- store i32 %inc, i32* %i, align 4
- br label %for.cond
-
-for.end: ; preds = %for.cond
- %8 = load i32* %y.addr, align 4
- %arrayidx6 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %8
- %9 = bitcast %struct.my_struct addrspace(4)* %arrayidx6 to i8 addrspace(4)*
- call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i8 addrspace(4)* %9, i32 12, i32 4, i1 false)
- %a7 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
- %10 = load i32* %a7, align 4
- %11 = load i32* %x.addr, align 4
- %arrayidx8 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %11
- %a9 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx8, i32 0, i32 0
- %12 = load i32 addrspace(4)* %a9, align 4
- %add10 = add nsw i32 %10, %12
- %13 = load i32* %x.addr, align 4
- %add11 = add nsw i32 %13, 1
- %arrayidx12 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add11
- %b13 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx12, i32 0, i32 1
- %arrayidx14 = getelementptr inbounds [2 x i32] addrspace(4)* %b13, i32 0, i32 0
- %14 = load i32 addrspace(4)* %arrayidx14, align 4
- %add15 = add nsw i32 %add10, %14
- %15 = load i32* %x.addr, align 4
- %arrayidx16 = getelementptr inbounds [4 x i32] addrspace(1)* @g, i32 0, i32 %15
- %16 = load i32 addrspace(1)* %arrayidx16, align 4
- %add17 = add nsw i32 %add15, %16
- %17 = load i32 addrspace(1)* getelementptr inbounds ([4 x i32] addrspace(1)* @g, i32 0, i32 3), align 4
- %add18 = add nsw i32 %add17, %17
- %18 = load i32** %mem.addr, align 4
- %arrayidx19 = getelementptr inbounds i32* %18, i32 0
- store i32 %add18, i32* %arrayidx19
- ret void
-}
-
-declare void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* nocapture, i8 addrspace(4)* nocapture, i32, i32, i1) nounwind
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (%struct.my_struct*, i32, i32*, i32)* @struct_cl}
diff --git a/backend/kernels/insert.cl.ll b/backend/kernels/insert.cl.ll
new file mode 100644
index 00000000..af9e50e5
--- /dev/null
+++ b/backend/kernels/insert.cl.ll
@@ -0,0 +1,80 @@
+; ModuleID = 'insert.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @insert(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %src, align 16
+ %1 = insertelement <4 x i32> %0, i32 1, i32 2
+ store <4 x i32> %1, <4 x i32> addrspace(1)* %src, align 16
+ store <4 x i32> %1, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @insert}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/insert.ll b/backend/kernels/insert.ll
deleted file mode 100644
index 5df1dd89..00000000
--- a/backend/kernels/insert.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; ModuleID = 'insert.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @insert(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
-entry:
- %0 = load <4 x i32>* %src, align 16
- %1 = insertelement <4 x i32> %0, i32 1, i32 2
- store <4 x i32> %1, <4 x i32>* %src, align 16
- store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @insert}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/load_store.ll b/backend/kernels/load_store.cl.ll
index 53296872..84bc047d 100644
--- a/backend/kernels/load_store.ll
+++ b/backend/kernels/load_store.cl.ll
@@ -1,4 +1,4 @@
-; ModuleID = 'load_store.o'
+; ModuleID = 'load_store.cl.o'
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
diff --git a/backend/kernels/loop.cl.ll b/backend/kernels/loop.cl.ll
new file mode 100644
index 00000000..692dfb7d
--- /dev/null
+++ b/backend/kernels/loop.cl.ll
@@ -0,0 +1,93 @@
+; ModuleID = 'loop.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+entry:
+ %cmp2 = icmp eq i32 %x, 0
+ br i1 %cmp2, label %for.end, label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.03 = phi i32 [ %inc1, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.03
+ %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %0, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc1 = add nsw i32 %i.03, 1
+ %exitcond = icmp eq i32 %inc1, %x
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/loop2.cl.ll b/backend/kernels/loop2.cl.ll
new file mode 100644
index 00000000..effe780f
--- /dev/null
+++ b/backend/kernels/loop2.cl.ll
@@ -0,0 +1,101 @@
+; ModuleID = 'loop2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+entry:
+ %cmp6 = icmp eq i32 %x, 0
+ br i1 %cmp6, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph: ; preds = %entry
+ %.pre = load i32 addrspace(1)* %dst, align 4, !tbaa !1
+ br label %for.body
+
+for.body: ; preds = %for.body, %for.body.lr.ph
+ %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %1, %for.body ]
+ %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
+ %add = add nsw i32 %i.07, 1
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %cmp1 = icmp sgt i32 %1, 0
+ %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.07
+ %storemerge.v = select i1 %cmp1, i32 1, i32 2
+ %storemerge = add i32 %storemerge.v, %0
+ store i32 %storemerge, i32 addrspace(1)* %arrayidx2, align 4
+ %exitcond = icmp eq i32 %add, %x
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/loop2.ll b/backend/kernels/loop2.ll
deleted file mode 100644
index abb16d75..00000000
--- a/backend/kernels/loop2.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; ModuleID = 'loop2.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.big = type { [10 x i32] }
-
-define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
-entry:
- %cmp6 = icmp eq i32 %x, 0
- br i1 %cmp6, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph: ; preds = %entry
- %.pre = load i32 addrspace(1)* %dst, align 4, !tbaa !1
- br label %for.body
-
-for.body: ; preds = %for.body, %for.body.lr.ph
- %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %1, %for.body ]
- %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
- %add = add nsw i32 %i.07, 1
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
- %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
- %cmp1 = icmp sgt i32 %1, 0
- %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.07
- %storemerge.v = select i1 %cmp1, i32 1, i32 2
- %storemerge = add i32 %storemerge.v, %0
- store i32 %storemerge, i32 addrspace(1)* %arrayidx2, align 4
- %exitcond = icmp eq i32 %add, %x
- br i1 %exitcond, label %for.end, label %for.body
-
-for.end: ; preds = %for.body, %entry
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/loop3.cl b/backend/kernels/loop3.cl
new file mode 100644
index 00000000..acb55361
--- /dev/null
+++ b/backend/kernels/loop3.cl
@@ -0,0 +1,9 @@
+#include "stdlib.h"
+
+struct big { int x[10]; };
+
+__kernel void add(__global int *dst, unsigned int x, struct big b)
+{
+ for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++;
+}
+
diff --git a/backend/kernels/loop3.cl.ll b/backend/kernels/loop3.cl.ll
new file mode 100644
index 00000000..378357e9
--- /dev/null
+++ b/backend/kernels/loop3.cl.ll
@@ -0,0 +1,100 @@
+; ModuleID = 'loop3.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+entry:
+ %cmp2 = icmp eq i32 %x, 0
+ br i1 %cmp2, label %for.end, label %get_local_id.exit.lr.ph
+
+get_local_id.exit.lr.ph: ; preds = %entry
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit
+
+get_local_id.exit: ; preds = %get_local_id.exit, %get_local_id.exit.lr.ph
+ %i.03 = phi i32 [ 0, %get_local_id.exit.lr.ph ], [ %inc1, %get_local_id.exit ]
+ %add = add i32 %call.i, %i.03
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
+ %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %0, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc1 = add nsw i32 %i.03, 1
+ %exitcond = icmp eq i32 %inc1, %x
+ br i1 %exitcond, label %for.end, label %get_local_id.exit
+
+for.end: ; preds = %get_local_id.exit, %entry
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/loop.ll b/backend/kernels/loop3.ll
index 9d33968a..c25a6d32 100644
--- a/backend/kernels/loop.ll
+++ b/backend/kernels/loop3.ll
@@ -1,4 +1,4 @@
-; ModuleID = 'loop.o'
+; ModuleID = 'loop3.o'
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
@@ -7,22 +7,29 @@ target triple = "ptx32--"
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
entry:
%cmp2 = icmp eq i32 %x, 0
- br i1 %cmp2, label %for.end, label %for.body
+ br i1 %cmp2, label %for.end, label %get_local_id.exit.lr.ph
-for.body: ; preds = %for.body, %entry
- %i.03 = phi i32 [ %inc1, %for.body ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.03
+get_local_id.exit.lr.ph: ; preds = %entry
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit
+
+get_local_id.exit: ; preds = %get_local_id.exit, %get_local_id.exit.lr.ph
+ %i.03 = phi i32 [ 0, %get_local_id.exit.lr.ph ], [ %inc1, %get_local_id.exit ]
+ %add = add i32 %call.i, %i.03
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
%0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
%inc = add nsw i32 %0, 1
store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
%inc1 = add nsw i32 %i.03, 1
%exitcond = icmp eq i32 %inc1, %x
- br i1 %exitcond, label %for.end, label %for.body
+ br i1 %exitcond, label %for.end, label %get_local_id.exit
-for.end: ; preds = %for.body, %entry
+for.end: ; preds = %get_local_id.exit, %entry
ret void
}
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
!opencl.kernels = !{!0}
!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
diff --git a/backend/kernels/loop4.cl b/backend/kernels/loop4.cl
new file mode 100644
index 00000000..56fa72a0
--- /dev/null
+++ b/backend/kernels/loop4.cl
@@ -0,0 +1,12 @@
+#include "stdlib.h"
+
+struct big { int x[10]; };
+
+__kernel void add(__global int *dst, unsigned int x, struct big b)
+{
+ if (get_local_id(1) > 4)
+ for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++;
+ else
+ for (int i = 0; i < 2*x; ++i) dst[get_local_id(0) + i + x]++;
+}
+
diff --git a/backend/kernels/loop4.cl.ll b/backend/kernels/loop4.cl.ll
new file mode 100644
index 00000000..618c5503
--- /dev/null
+++ b/backend/kernels/loop4.cl.ll
@@ -0,0 +1,129 @@
+; ModuleID = 'loop4.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+get_local_id.exit:
+ %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+ %cmp = icmp ugt i32 %call3.i, 4
+ br i1 %cmp, label %for.cond.preheader, label %for.cond5.preheader
+
+for.cond.preheader: ; preds = %get_local_id.exit
+ %cmp124 = icmp eq i32 %x, 0
+ br i1 %cmp124, label %if.end, label %get_local_id.exit17.lr.ph
+
+get_local_id.exit17.lr.ph: ; preds = %for.cond.preheader
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit17
+
+for.cond5.preheader: ; preds = %get_local_id.exit
+ %mul.mask = and i32 %x, 2147483647
+ %cmp621 = icmp eq i32 %mul.mask, 0
+ br i1 %cmp621, label %if.end, label %get_local_id.exit20.lr.ph
+
+get_local_id.exit20.lr.ph: ; preds = %for.cond5.preheader
+ %call.i18 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %0 = shl i32 %x, 1
+ br label %get_local_id.exit20
+
+get_local_id.exit17: ; preds = %get_local_id.exit17, %get_local_id.exit17.lr.ph
+ %i.025 = phi i32 [ 0, %get_local_id.exit17.lr.ph ], [ %inc3, %get_local_id.exit17 ]
+ %add = add i32 %call.i, %i.025
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc3 = add nsw i32 %i.025, 1
+ %exitcond26 = icmp eq i32 %inc3, %x
+ br i1 %exitcond26, label %if.end, label %get_local_id.exit17
+
+get_local_id.exit20: ; preds = %get_local_id.exit20, %get_local_id.exit20.lr.ph
+ %i4.022 = phi i32 [ 0, %get_local_id.exit20.lr.ph ], [ %inc14, %get_local_id.exit20 ]
+ %add9 = add i32 %i4.022, %x
+ %add10 = add i32 %add9, %call.i18
+ %arrayidx11 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add10
+ %2 = load i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1
+ %inc12 = add nsw i32 %2, 1
+ store i32 %inc12, i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1
+ %inc14 = add nsw i32 %i4.022, 1
+ %exitcond = icmp eq i32 %inc14, %0
+ br i1 %exitcond, label %if.end, label %get_local_id.exit20
+
+if.end: ; preds = %get_local_id.exit20, %get_local_id.exit17, %for.cond5.preheader, %for.cond.preheader
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/loop4.ll b/backend/kernels/loop4.ll
new file mode 100644
index 00000000..8b5a7463
--- /dev/null
+++ b/backend/kernels/loop4.ll
@@ -0,0 +1,67 @@
+; ModuleID = 'loop4.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
+get_local_id.exit:
+ %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+ %cmp = icmp ugt i32 %call3.i, 4
+ br i1 %cmp, label %for.cond.preheader, label %for.cond5.preheader
+
+for.cond.preheader: ; preds = %get_local_id.exit
+ %cmp124 = icmp eq i32 %x, 0
+ br i1 %cmp124, label %if.end, label %get_local_id.exit17.lr.ph
+
+get_local_id.exit17.lr.ph: ; preds = %for.cond.preheader
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit17
+
+for.cond5.preheader: ; preds = %get_local_id.exit
+ %mul.mask = and i32 %x, 2147483647
+ %cmp621 = icmp eq i32 %mul.mask, 0
+ br i1 %cmp621, label %if.end, label %get_local_id.exit20.lr.ph
+
+get_local_id.exit20.lr.ph: ; preds = %for.cond5.preheader
+ %call.i18 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %0 = shl i32 %x, 1
+ br label %get_local_id.exit20
+
+get_local_id.exit17: ; preds = %get_local_id.exit17, %get_local_id.exit17.lr.ph
+ %i.025 = phi i32 [ 0, %get_local_id.exit17.lr.ph ], [ %inc3, %get_local_id.exit17 ]
+ %add = add i32 %call.i, %i.025
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc3 = add nsw i32 %i.025, 1
+ %exitcond26 = icmp eq i32 %inc3, %x
+ br i1 %exitcond26, label %if.end, label %get_local_id.exit17
+
+get_local_id.exit20: ; preds = %get_local_id.exit20, %get_local_id.exit20.lr.ph
+ %i4.022 = phi i32 [ 0, %get_local_id.exit20.lr.ph ], [ %inc14, %get_local_id.exit20 ]
+ %add9 = add i32 %i4.022, %x
+ %add10 = add i32 %add9, %call.i18
+ %arrayidx11 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add10
+ %2 = load i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1
+ %inc12 = add nsw i32 %2, 1
+ store i32 %inc12, i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1
+ %inc14 = add nsw i32 %i4.022, 1
+ %exitcond = icmp eq i32 %inc14, %0
+ br i1 %exitcond, label %if.end, label %get_local_id.exit20
+
+if.end: ; preds = %get_local_id.exit20, %get_local_id.exit17, %for.cond5.preheader, %for.cond.preheader
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/loop5.cl b/backend/kernels/loop5.cl
new file mode 100644
index 00000000..d4a3aa24
--- /dev/null
+++ b/backend/kernels/loop5.cl
@@ -0,0 +1,17 @@
+#include "stdlib.h"
+
+struct big { int x[10]; };
+
+__kernel void add(__global int *dst0, __global int *dst1, unsigned int x, int y, struct big b)
+{
+ __global int *dst = NULL;
+ if (y > 0)
+ dst = dst0;
+ else
+ dst = dst1;
+ if (get_local_id(1) > 4)
+ for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++;
+ else
+ for (int i = 0; i < 2*x; ++i) dst[get_local_id(0) + i + x]++;
+}
+
diff --git a/backend/kernels/loop5.cl.ll b/backend/kernels/loop5.cl.ll
new file mode 100644
index 00000000..b97ad7b5
--- /dev/null
+++ b/backend/kernels/loop5.cl.ll
@@ -0,0 +1,131 @@
+; ModuleID = 'loop5.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline {
+get_local_id.exit:
+ %cmp = icmp sgt i32 %y, 0
+ %dst0.dst1 = select i1 %cmp, i32 addrspace(1)* %dst0, i32 addrspace(1)* %dst1
+ %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+ %cmp1 = icmp ugt i32 %call3.i, 4
+ br i1 %cmp1, label %for.cond.preheader, label %for.cond8.preheader
+
+for.cond.preheader: ; preds = %get_local_id.exit
+ %cmp328 = icmp eq i32 %x, 0
+ br i1 %cmp328, label %if.end19, label %get_local_id.exit21.lr.ph
+
+get_local_id.exit21.lr.ph: ; preds = %for.cond.preheader
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit21
+
+for.cond8.preheader: ; preds = %get_local_id.exit
+ %mul.mask = and i32 %x, 2147483647
+ %cmp925 = icmp eq i32 %mul.mask, 0
+ br i1 %cmp925, label %if.end19, label %get_local_id.exit24.lr.ph
+
+get_local_id.exit24.lr.ph: ; preds = %for.cond8.preheader
+ %call.i22 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %0 = shl i32 %x, 1
+ br label %get_local_id.exit24
+
+get_local_id.exit21: ; preds = %get_local_id.exit21, %get_local_id.exit21.lr.ph
+ %i.029 = phi i32 [ 0, %get_local_id.exit21.lr.ph ], [ %inc5, %get_local_id.exit21 ]
+ %add = add i32 %call.i, %i.029
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc5 = add nsw i32 %i.029, 1
+ %exitcond30 = icmp eq i32 %inc5, %x
+ br i1 %exitcond30, label %if.end19, label %get_local_id.exit21
+
+get_local_id.exit24: ; preds = %get_local_id.exit24, %get_local_id.exit24.lr.ph
+ %i7.026 = phi i32 [ 0, %get_local_id.exit24.lr.ph ], [ %inc17, %get_local_id.exit24 ]
+ %add12 = add i32 %i7.026, %x
+ %add13 = add i32 %add12, %call.i22
+ %arrayidx14 = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add13
+ %2 = load i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1
+ %inc15 = add nsw i32 %2, 1
+ store i32 %inc15, i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1
+ %inc17 = add nsw i32 %i7.026, 1
+ %exitcond = icmp eq i32 %inc17, %0
+ br i1 %exitcond, label %if.end19, label %get_local_id.exit24
+
+if.end19: ; preds = %get_local_id.exit24, %get_local_id.exit21, %for.cond8.preheader, %for.cond.preheader
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/loop5.ll b/backend/kernels/loop5.ll
new file mode 100644
index 00000000..089beee5
--- /dev/null
+++ b/backend/kernels/loop5.ll
@@ -0,0 +1,131 @@
+; ModuleID = 'loop5.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.big = type { [10 x i32] }
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline {
+get_local_id.exit:
+ %cmp = icmp sgt i32 %y, 0
+ %dst0.dst1 = select i1 %cmp, i32 addrspace(1)* %dst0, i32 addrspace(1)* %dst1
+ %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+ %cmp1 = icmp ugt i32 %call3.i, 4
+ br i1 %cmp1, label %for.cond.preheader, label %for.cond8.preheader
+
+for.cond.preheader: ; preds = %get_local_id.exit
+ %cmp328 = icmp eq i32 %x, 0
+ br i1 %cmp328, label %if.end19, label %get_local_id.exit21.lr.ph
+
+get_local_id.exit21.lr.ph: ; preds = %for.cond.preheader
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ br label %get_local_id.exit21
+
+for.cond8.preheader: ; preds = %get_local_id.exit
+ %mul.mask = and i32 %x, 2147483647
+ %cmp925 = icmp eq i32 %mul.mask, 0
+ br i1 %cmp925, label %if.end19, label %get_local_id.exit24.lr.ph
+
+get_local_id.exit24.lr.ph: ; preds = %for.cond8.preheader
+ %call.i22 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %0 = shl i32 %x, 1
+ br label %get_local_id.exit24
+
+get_local_id.exit21: ; preds = %get_local_id.exit21, %get_local_id.exit21.lr.ph
+ %i.029 = phi i32 [ 0, %get_local_id.exit21.lr.ph ], [ %inc5, %get_local_id.exit21 ]
+ %add = add i32 %call.i, %i.029
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add
+ %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %1, 1
+ store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc5 = add nsw i32 %i.029, 1
+ %exitcond30 = icmp eq i32 %inc5, %x
+ br i1 %exitcond30, label %if.end19, label %get_local_id.exit21
+
+get_local_id.exit24: ; preds = %get_local_id.exit24, %get_local_id.exit24.lr.ph
+ %i7.026 = phi i32 [ 0, %get_local_id.exit24.lr.ph ], [ %inc17, %get_local_id.exit24 ]
+ %add12 = add i32 %i7.026, %x
+ %add13 = add i32 %add12, %call.i22
+ %arrayidx14 = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add13
+ %2 = load i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1
+ %inc15 = add nsw i32 %2, 1
+ store i32 %inc15, i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1
+ %inc17 = add nsw i32 %i7.026, 1
+ %exitcond = icmp eq i32 %inc17, %0
+ br i1 %exitcond, label %if.end19, label %get_local_id.exit24
+
+if.end19: ; preds = %get_local_id.exit24, %get_local_id.exit21, %for.cond8.preheader, %for.cond.preheader
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, %struct.big*)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/mad.cl b/backend/kernels/mad.cl
deleted file mode 100644
index 5875a9bf..00000000
--- a/backend/kernels/mad.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-#include "stdlib.h"
-__attribute__((pure, overloadable)) int mad(int,int,int);
-__attribute__((pure, overloadable)) float mad(float,float,float);
-__attribute__((pure, overloadable)) float4 mad(float4,float4,float4);
-
-__kernel void add(__global int *dst, unsigned int x, float z)
-{
- for (int i = 0; i < x; ++i) {
- int y = mad(dst[i], 2, 3);
- y = mad(dst[i], 2, 3);
- float z = mad((float) dst[i], 2.f, 3.f);
- float4 z0 = mad((float4) dst[i], (float4)(0.f,1.f,2.f,3.f), (float4)3.f);
- float4 x0 = z0 * (float4) 2.f;
- dst[i] = y + (int) z + x0.x + x0.y + x0.z;
- }
-}
-
-
diff --git a/backend/kernels/mad.cl.ll b/backend/kernels/mad.cl.ll
new file mode 100644
index 00000000..6a281450
--- /dev/null
+++ b/backend/kernels/mad.cl.ll
@@ -0,0 +1,113 @@
+; ModuleID = 'mad.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, float %z) nounwind noinline {
+entry:
+ %cmp16 = icmp eq i32 %x, 0
+ br i1 %cmp16, label %for.end, label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.017 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.017
+ %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %call2 = tail call ptx_device i32 @_Z3madiii(i32 %0, i32 2, i32 3) nounwind readonly
+ %conv = sitofp i32 %0 to float
+ %call5 = tail call ptx_device float @_Z3madfff(float %conv, float 2.000000e+00, float 3.000000e+00) nounwind readnone
+ %call.i = tail call ptx_device float @_Z3madfff(float %conv, float 0.000000e+00, float 3.000000e+00) nounwind readnone
+ %vecinit.i = insertelement <4 x float> undef, float %call.i, i32 0
+ %call1.i = tail call ptx_device float @_Z3madfff(float %conv, float 1.000000e+00, float 3.000000e+00) nounwind readnone
+ %vecinit2.i = insertelement <4 x float> %vecinit.i, float %call1.i, i32 1
+ %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %call5, i32 2
+ %call5.i = tail call ptx_device float @_Z3madfff(float %conv, float 3.000000e+00, float 3.000000e+00) nounwind readnone
+ %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %call5.i, i32 3
+ %mul = fmul <4 x float> %vecinit6.i, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+ %conv9 = fptosi float %call5 to i32
+ %add = add nsw i32 %conv9, %call2
+ %conv10 = sitofp i32 %add to float
+ %1 = extractelement <4 x float> %mul, i32 0
+ %add11 = fadd float %conv10, %1
+ %2 = extractelement <4 x float> %mul, i32 1
+ %add12 = fadd float %add11, %2
+ %3 = extractelement <4 x float> %mul, i32 2
+ %add13 = fadd float %add12, %3
+ %conv14 = fptosi float %add13 to i32
+ store i32 %conv14, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %inc = add nsw i32 %i.017, 1
+ %exitcond = icmp eq i32 %inc, %x
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ ret void
+}
+
+declare ptx_device i32 @_Z3madiii(i32, i32, i32) nounwind readonly
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32, float)* @add}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/mad.ll b/backend/kernels/mad.ll
deleted file mode 100644
index 6bd19daf..00000000
--- a/backend/kernels/mad.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; ModuleID = 'mad.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @add(i32* nocapture %dst, i32 %x, float %z) nounwind noinline {
-entry:
- %cmp16 = icmp eq i32 %x, 0
- br i1 %cmp16, label %for.end, label %for.body
-
-for.body: ; preds = %for.body, %entry
- %i.017 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
- %arrayidx = getelementptr inbounds i32* %dst, i32 %i.017
- %0 = load i32* %arrayidx, align 4, !tbaa !1
- %call2 = tail call ptx_device i32 @_Z3madiii(i32 %0, i32 2, i32 3) nounwind readonly
- %conv = sitofp i32 %0 to float
- %call5 = tail call ptx_device float @_Z3madfff(float %conv, float 2.000000e+00, float 3.000000e+00) nounwind readonly
- %1 = insertelement <4 x float> undef, float %conv, i32 0
- %splat = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
- %call8 = tail call ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %splat, <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>) nounwind readonly
- %mul = fmul <4 x float> %call8, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
- %conv9 = fptosi float %call5 to i32
- %add = add nsw i32 %conv9, %call2
- %conv10 = sitofp i32 %add to float
- %2 = extractelement <4 x float> %mul, i32 0
- %add11 = fadd float %conv10, %2
- %3 = extractelement <4 x float> %mul, i32 1
- %add12 = fadd float %add11, %3
- %4 = extractelement <4 x float> %mul, i32 2
- %add13 = fadd float %add12, %4
- %conv14 = fptosi float %add13 to i32
- store i32 %conv14, i32* %arrayidx, align 4, !tbaa !1
- %inc = add nsw i32 %i.017, 1
- %exitcond = icmp eq i32 %inc, %x
- br i1 %exitcond, label %for.end, label %for.body
-
-for.end: ; preds = %for.body, %entry
- ret void
-}
-
-declare ptx_device i32 @_Z3madiii(i32, i32, i32) nounwind readonly
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readonly
-
-declare ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) nounwind readonly
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32, float)* @add}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/select.cl.ll b/backend/kernels/select.cl.ll
new file mode 100644
index 00000000..ebf1ad0b
--- /dev/null
+++ b/backend/kernels/select.cl.ll
@@ -0,0 +1,100 @@
+; ModuleID = 'select.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @test_select(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src0, <4 x i32> addrspace(1)* nocapture %src1) nounwind noinline {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %src0, align 16, !tbaa !1
+ %arrayidx1 = getelementptr inbounds <4 x i32> addrspace(1)* %src0, i32 1
+ %1 = load <4 x i32> addrspace(1)* %arrayidx1, align 16, !tbaa !1
+ %2 = extractelement <4 x i32> %0, i32 0
+ %3 = extractelement <4 x i32> %1, i32 0
+ %4 = extractelement <4 x i32> %0, i32 1
+ %5 = extractelement <4 x i32> %1, i32 1
+ %6 = extractelement <4 x i32> %0, i32 2
+ %7 = extractelement <4 x i32> %1, i32 2
+ %8 = extractelement <4 x i32> %0, i32 3
+ %9 = extractelement <4 x i32> %1, i32 3
+ %tobool.i = icmp slt i32 %3, 0
+ %cond1.i = select i1 %tobool.i, i32 %3, i32 %2
+ %10 = insertelement <4 x i32> undef, i32 %cond1.i, i32 0
+ %tobool3.i = icmp slt i32 %5, 0
+ %cond7.i = select i1 %tobool3.i, i32 %5, i32 %4
+ %11 = insertelement <4 x i32> %10, i32 %cond7.i, i32 1
+ %tobool9.i = icmp slt i32 %7, 0
+ %cond13.i = select i1 %tobool9.i, i32 %7, i32 %6
+ %12 = insertelement <4 x i32> %11, i32 %cond13.i, i32 2
+ %tobool15.i = icmp slt i32 %9, 0
+ %cond19.i = select i1 %tobool15.i, i32 %9, i32 %8
+ %13 = insertelement <4 x i32> %12, i32 %cond19.i, i32 3
+ store <4 x i32> %13, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*)* @test_select}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/select.ll b/backend/kernels/select.ll
deleted file mode 100644
index a3d7e16f..00000000
--- a/backend/kernels/select.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; ModuleID = 'select.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @test_select(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src0, <4 x i32>* nocapture %src1) nounwind noinline {
-entry:
- %0 = load <4 x i32>* %src0, align 16, !tbaa !1
- %arrayidx1 = getelementptr inbounds <4 x i32>* %src0, i32 1
- %1 = load <4 x i32>* %arrayidx1, align 16, !tbaa !1
- %2 = extractelement <4 x i32> %0, i32 0
- %3 = extractelement <4 x i32> %1, i32 0
- %4 = extractelement <4 x i32> %0, i32 1
- %5 = extractelement <4 x i32> %1, i32 1
- %6 = extractelement <4 x i32> %0, i32 2
- %7 = extractelement <4 x i32> %1, i32 2
- %8 = extractelement <4 x i32> %0, i32 3
- %9 = extractelement <4 x i32> %1, i32 3
- %tobool.i = icmp slt i32 %3, 0
- %cond1.i = select i1 %tobool.i, i32 %3, i32 %2
- %10 = insertelement <4 x i32> undef, i32 %cond1.i, i32 0
- %tobool3.i = icmp slt i32 %5, 0
- %cond7.i = select i1 %tobool3.i, i32 %5, i32 %4
- %11 = insertelement <4 x i32> %10, i32 %cond7.i, i32 1
- %tobool9.i = icmp slt i32 %7, 0
- %cond13.i = select i1 %tobool9.i, i32 %7, i32 %6
- %12 = insertelement <4 x i32> %11, i32 %cond13.i, i32 2
- %tobool15.i = icmp slt i32 %9, 0
- %cond19.i = select i1 %tobool15.i, i32 %9, i32 %8
- %13 = insertelement <4 x i32> %12, i32 %cond19.i, i32 3
- store <4 x i32> %13, <4 x i32>* %dst, align 16, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @test_select}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/short.cl.ll b/backend/kernels/short.cl.ll
new file mode 100644
index 00000000..8ad601be
--- /dev/null
+++ b/backend/kernels/short.cl.ll
@@ -0,0 +1,79 @@
+; ModuleID = 'short.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @short_write(i16 addrspace(1)* nocapture %dst, i16 %x, i16 %y) nounwind noinline {
+entry:
+ %add = add i16 %y, %x
+ store i16 %add, i16 addrspace(1)* %dst, align 2, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i16 addrspace(1)*, i16, i16)* @short_write}
+!1 = metadata !{metadata !"short", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/short.ll b/backend/kernels/short.ll
deleted file mode 100644
index 6225107a..00000000
--- a/backend/kernels/short.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; ModuleID = 'short.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @short_write(i16* nocapture %dst, i16 %x, i16 %y) nounwind noinline {
-entry:
- %add = add i16 %y, %x
- store i16 %add, i16* %dst, align 2, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i16*, i16, i16)* @short_write}
-!1 = metadata !{metadata !"short", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/shuffle.cl.ll b/backend/kernels/shuffle.cl.ll
new file mode 100644
index 00000000..d503d143
--- /dev/null
+++ b/backend/kernels/shuffle.cl.ll
@@ -0,0 +1,79 @@
+; ModuleID = 'shuffle.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @shuffle(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1
+ %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
+ store <4 x i32> %1, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @shuffle}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/shuffle.ll b/backend/kernels/shuffle.ll
deleted file mode 100644
index e17a6844..00000000
--- a/backend/kernels/shuffle.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-; ModuleID = 'shuffle.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @shuffle(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline {
-entry:
- %0 = load <4 x i32>* %src, align 16, !tbaa !1
- %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
- store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @shuffle}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/simple_float4.cl.ll b/backend/kernels/simple_float4.cl.ll
new file mode 100644
index 00000000..c11f23a1
--- /dev/null
+++ b/backend/kernels/simple_float4.cl.ll
@@ -0,0 +1,83 @@
+; ModuleID = 'simple_float4.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
+get_global_id.exit5:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+ %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
+ %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/simple_float4.ll b/backend/kernels/simple_float4.ll
deleted file mode 100644
index 0e052084..00000000
--- a/backend/kernels/simple_float4.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; ModuleID = 'simple_float4.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src) nounwind noinline {
-get_global_id.exit5:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i
- %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1
- %arrayidx2 = getelementptr inbounds <4 x float>* %dst, i32 %call.i
- store <4 x float> %0, <4 x float>* %arrayidx2, align 16, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x float>*, <4 x float>*)* @simple_float4}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/simple_float4_2.cl.ll b/backend/kernels/simple_float4_2.cl.ll
new file mode 100644
index 00000000..8d7f4fbc
--- /dev/null
+++ b/backend/kernels/simple_float4_2.cl.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'simple_float4_2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
+get_global_id.exit10:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+ %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
+ %mul = fmul <4 x float> %0, %0
+ %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ store <4 x float> %mul, <4 x float> addrspace(1)* %arrayidx4, align 16, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/simple_float4_2.ll b/backend/kernels/simple_float4_2.ll
deleted file mode 100644
index 4f5e1da4..00000000
--- a/backend/kernels/simple_float4_2.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; ModuleID = 'simple_float4_2.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src) nounwind noinline {
-get_global_id.exit10:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i
- %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1
- %mul = fmul <4 x float> %0, %0
- %arrayidx4 = getelementptr inbounds <4 x float>* %dst, i32 %call.i
- store <4 x float> %mul, <4 x float>* %arrayidx4, align 16, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x float>*, <4 x float>*)* @simple_float4}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/simple_float4_3.cl b/backend/kernels/simple_float4_3.cl
index 25c8fe4f..d908433f 100644
--- a/backend/kernels/simple_float4_3.cl
+++ b/backend/kernels/simple_float4_3.cl
@@ -2,7 +2,7 @@
__kernel void simple_float4(__global float4 *dst, __global float4 *src, bool b)
{
- dst[get_global_id(0)] = select(b, src[get_global_id(0)], src[get_global_id(1)]);
+ dst[get_global_id(0)] = select(src[get_global_id(0)], src[get_global_id(1)], (int4)(b));
dst[get_global_id(0)] += (float4) (src[2].x, 1.f, 2.f, 3.f);
}
diff --git a/backend/kernels/simple_float4_3.cl.ll b/backend/kernels/simple_float4_3.cl.ll
new file mode 100644
index 00000000..a41afb18
--- /dev/null
+++ b/backend/kernels/simple_float4_3.cl.ll
@@ -0,0 +1,92 @@
+; ModuleID = 'simple_float4_3.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src, i1 %b) nounwind noinline {
+get_global_id.exit16:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+ %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
+ %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1
+ %arrayidx6 = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 2
+ %1 = load <4 x float> addrspace(1)* %arrayidx6, align 16
+ %2 = extractelement <4 x float> %1, i32 0
+ %vecinit = insertelement <4 x float> undef, float %2, i32 0
+ %vecinit7 = insertelement <4 x float> %vecinit, float 1.000000e+00, i32 1
+ %vecinit8 = insertelement <4 x float> %vecinit7, float 2.000000e+00, i32 2
+ %vecinit9 = insertelement <4 x float> %vecinit8, float 3.000000e+00, i32 3
+ %add = fadd <4 x float> %0, %vecinit9
+ store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*, i1)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/simple_float4_3.ll b/backend/kernels/simple_float4_3.ll
deleted file mode 100644
index c1bdd31c..00000000
--- a/backend/kernels/simple_float4_3.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; ModuleID = 'simple_float4_3.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src, i1 %b) nounwind noinline {
-get_global_id.exit16:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i
- %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1
- %call3.i = tail call ptx_device i32 @__gen_ocl_get_global_id1() nounwind readnone
- %arrayidx2 = getelementptr inbounds <4 x float>* %src, i32 %call3.i
- %1 = load <4 x float>* %arrayidx2, align 16, !tbaa !1
- %x.y.i = select i1 %b, <4 x float> %0, <4 x float> %1
- %arrayidx5 = getelementptr inbounds <4 x float>* %dst, i32 %call.i
- store <4 x float> %x.y.i, <4 x float>* %arrayidx5, align 16, !tbaa !1
- %arrayidx6 = getelementptr inbounds <4 x float>* %src, i32 2
- %2 = load <4 x float>* %arrayidx6, align 16
- %3 = extractelement <4 x float> %2, i32 0
- %vecinit = insertelement <4 x float> undef, float %3, i32 0
- %vecinit7 = insertelement <4 x float> %vecinit, float 1.000000e+00, i32 1
- %vecinit8 = insertelement <4 x float> %vecinit7, float 2.000000e+00, i32 2
- %vecinit9 = insertelement <4 x float> %vecinit8, float 3.000000e+00, i32 3
- %add = fadd <4 x float> %x.y.i, %vecinit9
- store <4 x float> %add, <4 x float>* %arrayidx5, align 16, !tbaa !1
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-declare ptx_device i32 @__gen_ocl_get_global_id1() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (<4 x float>*, <4 x float>*, i1)* @simple_float4}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/stdlib.h b/backend/kernels/stdlib.h
index 0701ff83..7d4e97ec 100644
--- a/backend/kernels/stdlib.h
+++ b/backend/kernels/stdlib.h
@@ -23,6 +23,7 @@ __attribute__((pure,const)) unsigned int __gen_ocl_get_global_id2(void);
__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id0(void);
__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id1(void);
__attribute__((pure,const)) unsigned int __gen_ocl_get_local_id2(void);
+__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c);
inline unsigned get_global_id(unsigned int dim) {
if (dim == 0) return __gen_ocl_get_global_id0();
@@ -59,30 +60,48 @@ typedef bool bool2 __attribute__((ext_vector_type(2)));
typedef bool bool3 __attribute__((ext_vector_type(3)));
typedef bool bool4 __attribute__((ext_vector_type(4)));
-__attribute__((overloadable)) inline int4 select(int4 src0, int4 src1, int4 cond) {
- int4 dst;
- const int x0 = src0.x; // Fix performance issue with CLANG
- const int x1 = src1.x;
- const int y0 = src0.y;
- const int y1 = src1.y;
- const int z0 = src0.z;
- const int z1 = src1.z;
- const int w0 = src0.w;
- const int w1 = src1.w;
+// This will be optimized out by LLVM and will output LLVM select instructions
+#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \
+__attribute__((overloadable)) \
+inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \
+ TYPE4 dst; \
+ const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \
+ const TYPE x1 = src1.x; \
+ const TYPE y0 = src0.y; \
+ const TYPE y1 = src1.y; \
+ const TYPE z0 = src0.z; \
+ const TYPE z1 = src1.z; \
+ const TYPE w0 = src0.w; \
+ const TYPE w1 = src1.w; \
+ \
+ dst.x = (cond.x & MASK) ? x1 : x0; \
+ dst.y = (cond.y & MASK) ? y1 : y0; \
+ dst.z = (cond.z & MASK) ? z1 : z0; \
+ dst.w = (cond.w & MASK) ? w1 : w0; \
+ return dst; \
+}
+DECL_SELECT4(int4, int, int4, 0x80000000)
+DECL_SELECT4(float4, float, int4, 0x80000000)
+#undef DECL_SELECT4
- dst.x = (cond.x & 0x80000000) ? x1 : x0;
- dst.y = (cond.y & 0x80000000) ? y1 : y0;
- dst.z = (cond.z & 0x80000000) ? z1 : z0;
- dst.w = (cond.w & 0x80000000) ? w1 : w0;
- return dst;
+__attribute__((overloadable)) float2 mad(float2 a, float2 b, float2 c) {
+ return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y));
+}
+__attribute__((overloadable)) float3 mad(float3 a, float3 b, float3 c) {
+ return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z));
+}
+__attribute__((overloadable)) float4 mad(float4 a, float4 b, float4 c) {
+ return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y),
+ mad(a.z,b.z,c.z), mad(a.w,b.w,c.w));
}
#define __private __attribute__((address_space(0)))
#define __global __attribute__((address_space(1)))
#define __constant __attribute__((address_space(2)))
-#define __local __attribute__((address_space(3)))
+//#define __local __attribute__((address_space(3)))
#define global __global
-#define local __local
+//#define local __local
#define constant __constant
#define private __private
+#define NULL ((void*)0)
diff --git a/backend/kernels/store.cl.ll b/backend/kernels/store.cl.ll
new file mode 100644
index 00000000..b74e69a6
--- /dev/null
+++ b/backend/kernels/store.cl.ll
@@ -0,0 +1,78 @@
+; ModuleID = 'store.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @store(i32 addrspace(1)* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline {
+entry:
+ store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(4)*, i32)* @store}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/store.ll b/backend/kernels/store.ll
deleted file mode 100644
index c885d62e..00000000
--- a/backend/kernels/store.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; ModuleID = 'store.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @store(i32* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline {
-entry:
- store i32 1, i32* %dst, align 4, !tbaa !1
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32 addrspace(4)*, i32)* @store}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/struct.cl.ll b/backend/kernels/struct.cl.ll
new file mode 100644
index 00000000..fc89f46d
--- /dev/null
+++ b/backend/kernels/struct.cl.ll
@@ -0,0 +1,128 @@
+; ModuleID = 'struct.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.my_struct = type { i32, [2 x i32] }
+
+@g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
+@struct_cl.hop = internal addrspace(4) unnamed_addr global %struct.my_struct zeroinitializer, align 4
+@struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, i32 addrspace(1)* nocapture %mem, i32 %y) nounwind noinline {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %i.023 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %a = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %i.023, i32 0
+ store i32 %i.023, i32 addrspace(4)* %a, align 4, !tbaa !1
+ %arrayidx2 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %i.023, i32 1, i32 0
+ %add = add nsw i32 %i.023, 1
+ store i32 %add, i32 addrspace(4)* %arrayidx2, align 4, !tbaa !1
+ %exitcond = icmp eq i32 %add, 256
+ br i1 %exitcond, label %for.end, label %for.body
+
+for.end: ; preds = %for.body
+ %cmp6 = icmp eq i32 %y, 0
+ br i1 %cmp6, label %if.then, label %if.else
+
+if.then: ; preds = %for.end
+ tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i32 12, i32 4, i1 false)
+ br label %if.end
+
+if.else: ; preds = %for.end
+ %add8 = add nsw i32 %y, 1
+ %arrayidx9 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add8
+ %0 = bitcast %struct.my_struct addrspace(4)* %arrayidx9 to i8 addrspace(4)*
+ tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i8 addrspace(4)* %0, i32 12, i32 4, i1 false)
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i32 12, i32 4, i1 false)
+ %a10 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
+ %1 = load i32* %a10, align 4, !tbaa !1
+ %a12 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %x, i32 0
+ %2 = load i32 addrspace(4)* %a12, align 4, !tbaa !1
+ %add14 = add nsw i32 %x, 1
+ %arrayidx17 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add14, i32 1, i32 0
+ %3 = load i32 addrspace(4)* %arrayidx17, align 4, !tbaa !1
+ %arrayidx19 = getelementptr inbounds [4 x i32] addrspace(2)* @g, i32 0, i32 %x
+ %4 = load i32 addrspace(2)* %arrayidx19, align 4, !tbaa !1
+ %add13 = add i32 %1, 3
+ %add18 = add i32 %add13, %2
+ %add20 = add i32 %add18, %3
+ %add21 = add i32 %add20, %4
+ store i32 %add21, i32 addrspace(1)* %mem, align 4, !tbaa !1
+ ret void
+}
+
+declare void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* nocapture, i8 addrspace(4)* nocapture, i32, i32, i1) nounwind
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.my_struct*, i32, i32 addrspace(1)*, i32)* @struct_cl}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/struct.ll b/backend/kernels/struct.ll
deleted file mode 100644
index d1273bee..00000000
--- a/backend/kernels/struct.ll
+++ /dev/null
@@ -1,121 +0,0 @@
-; ModuleID = 'struct.o'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%struct.my_struct = type { i32, [2 x i32] }
-
-@g = constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 16
-@struct_cl.hop = internal global %struct.my_struct zeroinitializer, align 4
-@struct_cl.array = internal global [256 x %struct.my_struct] zeroinitializer, align 16
-
-define void @struct_cl(i64 %s.coerce0, i32 %s.coerce1, i32 %x, i32* %mem, i32 %y) nounwind uwtable {
-entry:
- %s = alloca %struct.my_struct, align 8
- %x.addr = alloca i32, align 4
- %mem.addr = alloca i32*, align 8
- %y.addr = alloca i32, align 4
- %i = alloca i32, align 4
- %0 = bitcast %struct.my_struct* %s to { i64, i32 }*
- %1 = getelementptr { i64, i32 }* %0, i32 0, i32 0
- store i64 %s.coerce0, i64* %1
- %2 = getelementptr { i64, i32 }* %0, i32 0, i32 1
- store i32 %s.coerce1, i32* %2
- store i32 %x, i32* %x.addr, align 4
- store i32* %mem, i32** %mem.addr, align 8
- store i32 %y, i32* %y.addr, align 4
- store i32 0, i32* %i, align 4
- br label %for.cond
-
-for.cond: ; preds = %for.inc, %entry
- %3 = load i32* %i, align 4
- %cmp = icmp slt i32 %3, 256
- br i1 %cmp, label %for.body, label %for.end
-
-for.body: ; preds = %for.cond
- %4 = load i32* %i, align 4
- %5 = load i32* %i, align 4
- %idxprom = sext i32 %5 to i64
- %arrayidx = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom
- %a = getelementptr inbounds %struct.my_struct* %arrayidx, i32 0, i32 0
- store i32 %4, i32* %a, align 4
- %6 = load i32* %i, align 4
- %7 = load i32* %i, align 4
- %idxprom1 = sext i32 %7 to i64
- %arrayidx2 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom1
- %b = getelementptr inbounds %struct.my_struct* %arrayidx2, i32 0, i32 1
- %arrayidx3 = getelementptr inbounds [2 x i32]* %b, i32 0, i64 0
- store i32 %6, i32* %arrayidx3, align 4
- %8 = load i32* %i, align 4
- %add = add nsw i32 %8, 1
- %9 = load i32* %i, align 4
- %idxprom4 = sext i32 %9 to i64
- %arrayidx5 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom4
- %b6 = getelementptr inbounds %struct.my_struct* %arrayidx5, i32 0, i32 1
- %arrayidx7 = getelementptr inbounds [2 x i32]* %b6, i32 0, i64 0
- store i32 %add, i32* %arrayidx7, align 4
- br label %for.inc
-
-for.inc: ; preds = %for.body
- %10 = load i32* %i, align 4
- %inc = add nsw i32 %10, 1
- store i32 %inc, i32* %i, align 4
- br label %for.cond
-
-for.end: ; preds = %for.cond
- %11 = load i32* %y.addr, align 4
- %cmp8 = icmp eq i32 %11, 0
- br i1 %cmp8, label %if.then, label %if.else
-
-if.then: ; preds = %for.end
- %12 = load i32* %y.addr, align 4
- %idxprom9 = sext i32 %12 to i64
- %arrayidx10 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom9
- %13 = bitcast %struct.my_struct* %arrayidx10 to i8*
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i8* %13, i64 12, i32 4, i1 false)
- br label %if.end
-
-if.else: ; preds = %for.end
- %14 = load i32* %y.addr, align 4
- %add11 = add nsw i32 %14, 1
- %idxprom12 = sext i32 %add11 to i64
- %arrayidx13 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom12
- %15 = bitcast %struct.my_struct* %arrayidx13 to i8*
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i8* %15, i64 12, i32 4, i1 false)
- br label %if.end
-
-if.end: ; preds = %if.else, %if.then
- call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([256 x %struct.my_struct]* @struct_cl.array to i8*), i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i64 12, i32 4, i1 false)
- %a14 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
- %16 = load i32* %a14, align 4
- %17 = load i32* %x.addr, align 4
- %idxprom15 = sext i32 %17 to i64
- %arrayidx16 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom15
- %a17 = getelementptr inbounds %struct.my_struct* %arrayidx16, i32 0, i32 0
- %18 = load i32* %a17, align 4
- %add18 = add nsw i32 %16, %18
- %19 = load i32* %x.addr, align 4
- %add19 = add nsw i32 %19, 1
- %idxprom20 = sext i32 %add19 to i64
- %arrayidx21 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom20
- %b22 = getelementptr inbounds %struct.my_struct* %arrayidx21, i32 0, i32 1
- %arrayidx23 = getelementptr inbounds [2 x i32]* %b22, i32 0, i64 0
- %20 = load i32* %arrayidx23, align 4
- %add24 = add nsw i32 %add18, %20
- %21 = load i32* %x.addr, align 4
- %idxprom25 = sext i32 %21 to i64
- %arrayidx26 = getelementptr inbounds [4 x i32]* @g, i32 0, i64 %idxprom25
- %22 = load i32* %arrayidx26, align 4
- %add27 = add nsw i32 %add24, %22
- %23 = load i32* getelementptr inbounds ([4 x i32]* @g, i32 0, i64 3), align 4
- %add28 = add nsw i32 %add27, %23
- %24 = load i32** %mem.addr, align 8
- %arrayidx29 = getelementptr inbounds i32* %24, i64 0
- store i32 %add28, i32* %arrayidx29
- ret void
-}
-
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i64, i32, i32, i32*, i32)* @struct_cl}
diff --git a/backend/kernels/struct2.cl.ll b/backend/kernels/struct2.cl.ll
new file mode 100644
index 00000000..2c5f8b16
--- /dev/null
+++ b/backend/kernels/struct2.cl.ll
@@ -0,0 +1,100 @@
+; ModuleID = 'struct2.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+%struct.my_struct = type { i32, [2 x i32] }
+
+@g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct addrspace(1)* nocapture %mem, i32 %y) nounwind noinline {
+entry:
+ %cmp = icmp eq i32 %y, 0
+ br i1 %cmp, label %if.end, label %if.else
+
+if.else: ; preds = %entry
+ %s.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
+ %tmp4 = load i32* %s.0, align 4
+ %s.1.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 0
+ %tmp5 = load i32* %s.1.0, align 4
+ %s.1.1 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 1
+ %tmp6 = load i32* %s.1.1, align 4
+ br label %if.end
+
+if.end: ; preds = %if.else, %entry
+ %hop.1.1.0 = phi i32 [ %tmp6, %if.else ], [ 2, %entry ]
+ %hop.1.0.0 = phi i32 [ %tmp5, %if.else ], [ 2, %entry ]
+ %hop.0.0 = phi i32 [ %tmp4, %if.else ], [ 1, %entry ]
+ %mem.0 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 0
+ store i32 %hop.0.0, i32 addrspace(1)* %mem.0, align 4
+ %mem.1.0 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 1, i32 0
+ store i32 %hop.1.0.0, i32 addrspace(1)* %mem.1.0, align 4
+ %mem.1.1 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 1, i32 1
+ store i32 %hop.1.1.0, i32 addrspace(1)* %mem.1.1, align 4
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (%struct.my_struct*, i32, %struct.my_struct addrspace(1)*, i32)* @struct_cl}
diff --git a/backend/kernels/struct2.ll b/backend/kernels/struct2.ll
deleted file mode 100644
index ef1f3ab6..00000000
--- a/backend/kernels/struct2.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; ModuleID = 'struct2.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-%struct.my_struct = type { i32, [2 x i32] }
-
-@g = addrspace(1) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
-
-define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct* nocapture %mem, i32 %y) nounwind noinline {
-entry:
- %cmp = icmp eq i32 %y, 0
- br i1 %cmp, label %if.end, label %if.else
-
-if.else: ; preds = %entry
- %s.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0
- %tmp4 = load i32* %s.0, align 4
- %s.1.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 0
- %tmp5 = load i32* %s.1.0, align 4
- %s.1.1 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 1
- %tmp6 = load i32* %s.1.1, align 4
- br label %if.end
-
-if.end: ; preds = %if.else, %entry
- %hop.1.1.0 = phi i32 [ %tmp6, %if.else ], [ 2, %entry ]
- %hop.1.0.0 = phi i32 [ %tmp5, %if.else ], [ 2, %entry ]
- %hop.0.0 = phi i32 [ %tmp4, %if.else ], [ 1, %entry ]
- %mem.0 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 0
- store i32 %hop.0.0, i32* %mem.0, align 4
- %mem.1.0 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 1, i32 0
- store i32 %hop.1.0.0, i32* %mem.1.0, align 4
- %mem.1.1 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 1, i32 1
- store i32 %hop.1.1.0, i32* %mem.1.1, align 4
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (%struct.my_struct*, i32, %struct.my_struct*, i32)* @struct_cl}
diff --git a/backend/kernels/test_select.cl.ll b/backend/kernels/test_select.cl.ll
new file mode 100644
index 00000000..478dcbbf
--- /dev/null
+++ b/backend/kernels/test_select.cl.ll
@@ -0,0 +1,86 @@
+; ModuleID = 'test_select.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @test_select(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %src) nounwind noinline {
+get_global_id.exit7:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %call.i
+ %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
+ %cmp = icmp sgt i32 %0, 1
+ %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i
+ %. = select i1 %cmp, i32 1, i32 2
+ store i32 %., i32 addrspace(1)* %arrayidx2, align 4
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*)* @test_select}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/test_select.ll b/backend/kernels/test_select.ll
deleted file mode 100644
index 302251ca..00000000
--- a/backend/kernels/test_select.ll
+++ /dev/null
@@ -1,24 +0,0 @@
-; ModuleID = 'test_select.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @test_select(i32* nocapture %dst, i32* nocapture %src) nounwind noinline {
-get_global_id.exit7:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds i32* %src, i32 %call.i
- %0 = load i32* %arrayidx, align 4, !tbaa !1
- %cmp = icmp sgt i32 %0, 1
- %arrayidx2 = getelementptr inbounds i32* %dst, i32 %call.i
- %. = select i1 %cmp, i32 1, i32 2
- store i32 %., i32* %arrayidx2, align 4
- ret void
-}
-
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*, i32*)* @test_select}
-!1 = metadata !{metadata !"int", metadata !2}
-!2 = metadata !{metadata !"omnipotent char", metadata !3}
-!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/undefined.cl.ll b/backend/kernels/undefined.cl.ll
new file mode 100644
index 00000000..f6446426
--- /dev/null
+++ b/backend/kernels/undefined.cl.ll
@@ -0,0 +1,78 @@
+; ModuleID = 'undefined.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @undefined(i32 addrspace(1)* nocapture %dst) nounwind noinline {
+entry:
+ store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (i32 addrspace(1)*)* @undefined}
+!1 = metadata !{metadata !"int", metadata !2}
+!2 = metadata !{metadata !"omnipotent char", metadata !3}
+!3 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/undefined.ll b/backend/kernels/undefined.ll
deleted file mode 100644
index a706e7ba..00000000
--- a/backend/kernels/undefined.ll
+++ /dev/null
@@ -1,32 +0,0 @@
-; ModuleID = 'undefined.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @undefined(i32* %dst) nounwind noinline {
-entry:
- %dst.addr = alloca i32*, align 4
- %x = alloca i32, align 4
- store i32* %dst, i32** %dst.addr, align 4
- %0 = load i32* %x, align 4
- %cmp = icmp eq i32 %0, 0
- br i1 %cmp, label %if.then, label %if.else
-
-if.then: ; preds = %entry
- %1 = load i32** %dst.addr, align 4
- %arrayidx = getelementptr inbounds i32* %1, i32 0
- store i32 0, i32* %arrayidx
- br label %if.end
-
-if.else: ; preds = %entry
- %2 = load i32** %dst.addr, align 4
- %arrayidx1 = getelementptr inbounds i32* %2, i32 0
- store i32 1, i32* %arrayidx1
- br label %if.end
-
-if.end: ; preds = %if.else, %if.then
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void (i32*)* @undefined}
diff --git a/backend/kernels/vector_constant.cl b/backend/kernels/vector_constant.cl
new file mode 100644
index 00000000..fc9d308f
--- /dev/null
+++ b/backend/kernels/vector_constant.cl
@@ -0,0 +1,7 @@
+#include "stdlib.h"
+
+__kernel void simple_float4(__global float4 *dst, __global float4 *src)
+{
+ dst[get_global_id(0)] = src[get_global_id(0)] + (float4)(0.f,1.f,2.f,3.f);
+}
+
diff --git a/backend/kernels/vector_constant.cl.ll b/backend/kernels/vector_constant.cl.ll
new file mode 100644
index 00000000..de644cd3
--- /dev/null
+++ b/backend/kernels/vector_constant.cl.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'vector_constant.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
+get_global_id.exit5:
+ %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+ %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
+ %add = fadd <4 x float> %0, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+ %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1
+ ret void
+}
+
+declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/backend/kernels/void.cl.ll b/backend/kernels/void.cl.ll
new file mode 100644
index 00000000..151fc1cb
--- /dev/null
+++ b/backend/kernels/void.cl.ll
@@ -0,0 +1,74 @@
+; ModuleID = 'void.cl.o'
+target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
+target triple = "ptx32--"
+
+define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <2 x float> %a, i32 0
+ %1 = extractelement <2 x float> %b, i32 0
+ %2 = extractelement <2 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <2 x float> undef, float %call, i32 0
+ %3 = extractelement <2 x float> %a, i32 1
+ %4 = extractelement <2 x float> %b, i32 1
+ %5 = extractelement <2 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
+ ret <2 x float> %vecinit2
+}
+
+declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
+
+define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <3 x float> %a, i32 0
+ %1 = extractelement <3 x float> %b, i32 0
+ %2 = extractelement <3 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <3 x float> undef, float %call, i32 0
+ %3 = extractelement <3 x float> %a, i32 1
+ %4 = extractelement <3 x float> %b, i32 1
+ %5 = extractelement <3 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <3 x float> %a, i32 2
+ %7 = extractelement <3 x float> %b, i32 2
+ %8 = extractelement <3 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
+ ret <3 x float> %vecinit4
+}
+
+define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
+entry:
+ %0 = extractelement <4 x float> %a, i32 0
+ %1 = extractelement <4 x float> %b, i32 0
+ %2 = extractelement <4 x float> %c, i32 0
+ %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
+ %vecinit = insertelement <4 x float> undef, float %call, i32 0
+ %3 = extractelement <4 x float> %a, i32 1
+ %4 = extractelement <4 x float> %b, i32 1
+ %5 = extractelement <4 x float> %c, i32 1
+ %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
+ %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
+ %6 = extractelement <4 x float> %a, i32 2
+ %7 = extractelement <4 x float> %b, i32 2
+ %8 = extractelement <4 x float> %c, i32 2
+ %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
+ %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
+ %9 = extractelement <4 x float> %a, i32 3
+ %10 = extractelement <4 x float> %b, i32 3
+ %11 = extractelement <4 x float> %c, i32 3
+ %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
+ %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
+ ret <4 x float> %vecinit6
+}
+
+define ptx_kernel void @hop() nounwind readnone noinline {
+entry:
+ ret void
+}
+
+!opencl.kernels = !{!0}
+
+!0 = metadata !{void ()* @hop}
diff --git a/backend/kernels/void.ll b/backend/kernels/void.ll
deleted file mode 100644
index 3c6c269b..00000000
--- a/backend/kernels/void.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; ModuleID = 'void.o'
-target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
-target triple = "ptx32--"
-
-define ptx_kernel void @hop() nounwind readnone noinline {
-entry:
- ret void
-}
-
-!opencl.kernels = !{!0}
-
-!0 = metadata !{void ()* @hop}