summaryrefslogtreecommitdiff
path: root/backend/kernels
diff options
context:
space:
mode:
authorBenjamin Segovia <segovia.benjamin@gmail.com>2012-04-12 18:51:11 +0000
committerKeith Packard <keithp@keithp.com>2012-08-10 16:16:18 -0700
commitbfba59eb36af59440dabd2c9ec508eac8a968ab3 (patch)
treeb539e3080ee7cd30af558210daec221f3e7bdec8 /backend/kernels
parent99d9b8998677cb870fb0fba9a48a353297821c43 (diff)
Recompiled kernels with the new stdlib Added definition and use sets per register
Diffstat (limited to 'backend/kernels')
-rw-r--r--backend/kernels/add.cl.ll62
-rw-r--r--backend/kernels/add2.cl.ll62
-rw-r--r--backend/kernels/cmp.cl.ll62
-rw-r--r--backend/kernels/cmp_cvt.cl.ll62
-rw-r--r--backend/kernels/complex_struct.cl.ll62
-rw-r--r--backend/kernels/cycle.cl.ll62
-rw-r--r--backend/kernels/extract.cl.ll62
-rw-r--r--backend/kernels/function.cl.ll62
-rw-r--r--backend/kernels/function_param.cl.ll62
-rw-r--r--backend/kernels/get_global_id.cl.ll77
-rw-r--r--backend/kernels/insert.cl.ll62
-rw-r--r--backend/kernels/loop.cl.ll62
-rw-r--r--backend/kernels/loop2.cl.ll62
-rw-r--r--backend/kernels/loop3.cl.ll62
-rw-r--r--backend/kernels/loop4.cl.ll62
-rw-r--r--backend/kernels/loop5.cl.ll62
-rw-r--r--backend/kernels/select.cl.ll62
-rw-r--r--backend/kernels/short.cl.ll62
-rw-r--r--backend/kernels/shuffle.cl.ll62
-rw-r--r--backend/kernels/simple_float4.cl.ll80
-rw-r--r--backend/kernels/simple_float4_2.cl.ll80
-rw-r--r--backend/kernels/simple_float4_3.cl.ll80
-rw-r--r--backend/kernels/store.cl.ll62
-rw-r--r--backend/kernels/struct.cl.ll62
-rw-r--r--backend/kernels/struct2.cl.ll62
-rw-r--r--backend/kernels/test_select.cl.ll80
-rw-r--r--backend/kernels/undefined.cl.ll62
-rw-r--r--backend/kernels/vector_constant.cl.ll80
-rw-r--r--backend/kernels/void.cl.ll62
29 files changed, 75 insertions, 1828 deletions
diff --git a/backend/kernels/add.cl.ll b/backend/kernels/add.cl.ll
index ee4d5ba2..80f45c95 100644
--- a/backend/kernels/add.cl.ll
+++ b/backend/kernels/add.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline {
entry:
%add = add i32 %y, %x
diff --git a/backend/kernels/add2.cl.ll b/backend/kernels/add2.cl.ll
index 2d7dd549..dae1c24e 100644
--- a/backend/kernels/add2.cl.ll
+++ b/backend/kernels/add2.cl.ll
@@ -4,68 +4,6 @@ target triple = "ptx32--"
%struct.big = type { i32, i32 }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(%struct.big addrspace(1)* nocapture %b, i32 %x, i32 %y) nounwind noinline {
entry:
%add = add i32 %y, %x
diff --git a/backend/kernels/cmp.cl.ll b/backend/kernels/cmp.cl.ll
index a77694eb..759c1844 100644
--- a/backend/kernels/cmp.cl.ll
+++ b/backend/kernels/cmp.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @test_cmp(i8 addrspace(1)* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline {
entry:
%cmp = icmp slt i32 %x, %y
diff --git a/backend/kernels/cmp_cvt.cl.ll b/backend/kernels/cmp_cvt.cl.ll
index 3a85bcae..37945e40 100644
--- a/backend/kernels/cmp_cvt.cl.ll
+++ b/backend/kernels/cmp_cvt.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @cmp_cvt(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline {
get_local_id.exit:
%add = add nsw i32 %y, %x
diff --git a/backend/kernels/complex_struct.cl.ll b/backend/kernels/complex_struct.cl.ll
index bf607ca0..ce370b58 100644
--- a/backend/kernels/complex_struct.cl.ll
+++ b/backend/kernels/complex_struct.cl.ll
@@ -5,68 +5,6 @@ target triple = "ptx32--"
%struct.my_struct = type { i32, [5 x %struct.hop] }
%struct.hop = type { float, float }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline {
entry:
%x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0
diff --git a/backend/kernels/cycle.cl.ll b/backend/kernels/cycle.cl.ll
index 0c4ee200..2c409ff0 100644
--- a/backend/kernels/cycle.cl.ll
+++ b/backend/kernels/cycle.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @cycle(i32 addrspace(1)* nocapture %dst) noreturn nounwind readnone noinline {
entry:
br label %hop0
diff --git a/backend/kernels/extract.cl.ll b/backend/kernels/extract.cl.ll
index 11c95bdb..f9ef7b35 100644
--- a/backend/kernels/extract.cl.ll
+++ b/backend/kernels/extract.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @extract(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
entry:
%0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1
diff --git a/backend/kernels/function.cl.ll b/backend/kernels/function.cl.ll
index 62527a71..7fdeeaba 100644
--- a/backend/kernels/function.cl.ll
+++ b/backend/kernels/function.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind {
entry:
store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
diff --git a/backend/kernels/function_param.cl.ll b/backend/kernels/function_param.cl.ll
index 56646dd1..0e9064f4 100644
--- a/backend/kernels/function_param.cl.ll
+++ b/backend/kernels/function_param.cl.ll
@@ -4,68 +4,6 @@ target triple = "ptx32--"
%struct.struct0 = type { [5 x i32], i32, i32, i32 }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline {
entry:
%arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4
diff --git a/backend/kernels/get_global_id.cl.ll b/backend/kernels/get_global_id.cl.ll
index 2dacffb7..8a6aaa4d 100644
--- a/backend/kernels/get_global_id.cl.ll
+++ b/backend/kernels/get_global_id.cl.ll
@@ -2,82 +2,25 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @test_global_id(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %p) nounwind noinline {
-get_global_id.exit13:
+get_global_id.exit17:
%call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
%sext = shl i32 %call.i, 16
%conv1 = ashr exact i32 %sext, 16
- %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i6
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add.i
store i32 %conv1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
- %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %call.i6
+ %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %add.i
store i32 %call.i, i32 addrspace(1)* %arrayidx5, align 4, !tbaa !1
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
diff --git a/backend/kernels/insert.cl.ll b/backend/kernels/insert.cl.ll
index af9e50e5..e9f3d6ba 100644
--- a/backend/kernels/insert.cl.ll
+++ b/backend/kernels/insert.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @insert(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
entry:
%0 = load <4 x i32> addrspace(1)* %src, align 16
diff --git a/backend/kernels/loop.cl.ll b/backend/kernels/loop.cl.ll
index 692dfb7d..f5f6085d 100644
--- a/backend/kernels/loop.cl.ll
+++ b/backend/kernels/loop.cl.ll
@@ -4,68 +4,6 @@ target triple = "ptx32--"
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
entry:
%cmp2 = icmp eq i32 %x, 0
diff --git a/backend/kernels/loop2.cl.ll b/backend/kernels/loop2.cl.ll
index effe780f..2d710c02 100644
--- a/backend/kernels/loop2.cl.ll
+++ b/backend/kernels/loop2.cl.ll
@@ -4,68 +4,6 @@ target triple = "ptx32--"
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
entry:
%cmp6 = icmp eq i32 %x, 0
diff --git a/backend/kernels/loop3.cl.ll b/backend/kernels/loop3.cl.ll
index 378357e9..984165ef 100644
--- a/backend/kernels/loop3.cl.ll
+++ b/backend/kernels/loop3.cl.ll
@@ -4,68 +4,6 @@ target triple = "ptx32--"
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
entry:
%cmp2 = icmp eq i32 %x, 0
diff --git a/backend/kernels/loop4.cl.ll b/backend/kernels/loop4.cl.ll
index 618c5503..08ed4b01 100644
--- a/backend/kernels/loop4.cl.ll
+++ b/backend/kernels/loop4.cl.ll
@@ -4,68 +4,6 @@ target triple = "ptx32--"
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline {
get_local_id.exit:
%call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone
diff --git a/backend/kernels/loop5.cl.ll b/backend/kernels/loop5.cl.ll
index b97ad7b5..884ae089 100644
--- a/backend/kernels/loop5.cl.ll
+++ b/backend/kernels/loop5.cl.ll
@@ -4,68 +4,6 @@ target triple = "ptx32--"
%struct.big = type { [10 x i32] }
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline {
get_local_id.exit:
%cmp = icmp sgt i32 %y, 0
diff --git a/backend/kernels/select.cl.ll b/backend/kernels/select.cl.ll
index ebf1ad0b..ee381030 100644
--- a/backend/kernels/select.cl.ll
+++ b/backend/kernels/select.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @test_select(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src0, <4 x i32> addrspace(1)* nocapture %src1) nounwind noinline {
entry:
%0 = load <4 x i32> addrspace(1)* %src0, align 16, !tbaa !1
diff --git a/backend/kernels/short.cl.ll b/backend/kernels/short.cl.ll
index 8ad601be..c56edabe 100644
--- a/backend/kernels/short.cl.ll
+++ b/backend/kernels/short.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @short_write(i16 addrspace(1)* nocapture %dst, i16 %x, i16 %y) nounwind noinline {
entry:
%add = add i16 %y, %x
diff --git a/backend/kernels/shuffle.cl.ll b/backend/kernels/shuffle.cl.ll
index d503d143..31a1e2b2 100644
--- a/backend/kernels/shuffle.cl.ll
+++ b/backend/kernels/shuffle.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @shuffle(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline {
entry:
%0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1
diff --git a/backend/kernels/simple_float4.cl.ll b/backend/kernels/simple_float4.cl.ll
index c11f23a1..80009cc5 100644
--- a/backend/kernels/simple_float4.cl.ll
+++ b/backend/kernels/simple_float4.cl.ll
@@ -2,79 +2,25 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
-get_global_id.exit5:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+get_global_id.exit11:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i
%0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
- %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i
store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
diff --git a/backend/kernels/simple_float4_2.cl.ll b/backend/kernels/simple_float4_2.cl.ll
index 8d7f4fbc..ebc7e2a3 100644
--- a/backend/kernels/simple_float4_2.cl.ll
+++ b/backend/kernels/simple_float4_2.cl.ll
@@ -2,80 +2,26 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
-get_global_id.exit10:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+get_global_id.exit22:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i
%0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
%mul = fmul <4 x float> %0, %0
- %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i
store <4 x float> %mul, <4 x float> addrspace(1)* %arrayidx4, align 16, !tbaa !1
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
diff --git a/backend/kernels/simple_float4_3.cl.ll b/backend/kernels/simple_float4_3.cl.ll
index a41afb18..afcafd66 100644
--- a/backend/kernels/simple_float4_3.cl.ll
+++ b/backend/kernels/simple_float4_3.cl.ll
@@ -2,74 +2,16 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src, i1 %b) nounwind noinline {
-get_global_id.exit16:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+get_global_id.exit35:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i
%0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
- %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i
store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1
%arrayidx6 = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 2
%1 = load <4 x float> addrspace(1)* %arrayidx6, align 16
@@ -83,7 +25,11 @@ get_global_id.exit16:
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
diff --git a/backend/kernels/store.cl.ll b/backend/kernels/store.cl.ll
index b74e69a6..64a20095 100644
--- a/backend/kernels/store.cl.ll
+++ b/backend/kernels/store.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @store(i32 addrspace(1)* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline {
entry:
store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
diff --git a/backend/kernels/struct.cl.ll b/backend/kernels/struct.cl.ll
index fc89f46d..caafa607 100644
--- a/backend/kernels/struct.cl.ll
+++ b/backend/kernels/struct.cl.ll
@@ -8,68 +8,6 @@ target triple = "ptx32--"
@struct_cl.hop = internal addrspace(4) unnamed_addr global %struct.my_struct zeroinitializer, align 4
@struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, i32 addrspace(1)* nocapture %mem, i32 %y) nounwind noinline {
entry:
br label %for.body
diff --git a/backend/kernels/struct2.cl.ll b/backend/kernels/struct2.cl.ll
index 2c5f8b16..8d102ccd 100644
--- a/backend/kernels/struct2.cl.ll
+++ b/backend/kernels/struct2.cl.ll
@@ -6,68 +6,6 @@ target triple = "ptx32--"
@g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct addrspace(1)* nocapture %mem, i32 %y) nounwind noinline {
entry:
%cmp = icmp eq i32 %y, 0
diff --git a/backend/kernels/test_select.cl.ll b/backend/kernels/test_select.cl.ll
index 478dcbbf..589bf2dc 100644
--- a/backend/kernels/test_select.cl.ll
+++ b/backend/kernels/test_select.cl.ll
@@ -2,81 +2,27 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @test_select(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %src) nounwind noinline {
-get_global_id.exit7:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %call.i
+get_global_id.exit13:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %add.i
%0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1
%cmp = icmp sgt i32 %0, 1
- %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i
+ %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add.i
%. = select i1 %cmp, i32 1, i32 2
store i32 %., i32 addrspace(1)* %arrayidx2, align 4
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
diff --git a/backend/kernels/undefined.cl.ll b/backend/kernels/undefined.cl.ll
index f6446426..65375808 100644
--- a/backend/kernels/undefined.cl.ll
+++ b/backend/kernels/undefined.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @undefined(i32 addrspace(1)* nocapture %dst) nounwind noinline {
entry:
store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1
diff --git a/backend/kernels/vector_constant.cl.ll b/backend/kernels/vector_constant.cl.ll
index de644cd3..71c54d12 100644
--- a/backend/kernels/vector_constant.cl.ll
+++ b/backend/kernels/vector_constant.cl.ll
@@ -2,80 +2,26 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline {
-get_global_id.exit5:
- %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
- %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i
+get_global_id.exit11:
+ %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
+ %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+ %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+ %mul.i = mul i32 %call.i10.i, %call.i3.i
+ %add.i = add i32 %mul.i, %call.i.i
+ %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i
%0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1
%add = fadd <4 x float> %0, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
- %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i
+ %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i
store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1
ret void
}
-declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone
+declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone
+
+declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone
!opencl.kernels = !{!0}
diff --git a/backend/kernels/void.cl.ll b/backend/kernels/void.cl.ll
index 151fc1cb..de543ddc 100644
--- a/backend/kernels/void.cl.ll
+++ b/backend/kernels/void.cl.ll
@@ -2,68 +2,6 @@
target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64"
target triple = "ptx32--"
-define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <2 x float> %a, i32 0
- %1 = extractelement <2 x float> %b, i32 0
- %2 = extractelement <2 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <2 x float> undef, float %call, i32 0
- %3 = extractelement <2 x float> %a, i32 1
- %4 = extractelement <2 x float> %b, i32 1
- %5 = extractelement <2 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1
- ret <2 x float> %vecinit2
-}
-
-declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone
-
-define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <3 x float> %a, i32 0
- %1 = extractelement <3 x float> %b, i32 0
- %2 = extractelement <3 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <3 x float> undef, float %call, i32 0
- %3 = extractelement <3 x float> %a, i32 1
- %4 = extractelement <3 x float> %b, i32 1
- %5 = extractelement <3 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <3 x float> %a, i32 2
- %7 = extractelement <3 x float> %b, i32 2
- %8 = extractelement <3 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2
- ret <3 x float> %vecinit4
-}
-
-define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone {
-entry:
- %0 = extractelement <4 x float> %a, i32 0
- %1 = extractelement <4 x float> %b, i32 0
- %2 = extractelement <4 x float> %c, i32 0
- %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone
- %vecinit = insertelement <4 x float> undef, float %call, i32 0
- %3 = extractelement <4 x float> %a, i32 1
- %4 = extractelement <4 x float> %b, i32 1
- %5 = extractelement <4 x float> %c, i32 1
- %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone
- %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1
- %6 = extractelement <4 x float> %a, i32 2
- %7 = extractelement <4 x float> %b, i32 2
- %8 = extractelement <4 x float> %c, i32 2
- %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone
- %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2
- %9 = extractelement <4 x float> %a, i32 3
- %10 = extractelement <4 x float> %b, i32 3
- %11 = extractelement <4 x float> %c, i32 3
- %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone
- %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3
- ret <4 x float> %vecinit6
-}
-
define ptx_kernel void @hop() nounwind readnone noinline {
entry:
ret void