diff options
author | Benjamin Segovia <segovia.benjamin@gmail.com> | 2012-04-12 18:51:11 +0000 |
---|---|---|
committer | Keith Packard <keithp@keithp.com> | 2012-08-10 16:16:18 -0700 |
commit | bfba59eb36af59440dabd2c9ec508eac8a968ab3 (patch) | |
tree | b539e3080ee7cd30af558210daec221f3e7bdec8 /backend/kernels | |
parent | 99d9b8998677cb870fb0fba9a48a353297821c43 (diff) |
Recompiled kernels with the new stdlib Added definition and use sets per register
Diffstat (limited to 'backend/kernels')
29 files changed, 75 insertions, 1828 deletions
diff --git a/backend/kernels/add.cl.ll b/backend/kernels/add.cl.ll index ee4d5ba2..80f45c95 100644 --- a/backend/kernels/add.cl.ll +++ b/backend/kernels/add.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline { entry: %add = add i32 %y, %x diff --git a/backend/kernels/add2.cl.ll b/backend/kernels/add2.cl.ll index 2d7dd549..dae1c24e 100644 --- a/backend/kernels/add2.cl.ll +++ b/backend/kernels/add2.cl.ll @@ -4,68 +4,6 @@ target triple = "ptx32--" %struct.big = type { i32, i32 } -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @add(%struct.big addrspace(1)* nocapture %b, i32 %x, i32 %y) nounwind noinline { entry: %add = add i32 %y, %x diff --git a/backend/kernels/cmp.cl.ll b/backend/kernels/cmp.cl.ll index a77694eb..759c1844 100644 --- a/backend/kernels/cmp.cl.ll +++ b/backend/kernels/cmp.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @test_cmp(i8 addrspace(1)* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline { entry: %cmp = icmp slt i32 %x, %y diff --git a/backend/kernels/cmp_cvt.cl.ll b/backend/kernels/cmp_cvt.cl.ll index 3a85bcae..37945e40 100644 --- a/backend/kernels/cmp_cvt.cl.ll +++ b/backend/kernels/cmp_cvt.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @cmp_cvt(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline { get_local_id.exit: %add = add nsw i32 %y, %x diff --git a/backend/kernels/complex_struct.cl.ll b/backend/kernels/complex_struct.cl.ll index bf607ca0..ce370b58 100644 --- a/backend/kernels/complex_struct.cl.ll +++ b/backend/kernels/complex_struct.cl.ll @@ -5,68 +5,6 @@ target triple = "ptx32--" %struct.my_struct = type { i32, [5 x %struct.hop] } %struct.hop = type { float, float } -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline { entry: %x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0 diff --git a/backend/kernels/cycle.cl.ll b/backend/kernels/cycle.cl.ll index 0c4ee200..2c409ff0 100644 --- a/backend/kernels/cycle.cl.ll +++ b/backend/kernels/cycle.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @cycle(i32 addrspace(1)* nocapture %dst) noreturn nounwind readnone noinline { entry: br label %hop0 diff --git a/backend/kernels/extract.cl.ll b/backend/kernels/extract.cl.ll index 11c95bdb..f9ef7b35 100644 --- a/backend/kernels/extract.cl.ll +++ b/backend/kernels/extract.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @extract(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline { entry: %0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1 diff --git a/backend/kernels/function.cl.ll b/backend/kernels/function.cl.ll index 62527a71..7fdeeaba 100644 --- a/backend/kernels/function.cl.ll +++ b/backend/kernels/function.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind { entry: store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 diff --git a/backend/kernels/function_param.cl.ll b/backend/kernels/function_param.cl.ll index 56646dd1..0e9064f4 100644 --- a/backend/kernels/function_param.cl.ll +++ b/backend/kernels/function_param.cl.ll @@ -4,68 +4,6 @@ target triple = "ptx32--" %struct.struct0 = type { [5 x i32], i32, i32, i32 } -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline { entry: %arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4 diff --git a/backend/kernels/get_global_id.cl.ll b/backend/kernels/get_global_id.cl.ll index 2dacffb7..8a6aaa4d 100644 --- a/backend/kernels/get_global_id.cl.ll +++ b/backend/kernels/get_global_id.cl.ll @@ -2,82 +2,25 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @test_global_id(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %p) nounwind noinline { -get_global_id.exit13: +get_global_id.exit17: %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone %sext = shl i32 %call.i, 16 %conv1 = ashr exact i32 %sext, 16 - %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i6 + %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + %mul.i = mul i32 %call.i10.i, %call.i3.i + %add.i = add i32 %mul.i, %call.i + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add.i store i32 %conv1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 - %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %call.i6 + %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %add.i store i32 %call.i, i32 addrspace(1)* %arrayidx5, align 4, !tbaa !1 ret void } -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone +declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone diff --git a/backend/kernels/insert.cl.ll b/backend/kernels/insert.cl.ll index af9e50e5..e9f3d6ba 100644 --- a/backend/kernels/insert.cl.ll +++ b/backend/kernels/insert.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @insert(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline { entry: %0 = load <4 x i32> addrspace(1)* %src, align 16 diff --git a/backend/kernels/loop.cl.ll b/backend/kernels/loop.cl.ll index 692dfb7d..f5f6085d 100644 --- a/backend/kernels/loop.cl.ll +++ b/backend/kernels/loop.cl.ll @@ -4,68 +4,6 @@ target triple = "ptx32--" %struct.big = type { [10 x i32] } -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { entry: %cmp2 = icmp eq i32 %x, 0 diff --git a/backend/kernels/loop2.cl.ll b/backend/kernels/loop2.cl.ll index effe780f..2d710c02 100644 --- a/backend/kernels/loop2.cl.ll +++ b/backend/kernels/loop2.cl.ll @@ -4,68 +4,6 @@ target triple = "ptx32--" %struct.big = type { [10 x i32] } -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { entry: %cmp6 = icmp eq i32 %x, 0 diff --git a/backend/kernels/loop3.cl.ll b/backend/kernels/loop3.cl.ll index 378357e9..984165ef 100644 --- a/backend/kernels/loop3.cl.ll +++ b/backend/kernels/loop3.cl.ll @@ -4,68 +4,6 @@ target triple = "ptx32--" %struct.big = type { [10 x i32] } -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { entry: %cmp2 = icmp eq i32 %x, 0 diff --git a/backend/kernels/loop4.cl.ll b/backend/kernels/loop4.cl.ll index 618c5503..08ed4b01 100644 --- a/backend/kernels/loop4.cl.ll +++ b/backend/kernels/loop4.cl.ll @@ -4,68 +4,6 @@ target triple = "ptx32--" %struct.big = type { [10 x i32] } -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { get_local_id.exit: %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone diff --git a/backend/kernels/loop5.cl.ll b/backend/kernels/loop5.cl.ll index b97ad7b5..884ae089 100644 --- a/backend/kernels/loop5.cl.ll +++ b/backend/kernels/loop5.cl.ll @@ -4,68 +4,6 @@ target triple = "ptx32--" %struct.big = type { [10 x i32] } -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline { get_local_id.exit: %cmp = icmp sgt i32 %y, 0 diff --git a/backend/kernels/select.cl.ll b/backend/kernels/select.cl.ll index ebf1ad0b..ee381030 100644 --- a/backend/kernels/select.cl.ll +++ b/backend/kernels/select.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @test_select(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src0, <4 x i32> addrspace(1)* nocapture %src1) nounwind noinline { entry: %0 = load <4 x i32> addrspace(1)* %src0, align 16, !tbaa !1 diff --git a/backend/kernels/short.cl.ll b/backend/kernels/short.cl.ll index 8ad601be..c56edabe 100644 --- a/backend/kernels/short.cl.ll +++ b/backend/kernels/short.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @short_write(i16 addrspace(1)* nocapture %dst, i16 %x, i16 %y) nounwind noinline { entry: %add = add i16 %y, %x diff --git a/backend/kernels/shuffle.cl.ll b/backend/kernels/shuffle.cl.ll index d503d143..31a1e2b2 100644 --- a/backend/kernels/shuffle.cl.ll +++ b/backend/kernels/shuffle.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @shuffle(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline { entry: %0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1 diff --git a/backend/kernels/simple_float4.cl.ll b/backend/kernels/simple_float4.cl.ll index c11f23a1..80009cc5 100644 --- a/backend/kernels/simple_float4.cl.ll +++ b/backend/kernels/simple_float4.cl.ll @@ -2,79 +2,25 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline { -get_global_id.exit5: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i +get_global_id.exit11: + %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + %mul.i = mul i32 %call.i10.i, %call.i3.i + %add.i = add i32 %mul.i, %call.i.i + %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1 - %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i + %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1 ret void } -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone +declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone !opencl.kernels = !{!0} diff --git a/backend/kernels/simple_float4_2.cl.ll b/backend/kernels/simple_float4_2.cl.ll index 8d7f4fbc..ebc7e2a3 100644 --- a/backend/kernels/simple_float4_2.cl.ll +++ b/backend/kernels/simple_float4_2.cl.ll @@ -2,80 +2,26 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline { -get_global_id.exit10: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i +get_global_id.exit22: + %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + %mul.i = mul i32 %call.i10.i, %call.i3.i + %add.i = add i32 %mul.i, %call.i.i + %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1 %mul = fmul <4 x float> %0, %0 - %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i + %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i store <4 x float> %mul, <4 x float> addrspace(1)* %arrayidx4, align 16, !tbaa !1 ret void } -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone +declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone !opencl.kernels = !{!0} diff --git a/backend/kernels/simple_float4_3.cl.ll b/backend/kernels/simple_float4_3.cl.ll index a41afb18..afcafd66 100644 --- a/backend/kernels/simple_float4_3.cl.ll +++ b/backend/kernels/simple_float4_3.cl.ll @@ -2,74 +2,16 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src, i1 %b) nounwind noinline { -get_global_id.exit16: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i +get_global_id.exit35: + %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + %mul.i = mul i32 %call.i10.i, %call.i3.i + %add.i = add i32 %mul.i, %call.i.i + %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1 - %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i + %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1 %arrayidx6 = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 2 %1 = load <4 x float> addrspace(1)* %arrayidx6, align 16 @@ -83,7 +25,11 @@ get_global_id.exit16: ret void } -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone +declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone !opencl.kernels = !{!0} diff --git a/backend/kernels/store.cl.ll b/backend/kernels/store.cl.ll index b74e69a6..64a20095 100644 --- a/backend/kernels/store.cl.ll +++ b/backend/kernels/store.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @store(i32 addrspace(1)* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline { entry: store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 diff --git a/backend/kernels/struct.cl.ll b/backend/kernels/struct.cl.ll index fc89f46d..caafa607 100644 --- a/backend/kernels/struct.cl.ll +++ b/backend/kernels/struct.cl.ll @@ -8,68 +8,6 @@ target triple = "ptx32--" @struct_cl.hop = internal addrspace(4) unnamed_addr global %struct.my_struct zeroinitializer, align 4 @struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4 -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, i32 addrspace(1)* nocapture %mem, i32 %y) nounwind noinline { entry: br label %for.body diff --git a/backend/kernels/struct2.cl.ll b/backend/kernels/struct2.cl.ll index 2c5f8b16..8d102ccd 100644 --- a/backend/kernels/struct2.cl.ll +++ b/backend/kernels/struct2.cl.ll @@ -6,68 +6,6 @@ target triple = "ptx32--" @g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4 -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct addrspace(1)* nocapture %mem, i32 %y) nounwind noinline { entry: %cmp = icmp eq i32 %y, 0 diff --git a/backend/kernels/test_select.cl.ll b/backend/kernels/test_select.cl.ll index 478dcbbf..589bf2dc 100644 --- a/backend/kernels/test_select.cl.ll +++ b/backend/kernels/test_select.cl.ll @@ -2,81 +2,27 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @test_select(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %src) nounwind noinline { -get_global_id.exit7: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %call.i +get_global_id.exit13: + %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + %mul.i = mul i32 %call.i10.i, %call.i3.i + %add.i = add i32 %mul.i, %call.i.i + %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %add.i %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 %cmp = icmp sgt i32 %0, 1 - %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i + %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add.i %. = select i1 %cmp, i32 1, i32 2 store i32 %., i32 addrspace(1)* %arrayidx2, align 4 ret void } -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone +declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone !opencl.kernels = !{!0} diff --git a/backend/kernels/undefined.cl.ll b/backend/kernels/undefined.cl.ll index f6446426..65375808 100644 --- a/backend/kernels/undefined.cl.ll +++ b/backend/kernels/undefined.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @undefined(i32 addrspace(1)* nocapture %dst) nounwind noinline { entry: store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 diff --git a/backend/kernels/vector_constant.cl.ll b/backend/kernels/vector_constant.cl.ll index de644cd3..71c54d12 100644 --- a/backend/kernels/vector_constant.cl.ll +++ b/backend/kernels/vector_constant.cl.ll @@ -2,80 +2,26 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline { -get_global_id.exit5: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i +get_global_id.exit11: + %call.i.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %call.i3.i = tail call ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + %call.i10.i = tail call ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + %mul.i = mul i32 %call.i10.i, %call.i3.i + %add.i = add i32 %mul.i, %call.i.i + %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %add.i %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1 %add = fadd <4 x float> %0, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> - %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i + %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %add.i store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1 ret void } -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone +declare ptx_device i32 @__gen_ocl_get_num_groups0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_size0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone !opencl.kernels = !{!0} diff --git a/backend/kernels/void.cl.ll b/backend/kernels/void.cl.ll index 151fc1cb..de543ddc 100644 --- a/backend/kernels/void.cl.ll +++ b/backend/kernels/void.cl.ll @@ -2,68 +2,6 @@ target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" -define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { -entry: - %0 = extractelement <2 x float> %a, i32 0 - %1 = extractelement <2 x float> %b, i32 0 - %2 = extractelement <2 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <2 x float> undef, float %call, i32 0 - %3 = extractelement <2 x float> %a, i32 1 - %4 = extractelement <2 x float> %b, i32 1 - %5 = extractelement <2 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 - ret <2 x float> %vecinit2 -} - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone - -define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { -entry: - %0 = extractelement <3 x float> %a, i32 0 - %1 = extractelement <3 x float> %b, i32 0 - %2 = extractelement <3 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <3 x float> undef, float %call, i32 0 - %3 = extractelement <3 x float> %a, i32 1 - %4 = extractelement <3 x float> %b, i32 1 - %5 = extractelement <3 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <3 x float> %a, i32 2 - %7 = extractelement <3 x float> %b, i32 2 - %8 = extractelement <3 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 - ret <3 x float> %vecinit4 -} - -define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { -entry: - %0 = extractelement <4 x float> %a, i32 0 - %1 = extractelement <4 x float> %b, i32 0 - %2 = extractelement <4 x float> %c, i32 0 - %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone - %vecinit = insertelement <4 x float> undef, float %call, i32 0 - %3 = extractelement <4 x float> %a, i32 1 - %4 = extractelement <4 x float> %b, i32 1 - %5 = extractelement <4 x float> %c, i32 1 - %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone - %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 - %6 = extractelement <4 x float> %a, i32 2 - %7 = extractelement <4 x float> %b, i32 2 - %8 = extractelement <4 x float> %c, i32 2 - %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone - %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 - %9 = extractelement <4 x float> %a, i32 3 - %10 = extractelement <4 x float> %b, i32 3 - %11 = extractelement <4 x float> %c, i32 3 - %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone - %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 - ret <4 x float> %vecinit6 -} - define ptx_kernel void @hop() nounwind readnone noinline { entry: ret void |