diff options
author | Benjamin Segovia <segovia.benjamin@gmail.com> | 2012-03-19 19:11:28 -0700 |
---|---|---|
committer | Keith Packard <keithp@keithp.com> | 2012-08-10 16:15:45 -0700 |
commit | 3b126f62deca284fedafaca6dad807388673da45 (patch) | |
tree | 53be7aaa52c6cb2bb946bf0fce8f760bdb30942e /backend/kernels | |
parent | 6f4a65ba93a5e4ed686cc03d91c6431edcec8ba3 (diff) |
Added first support for constant vector Recompiled all ocl kernels Fixed problem when a block does not end with a branch Added proper support for special registers and function argument in liveness analysis and function DAG creation
Diffstat (limited to 'backend/kernels')
70 files changed, 3113 insertions, 846 deletions
diff --git a/backend/kernels/add.cl.ll b/backend/kernels/add.cl.ll new file mode 100644 index 00000000..3cc40069 --- /dev/null +++ b/backend/kernels/add.cl.ll @@ -0,0 +1,75 @@ +; ModuleID = 'add.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel i32 @add(i32 %x, i32 %y) nounwind readnone noinline { +entry: + %add = add i32 %y, %x + ret i32 %add +} + +!opencl.kernels = !{!0} + +!0 = metadata !{i32 (i32, i32)* @add} diff --git a/backend/kernels/add.ll b/backend/kernels/add.ll deleted file mode 100644 index 9b2c7413..00000000 --- a/backend/kernels/add.ll +++ /dev/null @@ -1,13 +0,0 @@ -; ModuleID = 'add.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel i32 @add(i32 %x, i32 %y) nounwind readnone noinline { -entry: - %add = add i32 %y, %x - ret i32 %add -} - -!opencl.kernels = !{!0} - -!0 = metadata !{i32 (i32, i32)* @add} diff --git a/backend/kernels/add2.cl.ll b/backend/kernels/add2.cl.ll new file mode 100644 index 00000000..cb7cf6e3 --- /dev/null +++ b/backend/kernels/add2.cl.ll @@ -0,0 +1,83 @@ +; ModuleID = 'add2.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.big = type { i32, i32 } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @add(%struct.big* noalias nocapture sret %agg.result, i32 %x, i32 %y) nounwind noinline { +entry: + %add = add i32 %y, %x + %sub = add i32 %x, 10 + %add1 = sub i32 %sub, %y + %agg.result.0 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 0 + store i32 %add, i32* %agg.result.0, align 4 + %agg.result.1 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 1 + store i32 %add1, i32* %agg.result.1, align 4 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (%struct.big*, i32, i32)* @add} diff --git a/backend/kernels/add2.ll b/backend/kernels/add2.ll deleted file mode 100644 index 4ca11259..00000000 --- a/backend/kernels/add2.ll +++ /dev/null @@ -1,21 +0,0 @@ -; ModuleID = 'add2.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -%struct.big = type { i32, i32 } - -define ptx_kernel void @add(%struct.big* noalias nocapture sret %agg.result, i32 %x, i32 %y) nounwind noinline { -entry: - %add = add i32 %y, %x - %sub = add i32 %x, 10 - %add1 = sub i32 %sub, %y - %agg.result.0 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 0 - store i32 %add, i32* %agg.result.0, align 4 - %agg.result.1 = getelementptr inbounds %struct.big* %agg.result, i32 0, i32 1 - store i32 %add1, i32* %agg.result.1, align 4 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (%struct.big*, i32, i32)* @add} diff --git a/backend/kernels/cmp.cl.ll b/backend/kernels/cmp.cl.ll new file mode 100644 index 00000000..a77694eb --- /dev/null +++ b/backend/kernels/cmp.cl.ll @@ -0,0 +1,84 @@ +; ModuleID = 'cmp.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @test_cmp(i8 addrspace(1)* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline { +entry: + %cmp = icmp slt i32 %x, %y + %conv = zext i1 %cmp to i32 + %cmp1 = fcmp ogt float %z, %w + %add = sext i1 %cmp1 to i32 + %tobool = icmp ne i32 %conv, %add + %frombool = zext i1 %tobool to i8 + store i8 %frombool, i8 addrspace(1)* %dst, align 1, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i8 addrspace(1)*, i32, i32, float, float)* @test_cmp} +!1 = metadata !{metadata !"bool", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/cmp.ll b/backend/kernels/cmp.ll deleted file mode 100644 index c2f64007..00000000 --- a/backend/kernels/cmp.ll +++ /dev/null @@ -1,22 +0,0 @@ -; ModuleID = 'cmp.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @test_cmp(i8* nocapture %dst, i32 %x, i32 %y, float %z, float %w) nounwind noinline { -entry: - %cmp = icmp slt i32 %x, %y - %conv = zext i1 %cmp to i32 - %cmp1 = fcmp ogt float %z, %w - %add = sext i1 %cmp1 to i32 - %tobool = icmp ne i32 %conv, %add - %frombool = zext i1 %tobool to i8 - store i8 %frombool, i8* %dst, align 1, !tbaa !1 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i8*, i32, i32, float, float)* @test_cmp} -!1 = metadata !{metadata !"bool", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/cmp_cvt.cl.ll b/backend/kernels/cmp_cvt.cl.ll new file mode 100644 index 00000000..3a85bcae --- /dev/null +++ b/backend/kernels/cmp_cvt.cl.ll @@ -0,0 +1,84 @@ +; ModuleID = 'cmp_cvt.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @cmp_cvt(i32 addrspace(1)* nocapture %dst, i32 %x, i32 %y) nounwind noinline { +get_local_id.exit: + %add = add nsw i32 %y, %x + %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %cmp = icmp ult i32 %add, %call.i + %conv = zext i1 %cmp to i32 + store i32 %conv, i32 addrspace(1)* %dst, align 4, !tbaa !1 + ret void +} + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32, i32)* @cmp_cvt} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/cmp_cvt.ll b/backend/kernels/cmp_cvt.ll deleted file mode 100644 index ab62b6c4..00000000 --- a/backend/kernels/cmp_cvt.ll +++ /dev/null @@ -1,22 +0,0 @@ -; ModuleID = 'cmp_cvt.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @cmp_cvt(i32* nocapture %dst, i32 %x, i32 %y) nounwind noinline { -get_local_id.exit: - %add = add nsw i32 %y, %x - %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone - %cmp = icmp ult i32 %add, %call.i - %conv = zext i1 %cmp to i32 - store i32 %conv, i32* %dst, align 4, !tbaa !1 - ret void -} - -declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32*, i32, i32)* @cmp_cvt} -!1 = metadata !{metadata !"int", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/compile.sh b/backend/kernels/compile.sh index 7f5ef36d..e1177a75 100755 --- a/backend/kernels/compile.sh +++ b/backend/kernels/compile.sh @@ -1,4 +1,5 @@ -clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1.cl -o $1.o +#!/bin/bash +clang -emit-llvm -O3 -ccc-host-triple ptx32 -c $1 -o $1.o llvm-dis $1.o rm $1.o mv $1.o.ll $1.ll diff --git a/backend/kernels/complex_struct.cl.ll b/backend/kernels/complex_struct.cl.ll new file mode 100644 index 00000000..bf607ca0 --- /dev/null +++ b/backend/kernels/complex_struct.cl.ll @@ -0,0 +1,84 @@ +; ModuleID = 'complex_struct.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.my_struct = type { i32, [5 x %struct.hop] } +%struct.hop = type { float, float } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline { +entry: + %x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0 + %0 = load float addrspace(1)* %x, align 4, !tbaa !1 + %y = getelementptr inbounds %struct.my_struct addrspace(1)* %dst, i32 0, i32 1, i32 2, i32 1 + store float %0, float addrspace(1)* %y, align 4, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (%struct.my_struct addrspace(1)*, %struct.my_struct addrspace(1)*)* @struct_cl} +!1 = metadata !{metadata !"float", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/complex_struct.ll b/backend/kernels/complex_struct.ll new file mode 100644 index 00000000..4274dd34 --- /dev/null +++ b/backend/kernels/complex_struct.ll @@ -0,0 +1,84 @@ +; ModuleID = 'complex_struct.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.my_struct = type { i32, [5 x %struct.hop] } +%struct.hop = type { float, float } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @struct_cl(%struct.my_struct addrspace(1)* nocapture %dst, %struct.my_struct addrspace(1)* nocapture %src) nounwind noinline { +entry: + %x = getelementptr inbounds %struct.my_struct addrspace(1)* %src, i32 1, i32 1, i32 3, i32 0 + %0 = load float addrspace(1)* %x, align 4, !tbaa !1 + %y = getelementptr inbounds %struct.my_struct addrspace(1)* %dst, i32 0, i32 1, i32 2, i32 1 + store float %0, float addrspace(1)* %y, align 4, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (%struct.my_struct addrspace(1)*, %struct.my_struct addrspace(1)*)* @struct_cl} +!1 = metadata !{metadata !"float", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/cycle.cl.ll b/backend/kernels/cycle.cl.ll new file mode 100644 index 00000000..0c4ee200 --- /dev/null +++ b/backend/kernels/cycle.cl.ll @@ -0,0 +1,77 @@ +; ModuleID = 'cycle.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @cycle(i32 addrspace(1)* nocapture %dst) noreturn nounwind readnone noinline { +entry: + br label %hop0 + +hop0: ; preds = %hop0, %entry + br label %hop0 +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*)* @cycle} diff --git a/backend/kernels/cycle.ll b/backend/kernels/cycle.ll deleted file mode 100644 index 63363007..00000000 --- a/backend/kernels/cycle.ll +++ /dev/null @@ -1,15 +0,0 @@ -; ModuleID = 'cycle.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @cycle(i32* nocapture %dst) noreturn nounwind readnone noinline { -entry: - br label %hop0 - -hop0: ; preds = %hop0, %entry - br label %hop0 -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32*)* @cycle} diff --git a/backend/kernels/dummy.ll b/backend/kernels/dummy.ll deleted file mode 100644 index 3c6c269b..00000000 --- a/backend/kernels/dummy.ll +++ /dev/null @@ -1,12 +0,0 @@ -; ModuleID = 'void.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @hop() nounwind readnone noinline { -entry: - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void ()* @hop} diff --git a/backend/kernels/extract.cl.ll b/backend/kernels/extract.cl.ll new file mode 100644 index 00000000..11c95bdb --- /dev/null +++ b/backend/kernels/extract.cl.ll @@ -0,0 +1,83 @@ +; ModuleID = 'extract.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @extract(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline { +entry: + %0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1 + %1 = extractelement <4 x i32> %0, i32 0 + %vecinit = insertelement <4 x i32> undef, i32 %1, i32 0 + %vecinit1 = insertelement <4 x i32> %vecinit, i32 1, i32 1 + %vecinit2 = insertelement <4 x i32> %vecinit1, i32 2, i32 2 + %vecinit3 = insertelement <4 x i32> %vecinit2, i32 3, i32 3 + store <4 x i32> %vecinit3, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @extract} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/extract.ll b/backend/kernels/extract.ll deleted file mode 100644 index b10a21ff..00000000 --- a/backend/kernels/extract.ll +++ /dev/null @@ -1,21 +0,0 @@ -; ModuleID = 'extract.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @extract(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline { -entry: - %0 = load <4 x i32>* %src, align 16, !tbaa !1 - %1 = extractelement <4 x i32> %0, i32 0 - %vecinit = insertelement <4 x i32> undef, i32 %1, i32 0 - %vecinit1 = insertelement <4 x i32> %vecinit, i32 1, i32 1 - %vecinit2 = insertelement <4 x i32> %vecinit1, i32 2, i32 2 - %vecinit3 = insertelement <4 x i32> %vecinit2, i32 3, i32 3 - store <4 x i32> %vecinit3, <4 x i32>* %dst, align 16, !tbaa !1 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @extract} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/function.cl.ll b/backend/kernels/function.cl.ll new file mode 100644 index 00000000..62527a71 --- /dev/null +++ b/backend/kernels/function.cl.ll @@ -0,0 +1,86 @@ +; ModuleID = 'function.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind { +entry: + store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 + ret void +} + +define ptx_kernel void @write2(i32 addrspace(1)* nocapture %dst, i32 %x) nounwind noinline { +entry: + store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %x + store i32 1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32)* @write2} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/function.ll b/backend/kernels/function.ll deleted file mode 100644 index e428448f..00000000 --- a/backend/kernels/function.ll +++ /dev/null @@ -1,24 +0,0 @@ -; ModuleID = 'function.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_device void @write(i32 addrspace(1)* nocapture %dst) nounwind { -entry: - store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 - ret void -} - -define ptx_kernel void @write2(i32 addrspace(1)* nocapture %dst, i32 %x) nounwind noinline { -entry: - store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 - %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %x - store i32 1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32 addrspace(1)*, i32)* @write2} -!1 = metadata !{metadata !"int", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/function_param.cl.ll b/backend/kernels/function_param.cl.ll new file mode 100644 index 00000000..56646dd1 --- /dev/null +++ b/backend/kernels/function_param.cl.ll @@ -0,0 +1,95 @@ +; ModuleID = 'function_param.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.struct0 = type { [5 x i32], i32, i32, i32 } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline { +entry: + %arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4 + %0 = load i32 addrspace(4)* %arrayidx, align 4, !tbaa !1 + %arrayidx1 = getelementptr inbounds %struct.struct0* %s, i32 0, i32 0, i32 4 + %1 = load i32* %arrayidx1, align 4, !tbaa !1 + %add = add i32 %0, %x + %add2 = add i32 %add, %1 + store i32 %add2, i32* %arrayidx1, align 4, !tbaa !1 + %2 = bitcast %struct.struct0 addrspace(1)* %dst to i8 addrspace(1)* + %3 = bitcast %struct.struct0* %s to i8* + tail call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %2, i8* %3, i32 32, i32 4, i1 false) + %y5 = getelementptr inbounds %struct.struct0 addrspace(1)* %dst, i32 0, i32 2 + %4 = load i32 addrspace(1)* %y5, align 4, !tbaa !1 + %add6 = add nsw i32 %4, %y + store i32 %add6, i32 addrspace(1)* %y5, align 4, !tbaa !1 + ret void +} + +declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) nounwind + +!opencl.kernels = !{!0} + +!0 = metadata !{void (%struct.struct0 addrspace(1)*, %struct.struct0*, i32 addrspace(4)*, i32, i32)* @param} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/function_param.ll b/backend/kernels/function_param.ll deleted file mode 100644 index 90c7220f..00000000 --- a/backend/kernels/function_param.ll +++ /dev/null @@ -1,33 +0,0 @@ -; ModuleID = 'function_param.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -%struct.struct0 = type { [5 x i32], i32, i32, i32 } - -define ptx_kernel void @param(%struct.struct0 addrspace(1)* nocapture %dst, %struct.struct0* nocapture byval %s, i32 addrspace(4)* nocapture %h, i32 %x, i32 %y) nounwind noinline { -entry: - %arrayidx = getelementptr inbounds i32 addrspace(4)* %h, i32 4 - %0 = load i32 addrspace(4)* %arrayidx, align 4, !tbaa !1 - %arrayidx1 = getelementptr inbounds %struct.struct0* %s, i32 0, i32 0, i32 4 - %1 = load i32* %arrayidx1, align 4, !tbaa !1 - %add = add i32 %0, %x - %add2 = add i32 %add, %1 - store i32 %add2, i32* %arrayidx1, align 4, !tbaa !1 - %2 = bitcast %struct.struct0 addrspace(1)* %dst to i8 addrspace(1)* - %3 = bitcast %struct.struct0* %s to i8* - tail call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* %2, i8* %3, i32 32, i32 4, i1 false) - %y5 = getelementptr inbounds %struct.struct0 addrspace(1)* %dst, i32 0, i32 2 - %4 = load i32 addrspace(1)* %y5, align 4, !tbaa !1 - %add6 = add nsw i32 %4, %y - store i32 %add6, i32 addrspace(1)* %y5, align 4, !tbaa !1 - ret void -} - -declare void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocapture, i32, i32, i1) nounwind - -!opencl.kernels = !{!0} - -!0 = metadata !{void (%struct.struct0 addrspace(1)*, %struct.struct0*, i32 addrspace(4)*, i32, i32)* @param} -!1 = metadata !{metadata !"int", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/get_global_id.cl.ll b/backend/kernels/get_global_id.cl.ll new file mode 100644 index 00000000..2dacffb7 --- /dev/null +++ b/backend/kernels/get_global_id.cl.ll @@ -0,0 +1,89 @@ +; ModuleID = 'get_global_id.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @test_global_id(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %p) nounwind noinline { +get_global_id.exit13: + %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %sext = shl i32 %call.i, 16 + %conv1 = ashr exact i32 %sext, 16 + %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i6 + store i32 %conv1, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %p, i32 %call.i6 + store i32 %call.i, i32 addrspace(1)* %arrayidx5, align 4, !tbaa !1 + ret void +} + +declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*)* @test_global_id} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/get_global_id.ll b/backend/kernels/get_global_id.ll deleted file mode 100644 index 1df9fdff..00000000 --- a/backend/kernels/get_global_id.ll +++ /dev/null @@ -1,27 +0,0 @@ -; ModuleID = 'get_global_id.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @test_global_id(i32* nocapture %dst, i32* nocapture %p) nounwind noinline { -get_global_id.exit13: - %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone - %sext = shl i32 %call.i, 16 - %conv1 = ashr exact i32 %sext, 16 - %call.i6 = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds i32* %dst, i32 %call.i6 - store i32 %conv1, i32* %arrayidx, align 4, !tbaa !1 - %arrayidx5 = getelementptr inbounds i32* %p, i32 %call.i6 - store i32 %call.i, i32* %arrayidx5, align 4, !tbaa !1 - ret void -} - -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - -declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32*, i32*)* @test_global_id} -!1 = metadata !{metadata !"int", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/gg.ll b/backend/kernels/gg.ll Binary files differdeleted file mode 100644 index 0f9d666e..00000000 --- a/backend/kernels/gg.ll +++ /dev/null diff --git a/backend/kernels/gg.ll.ll b/backend/kernels/gg.ll.ll deleted file mode 100644 index 1c60671c..00000000 --- a/backend/kernels/gg.ll.ll +++ /dev/null @@ -1,89 +0,0 @@ -; ModuleID = 'gg.ll' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -%struct.my_struct = type { i32, [2 x i32] } - -@g = addrspace(1) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4 -@struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4 - -define ptx_kernel void @struct_cl(%struct.my_struct* byval %s, i32 %x, i32* %mem, i32 %y) nounwind noinline { -entry: - %x.addr = alloca i32, align 4 - %mem.addr = alloca i32*, align 4 - %y.addr = alloca i32, align 4 - %i = alloca i32, align 4 - store i32 %x, i32* %x.addr, align 4 - store i32* %mem, i32** %mem.addr, align 4 - store i32 %y, i32* %y.addr, align 4 - store i32 0, i32* %i, align 4 - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %0 = load i32* %i, align 4 - %cmp = icmp slt i32 %0, 256 - br i1 %cmp, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %1 = load i32* %i, align 4 - %2 = load i32* %i, align 4 - %arrayidx = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %2 - %a = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx, i32 0, i32 0 - store i32 %1, i32 addrspace(4)* %a, align 4 - %3 = load i32* %i, align 4 - %4 = load i32* %i, align 4 - %arrayidx1 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %4 - %b = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx1, i32 0, i32 1 - %arrayidx2 = getelementptr inbounds [2 x i32] addrspace(4)* %b, i32 0, i32 0 - store i32 %3, i32 addrspace(4)* %arrayidx2, align 4 - %5 = load i32* %i, align 4 - %add = add nsw i32 %5, 1 - %6 = load i32* %i, align 4 - %arrayidx3 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %6 - %b4 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx3, i32 0, i32 1 - %arrayidx5 = getelementptr inbounds [2 x i32] addrspace(4)* %b4, i32 0, i32 0 - store i32 %add, i32 addrspace(4)* %arrayidx5, align 4 - br label %for.inc - -for.inc: ; preds = %for.body - %7 = load i32* %i, align 4 - %inc = add nsw i32 %7, 1 - store i32 %inc, i32* %i, align 4 - br label %for.cond - -for.end: ; preds = %for.cond - %8 = load i32* %y.addr, align 4 - %arrayidx6 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %8 - %9 = bitcast %struct.my_struct addrspace(4)* %arrayidx6 to i8 addrspace(4)* - call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i8 addrspace(4)* %9, i32 12, i32 4, i1 false) - %a7 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0 - %10 = load i32* %a7, align 4 - %11 = load i32* %x.addr, align 4 - %arrayidx8 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %11 - %a9 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx8, i32 0, i32 0 - %12 = load i32 addrspace(4)* %a9, align 4 - %add10 = add nsw i32 %10, %12 - %13 = load i32* %x.addr, align 4 - %add11 = add nsw i32 %13, 1 - %arrayidx12 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add11 - %b13 = getelementptr inbounds %struct.my_struct addrspace(4)* %arrayidx12, i32 0, i32 1 - %arrayidx14 = getelementptr inbounds [2 x i32] addrspace(4)* %b13, i32 0, i32 0 - %14 = load i32 addrspace(4)* %arrayidx14, align 4 - %add15 = add nsw i32 %add10, %14 - %15 = load i32* %x.addr, align 4 - %arrayidx16 = getelementptr inbounds [4 x i32] addrspace(1)* @g, i32 0, i32 %15 - %16 = load i32 addrspace(1)* %arrayidx16, align 4 - %add17 = add nsw i32 %add15, %16 - %17 = load i32 addrspace(1)* getelementptr inbounds ([4 x i32] addrspace(1)* @g, i32 0, i32 3), align 4 - %add18 = add nsw i32 %add17, %17 - %18 = load i32** %mem.addr, align 4 - %arrayidx19 = getelementptr inbounds i32* %18, i32 0 - store i32 %add18, i32* %arrayidx19 - ret void -} - -declare void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* nocapture, i8 addrspace(4)* nocapture, i32, i32, i1) nounwind - -!opencl.kernels = !{!0} - -!0 = metadata !{void (%struct.my_struct*, i32, i32*, i32)* @struct_cl} diff --git a/backend/kernels/insert.cl.ll b/backend/kernels/insert.cl.ll new file mode 100644 index 00000000..af9e50e5 --- /dev/null +++ b/backend/kernels/insert.cl.ll @@ -0,0 +1,80 @@ +; ModuleID = 'insert.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @insert(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline { +entry: + %0 = load <4 x i32> addrspace(1)* %src, align 16 + %1 = insertelement <4 x i32> %0, i32 1, i32 2 + store <4 x i32> %1, <4 x i32> addrspace(1)* %src, align 16 + store <4 x i32> %1, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @insert} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/insert.ll b/backend/kernels/insert.ll deleted file mode 100644 index 5df1dd89..00000000 --- a/backend/kernels/insert.ll +++ /dev/null @@ -1,18 +0,0 @@ -; ModuleID = 'insert.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @insert(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline { -entry: - %0 = load <4 x i32>* %src, align 16 - %1 = insertelement <4 x i32> %0, i32 1, i32 2 - store <4 x i32> %1, <4 x i32>* %src, align 16 - store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @insert} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/load_store.ll b/backend/kernels/load_store.cl.ll index 53296872..84bc047d 100644 --- a/backend/kernels/load_store.ll +++ b/backend/kernels/load_store.cl.ll @@ -1,4 +1,4 @@ -; ModuleID = 'load_store.o' +; ModuleID = 'load_store.cl.o' target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" diff --git a/backend/kernels/loop.cl.ll b/backend/kernels/loop.cl.ll new file mode 100644 index 00000000..692dfb7d --- /dev/null +++ b/backend/kernels/loop.cl.ll @@ -0,0 +1,93 @@ +; ModuleID = 'loop.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.big = type { [10 x i32] } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { +entry: + %cmp2 = icmp eq i32 %x, 0 + br i1 %cmp2, label %for.end, label %for.body + +for.body: ; preds = %for.body, %entry + %i.03 = phi i32 [ %inc1, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.03 + %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc1 = add nsw i32 %i.03, 1 + %exitcond = icmp eq i32 %inc1, %x + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/loop2.cl.ll b/backend/kernels/loop2.cl.ll new file mode 100644 index 00000000..effe780f --- /dev/null +++ b/backend/kernels/loop2.cl.ll @@ -0,0 +1,101 @@ +; ModuleID = 'loop2.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.big = type { [10 x i32] } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { +entry: + %cmp6 = icmp eq i32 %x, 0 + br i1 %cmp6, label %for.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %entry + %.pre = load i32 addrspace(1)* %dst, align 4, !tbaa !1 + br label %for.body + +for.body: ; preds = %for.body, %for.body.lr.ph + %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %1, %for.body ] + %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ] + %add = add nsw i32 %i.07, 1 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add + %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %cmp1 = icmp sgt i32 %1, 0 + %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.07 + %storemerge.v = select i1 %cmp1, i32 1, i32 2 + %storemerge = add i32 %storemerge.v, %0 + store i32 %storemerge, i32 addrspace(1)* %arrayidx2, align 4 + %exitcond = icmp eq i32 %add, %x + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/loop2.ll b/backend/kernels/loop2.ll deleted file mode 100644 index abb16d75..00000000 --- a/backend/kernels/loop2.ll +++ /dev/null @@ -1,39 +0,0 @@ -; ModuleID = 'loop2.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -%struct.big = type { [10 x i32] } - -define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { -entry: - %cmp6 = icmp eq i32 %x, 0 - br i1 %cmp6, label %for.end, label %for.body.lr.ph - -for.body.lr.ph: ; preds = %entry - %.pre = load i32 addrspace(1)* %dst, align 4, !tbaa !1 - br label %for.body - -for.body: ; preds = %for.body, %for.body.lr.ph - %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %1, %for.body ] - %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ] - %add = add nsw i32 %i.07, 1 - %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add - %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 - %cmp1 = icmp sgt i32 %1, 0 - %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.07 - %storemerge.v = select i1 %cmp1, i32 1, i32 2 - %storemerge = add i32 %storemerge.v, %0 - store i32 %storemerge, i32 addrspace(1)* %arrayidx2, align 4 - %exitcond = icmp eq i32 %add, %x - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add} -!1 = metadata !{metadata !"int", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/loop3.cl b/backend/kernels/loop3.cl new file mode 100644 index 00000000..acb55361 --- /dev/null +++ b/backend/kernels/loop3.cl @@ -0,0 +1,9 @@ +#include "stdlib.h" + +struct big { int x[10]; }; + +__kernel void add(__global int *dst, unsigned int x, struct big b) +{ + for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++; +} + diff --git a/backend/kernels/loop3.cl.ll b/backend/kernels/loop3.cl.ll new file mode 100644 index 00000000..378357e9 --- /dev/null +++ b/backend/kernels/loop3.cl.ll @@ -0,0 +1,100 @@ +; ModuleID = 'loop3.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.big = type { [10 x i32] } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { +entry: + %cmp2 = icmp eq i32 %x, 0 + br i1 %cmp2, label %for.end, label %get_local_id.exit.lr.ph + +get_local_id.exit.lr.ph: ; preds = %entry + %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + br label %get_local_id.exit + +get_local_id.exit: ; preds = %get_local_id.exit, %get_local_id.exit.lr.ph + %i.03 = phi i32 [ 0, %get_local_id.exit.lr.ph ], [ %inc1, %get_local_id.exit ] + %add = add i32 %call.i, %i.03 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add + %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc1 = add nsw i32 %i.03, 1 + %exitcond = icmp eq i32 %inc1, %x + br i1 %exitcond, label %for.end, label %get_local_id.exit + +for.end: ; preds = %get_local_id.exit, %entry + ret void +} + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/loop.ll b/backend/kernels/loop3.ll index 9d33968a..c25a6d32 100644 --- a/backend/kernels/loop.ll +++ b/backend/kernels/loop3.ll @@ -1,4 +1,4 @@ -; ModuleID = 'loop.o' +; ModuleID = 'loop3.o' target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" target triple = "ptx32--" @@ -7,22 +7,29 @@ target triple = "ptx32--" define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { entry: %cmp2 = icmp eq i32 %x, 0 - br i1 %cmp2, label %for.end, label %for.body + br i1 %cmp2, label %for.end, label %get_local_id.exit.lr.ph -for.body: ; preds = %for.body, %entry - %i.03 = phi i32 [ %inc1, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.03 +get_local_id.exit.lr.ph: ; preds = %entry + %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + br label %get_local_id.exit + +get_local_id.exit: ; preds = %get_local_id.exit, %get_local_id.exit.lr.ph + %i.03 = phi i32 [ 0, %get_local_id.exit.lr.ph ], [ %inc1, %get_local_id.exit ] + %add = add i32 %call.i, %i.03 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 %inc = add nsw i32 %0, 1 store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 %inc1 = add nsw i32 %i.03, 1 %exitcond = icmp eq i32 %inc1, %x - br i1 %exitcond, label %for.end, label %for.body + br i1 %exitcond, label %for.end, label %get_local_id.exit -for.end: ; preds = %for.body, %entry +for.end: ; preds = %get_local_id.exit, %entry ret void } +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + !opencl.kernels = !{!0} !0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add} diff --git a/backend/kernels/loop4.cl b/backend/kernels/loop4.cl new file mode 100644 index 00000000..56fa72a0 --- /dev/null +++ b/backend/kernels/loop4.cl @@ -0,0 +1,12 @@ +#include "stdlib.h" + +struct big { int x[10]; }; + +__kernel void add(__global int *dst, unsigned int x, struct big b) +{ + if (get_local_id(1) > 4) + for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++; + else + for (int i = 0; i < 2*x; ++i) dst[get_local_id(0) + i + x]++; +} + diff --git a/backend/kernels/loop4.cl.ll b/backend/kernels/loop4.cl.ll new file mode 100644 index 00000000..618c5503 --- /dev/null +++ b/backend/kernels/loop4.cl.ll @@ -0,0 +1,129 @@ +; ModuleID = 'loop4.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.big = type { [10 x i32] } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { +get_local_id.exit: + %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone + %cmp = icmp ugt i32 %call3.i, 4 + br i1 %cmp, label %for.cond.preheader, label %for.cond5.preheader + +for.cond.preheader: ; preds = %get_local_id.exit + %cmp124 = icmp eq i32 %x, 0 + br i1 %cmp124, label %if.end, label %get_local_id.exit17.lr.ph + +get_local_id.exit17.lr.ph: ; preds = %for.cond.preheader + %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + br label %get_local_id.exit17 + +for.cond5.preheader: ; preds = %get_local_id.exit + %mul.mask = and i32 %x, 2147483647 + %cmp621 = icmp eq i32 %mul.mask, 0 + br i1 %cmp621, label %if.end, label %get_local_id.exit20.lr.ph + +get_local_id.exit20.lr.ph: ; preds = %for.cond5.preheader + %call.i18 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %0 = shl i32 %x, 1 + br label %get_local_id.exit20 + +get_local_id.exit17: ; preds = %get_local_id.exit17, %get_local_id.exit17.lr.ph + %i.025 = phi i32 [ 0, %get_local_id.exit17.lr.ph ], [ %inc3, %get_local_id.exit17 ] + %add = add i32 %call.i, %i.025 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add + %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc3 = add nsw i32 %i.025, 1 + %exitcond26 = icmp eq i32 %inc3, %x + br i1 %exitcond26, label %if.end, label %get_local_id.exit17 + +get_local_id.exit20: ; preds = %get_local_id.exit20, %get_local_id.exit20.lr.ph + %i4.022 = phi i32 [ 0, %get_local_id.exit20.lr.ph ], [ %inc14, %get_local_id.exit20 ] + %add9 = add i32 %i4.022, %x + %add10 = add i32 %add9, %call.i18 + %arrayidx11 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add10 + %2 = load i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1 + %inc12 = add nsw i32 %2, 1 + store i32 %inc12, i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1 + %inc14 = add nsw i32 %i4.022, 1 + %exitcond = icmp eq i32 %inc14, %0 + br i1 %exitcond, label %if.end, label %get_local_id.exit20 + +if.end: ; preds = %get_local_id.exit20, %get_local_id.exit17, %for.cond5.preheader, %for.cond.preheader + ret void +} + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/loop4.ll b/backend/kernels/loop4.ll new file mode 100644 index 00000000..8b5a7463 --- /dev/null +++ b/backend/kernels/loop4.ll @@ -0,0 +1,67 @@ +; ModuleID = 'loop4.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.big = type { [10 x i32] } + +define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, %struct.big* nocapture byval %b) nounwind noinline { +get_local_id.exit: + %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone + %cmp = icmp ugt i32 %call3.i, 4 + br i1 %cmp, label %for.cond.preheader, label %for.cond5.preheader + +for.cond.preheader: ; preds = %get_local_id.exit + %cmp124 = icmp eq i32 %x, 0 + br i1 %cmp124, label %if.end, label %get_local_id.exit17.lr.ph + +get_local_id.exit17.lr.ph: ; preds = %for.cond.preheader + %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + br label %get_local_id.exit17 + +for.cond5.preheader: ; preds = %get_local_id.exit + %mul.mask = and i32 %x, 2147483647 + %cmp621 = icmp eq i32 %mul.mask, 0 + br i1 %cmp621, label %if.end, label %get_local_id.exit20.lr.ph + +get_local_id.exit20.lr.ph: ; preds = %for.cond5.preheader + %call.i18 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %0 = shl i32 %x, 1 + br label %get_local_id.exit20 + +get_local_id.exit17: ; preds = %get_local_id.exit17, %get_local_id.exit17.lr.ph + %i.025 = phi i32 [ 0, %get_local_id.exit17.lr.ph ], [ %inc3, %get_local_id.exit17 ] + %add = add i32 %call.i, %i.025 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add + %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc3 = add nsw i32 %i.025, 1 + %exitcond26 = icmp eq i32 %inc3, %x + br i1 %exitcond26, label %if.end, label %get_local_id.exit17 + +get_local_id.exit20: ; preds = %get_local_id.exit20, %get_local_id.exit20.lr.ph + %i4.022 = phi i32 [ 0, %get_local_id.exit20.lr.ph ], [ %inc14, %get_local_id.exit20 ] + %add9 = add i32 %i4.022, %x + %add10 = add i32 %add9, %call.i18 + %arrayidx11 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %add10 + %2 = load i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1 + %inc12 = add nsw i32 %2, 1 + store i32 %inc12, i32 addrspace(1)* %arrayidx11, align 4, !tbaa !1 + %inc14 = add nsw i32 %i4.022, 1 + %exitcond = icmp eq i32 %inc14, %0 + br i1 %exitcond, label %if.end, label %get_local_id.exit20 + +if.end: ; preds = %get_local_id.exit20, %get_local_id.exit17, %for.cond5.preheader, %for.cond.preheader + ret void +} + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32, %struct.big*)* @add} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/loop5.cl b/backend/kernels/loop5.cl new file mode 100644 index 00000000..d4a3aa24 --- /dev/null +++ b/backend/kernels/loop5.cl @@ -0,0 +1,17 @@ +#include "stdlib.h" + +struct big { int x[10]; }; + +__kernel void add(__global int *dst0, __global int *dst1, unsigned int x, int y, struct big b) +{ + __global int *dst = NULL; + if (y > 0) + dst = dst0; + else + dst = dst1; + if (get_local_id(1) > 4) + for (int i = 0; i < x; ++i) dst[get_local_id(0) + i]++; + else + for (int i = 0; i < 2*x; ++i) dst[get_local_id(0) + i + x]++; +} + diff --git a/backend/kernels/loop5.cl.ll b/backend/kernels/loop5.cl.ll new file mode 100644 index 00000000..b97ad7b5 --- /dev/null +++ b/backend/kernels/loop5.cl.ll @@ -0,0 +1,131 @@ +; ModuleID = 'loop5.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.big = type { [10 x i32] } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline { +get_local_id.exit: + %cmp = icmp sgt i32 %y, 0 + %dst0.dst1 = select i1 %cmp, i32 addrspace(1)* %dst0, i32 addrspace(1)* %dst1 + %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone + %cmp1 = icmp ugt i32 %call3.i, 4 + br i1 %cmp1, label %for.cond.preheader, label %for.cond8.preheader + +for.cond.preheader: ; preds = %get_local_id.exit + %cmp328 = icmp eq i32 %x, 0 + br i1 %cmp328, label %if.end19, label %get_local_id.exit21.lr.ph + +get_local_id.exit21.lr.ph: ; preds = %for.cond.preheader + %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + br label %get_local_id.exit21 + +for.cond8.preheader: ; preds = %get_local_id.exit + %mul.mask = and i32 %x, 2147483647 + %cmp925 = icmp eq i32 %mul.mask, 0 + br i1 %cmp925, label %if.end19, label %get_local_id.exit24.lr.ph + +get_local_id.exit24.lr.ph: ; preds = %for.cond8.preheader + %call.i22 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %0 = shl i32 %x, 1 + br label %get_local_id.exit24 + +get_local_id.exit21: ; preds = %get_local_id.exit21, %get_local_id.exit21.lr.ph + %i.029 = phi i32 [ 0, %get_local_id.exit21.lr.ph ], [ %inc5, %get_local_id.exit21 ] + %add = add i32 %call.i, %i.029 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add + %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc5 = add nsw i32 %i.029, 1 + %exitcond30 = icmp eq i32 %inc5, %x + br i1 %exitcond30, label %if.end19, label %get_local_id.exit21 + +get_local_id.exit24: ; preds = %get_local_id.exit24, %get_local_id.exit24.lr.ph + %i7.026 = phi i32 [ 0, %get_local_id.exit24.lr.ph ], [ %inc17, %get_local_id.exit24 ] + %add12 = add i32 %i7.026, %x + %add13 = add i32 %add12, %call.i22 + %arrayidx14 = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add13 + %2 = load i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1 + %inc15 = add nsw i32 %2, 1 + store i32 %inc15, i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1 + %inc17 = add nsw i32 %i7.026, 1 + %exitcond = icmp eq i32 %inc17, %0 + br i1 %exitcond, label %if.end19, label %get_local_id.exit24 + +if.end19: ; preds = %get_local_id.exit24, %get_local_id.exit21, %for.cond8.preheader, %for.cond.preheader + ret void +} + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, %struct.big*)* @add} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/loop5.ll b/backend/kernels/loop5.ll new file mode 100644 index 00000000..089beee5 --- /dev/null +++ b/backend/kernels/loop5.ll @@ -0,0 +1,131 @@ +; ModuleID = 'loop5.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.big = type { [10 x i32] } + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst0, i32 addrspace(1)* nocapture %dst1, i32 %x, i32 %y, %struct.big* nocapture byval %b) nounwind noinline { +get_local_id.exit: + %cmp = icmp sgt i32 %y, 0 + %dst0.dst1 = select i1 %cmp, i32 addrspace(1)* %dst0, i32 addrspace(1)* %dst1 + %call3.i = tail call ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone + %cmp1 = icmp ugt i32 %call3.i, 4 + br i1 %cmp1, label %for.cond.preheader, label %for.cond8.preheader + +for.cond.preheader: ; preds = %get_local_id.exit + %cmp328 = icmp eq i32 %x, 0 + br i1 %cmp328, label %if.end19, label %get_local_id.exit21.lr.ph + +get_local_id.exit21.lr.ph: ; preds = %for.cond.preheader + %call.i = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + br label %get_local_id.exit21 + +for.cond8.preheader: ; preds = %get_local_id.exit + %mul.mask = and i32 %x, 2147483647 + %cmp925 = icmp eq i32 %mul.mask, 0 + br i1 %cmp925, label %if.end19, label %get_local_id.exit24.lr.ph + +get_local_id.exit24.lr.ph: ; preds = %for.cond8.preheader + %call.i22 = tail call ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + %0 = shl i32 %x, 1 + br label %get_local_id.exit24 + +get_local_id.exit21: ; preds = %get_local_id.exit21, %get_local_id.exit21.lr.ph + %i.029 = phi i32 [ 0, %get_local_id.exit21.lr.ph ], [ %inc5, %get_local_id.exit21 ] + %add = add i32 %call.i, %i.029 + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add + %1 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc = add nsw i32 %1, 1 + store i32 %inc, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc5 = add nsw i32 %i.029, 1 + %exitcond30 = icmp eq i32 %inc5, %x + br i1 %exitcond30, label %if.end19, label %get_local_id.exit21 + +get_local_id.exit24: ; preds = %get_local_id.exit24, %get_local_id.exit24.lr.ph + %i7.026 = phi i32 [ 0, %get_local_id.exit24.lr.ph ], [ %inc17, %get_local_id.exit24 ] + %add12 = add i32 %i7.026, %x + %add13 = add i32 %add12, %call.i22 + %arrayidx14 = getelementptr inbounds i32 addrspace(1)* %dst0.dst1, i32 %add13 + %2 = load i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1 + %inc15 = add nsw i32 %2, 1 + store i32 %inc15, i32 addrspace(1)* %arrayidx14, align 4, !tbaa !1 + %inc17 = add nsw i32 %i7.026, 1 + %exitcond = icmp eq i32 %inc17, %0 + br i1 %exitcond, label %if.end19, label %get_local_id.exit24 + +if.end19: ; preds = %get_local_id.exit24, %get_local_id.exit21, %for.cond8.preheader, %for.cond.preheader + ret void +} + +declare ptx_device i32 @__gen_ocl_get_local_id0() nounwind readnone + +declare ptx_device i32 @__gen_ocl_get_local_id1() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*, i32, i32, %struct.big*)* @add} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/mad.cl b/backend/kernels/mad.cl deleted file mode 100644 index 5875a9bf..00000000 --- a/backend/kernels/mad.cl +++ /dev/null @@ -1,18 +0,0 @@ -#include "stdlib.h" -__attribute__((pure, overloadable)) int mad(int,int,int); -__attribute__((pure, overloadable)) float mad(float,float,float); -__attribute__((pure, overloadable)) float4 mad(float4,float4,float4); - -__kernel void add(__global int *dst, unsigned int x, float z) -{ - for (int i = 0; i < x; ++i) { - int y = mad(dst[i], 2, 3); - y = mad(dst[i], 2, 3); - float z = mad((float) dst[i], 2.f, 3.f); - float4 z0 = mad((float4) dst[i], (float4)(0.f,1.f,2.f,3.f), (float4)3.f); - float4 x0 = z0 * (float4) 2.f; - dst[i] = y + (int) z + x0.x + x0.y + x0.z; - } -} - - diff --git a/backend/kernels/mad.cl.ll b/backend/kernels/mad.cl.ll new file mode 100644 index 00000000..6a281450 --- /dev/null +++ b/backend/kernels/mad.cl.ll @@ -0,0 +1,113 @@ +; ModuleID = 'mad.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @add(i32 addrspace(1)* nocapture %dst, i32 %x, float %z) nounwind noinline { +entry: + %cmp16 = icmp eq i32 %x, 0 + br i1 %cmp16, label %for.end, label %for.body + +for.body: ; preds = %for.body, %entry + %i.017 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32 addrspace(1)* %dst, i32 %i.017 + %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %call2 = tail call ptx_device i32 @_Z3madiii(i32 %0, i32 2, i32 3) nounwind readonly + %conv = sitofp i32 %0 to float + %call5 = tail call ptx_device float @_Z3madfff(float %conv, float 2.000000e+00, float 3.000000e+00) nounwind readnone + %call.i = tail call ptx_device float @_Z3madfff(float %conv, float 0.000000e+00, float 3.000000e+00) nounwind readnone + %vecinit.i = insertelement <4 x float> undef, float %call.i, i32 0 + %call1.i = tail call ptx_device float @_Z3madfff(float %conv, float 1.000000e+00, float 3.000000e+00) nounwind readnone + %vecinit2.i = insertelement <4 x float> %vecinit.i, float %call1.i, i32 1 + %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %call5, i32 2 + %call5.i = tail call ptx_device float @_Z3madfff(float %conv, float 3.000000e+00, float 3.000000e+00) nounwind readnone + %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %call5.i, i32 3 + %mul = fmul <4 x float> %vecinit6.i, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> + %conv9 = fptosi float %call5 to i32 + %add = add nsw i32 %conv9, %call2 + %conv10 = sitofp i32 %add to float + %1 = extractelement <4 x float> %mul, i32 0 + %add11 = fadd float %conv10, %1 + %2 = extractelement <4 x float> %mul, i32 1 + %add12 = fadd float %add11, %2 + %3 = extractelement <4 x float> %mul, i32 2 + %add13 = fadd float %add12, %3 + %conv14 = fptosi float %add13 to i32 + store i32 %conv14, i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %inc = add nsw i32 %i.017, 1 + %exitcond = icmp eq i32 %inc, %x + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare ptx_device i32 @_Z3madiii(i32, i32, i32) nounwind readonly + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32, float)* @add} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/mad.ll b/backend/kernels/mad.ll deleted file mode 100644 index 6bd19daf..00000000 --- a/backend/kernels/mad.ll +++ /dev/null @@ -1,51 +0,0 @@ -; ModuleID = 'mad.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @add(i32* nocapture %dst, i32 %x, float %z) nounwind noinline { -entry: - %cmp16 = icmp eq i32 %x, 0 - br i1 %cmp16, label %for.end, label %for.body - -for.body: ; preds = %for.body, %entry - %i.017 = phi i32 [ %inc, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds i32* %dst, i32 %i.017 - %0 = load i32* %arrayidx, align 4, !tbaa !1 - %call2 = tail call ptx_device i32 @_Z3madiii(i32 %0, i32 2, i32 3) nounwind readonly - %conv = sitofp i32 %0 to float - %call5 = tail call ptx_device float @_Z3madfff(float %conv, float 2.000000e+00, float 3.000000e+00) nounwind readonly - %1 = insertelement <4 x float> undef, float %conv, i32 0 - %splat = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer - %call8 = tail call ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %splat, <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>) nounwind readonly - %mul = fmul <4 x float> %call8, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00> - %conv9 = fptosi float %call5 to i32 - %add = add nsw i32 %conv9, %call2 - %conv10 = sitofp i32 %add to float - %2 = extractelement <4 x float> %mul, i32 0 - %add11 = fadd float %conv10, %2 - %3 = extractelement <4 x float> %mul, i32 1 - %add12 = fadd float %add11, %3 - %4 = extractelement <4 x float> %mul, i32 2 - %add13 = fadd float %add12, %4 - %conv14 = fptosi float %add13 to i32 - store i32 %conv14, i32* %arrayidx, align 4, !tbaa !1 - %inc = add nsw i32 %i.017, 1 - %exitcond = icmp eq i32 %inc, %x - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -declare ptx_device i32 @_Z3madiii(i32, i32, i32) nounwind readonly - -declare ptx_device float @_Z3madfff(float, float, float) nounwind readonly - -declare ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float>, <4 x float>, <4 x float>) nounwind readonly - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32*, i32, float)* @add} -!1 = metadata !{metadata !"int", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/select.cl.ll b/backend/kernels/select.cl.ll new file mode 100644 index 00000000..ebf1ad0b --- /dev/null +++ b/backend/kernels/select.cl.ll @@ -0,0 +1,100 @@ +; ModuleID = 'select.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @test_select(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src0, <4 x i32> addrspace(1)* nocapture %src1) nounwind noinline { +entry: + %0 = load <4 x i32> addrspace(1)* %src0, align 16, !tbaa !1 + %arrayidx1 = getelementptr inbounds <4 x i32> addrspace(1)* %src0, i32 1 + %1 = load <4 x i32> addrspace(1)* %arrayidx1, align 16, !tbaa !1 + %2 = extractelement <4 x i32> %0, i32 0 + %3 = extractelement <4 x i32> %1, i32 0 + %4 = extractelement <4 x i32> %0, i32 1 + %5 = extractelement <4 x i32> %1, i32 1 + %6 = extractelement <4 x i32> %0, i32 2 + %7 = extractelement <4 x i32> %1, i32 2 + %8 = extractelement <4 x i32> %0, i32 3 + %9 = extractelement <4 x i32> %1, i32 3 + %tobool.i = icmp slt i32 %3, 0 + %cond1.i = select i1 %tobool.i, i32 %3, i32 %2 + %10 = insertelement <4 x i32> undef, i32 %cond1.i, i32 0 + %tobool3.i = icmp slt i32 %5, 0 + %cond7.i = select i1 %tobool3.i, i32 %5, i32 %4 + %11 = insertelement <4 x i32> %10, i32 %cond7.i, i32 1 + %tobool9.i = icmp slt i32 %7, 0 + %cond13.i = select i1 %tobool9.i, i32 %7, i32 %6 + %12 = insertelement <4 x i32> %11, i32 %cond13.i, i32 2 + %tobool15.i = icmp slt i32 %9, 0 + %cond19.i = select i1 %tobool15.i, i32 %9, i32 %8 + %13 = insertelement <4 x i32> %12, i32 %cond19.i, i32 3 + store <4 x i32> %13, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*)* @test_select} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/select.ll b/backend/kernels/select.ll deleted file mode 100644 index a3d7e16f..00000000 --- a/backend/kernels/select.ll +++ /dev/null @@ -1,38 +0,0 @@ -; ModuleID = 'select.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @test_select(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src0, <4 x i32>* nocapture %src1) nounwind noinline { -entry: - %0 = load <4 x i32>* %src0, align 16, !tbaa !1 - %arrayidx1 = getelementptr inbounds <4 x i32>* %src0, i32 1 - %1 = load <4 x i32>* %arrayidx1, align 16, !tbaa !1 - %2 = extractelement <4 x i32> %0, i32 0 - %3 = extractelement <4 x i32> %1, i32 0 - %4 = extractelement <4 x i32> %0, i32 1 - %5 = extractelement <4 x i32> %1, i32 1 - %6 = extractelement <4 x i32> %0, i32 2 - %7 = extractelement <4 x i32> %1, i32 2 - %8 = extractelement <4 x i32> %0, i32 3 - %9 = extractelement <4 x i32> %1, i32 3 - %tobool.i = icmp slt i32 %3, 0 - %cond1.i = select i1 %tobool.i, i32 %3, i32 %2 - %10 = insertelement <4 x i32> undef, i32 %cond1.i, i32 0 - %tobool3.i = icmp slt i32 %5, 0 - %cond7.i = select i1 %tobool3.i, i32 %5, i32 %4 - %11 = insertelement <4 x i32> %10, i32 %cond7.i, i32 1 - %tobool9.i = icmp slt i32 %7, 0 - %cond13.i = select i1 %tobool9.i, i32 %7, i32 %6 - %12 = insertelement <4 x i32> %11, i32 %cond13.i, i32 2 - %tobool15.i = icmp slt i32 %9, 0 - %cond19.i = select i1 %tobool15.i, i32 %9, i32 %8 - %13 = insertelement <4 x i32> %12, i32 %cond19.i, i32 3 - store <4 x i32> %13, <4 x i32>* %dst, align 16, !tbaa !1 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (<4 x i32>*, <4 x i32>*, <4 x i32>*)* @test_select} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/short.cl.ll b/backend/kernels/short.cl.ll new file mode 100644 index 00000000..8ad601be --- /dev/null +++ b/backend/kernels/short.cl.ll @@ -0,0 +1,79 @@ +; ModuleID = 'short.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @short_write(i16 addrspace(1)* nocapture %dst, i16 %x, i16 %y) nounwind noinline { +entry: + %add = add i16 %y, %x + store i16 %add, i16 addrspace(1)* %dst, align 2, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i16 addrspace(1)*, i16, i16)* @short_write} +!1 = metadata !{metadata !"short", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/short.ll b/backend/kernels/short.ll deleted file mode 100644 index 6225107a..00000000 --- a/backend/kernels/short.ll +++ /dev/null @@ -1,17 +0,0 @@ -; ModuleID = 'short.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @short_write(i16* nocapture %dst, i16 %x, i16 %y) nounwind noinline { -entry: - %add = add i16 %y, %x - store i16 %add, i16* %dst, align 2, !tbaa !1 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i16*, i16, i16)* @short_write} -!1 = metadata !{metadata !"short", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/shuffle.cl.ll b/backend/kernels/shuffle.cl.ll new file mode 100644 index 00000000..d503d143 --- /dev/null +++ b/backend/kernels/shuffle.cl.ll @@ -0,0 +1,79 @@ +; ModuleID = 'shuffle.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @shuffle(<4 x i32> addrspace(1)* nocapture %dst, <4 x i32> addrspace(1)* nocapture %src, i32 %c) nounwind noinline { +entry: + %0 = load <4 x i32> addrspace(1)* %src, align 16, !tbaa !1 + %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2> + store <4 x i32> %1, <4 x i32> addrspace(1)* %dst, align 16, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x i32> addrspace(1)*, <4 x i32> addrspace(1)*, i32)* @shuffle} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/shuffle.ll b/backend/kernels/shuffle.ll deleted file mode 100644 index e17a6844..00000000 --- a/backend/kernels/shuffle.ll +++ /dev/null @@ -1,17 +0,0 @@ -; ModuleID = 'shuffle.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @shuffle(<4 x i32>* nocapture %dst, <4 x i32>* nocapture %src, i32 %c) nounwind noinline { -entry: - %0 = load <4 x i32>* %src, align 16, !tbaa !1 - %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2> - store <4 x i32> %1, <4 x i32>* %dst, align 16, !tbaa !1 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (<4 x i32>*, <4 x i32>*, i32)* @shuffle} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/simple_float4.cl.ll b/backend/kernels/simple_float4.cl.ll new file mode 100644 index 00000000..c11f23a1 --- /dev/null +++ b/backend/kernels/simple_float4.cl.ll @@ -0,0 +1,83 @@ +; ModuleID = 'simple_float4.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline { +get_global_id.exit5: + %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i + %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1 + %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i + store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1 + ret void +} + +declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/simple_float4.ll b/backend/kernels/simple_float4.ll deleted file mode 100644 index 0e052084..00000000 --- a/backend/kernels/simple_float4.ll +++ /dev/null @@ -1,21 +0,0 @@ -; ModuleID = 'simple_float4.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src) nounwind noinline { -get_global_id.exit5: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i - %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1 - %arrayidx2 = getelementptr inbounds <4 x float>* %dst, i32 %call.i - store <4 x float> %0, <4 x float>* %arrayidx2, align 16, !tbaa !1 - ret void -} - -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - -!opencl.kernels = !{!0} - -!0 = metadata !{void (<4 x float>*, <4 x float>*)* @simple_float4} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/simple_float4_2.cl.ll b/backend/kernels/simple_float4_2.cl.ll new file mode 100644 index 00000000..8d7f4fbc --- /dev/null +++ b/backend/kernels/simple_float4_2.cl.ll @@ -0,0 +1,84 @@ +; ModuleID = 'simple_float4_2.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline { +get_global_id.exit10: + %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i + %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1 + %mul = fmul <4 x float> %0, %0 + %arrayidx4 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i + store <4 x float> %mul, <4 x float> addrspace(1)* %arrayidx4, align 16, !tbaa !1 + ret void +} + +declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/simple_float4_2.ll b/backend/kernels/simple_float4_2.ll deleted file mode 100644 index 4f5e1da4..00000000 --- a/backend/kernels/simple_float4_2.ll +++ /dev/null @@ -1,22 +0,0 @@ -; ModuleID = 'simple_float4_2.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src) nounwind noinline { -get_global_id.exit10: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i - %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1 - %mul = fmul <4 x float> %0, %0 - %arrayidx4 = getelementptr inbounds <4 x float>* %dst, i32 %call.i - store <4 x float> %mul, <4 x float>* %arrayidx4, align 16, !tbaa !1 - ret void -} - -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - -!opencl.kernels = !{!0} - -!0 = metadata !{void (<4 x float>*, <4 x float>*)* @simple_float4} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/simple_float4_3.cl b/backend/kernels/simple_float4_3.cl index 25c8fe4f..d908433f 100644 --- a/backend/kernels/simple_float4_3.cl +++ b/backend/kernels/simple_float4_3.cl @@ -2,7 +2,7 @@ __kernel void simple_float4(__global float4 *dst, __global float4 *src, bool b) { - dst[get_global_id(0)] = select(b, src[get_global_id(0)], src[get_global_id(1)]); + dst[get_global_id(0)] = select(src[get_global_id(0)], src[get_global_id(1)], (int4)(b)); dst[get_global_id(0)] += (float4) (src[2].x, 1.f, 2.f, 3.f); } diff --git a/backend/kernels/simple_float4_3.cl.ll b/backend/kernels/simple_float4_3.cl.ll new file mode 100644 index 00000000..a41afb18 --- /dev/null +++ b/backend/kernels/simple_float4_3.cl.ll @@ -0,0 +1,92 @@ +; ModuleID = 'simple_float4_3.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src, i1 %b) nounwind noinline { +get_global_id.exit16: + %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i + %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1 + %arrayidx5 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i + store <4 x float> %0, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1 + %arrayidx6 = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 2 + %1 = load <4 x float> addrspace(1)* %arrayidx6, align 16 + %2 = extractelement <4 x float> %1, i32 0 + %vecinit = insertelement <4 x float> undef, float %2, i32 0 + %vecinit7 = insertelement <4 x float> %vecinit, float 1.000000e+00, i32 1 + %vecinit8 = insertelement <4 x float> %vecinit7, float 2.000000e+00, i32 2 + %vecinit9 = insertelement <4 x float> %vecinit8, float 3.000000e+00, i32 3 + %add = fadd <4 x float> %0, %vecinit9 + store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx5, align 16, !tbaa !1 + ret void +} + +declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*, i1)* @simple_float4} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/simple_float4_3.ll b/backend/kernels/simple_float4_3.ll deleted file mode 100644 index c1bdd31c..00000000 --- a/backend/kernels/simple_float4_3.ll +++ /dev/null @@ -1,36 +0,0 @@ -; ModuleID = 'simple_float4_3.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @simple_float4(<4 x float>* nocapture %dst, <4 x float>* nocapture %src, i1 %b) nounwind noinline { -get_global_id.exit16: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds <4 x float>* %src, i32 %call.i - %0 = load <4 x float>* %arrayidx, align 16, !tbaa !1 - %call3.i = tail call ptx_device i32 @__gen_ocl_get_global_id1() nounwind readnone - %arrayidx2 = getelementptr inbounds <4 x float>* %src, i32 %call3.i - %1 = load <4 x float>* %arrayidx2, align 16, !tbaa !1 - %x.y.i = select i1 %b, <4 x float> %0, <4 x float> %1 - %arrayidx5 = getelementptr inbounds <4 x float>* %dst, i32 %call.i - store <4 x float> %x.y.i, <4 x float>* %arrayidx5, align 16, !tbaa !1 - %arrayidx6 = getelementptr inbounds <4 x float>* %src, i32 2 - %2 = load <4 x float>* %arrayidx6, align 16 - %3 = extractelement <4 x float> %2, i32 0 - %vecinit = insertelement <4 x float> undef, float %3, i32 0 - %vecinit7 = insertelement <4 x float> %vecinit, float 1.000000e+00, i32 1 - %vecinit8 = insertelement <4 x float> %vecinit7, float 2.000000e+00, i32 2 - %vecinit9 = insertelement <4 x float> %vecinit8, float 3.000000e+00, i32 3 - %add = fadd <4 x float> %x.y.i, %vecinit9 - store <4 x float> %add, <4 x float>* %arrayidx5, align 16, !tbaa !1 - ret void -} - -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - -declare ptx_device i32 @__gen_ocl_get_global_id1() nounwind readnone - -!opencl.kernels = !{!0} - -!0 = metadata !{void (<4 x float>*, <4 x float>*, i1)* @simple_float4} -!1 = metadata !{metadata !"omnipotent char", metadata !2} -!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/stdlib.h b/backend/kernels/stdlib.h index 0701ff83..7d4e97ec 100644 --- a/backend/kernels/stdlib.h +++ b/backend/kernels/stdlib.h @@ -23,6 +23,7 @@ __attribute__((pure,const)) unsigned int __gen_ocl_get_global_id2(void); __attribute__((pure,const)) unsigned int __gen_ocl_get_local_id0(void); __attribute__((pure,const)) unsigned int __gen_ocl_get_local_id1(void); __attribute__((pure,const)) unsigned int __gen_ocl_get_local_id2(void); +__attribute__ ((pure,const,overloadable)) float mad(float a, float b, float c); inline unsigned get_global_id(unsigned int dim) { if (dim == 0) return __gen_ocl_get_global_id0(); @@ -59,30 +60,48 @@ typedef bool bool2 __attribute__((ext_vector_type(2))); typedef bool bool3 __attribute__((ext_vector_type(3))); typedef bool bool4 __attribute__((ext_vector_type(4))); -__attribute__((overloadable)) inline int4 select(int4 src0, int4 src1, int4 cond) { - int4 dst; - const int x0 = src0.x; // Fix performance issue with CLANG - const int x1 = src1.x; - const int y0 = src0.y; - const int y1 = src1.y; - const int z0 = src0.z; - const int z1 = src1.z; - const int w0 = src0.w; - const int w1 = src1.w; +// This will be optimized out by LLVM and will output LLVM select instructions +#define DECL_SELECT4(TYPE4, TYPE, COND_TYPE4, MASK) \ +__attribute__((overloadable)) \ +inline TYPE4 select(TYPE4 src0, TYPE4 src1, COND_TYPE4 cond) { \ + TYPE4 dst; \ + const TYPE x0 = src0.x; /* Fix performance issue with CLANG */ \ + const TYPE x1 = src1.x; \ + const TYPE y0 = src0.y; \ + const TYPE y1 = src1.y; \ + const TYPE z0 = src0.z; \ + const TYPE z1 = src1.z; \ + const TYPE w0 = src0.w; \ + const TYPE w1 = src1.w; \ + \ + dst.x = (cond.x & MASK) ? x1 : x0; \ + dst.y = (cond.y & MASK) ? y1 : y0; \ + dst.z = (cond.z & MASK) ? z1 : z0; \ + dst.w = (cond.w & MASK) ? w1 : w0; \ + return dst; \ +} +DECL_SELECT4(int4, int, int4, 0x80000000) +DECL_SELECT4(float4, float, int4, 0x80000000) +#undef DECL_SELECT4 - dst.x = (cond.x & 0x80000000) ? x1 : x0; - dst.y = (cond.y & 0x80000000) ? y1 : y0; - dst.z = (cond.z & 0x80000000) ? z1 : z0; - dst.w = (cond.w & 0x80000000) ? w1 : w0; - return dst; +__attribute__((overloadable)) float2 mad(float2 a, float2 b, float2 c) { + return (float2)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y)); +} +__attribute__((overloadable)) float3 mad(float3 a, float3 b, float3 c) { + return (float3)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), mad(a.z,b.z,c.z)); +} +__attribute__((overloadable)) float4 mad(float4 a, float4 b, float4 c) { + return (float4)(mad(a.x,b.x,c.x), mad(a.y,b.y,c.y), + mad(a.z,b.z,c.z), mad(a.w,b.w,c.w)); } #define __private __attribute__((address_space(0))) #define __global __attribute__((address_space(1))) #define __constant __attribute__((address_space(2))) -#define __local __attribute__((address_space(3))) +//#define __local __attribute__((address_space(3))) #define global __global -#define local __local +//#define local __local #define constant __constant #define private __private +#define NULL ((void*)0) diff --git a/backend/kernels/store.cl.ll b/backend/kernels/store.cl.ll new file mode 100644 index 00000000..b74e69a6 --- /dev/null +++ b/backend/kernels/store.cl.ll @@ -0,0 +1,78 @@ +; ModuleID = 'store.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @store(i32 addrspace(1)* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline { +entry: + store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(4)*, i32)* @store} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/store.ll b/backend/kernels/store.ll deleted file mode 100644 index c885d62e..00000000 --- a/backend/kernels/store.ll +++ /dev/null @@ -1,16 +0,0 @@ -; ModuleID = 'store.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @store(i32* nocapture %dst, i32 addrspace(4)* nocapture %dst0, i32 %x) nounwind noinline { -entry: - store i32 1, i32* %dst, align 4, !tbaa !1 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32*, i32 addrspace(4)*, i32)* @store} -!1 = metadata !{metadata !"int", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/struct.cl.ll b/backend/kernels/struct.cl.ll new file mode 100644 index 00000000..fc89f46d --- /dev/null +++ b/backend/kernels/struct.cl.ll @@ -0,0 +1,128 @@ +; ModuleID = 'struct.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.my_struct = type { i32, [2 x i32] } + +@g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4 +@struct_cl.hop = internal addrspace(4) unnamed_addr global %struct.my_struct zeroinitializer, align 4 +@struct_cl.array = internal addrspace(4) global [256 x %struct.my_struct] zeroinitializer, align 4 + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, i32 addrspace(1)* nocapture %mem, i32 %y) nounwind noinline { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %i.023 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %a = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %i.023, i32 0 + store i32 %i.023, i32 addrspace(4)* %a, align 4, !tbaa !1 + %arrayidx2 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %i.023, i32 1, i32 0 + %add = add nsw i32 %i.023, 1 + store i32 %add, i32 addrspace(4)* %arrayidx2, align 4, !tbaa !1 + %exitcond = icmp eq i32 %add, 256 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + %cmp6 = icmp eq i32 %y, 0 + br i1 %cmp6, label %if.then, label %if.else + +if.then: ; preds = %for.end + tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i32 12, i32 4, i1 false) + br label %if.end + +if.else: ; preds = %for.end + %add8 = add nsw i32 %y, 1 + %arrayidx9 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add8 + %0 = bitcast %struct.my_struct addrspace(4)* %arrayidx9 to i8 addrspace(4)* + tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i8 addrspace(4)* %0, i32 12, i32 4, i1 false) + br label %if.end + +if.end: ; preds = %if.else, %if.then + tail call void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* bitcast ([256 x %struct.my_struct] addrspace(4)* @struct_cl.array to i8 addrspace(4)*), i8 addrspace(4)* bitcast (%struct.my_struct addrspace(4)* @struct_cl.hop to i8 addrspace(4)*), i32 12, i32 4, i1 false) + %a10 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0 + %1 = load i32* %a10, align 4, !tbaa !1 + %a12 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %x, i32 0 + %2 = load i32 addrspace(4)* %a12, align 4, !tbaa !1 + %add14 = add nsw i32 %x, 1 + %arrayidx17 = getelementptr inbounds [256 x %struct.my_struct] addrspace(4)* @struct_cl.array, i32 0, i32 %add14, i32 1, i32 0 + %3 = load i32 addrspace(4)* %arrayidx17, align 4, !tbaa !1 + %arrayidx19 = getelementptr inbounds [4 x i32] addrspace(2)* @g, i32 0, i32 %x + %4 = load i32 addrspace(2)* %arrayidx19, align 4, !tbaa !1 + %add13 = add i32 %1, 3 + %add18 = add i32 %add13, %2 + %add20 = add i32 %add18, %3 + %add21 = add i32 %add20, %4 + store i32 %add21, i32 addrspace(1)* %mem, align 4, !tbaa !1 + ret void +} + +declare void @llvm.memcpy.p4i8.p4i8.i32(i8 addrspace(4)* nocapture, i8 addrspace(4)* nocapture, i32, i32, i1) nounwind + +!opencl.kernels = !{!0} + +!0 = metadata !{void (%struct.my_struct*, i32, i32 addrspace(1)*, i32)* @struct_cl} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/struct.ll b/backend/kernels/struct.ll deleted file mode 100644 index d1273bee..00000000 --- a/backend/kernels/struct.ll +++ /dev/null @@ -1,121 +0,0 @@ -; ModuleID = 'struct.o' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" -target triple = "x86_64-unknown-linux-gnu" - -%struct.my_struct = type { i32, [2 x i32] } - -@g = constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 16 -@struct_cl.hop = internal global %struct.my_struct zeroinitializer, align 4 -@struct_cl.array = internal global [256 x %struct.my_struct] zeroinitializer, align 16 - -define void @struct_cl(i64 %s.coerce0, i32 %s.coerce1, i32 %x, i32* %mem, i32 %y) nounwind uwtable { -entry: - %s = alloca %struct.my_struct, align 8 - %x.addr = alloca i32, align 4 - %mem.addr = alloca i32*, align 8 - %y.addr = alloca i32, align 4 - %i = alloca i32, align 4 - %0 = bitcast %struct.my_struct* %s to { i64, i32 }* - %1 = getelementptr { i64, i32 }* %0, i32 0, i32 0 - store i64 %s.coerce0, i64* %1 - %2 = getelementptr { i64, i32 }* %0, i32 0, i32 1 - store i32 %s.coerce1, i32* %2 - store i32 %x, i32* %x.addr, align 4 - store i32* %mem, i32** %mem.addr, align 8 - store i32 %y, i32* %y.addr, align 4 - store i32 0, i32* %i, align 4 - br label %for.cond - -for.cond: ; preds = %for.inc, %entry - %3 = load i32* %i, align 4 - %cmp = icmp slt i32 %3, 256 - br i1 %cmp, label %for.body, label %for.end - -for.body: ; preds = %for.cond - %4 = load i32* %i, align 4 - %5 = load i32* %i, align 4 - %idxprom = sext i32 %5 to i64 - %arrayidx = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom - %a = getelementptr inbounds %struct.my_struct* %arrayidx, i32 0, i32 0 - store i32 %4, i32* %a, align 4 - %6 = load i32* %i, align 4 - %7 = load i32* %i, align 4 - %idxprom1 = sext i32 %7 to i64 - %arrayidx2 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom1 - %b = getelementptr inbounds %struct.my_struct* %arrayidx2, i32 0, i32 1 - %arrayidx3 = getelementptr inbounds [2 x i32]* %b, i32 0, i64 0 - store i32 %6, i32* %arrayidx3, align 4 - %8 = load i32* %i, align 4 - %add = add nsw i32 %8, 1 - %9 = load i32* %i, align 4 - %idxprom4 = sext i32 %9 to i64 - %arrayidx5 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom4 - %b6 = getelementptr inbounds %struct.my_struct* %arrayidx5, i32 0, i32 1 - %arrayidx7 = getelementptr inbounds [2 x i32]* %b6, i32 0, i64 0 - store i32 %add, i32* %arrayidx7, align 4 - br label %for.inc - -for.inc: ; preds = %for.body - %10 = load i32* %i, align 4 - %inc = add nsw i32 %10, 1 - store i32 %inc, i32* %i, align 4 - br label %for.cond - -for.end: ; preds = %for.cond - %11 = load i32* %y.addr, align 4 - %cmp8 = icmp eq i32 %11, 0 - br i1 %cmp8, label %if.then, label %if.else - -if.then: ; preds = %for.end - %12 = load i32* %y.addr, align 4 - %idxprom9 = sext i32 %12 to i64 - %arrayidx10 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom9 - %13 = bitcast %struct.my_struct* %arrayidx10 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i8* %13, i64 12, i32 4, i1 false) - br label %if.end - -if.else: ; preds = %for.end - %14 = load i32* %y.addr, align 4 - %add11 = add nsw i32 %14, 1 - %idxprom12 = sext i32 %add11 to i64 - %arrayidx13 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom12 - %15 = bitcast %struct.my_struct* %arrayidx13 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i8* %15, i64 12, i32 4, i1 false) - br label %if.end - -if.end: ; preds = %if.else, %if.then - call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([256 x %struct.my_struct]* @struct_cl.array to i8*), i8* bitcast (%struct.my_struct* @struct_cl.hop to i8*), i64 12, i32 4, i1 false) - %a14 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0 - %16 = load i32* %a14, align 4 - %17 = load i32* %x.addr, align 4 - %idxprom15 = sext i32 %17 to i64 - %arrayidx16 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom15 - %a17 = getelementptr inbounds %struct.my_struct* %arrayidx16, i32 0, i32 0 - %18 = load i32* %a17, align 4 - %add18 = add nsw i32 %16, %18 - %19 = load i32* %x.addr, align 4 - %add19 = add nsw i32 %19, 1 - %idxprom20 = sext i32 %add19 to i64 - %arrayidx21 = getelementptr inbounds [256 x %struct.my_struct]* @struct_cl.array, i32 0, i64 %idxprom20 - %b22 = getelementptr inbounds %struct.my_struct* %arrayidx21, i32 0, i32 1 - %arrayidx23 = getelementptr inbounds [2 x i32]* %b22, i32 0, i64 0 - %20 = load i32* %arrayidx23, align 4 - %add24 = add nsw i32 %add18, %20 - %21 = load i32* %x.addr, align 4 - %idxprom25 = sext i32 %21 to i64 - %arrayidx26 = getelementptr inbounds [4 x i32]* @g, i32 0, i64 %idxprom25 - %22 = load i32* %arrayidx26, align 4 - %add27 = add nsw i32 %add24, %22 - %23 = load i32* getelementptr inbounds ([4 x i32]* @g, i32 0, i64 3), align 4 - %add28 = add nsw i32 %add27, %23 - %24 = load i32** %mem.addr, align 8 - %arrayidx29 = getelementptr inbounds i32* %24, i64 0 - store i32 %add28, i32* %arrayidx29 - ret void -} - -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i64, i32, i32, i32*, i32)* @struct_cl} diff --git a/backend/kernels/struct2.cl.ll b/backend/kernels/struct2.cl.ll new file mode 100644 index 00000000..2c5f8b16 --- /dev/null +++ b/backend/kernels/struct2.cl.ll @@ -0,0 +1,100 @@ +; ModuleID = 'struct2.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +%struct.my_struct = type { i32, [2 x i32] } + +@g = addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4 + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct addrspace(1)* nocapture %mem, i32 %y) nounwind noinline { +entry: + %cmp = icmp eq i32 %y, 0 + br i1 %cmp, label %if.end, label %if.else + +if.else: ; preds = %entry + %s.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0 + %tmp4 = load i32* %s.0, align 4 + %s.1.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 0 + %tmp5 = load i32* %s.1.0, align 4 + %s.1.1 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 1 + %tmp6 = load i32* %s.1.1, align 4 + br label %if.end + +if.end: ; preds = %if.else, %entry + %hop.1.1.0 = phi i32 [ %tmp6, %if.else ], [ 2, %entry ] + %hop.1.0.0 = phi i32 [ %tmp5, %if.else ], [ 2, %entry ] + %hop.0.0 = phi i32 [ %tmp4, %if.else ], [ 1, %entry ] + %mem.0 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 0 + store i32 %hop.0.0, i32 addrspace(1)* %mem.0, align 4 + %mem.1.0 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 1, i32 0 + store i32 %hop.1.0.0, i32 addrspace(1)* %mem.1.0, align 4 + %mem.1.1 = getelementptr inbounds %struct.my_struct addrspace(1)* %mem, i32 0, i32 1, i32 1 + store i32 %hop.1.1.0, i32 addrspace(1)* %mem.1.1, align 4 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (%struct.my_struct*, i32, %struct.my_struct addrspace(1)*, i32)* @struct_cl} diff --git a/backend/kernels/struct2.ll b/backend/kernels/struct2.ll deleted file mode 100644 index ef1f3ab6..00000000 --- a/backend/kernels/struct2.ll +++ /dev/null @@ -1,38 +0,0 @@ -; ModuleID = 'struct2.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -%struct.my_struct = type { i32, [2 x i32] } - -@g = addrspace(1) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 4 - -define ptx_kernel void @struct_cl(%struct.my_struct* nocapture byval %s, i32 %x, %struct.my_struct* nocapture %mem, i32 %y) nounwind noinline { -entry: - %cmp = icmp eq i32 %y, 0 - br i1 %cmp, label %if.end, label %if.else - -if.else: ; preds = %entry - %s.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 0 - %tmp4 = load i32* %s.0, align 4 - %s.1.0 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 0 - %tmp5 = load i32* %s.1.0, align 4 - %s.1.1 = getelementptr inbounds %struct.my_struct* %s, i32 0, i32 1, i32 1 - %tmp6 = load i32* %s.1.1, align 4 - br label %if.end - -if.end: ; preds = %if.else, %entry - %hop.1.1.0 = phi i32 [ %tmp6, %if.else ], [ 2, %entry ] - %hop.1.0.0 = phi i32 [ %tmp5, %if.else ], [ 2, %entry ] - %hop.0.0 = phi i32 [ %tmp4, %if.else ], [ 1, %entry ] - %mem.0 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 0 - store i32 %hop.0.0, i32* %mem.0, align 4 - %mem.1.0 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 1, i32 0 - store i32 %hop.1.0.0, i32* %mem.1.0, align 4 - %mem.1.1 = getelementptr inbounds %struct.my_struct* %mem, i32 0, i32 1, i32 1 - store i32 %hop.1.1.0, i32* %mem.1.1, align 4 - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (%struct.my_struct*, i32, %struct.my_struct*, i32)* @struct_cl} diff --git a/backend/kernels/test_select.cl.ll b/backend/kernels/test_select.cl.ll new file mode 100644 index 00000000..478dcbbf --- /dev/null +++ b/backend/kernels/test_select.cl.ll @@ -0,0 +1,86 @@ +; ModuleID = 'test_select.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @test_select(i32 addrspace(1)* nocapture %dst, i32 addrspace(1)* nocapture %src) nounwind noinline { +get_global_id.exit7: + %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + %arrayidx = getelementptr inbounds i32 addrspace(1)* %src, i32 %call.i + %0 = load i32 addrspace(1)* %arrayidx, align 4, !tbaa !1 + %cmp = icmp sgt i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32 addrspace(1)* %dst, i32 %call.i + %. = select i1 %cmp, i32 1, i32 2 + store i32 %., i32 addrspace(1)* %arrayidx2, align 4 + ret void +} + +declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*, i32 addrspace(1)*)* @test_select} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/test_select.ll b/backend/kernels/test_select.ll deleted file mode 100644 index 302251ca..00000000 --- a/backend/kernels/test_select.ll +++ /dev/null @@ -1,24 +0,0 @@ -; ModuleID = 'test_select.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @test_select(i32* nocapture %dst, i32* nocapture %src) nounwind noinline { -get_global_id.exit7: - %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - %arrayidx = getelementptr inbounds i32* %src, i32 %call.i - %0 = load i32* %arrayidx, align 4, !tbaa !1 - %cmp = icmp sgt i32 %0, 1 - %arrayidx2 = getelementptr inbounds i32* %dst, i32 %call.i - %. = select i1 %cmp, i32 1, i32 2 - store i32 %., i32* %arrayidx2, align 4 - ret void -} - -declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32*, i32*)* @test_select} -!1 = metadata !{metadata !"int", metadata !2} -!2 = metadata !{metadata !"omnipotent char", metadata !3} -!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/undefined.cl.ll b/backend/kernels/undefined.cl.ll new file mode 100644 index 00000000..f6446426 --- /dev/null +++ b/backend/kernels/undefined.cl.ll @@ -0,0 +1,78 @@ +; ModuleID = 'undefined.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @undefined(i32 addrspace(1)* nocapture %dst) nounwind noinline { +entry: + store i32 1, i32 addrspace(1)* %dst, align 4, !tbaa !1 + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void (i32 addrspace(1)*)* @undefined} +!1 = metadata !{metadata !"int", metadata !2} +!2 = metadata !{metadata !"omnipotent char", metadata !3} +!3 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/undefined.ll b/backend/kernels/undefined.ll deleted file mode 100644 index a706e7ba..00000000 --- a/backend/kernels/undefined.ll +++ /dev/null @@ -1,32 +0,0 @@ -; ModuleID = 'undefined.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @undefined(i32* %dst) nounwind noinline { -entry: - %dst.addr = alloca i32*, align 4 - %x = alloca i32, align 4 - store i32* %dst, i32** %dst.addr, align 4 - %0 = load i32* %x, align 4 - %cmp = icmp eq i32 %0, 0 - br i1 %cmp, label %if.then, label %if.else - -if.then: ; preds = %entry - %1 = load i32** %dst.addr, align 4 - %arrayidx = getelementptr inbounds i32* %1, i32 0 - store i32 0, i32* %arrayidx - br label %if.end - -if.else: ; preds = %entry - %2 = load i32** %dst.addr, align 4 - %arrayidx1 = getelementptr inbounds i32* %2, i32 0 - store i32 1, i32* %arrayidx1 - br label %if.end - -if.end: ; preds = %if.else, %if.then - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void (i32*)* @undefined} diff --git a/backend/kernels/vector_constant.cl b/backend/kernels/vector_constant.cl new file mode 100644 index 00000000..fc9d308f --- /dev/null +++ b/backend/kernels/vector_constant.cl @@ -0,0 +1,7 @@ +#include "stdlib.h" + +__kernel void simple_float4(__global float4 *dst, __global float4 *src) +{ + dst[get_global_id(0)] = src[get_global_id(0)] + (float4)(0.f,1.f,2.f,3.f); +} + diff --git a/backend/kernels/vector_constant.cl.ll b/backend/kernels/vector_constant.cl.ll new file mode 100644 index 00000000..de644cd3 --- /dev/null +++ b/backend/kernels/vector_constant.cl.ll @@ -0,0 +1,84 @@ +; ModuleID = 'vector_constant.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @simple_float4(<4 x float> addrspace(1)* nocapture %dst, <4 x float> addrspace(1)* nocapture %src) nounwind noinline { +get_global_id.exit5: + %call.i = tail call ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + %arrayidx = getelementptr inbounds <4 x float> addrspace(1)* %src, i32 %call.i + %0 = load <4 x float> addrspace(1)* %arrayidx, align 16, !tbaa !1 + %add = fadd <4 x float> %0, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00> + %arrayidx2 = getelementptr inbounds <4 x float> addrspace(1)* %dst, i32 %call.i + store <4 x float> %add, <4 x float> addrspace(1)* %arrayidx2, align 16, !tbaa !1 + ret void +} + +declare ptx_device i32 @__gen_ocl_get_global_id0() nounwind readnone + +!opencl.kernels = !{!0} + +!0 = metadata !{void (<4 x float> addrspace(1)*, <4 x float> addrspace(1)*)* @simple_float4} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} diff --git a/backend/kernels/void.cl.ll b/backend/kernels/void.cl.ll new file mode 100644 index 00000000..151fc1cb --- /dev/null +++ b/backend/kernels/void.cl.ll @@ -0,0 +1,74 @@ +; ModuleID = 'void.cl.o' +target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" +target triple = "ptx32--" + +define ptx_device <2 x float> @_Z3madDv2_fS_S_(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone { +entry: + %0 = extractelement <2 x float> %a, i32 0 + %1 = extractelement <2 x float> %b, i32 0 + %2 = extractelement <2 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <2 x float> undef, float %call, i32 0 + %3 = extractelement <2 x float> %a, i32 1 + %4 = extractelement <2 x float> %b, i32 1 + %5 = extractelement <2 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <2 x float> %vecinit, float %call1, i32 1 + ret <2 x float> %vecinit2 +} + +declare ptx_device float @_Z3madfff(float, float, float) nounwind readnone + +define ptx_device <3 x float> @_Z3madDv3_fS_S_(<3 x float> %a, <3 x float> %b, <3 x float> %c) nounwind readnone { +entry: + %0 = extractelement <3 x float> %a, i32 0 + %1 = extractelement <3 x float> %b, i32 0 + %2 = extractelement <3 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <3 x float> undef, float %call, i32 0 + %3 = extractelement <3 x float> %a, i32 1 + %4 = extractelement <3 x float> %b, i32 1 + %5 = extractelement <3 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <3 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <3 x float> %a, i32 2 + %7 = extractelement <3 x float> %b, i32 2 + %8 = extractelement <3 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <3 x float> %vecinit2, float %call3, i32 2 + ret <3 x float> %vecinit4 +} + +define ptx_device <4 x float> @_Z3madDv4_fS_S_(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone { +entry: + %0 = extractelement <4 x float> %a, i32 0 + %1 = extractelement <4 x float> %b, i32 0 + %2 = extractelement <4 x float> %c, i32 0 + %call = tail call ptx_device float @_Z3madfff(float %0, float %1, float %2) nounwind readnone + %vecinit = insertelement <4 x float> undef, float %call, i32 0 + %3 = extractelement <4 x float> %a, i32 1 + %4 = extractelement <4 x float> %b, i32 1 + %5 = extractelement <4 x float> %c, i32 1 + %call1 = tail call ptx_device float @_Z3madfff(float %3, float %4, float %5) nounwind readnone + %vecinit2 = insertelement <4 x float> %vecinit, float %call1, i32 1 + %6 = extractelement <4 x float> %a, i32 2 + %7 = extractelement <4 x float> %b, i32 2 + %8 = extractelement <4 x float> %c, i32 2 + %call3 = tail call ptx_device float @_Z3madfff(float %6, float %7, float %8) nounwind readnone + %vecinit4 = insertelement <4 x float> %vecinit2, float %call3, i32 2 + %9 = extractelement <4 x float> %a, i32 3 + %10 = extractelement <4 x float> %b, i32 3 + %11 = extractelement <4 x float> %c, i32 3 + %call5 = tail call ptx_device float @_Z3madfff(float %9, float %10, float %11) nounwind readnone + %vecinit6 = insertelement <4 x float> %vecinit4, float %call5, i32 3 + ret <4 x float> %vecinit6 +} + +define ptx_kernel void @hop() nounwind readnone noinline { +entry: + ret void +} + +!opencl.kernels = !{!0} + +!0 = metadata !{void ()* @hop} diff --git a/backend/kernels/void.ll b/backend/kernels/void.ll deleted file mode 100644 index 3c6c269b..00000000 --- a/backend/kernels/void.ll +++ /dev/null @@ -1,12 +0,0 @@ -; ModuleID = 'void.o' -target datalayout = "e-p:32:32-i64:64:64-f64:64:64-n1:8:16:32:64" -target triple = "ptx32--" - -define ptx_kernel void @hop() nounwind readnone noinline { -entry: - ret void -} - -!opencl.kernels = !{!0} - -!0 = metadata !{void ()* @hop} |