diff options
-rw-r--r-- | generic/lib/shared/vload_if.ll | 36 | ||||
-rw-r--r-- | generic/lib/shared/vload_impl.ll | 51 |
2 files changed, 21 insertions, 66 deletions
diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll index a3575e0..2634d37 100644 --- a/generic/lib/shared/vload_if.ll +++ b/generic/lib/shared/vload_if.ll @@ -1,64 +1,60 @@ ;Start int global vload -declare <2 x i32> @__clc_vload2_impl_int__global(i32 %x, i32 %y) +declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) +declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) +declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) +declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) +declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) + define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <2 x i32> @__clc_vload2_impl_int__global(i32 %x, i32 %y) + %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) ret <2 x i32> %call } -declare <3 x i32> @__clc_vload3_impl_int__global(i32 %x, i32 %y) define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <3 x i32> @__clc_vload3_impl_int__global(i32 %x, i32 %y) + %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) ret <3 x i32> %call } -declare <4 x i32> @__clc_vload4_impl_int__global(i32 %x, i32 %y) define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <4 x i32> @__clc_vload4_impl_int__global(i32 %x, i32 %y) + %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) ret <4 x i32> %call } -declare <8 x i32> @__clc_vload8_impl_int__global(i32 %x, i32 %y) define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <8 x i32> @__clc_vload8_impl_int__global(i32 %x, i32 %y) + %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) ret <8 x i32> %call } -declare <16 x i32> @__clc_vload16_impl_int__global(i32 %x, i32 %y) define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <16 x i32> @__clc_vload16_impl_int__global(i32 %x, i32 %y) + %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) ret <16 x i32> %call } ;Start uint global vload -declare <2 x i32> @__clc_vload2_impl_uint__global(i32 %x, i32 %y) define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <2 x i32> @__clc_vload2_impl_uint__global(i32 %x, i32 %y) + %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) ret <2 x i32> %call } -declare <3 x i32> @__clc_vload3_impl_uint__global(i32 %x, i32 %y) define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <3 x i32> @__clc_vload3_impl_uint__global(i32 %x, i32 %y) + %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) ret <3 x i32> %call } -declare <4 x i32> @__clc_vload4_impl_uint__global(i32 %x, i32 %y) define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <4 x i32> @__clc_vload4_impl_uint__global(i32 %x, i32 %y) + %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) ret <4 x i32> %call } -declare <8 x i32> @__clc_vload8_impl_uint__global(i32 %x, i32 %y) define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <8 x i32> @__clc_vload8_impl_uint__global(i32 %x, i32 %y) + %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) ret <8 x i32> %call } -declare <16 x i32> @__clc_vload16_impl_uint__global(i32 %x, i32 %y) define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <16 x i32> @__clc_vload16_impl_uint__global(i32 %x, i32 %y) + %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) ret <16 x i32> %call } diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll index 1d293ed..ae719e0 100644 --- a/generic/lib/shared/vload_impl.ll +++ b/generic/lib/shared/vload_impl.ll @@ -1,6 +1,6 @@ ; This provides optimized implementations of vload4/8/16 for 32-bit int/uint -define <2 x i32> @__clc_vload2_impl_int__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { +define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { %1 = ptrtoint i32 addrspace(1)* %addr to i32 %2 = add i32 %1, %offset %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)* @@ -8,7 +8,7 @@ define <2 x i32> @__clc_vload2_impl_int__global(i32 %offset, i32 addrspace(1)* ret <2 x i32> %4 } -define <3 x i32> @__clc_vload3_impl_int__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { +define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { %1 = ptrtoint i32 addrspace(1)* %addr to i32 %2 = add i32 %1, %offset %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)* @@ -16,7 +16,7 @@ define <3 x i32> @__clc_vload3_impl_int__global(i32 %offset, i32 addrspace(1)* ret <3 x i32> %4 } -define <4 x i32> @__clc_vload4_impl_int__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { +define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { %1 = ptrtoint i32 addrspace(1)* %addr to i32 %2 = add i32 %1, %offset %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)* @@ -24,7 +24,7 @@ define <4 x i32> @__clc_vload4_impl_int__global(i32 %offset, i32 addrspace(1)* ret <4 x i32> %4 } -define <8 x i32> @__clc_vload8_impl_int__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { +define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { %1 = ptrtoint i32 addrspace(1)* %addr to i32 %2 = add i32 %1, %offset %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)* @@ -32,7 +32,7 @@ define <8 x i32> @__clc_vload8_impl_int__global(i32 %offset, i32 addrspace(1)* ret <8 x i32> %4 } -define <16 x i32> @__clc_vload16_impl_int__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { +define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { %1 = ptrtoint i32 addrspace(1)* %addr to i32 %2 = add i32 %1, %offset %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)* @@ -40,47 +40,6 @@ define <16 x i32> @__clc_vload16_impl_int__global(i32 %offset, i32 addrspace(1) ret <16 x i32> %4 } -define <2 x i32> @__clc_vload2_impl_uint__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)* - %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <2 x i32> %4 -} - -define <2 x i32> @__clc_vload3_impl_uint__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)* - %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <2 x i32> %4 -} - -define <4 x i32> @__clc_vload4_impl_uint__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)* - %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <4 x i32> %4 -} - -define <8 x i32> @__clc_vload8_impl_uint__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)* - %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <8 x i32> %4 -} - -define <16 x i32> @__clc_vload16_impl_uint__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)* - %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <16 x i32> %4 -} - - !1 = metadata !{metadata !"char", metadata !5} !2 = metadata !{metadata !"short", metadata !5} !3 = metadata !{metadata !"int", metadata !5} |