diff options
Diffstat (limited to 'generic')
-rw-r--r-- | generic/include/clc/clc.h | 4 | ||||
-rw-r--r-- | generic/include/clc/integer/integer-gentype.inc | 39 | ||||
-rw-r--r-- | generic/include/clc/integer/mad24.h | 3 | ||||
-rw-r--r-- | generic/include/clc/integer/mad24.inc | 1 | ||||
-rw-r--r-- | generic/include/clc/integer/mul24.h | 3 | ||||
-rw-r--r-- | generic/include/clc/integer/mul24.inc | 1 | ||||
-rw-r--r-- | generic/include/clc/integer/upsample.h | 25 | ||||
-rw-r--r-- | generic/include/clc/relational/bitselect.h | 1 | ||||
-rw-r--r-- | generic/lib/SOURCES | 5 | ||||
-rw-r--r-- | generic/lib/integer/mad24.cl | 4 | ||||
-rw-r--r-- | generic/lib/integer/mad24.inc | 3 | ||||
-rw-r--r-- | generic/lib/integer/mul24.cl | 4 | ||||
-rw-r--r-- | generic/lib/integer/mul24.inc | 11 | ||||
-rw-r--r-- | generic/lib/integer/upsample.cl | 34 | ||||
-rw-r--r-- | generic/lib/shared/vload.cl | 64 | ||||
-rw-r--r-- | generic/lib/shared/vload_if.ll | 60 | ||||
-rw-r--r-- | generic/lib/shared/vload_impl.ll | 50 | ||||
-rw-r--r-- | generic/lib/shared/vstore.cl | 80 | ||||
-rw-r--r-- | generic/lib/shared/vstore_if.ll | 59 | ||||
-rw-r--r-- | generic/lib/shared/vstore_impl.ll | 41 |
20 files changed, 191 insertions, 301 deletions
diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h index 10d30e0..9a2f443 100644 --- a/generic/include/clc/clc.h +++ b/generic/include/clc/clc.h @@ -64,8 +64,11 @@ #include <clc/integer/abs_diff.h> #include <clc/integer/add_sat.h> #include <clc/integer/clz.h> +#include <clc/integer/mad24.h> +#include <clc/integer/mul24.h> #include <clc/integer/rotate.h> #include <clc/integer/sub_sat.h> +#include <clc/integer/upsample.h> /* 6.11.2 and 6.11.3 Shared Integer/Math Functions */ #include <clc/shared/clamp.h> @@ -82,6 +85,7 @@ /* 6.11.6 Relational Functions */ #include <clc/relational/any.h> +#include <clc/relational/bitselect.h> #include <clc/relational/select.h> /* 6.11.8 Synchronization Functions */ diff --git a/generic/include/clc/integer/integer-gentype.inc b/generic/include/clc/integer/integer-gentype.inc new file mode 100644 index 0000000..6470eb3 --- /dev/null +++ b/generic/include/clc/integer/integer-gentype.inc @@ -0,0 +1,39 @@ +#define __CLC_GENTYPE int +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE int2 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE int4 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE int8 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE int16 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint2 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint4 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint8 +#include __CLC_BODY +#undef __CLC_GENTYPE + +#define __CLC_GENTYPE uint16 +#include __CLC_BODY +#undef __CLC_GENTYPE diff --git a/generic/include/clc/integer/mad24.h b/generic/include/clc/integer/mad24.h new file mode 100644 index 0000000..0c120fa --- /dev/null +++ b/generic/include/clc/integer/mad24.h @@ -0,0 +1,3 @@ +#define __CLC_BODY <clc/integer/mad24.inc> +#include <clc/integer/integer-gentype.inc> +#undef __CLC_BODY diff --git a/generic/include/clc/integer/mad24.inc b/generic/include/clc/integer/mad24.inc new file mode 100644 index 0000000..81fe0c2 --- /dev/null +++ b/generic/include/clc/integer/mad24.inc @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z); diff --git a/generic/include/clc/integer/mul24.h b/generic/include/clc/integer/mul24.h new file mode 100644 index 0000000..4f97098 --- /dev/null +++ b/generic/include/clc/integer/mul24.h @@ -0,0 +1,3 @@ +#define __CLC_BODY <clc/integer/mul24.inc> +#include <clc/integer/integer-gentype.inc> +#undef __CLC_BODY diff --git a/generic/include/clc/integer/mul24.inc b/generic/include/clc/integer/mul24.inc new file mode 100644 index 0000000..8cbf7c1 --- /dev/null +++ b/generic/include/clc/integer/mul24.inc @@ -0,0 +1 @@ +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y); diff --git a/generic/include/clc/integer/upsample.h b/generic/include/clc/integer/upsample.h new file mode 100644 index 0000000..127debf --- /dev/null +++ b/generic/include/clc/integer/upsample.h @@ -0,0 +1,25 @@ +#define __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE) \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE upsample(GENTYPE hi, UGENTYPE lo); + +#define __CLC_UPSAMPLE_VEC(BGENTYPE, GENTYPE, UGENTYPE) \ + __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##2, GENTYPE##2, UGENTYPE##2); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##3, GENTYPE##3, UGENTYPE##3); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##4, GENTYPE##4, UGENTYPE##4); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##8, GENTYPE##8, UGENTYPE##8); \ + __CLC_UPSAMPLE_DECL(BGENTYPE##16, GENTYPE##16, UGENTYPE##16); \ + +#define __CLC_UPSAMPLE_TYPES() \ + __CLC_UPSAMPLE_VEC(short, char, uchar) \ + __CLC_UPSAMPLE_VEC(ushort, uchar, uchar) \ + __CLC_UPSAMPLE_VEC(int, short, ushort) \ + __CLC_UPSAMPLE_VEC(uint, ushort, ushort) \ + __CLC_UPSAMPLE_VEC(long, int, uint) \ + __CLC_UPSAMPLE_VEC(ulong, uint, uint) \ + +__CLC_UPSAMPLE_TYPES() + +#undef __CLC_UPSAMPLE_TYPES +#undef __CLC_UPSAMPLE_DECL +#undef __CLC_UPSAMPLE_VEC + diff --git a/generic/include/clc/relational/bitselect.h b/generic/include/clc/relational/bitselect.h new file mode 100644 index 0000000..e91cbfd --- /dev/null +++ b/generic/include/clc/relational/bitselect.h @@ -0,0 +1 @@ +#define bitselect(x, y, z) ((x) ^ ((z) & ((y) ^ (x)))) diff --git a/generic/lib/SOURCES b/generic/lib/SOURCES index 8cda14a..9ac08bd 100644 --- a/generic/lib/SOURCES +++ b/generic/lib/SOURCES @@ -11,10 +11,13 @@ integer/add_sat_impl.ll integer/clz.cl integer/clz_if.ll integer/clz_impl.ll +integer/mad24.cl +integer/mul24.cl integer/rotate.cl integer/sub_sat.cl integer/sub_sat_if.ll integer/sub_sat_impl.ll +integer/upsample.cl math/fmax.cl math/fmin.cl math/hypot.cl @@ -24,10 +27,8 @@ shared/clamp.cl shared/max.cl shared/min.cl shared/vload.cl -shared/vload_if.ll shared/vload_impl.ll shared/vstore.cl -shared/vstore_if.ll shared/vstore_impl.ll workitem/get_global_id.cl workitem/get_global_size.cl diff --git a/generic/lib/integer/mad24.cl b/generic/lib/integer/mad24.cl new file mode 100644 index 0000000..e29e99f --- /dev/null +++ b/generic/lib/integer/mad24.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <mad24.inc> +#include <clc/integer/integer-gentype.inc> diff --git a/generic/lib/integer/mad24.inc b/generic/lib/integer/mad24.inc new file mode 100644 index 0000000..902b0aa --- /dev/null +++ b/generic/lib/integer/mad24.inc @@ -0,0 +1,3 @@ +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z){ + return mul24(x, y) + z; +} diff --git a/generic/lib/integer/mul24.cl b/generic/lib/integer/mul24.cl new file mode 100644 index 0000000..8aedca6 --- /dev/null +++ b/generic/lib/integer/mul24.cl @@ -0,0 +1,4 @@ +#include <clc/clc.h> + +#define __CLC_BODY <mul24.inc> +#include <clc/integer/integer-gentype.inc> diff --git a/generic/lib/integer/mul24.inc b/generic/lib/integer/mul24.inc new file mode 100644 index 0000000..95a2f1d --- /dev/null +++ b/generic/lib/integer/mul24.inc @@ -0,0 +1,11 @@ + +// We need to use shifts here in order to mantain the sign bit for signed +// integers. The compiler should optimize this to (x & 0x00FFFFFF) for +// unsigned integers. +#define CONVERT_TO_24BIT(x) (((x) << 8) >> 8) + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y){ + return CONVERT_TO_24BIT(x) * CONVERT_TO_24BIT(y); +} + +#undef CONVERT_TO_24BIT diff --git a/generic/lib/integer/upsample.cl b/generic/lib/integer/upsample.cl new file mode 100644 index 0000000..7301cc3 --- /dev/null +++ b/generic/lib/integer/upsample.cl @@ -0,0 +1,34 @@ +#include <clc/clc.h> + +#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE) \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE upsample(GENTYPE hi, UGENTYPE lo){ \ + return ((BGENTYPE)hi << GENSIZE) | lo; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##2 upsample(GENTYPE##2 hi, UGENTYPE##2 lo){ \ + return (BGENTYPE##2){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1)}; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##3 upsample(GENTYPE##3 hi, UGENTYPE##3 lo){ \ + return (BGENTYPE##3){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1), upsample(hi.s2, lo.s2)}; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##4 upsample(GENTYPE##4 hi, UGENTYPE##4 lo){ \ + return (BGENTYPE##4){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##8 upsample(GENTYPE##8 hi, UGENTYPE##8 lo){ \ + return (BGENTYPE##8){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + _CLC_OVERLOAD _CLC_DECL BGENTYPE##16 upsample(GENTYPE##16 hi, UGENTYPE##16 lo){ \ + return (BGENTYPE##16){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \ + } \ + +#define __CLC_UPSAMPLE_TYPES() \ + __CLC_UPSAMPLE_IMPL(short, char, uchar, 8) \ + __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar, 8) \ + __CLC_UPSAMPLE_IMPL(int, short, ushort, 16) \ + __CLC_UPSAMPLE_IMPL(uint, ushort, ushort, 16) \ + __CLC_UPSAMPLE_IMPL(long, int, uint, 32) \ + __CLC_UPSAMPLE_IMPL(ulong, uint, uint, 32) \ + +__CLC_UPSAMPLE_TYPES() + +#undef __CLC_UPSAMPLE_TYPES +#undef __CLC_UPSAMPLE_IMPL diff --git a/generic/lib/shared/vload.cl b/generic/lib/shared/vload.cl index 4dd7918..6793072 100644 --- a/generic/lib/shared/vload.cl +++ b/generic/lib/shared/vload.cl @@ -2,23 +2,23 @@ #define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 vload2(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##2)(x[offset] , x[offset+1]); \ + return (PRIM_TYPE##2)(x[2*offset] , x[2*offset+1]); \ } \ \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##3 vload3(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##3)(x[offset] , x[offset+1], x[offset+2]); \ + return (PRIM_TYPE##3)(x[3*offset] , x[3*offset+1], x[3*offset+2]); \ } \ \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##4 vload4(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##4)(x[offset], x[offset+1], x[offset+2], x[offset+3]); \ + return (PRIM_TYPE##4)(x[4*offset], x[4*offset+1], x[4*offset+2], x[4*offset+3]); \ } \ \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##8 vload8(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##8)(vload4(offset, x), vload4(offset+4, x)); \ + return (PRIM_TYPE##8)(vload4(0, &x[8*offset]), vload4(1, &x[8*offset])); \ } \ \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##16 vload16(size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ - return (PRIM_TYPE##16)(vload8(offset, x), vload8(offset+8, x)); \ + return (PRIM_TYPE##16)(vload8(0, &x[16*offset]), vload8(1, &x[16*offset])); \ } \ #define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ @@ -27,12 +27,13 @@ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ -//int/uint are special... see below #define VLOAD_TYPES() \ VLOAD_ADDR_SPACES(char) \ VLOAD_ADDR_SPACES(uchar) \ VLOAD_ADDR_SPACES(short) \ VLOAD_ADDR_SPACES(ushort) \ + VLOAD_ADDR_SPACES(int) \ + VLOAD_ADDR_SPACES(uint) \ VLOAD_ADDR_SPACES(long) \ VLOAD_ADDR_SPACES(ulong) \ VLOAD_ADDR_SPACES(float) \ @@ -43,54 +44,3 @@ VLOAD_TYPES() #pragma OPENCL EXTENSION cl_khr_fp64 : enable VLOAD_ADDR_SPACES(double) #endif - -VLOAD_VECTORIZE(int, __private) -VLOAD_VECTORIZE(int, __local) -VLOAD_VECTORIZE(int, __constant) -VLOAD_VECTORIZE(uint, __private) -VLOAD_VECTORIZE(uint, __local) -VLOAD_VECTORIZE(uint, __constant) - -_CLC_OVERLOAD _CLC_DEF int2 vload2(size_t offset, const global int *x) { - return (int2)(x[offset] , x[offset+1]); -} -_CLC_OVERLOAD _CLC_DEF int3 vload3(size_t offset, const global int *x) { - return (int3)(vload2(offset, x), x[offset+2]); -} -_CLC_OVERLOAD _CLC_DEF uint2 vload2(size_t offset, const global uint *x) { - return (uint2)(x[offset] , x[offset+1]); -} -_CLC_OVERLOAD _CLC_DEF uint3 vload3(size_t offset, const global uint *x) { - return (uint3)(vload2(offset, x), x[offset+2]); -} - -/*Note: It is known that R600 doesn't support load <2 x ?> and <3 x ?>... so - * they aren't actually overridden here - */ -_CLC_DECL int4 __clc_vload4_int__global(size_t offset, const __global int *); -_CLC_DECL int8 __clc_vload8_int__global(size_t offset, const __global int *); -_CLC_DECL int16 __clc_vload16_int__global(size_t offset, const __global int *); - -_CLC_OVERLOAD _CLC_DEF int4 vload4(size_t offset, const global int *x) { - return __clc_vload4_int__global(offset, x); -} -_CLC_OVERLOAD _CLC_DEF int8 vload8(size_t offset, const global int *x) { - return __clc_vload8_int__global(offset, x); -} -_CLC_OVERLOAD _CLC_DEF int16 vload16(size_t offset, const global int *x) { - return __clc_vload16_int__global(offset, x); -} - -_CLC_DECL uint4 __clc_vload4_uint__global(size_t offset, const __global uint *); -_CLC_DECL uint8 __clc_vload8_uint__global(size_t offset, const __global uint *); -_CLC_DECL uint16 __clc_vload16_uint__global(size_t offset, const __global uint *); - -_CLC_OVERLOAD _CLC_DEF uint4 vload4(size_t offset, const global uint *x) { - return __clc_vload4_uint__global(offset, x); -} -_CLC_OVERLOAD _CLC_DEF uint8 vload8(size_t offset, const global uint *x) { - return __clc_vload8_uint__global(offset, x); -} -_CLC_OVERLOAD _CLC_DEF uint16 vload16(size_t offset, const global uint *x) { - return __clc_vload16_uint__global(offset, x); -} diff --git a/generic/lib/shared/vload_if.ll b/generic/lib/shared/vload_if.ll deleted file mode 100644 index 2634d37..0000000 --- a/generic/lib/shared/vload_if.ll +++ /dev/null @@ -1,60 +0,0 @@ -;Start int global vload - -declare <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) -declare <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) -declare <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) -declare <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) -declare <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) - -define <2 x i32> @__clc_vload2_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) - ret <2 x i32> %call -} - -define <3 x i32> @__clc_vload3_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) - ret <3 x i32> %call -} - -define <4 x i32> @__clc_vload4_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) - ret <4 x i32> %call -} - -define <8 x i32> @__clc_vload8_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) - ret <8 x i32> %call -} - -define <16 x i32> @__clc_vload16_int__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) - ret <16 x i32> %call -} - - -;Start uint global vload - -define <2 x i32> @__clc_vload2_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <2 x i32> @__clc_vload2_impl_i32__global(i32 %x, i32 %y) - ret <2 x i32> %call -} - -define <3 x i32> @__clc_vload3_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <3 x i32> @__clc_vload3_impl_i32__global(i32 %x, i32 %y) - ret <3 x i32> %call -} - -define <4 x i32> @__clc_vload4_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <4 x i32> @__clc_vload4_impl_i32__global(i32 %x, i32 %y) - ret <4 x i32> %call -} - -define <8 x i32> @__clc_vload8_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <8 x i32> @__clc_vload8_impl_i32__global(i32 %x, i32 %y) - ret <8 x i32> %call -} - -define <16 x i32> @__clc_vload16_uint__global(i32 %x, i32 %y) nounwind readonly alwaysinline { - %call = call <16 x i32> @__clc_vload16_impl_i32__global(i32 %x, i32 %y) - ret <16 x i32> %call -} diff --git a/generic/lib/shared/vload_impl.ll b/generic/lib/shared/vload_impl.ll index ae719e0..2e70e5f 100644 --- a/generic/lib/shared/vload_impl.ll +++ b/generic/lib/shared/vload_impl.ll @@ -1,43 +1,33 @@ ; This provides optimized implementations of vload4/8/16 for 32-bit int/uint -define <2 x i32> @__clc_vload2_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)* - %4 = load <2 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <2 x i32> %4 +define <2 x i32> @__clc_vload2_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <2 x i32> addrspace(1)* + %2 = load <2 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <2 x i32> %2 } -define <3 x i32> @__clc_vload3_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)* - %4 = load <3 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <3 x i32> %4 +define <3 x i32> @__clc_vload3_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <3 x i32> addrspace(1)* + %2 = load <3 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <3 x i32> %2 } -define <4 x i32> @__clc_vload4_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)* - %4 = load <4 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <4 x i32> %4 +define <4 x i32> @__clc_vload4_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <4 x i32> addrspace(1)* + %2 = load <4 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <4 x i32> %2 } -define <8 x i32> @__clc_vload8_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)* - %4 = load <8 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <8 x i32> %4 +define <8 x i32> @__clc_vload8_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <8 x i32> addrspace(1)* + %2 = load <8 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <8 x i32> %2 } -define <16 x i32> @__clc_vload16_impl_i32__global(i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)* - %4 = load <16 x i32> addrspace(1)* %3, align 4, !tbaa !3 - ret <16 x i32> %4 +define <16 x i32> @__clc_vload16_i32__addr1(i32 addrspace(1)* nocapture %addr) nounwind readonly alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <16 x i32> addrspace(1)* + %2 = load <16 x i32> addrspace(1)* %1, align 4, !tbaa !3 + ret <16 x i32> %2 } !1 = metadata !{metadata !"char", metadata !5} diff --git a/generic/lib/shared/vstore.cl b/generic/lib/shared/vstore.cl index 17c2c4c..f6d360e 100644 --- a/generic/lib/shared/vstore.cl +++ b/generic/lib/shared/vstore.cl @@ -4,29 +4,29 @@ #define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ _CLC_OVERLOAD _CLC_DEF void vstore2(PRIM_TYPE##2 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - mem[offset] = vec.s0; \ - mem[offset+1] = vec.s1; \ + mem[2*offset] = vec.s0; \ + mem[2*offset+1] = vec.s1; \ } \ \ _CLC_OVERLOAD _CLC_DEF void vstore3(PRIM_TYPE##3 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - mem[offset] = vec.s0; \ - mem[offset+1] = vec.s1; \ - mem[offset+2] = vec.s2; \ + mem[3*offset] = vec.s0; \ + mem[3*offset+1] = vec.s1; \ + mem[3*offset+2] = vec.s2; \ } \ \ _CLC_OVERLOAD _CLC_DEF void vstore4(PRIM_TYPE##4 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - vstore2(vec.lo, offset, mem); \ - vstore2(vec.hi, offset+2, mem); \ + vstore2(vec.lo, 0, &mem[offset*4]); \ + vstore2(vec.hi, 1, &mem[offset*4]); \ } \ \ _CLC_OVERLOAD _CLC_DEF void vstore8(PRIM_TYPE##8 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - vstore4(vec.lo, offset, mem); \ - vstore4(vec.hi, offset+4, mem); \ + vstore4(vec.lo, 0, &mem[offset*8]); \ + vstore4(vec.hi, 1, &mem[offset*8]); \ } \ \ _CLC_OVERLOAD _CLC_DEF void vstore16(PRIM_TYPE##16 vec, size_t offset, ADDR_SPACE PRIM_TYPE *mem) { \ - vstore8(vec.lo, offset, mem); \ - vstore8(vec.hi, offset+8, mem); \ + vstore8(vec.lo, 0, &mem[offset*16]); \ + vstore8(vec.hi, 1, &mem[offset*16]); \ } \ #define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ @@ -34,12 +34,13 @@ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ -//int/uint are special... see below #define VSTORE_TYPES() \ VSTORE_ADDR_SPACES(char) \ VSTORE_ADDR_SPACES(uchar) \ VSTORE_ADDR_SPACES(short) \ VSTORE_ADDR_SPACES(ushort) \ + VSTORE_ADDR_SPACES(int) \ + VSTORE_ADDR_SPACES(uint) \ VSTORE_ADDR_SPACES(long) \ VSTORE_ADDR_SPACES(ulong) \ VSTORE_ADDR_SPACES(float) \ @@ -50,58 +51,3 @@ VSTORE_TYPES() #pragma OPENCL EXTENSION cl_khr_fp64 : enable VSTORE_ADDR_SPACES(double) #endif - -VSTORE_VECTORIZE(int, __private) -VSTORE_VECTORIZE(int, __local) -VSTORE_VECTORIZE(uint, __private) -VSTORE_VECTORIZE(uint, __local) - -_CLC_OVERLOAD _CLC_DEF void vstore2(int2 vec, size_t offset, global int *mem) { - mem[offset] = vec.s0; - mem[offset+1] = vec.s1; -} -_CLC_OVERLOAD _CLC_DEF void vstore3(int3 vec, size_t offset, global int *mem) { - mem[offset] = vec.s0; - mem[offset+1] = vec.s1; - mem[offset+2] = vec.s2; -} -_CLC_OVERLOAD _CLC_DEF void vstore2(uint2 vec, size_t offset, global uint *mem) { - mem[offset] = vec.s0; - mem[offset+1] = vec.s1; -} -_CLC_OVERLOAD _CLC_DEF void vstore3(uint3 vec, size_t offset, global uint *mem) { - mem[offset] = vec.s0; - mem[offset+1] = vec.s1; - mem[offset+2] = vec.s2; -} - -/*Note: R600 probably doesn't support store <2 x ?> and <3 x ?>... so - * they aren't actually overridden here... lowest-common-denominator - */ -_CLC_DECL void __clc_vstore4_int__global(int4 vec, size_t offset, __global int *); -_CLC_DECL void __clc_vstore8_int__global(int8 vec, size_t offset, __global int *); -_CLC_DECL void __clc_vstore16_int__global(int16 vec, size_t offset, __global int *); - -_CLC_OVERLOAD _CLC_DEF void vstore4(int4 vec, size_t offset, global int *x) { - __clc_vstore4_int__global(vec, offset, x); -} -_CLC_OVERLOAD _CLC_DEF void vstore8(int8 vec, size_t offset, global int *x) { - __clc_vstore8_int__global(vec, offset, x); -} -_CLC_OVERLOAD _CLC_DEF void vstore16(int16 vec, size_t offset, global int *x) { - __clc_vstore16_int__global(vec, offset, x); -} - -_CLC_DECL void __clc_vstore4_uint__global(uint4 vec, size_t offset, __global uint *); -_CLC_DECL void __clc_vstore8_uint__global(uint8 vec, size_t offset, __global uint *); -_CLC_DECL void __clc_vstore16_uint__global(uint16 vec, size_t offset, __global uint *); - -_CLC_OVERLOAD _CLC_DEF void vstore4(uint4 vec, size_t offset, global uint *x) { - __clc_vstore4_uint__global(vec, offset, x); -} -_CLC_OVERLOAD _CLC_DEF void vstore8(uint8 vec, size_t offset, global uint *x) { - __clc_vstore8_uint__global(vec, offset, x); -} -_CLC_OVERLOAD _CLC_DEF void vstore16(uint16 vec, size_t offset, global uint *x) { - __clc_vstore16_uint__global(vec, offset, x); -} diff --git a/generic/lib/shared/vstore_if.ll b/generic/lib/shared/vstore_if.ll deleted file mode 100644 index 30eb552..0000000 --- a/generic/lib/shared/vstore_if.ll +++ /dev/null @@ -1,59 +0,0 @@ -;Start int global vstore - -declare void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y) -declare void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y) -declare void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y) -declare void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y) -declare void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y) - -define void @__clc_vstore2_int__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore3_int__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore4_int__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore8_int__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore16_int__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y) - ret void -} - - -;Start uint global vstore -define void @__clc_vstore2_uint__global(<2 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore3_uint__global(<3 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore4_uint__global(<4 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore8_uint__global(<8 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %x, i32 %y) - ret void -} - -define void @__clc_vstore16_uint__global(<16 x i32> %vec, i32 %x, i32 %y) nounwind alwaysinline { - call void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %x, i32 %y) - ret void -}
\ No newline at end of file diff --git a/generic/lib/shared/vstore_impl.ll b/generic/lib/shared/vstore_impl.ll index 3baab5e..388bce2 100644 --- a/generic/lib/shared/vstore_impl.ll +++ b/generic/lib/shared/vstore_impl.ll @@ -1,46 +1,35 @@ ; This provides optimized implementations of vstore4/8/16 for 32-bit int/uint -define void @__clc_vstore2_impl_i32__global(<2 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <2 x i32> addrspace(1)* - store <2 x i32> %vec, <2 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore2_i32__addr1(<2 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <2 x i32> addrspace(1)* + store <2 x i32> %vec, <2 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } -define void @__clc_vstore3_impl_i32__global(<3 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <3 x i32> addrspace(1)* - store <3 x i32> %vec, <3 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore3_i32__addr1(<3 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <3 x i32> addrspace(1)* + store <3 x i32> %vec, <3 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } -define void @__clc_vstore4_impl_i32__global(<4 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <4 x i32> addrspace(1)* - store <4 x i32> %vec, <4 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore4_i32__addr1(<4 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <4 x i32> addrspace(1)* + store <4 x i32> %vec, <4 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } -define void @__clc_vstore8_impl_i32__global(<8 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <8 x i32> addrspace(1)* - store <8 x i32> %vec, <8 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore8_i32__addr1(<8 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <8 x i32> addrspace(1)* + store <8 x i32> %vec, <8 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } -define void @__clc_vstore16_impl_i32__global(<16 x i32> %vec, i32 %offset, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { - %1 = ptrtoint i32 addrspace(1)* %addr to i32 - %2 = add i32 %1, %offset - %3 = inttoptr i32 %2 to <16 x i32> addrspace(1)* - store <16 x i32> %vec, <16 x i32> addrspace(1)* %3, align 4, !tbaa !3 +define void @__clc_vstore16_i32__addr1(<16 x i32> %vec, i32 addrspace(1)* nocapture %addr) nounwind alwaysinline { + %1 = bitcast i32 addrspace(1)* %addr to <16 x i32> addrspace(1)* + store <16 x i32> %vec, <16 x i32> addrspace(1)* %1, align 4, !tbaa !3 ret void } - !1 = metadata !{metadata !"char", metadata !5} !2 = metadata !{metadata !"short", metadata !5} !3 = metadata !{metadata !"int", metadata !5} |