summaryrefslogtreecommitdiff
path: root/backend
diff options
context:
space:
mode:
authorRuiling Song <ruiling.song@intel.com>2014-03-19 11:41:54 +0800
committerZhigang Gong <zhigang.gong@intel.com>2014-03-25 13:20:47 +0800
commiteeefb77c77920d66834bbced01c002604e5d4f66 (patch)
tree76d5ed7d2cc5de1046cd07edec96ebe5b2eeb6f1 /backend
parentc8830424f2ae811a1fbc490c4752e156928b02c5 (diff)
GBE: make byte/short vload/vstore process one element each time.
Per OCL Spec, the computed address (p+offset*n) is 8-bit aligned for char, and 16-bit aligned for short in vloadn & vstoren. That is we can not assume that vload4 with char pointer is 4byte aligned. The previous implementation will make Clang generate an load or store with alignment 4 which is in fact only alignment 1. We need find another way to optimize the vloadn. But before that, let's keep vloadn and vstoren work correctly. This could fix the regression issue caused by byte/short optimization. Signed-off-by: Ruiling Song <ruiling.song@intel.com> Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Diffstat (limited to 'backend')
-rwxr-xr-xbackend/src/ocl_stdlib.tmpl.h60
1 files changed, 56 insertions, 4 deletions
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index e3ac6324..25f2ff74 100755
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -3882,10 +3882,59 @@ INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
-DECL_UNTYPED_RW_ALL(char)
-DECL_UNTYPED_RW_ALL(uchar)
-DECL_UNTYPED_RW_ALL(short)
-DECL_UNTYPED_RW_ALL(ushort)
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
+} \
+INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
+} \
+INLINE_OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
+} \
+INLINE_OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
+} \
+INLINE_OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
+ return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
+}
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+INLINE_OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 2 * offset) = v.s0; \
+ *(p + 2 * offset + 1) = v.s1; \
+} \
+INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+ *(p + 3 * offset) = v.s0; \
+ *(p + 3 * offset + 1) = v.s1; \
+ *(p + 3 * offset + 2) = v.s2; \
+} \
+INLINE_OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
+ vstore2(v.lo, 2*offset, p); \
+ vstore2(v.hi, 2*offset, p+2); \
+} \
+INLINE_OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
+ vstore4(v.lo, 2*offset, p); \
+ vstore4(v.hi, 2*offset, p+4); \
+} \
+INLINE_OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
+ vstore8(v.lo, 2*offset, p); \
+ vstore8(v.hi, 2*offset, p+8); \
+}
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+ DECL_BYTE_RD_SPACE(TYPE, __global) \
+ DECL_BYTE_RD_SPACE(TYPE, __local) \
+ DECL_BYTE_RD_SPACE(TYPE, __private) \
+ DECL_BYTE_RD_SPACE(TYPE, __constant) \
+ DECL_BYTE_WR_SPACE(TYPE, __global) \
+ DECL_BYTE_WR_SPACE(TYPE, __local) \
+ DECL_BYTE_WR_SPACE(TYPE, __private)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
DECL_UNTYPED_RW_ALL(int)
DECL_UNTYPED_RW_ALL(uint)
DECL_UNTYPED_RW_ALL(long)
@@ -3900,6 +3949,9 @@ DECL_UNTYPED_RW_ALL(double)
#undef DECL_UNTYPED_RD_SPACE_N
#undef DECL_UNTYPED_V3_SPACE
#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
PURE CONST float __gen_ocl_f16to32(short h);
PURE CONST short __gen_ocl_f32to16(float f);