WIP st/mesa: memcpy drop-ins for readpixelsreadpixels

author: Nicolai Hähnle <nicolai.haehnle@amd.com> 2016-06-15 11:06:38 +0200
committer: Nicolai Hähnle <nicolai.haehnle@amd.com> 2016-06-16 13:28:09 +0200
commit: a96d83464c64048d2ec5ee810b331d01b5838903 (patch)
tree: be4076bf17730886908a0af4e9822347bd7d309c
parent: 9c85441397bfecefa9d91f4540fdbc6784567519 (diff)
6 files changed, 843 insertions, 2 deletions
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 6d7a3cc948..26618ab607 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -154,6 +154,8 @@ libmesagallium_la_LIBADD = \
 	$(top_builddir)/src/compiler/glsl/libglsl.la \
 	$(ARCH_LIBS)
 
+libmesagallium_la_CFLAGS = $(AM_CFLAGS) $(SSE41_CFLAGS) -mavx -mavx2
+
 libmesa_sse41_la_SOURCES = \
 	$(X86_SSE41_FILES)
 
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index a49ad953c0..8e9c65584b 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -497,6 +497,8 @@ STATETRACKER_FILES = \
 	state_tracker/st_glsl_types.h \
 	state_tracker/st_manager.c \
 	state_tracker/st_manager.h \
+	state_tracker/st_memcpy_read.c \
+	state_tracker/st_memcpy_read.h \
 	state_tracker/st_mesa_to_tgsi.c \
 	state_tracker/st_mesa_to_tgsi.h \
 	state_tracker/st_nir.h \
diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c
index 4c2f2ce768..cbecc787ab 100644
--- a/src/mesa/state_tracker/st_cb_readpixels.c
+++ b/src/mesa/state_tracker/st_cb_readpixels.c
@@ -41,6 +41,7 @@
 #include "st_context.h"
 #include "st_cb_bitmap.h"
 #include "st_cb_readpixels.h"
+#include "st_memcpy_read.h"
 #include "state_tracker/st_cb_texture.h"
 #include "state_tracker/st_format.h"
 #include "state_tracker/st_pbo.h"
@@ -519,6 +520,9 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y,
    if (!window)
       map += y * tex_xfer->stride + x * util_format_get_blocksize(dst_format);
 
+//    static ubyte dummy[4 * 1024 * 1024];
+//    map = dummy; //HACK
+
    /* memcpy data into a user buffer */
    {
       const uint bytesPerRow = width * util_format_get_blocksize(dst_format);
@@ -528,12 +532,12 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y,
                                          type, 0, 0);
 
       if (tex_xfer->stride == bytesPerRow && destStride == bytesPerRow) {
-         memcpy(dest, map, bytesPerRow * height);
+         (*st_memcpy_read)(dest, map, bytesPerRow * height);
       } else {
          GLuint row;
 
          for (row = 0; row < (unsigned) height; row++) {
-            memcpy(dest, map, bytesPerRow);
+            (*st_memcpy_read)(dest, map, bytesPerRow);
             map += tex_xfer->stride;
             dest += destStride;
          }
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 997d428449..26b22ccd05 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -46,6 +46,7 @@
 #include "st_cb_fbo.h"
 #include "st_cb_flush.h"
 #include "st_manager.h"
+#include "st_memcpy_read.h"
 
 #include "state_tracker/st_gl_api.h"
 
@@ -969,5 +970,6 @@ static const struct st_api st_gl_api = {
 struct st_api *
 st_gl_api_create(void)
 {
+   st_init_memcpy_read();
    return (struct st_api *) &st_gl_api;
 }
diff --git a/src/mesa/state_tracker/st_memcpy_read.c b/src/mesa/state_tracker/st_memcpy_read.c
new file mode 100644
index 0000000000..4f0ac112d9
--- /dev/null
+++ b/src/mesa/state_tracker/st_memcpy_read.c
@@ -0,0 +1,797 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/*
+ * memcpy implementations used for copying from temporary staging textures to
+ * user memory.
+ */
+
+#include "util/u_debug.h"
+#include "util/u_math.h"
+
+#include "st_memcpy_read.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#define ALIGN_SRC(d, s, len, align) \
+   do { \
+      if ((uintptr_t)s & ((align) - 1)) { \
+         uintptr_t missing = align - ((uintptr_t)s & (align - 1)); \
+         uintptr_t copy = MIN2(missing, len); \
+         memcpy(d, s, copy); \
+         d += copy; \
+         s += copy; \
+         len -= copy; \
+      } \
+   } while (0)
+
+#define ALIGN_DST(d, s, len, align) \
+   do { \
+      if ((uintptr_t)d & ((align) - 1)) { \
+         uintptr_t missing = align - ((uintptr_t)d & (align - 1)); \
+         uintptr_t copy = MIN2(missing, len); \
+         memcpy(d, s, copy); \
+         d += copy; \
+         s += copy; \
+         len -= copy; \
+      } \
+   } while (0)
+
+#define SSE2_DQU_DQA(d, s, len) \
+   do { \
+      __m128i *dst_cacheline = (__m128i *)d; \
+      __m128i *src_cacheline = (__m128i *)s; \
+\
+      __m128i temp1 = _mm_loadu_si128(src_cacheline + 0); \
+      __m128i temp2 = _mm_loadu_si128(src_cacheline + 1); \
+      __m128i temp3 = _mm_loadu_si128(src_cacheline + 2); \
+      __m128i temp4 = _mm_loadu_si128(src_cacheline + 3);\
+\
+      _mm_store_si128(dst_cacheline + 0, temp1); \
+      _mm_store_si128(dst_cacheline + 1, temp2); \
+      _mm_store_si128(dst_cacheline + 2, temp3); \
+      _mm_store_si128(dst_cacheline + 3, temp4); \
+\
+      d += 64; \
+      s += 64; \
+      len -= 64; \
+   } while (len >= 64)
+
+#define SSE2_DQA_DQA(d, s, len) \
+   do { \
+      __m128i *dst_cacheline = (__m128i *)d; \
+      __m128i *src_cacheline = (__m128i *)s; \
+\
+      __m128i temp1 = _mm_load_si128(src_cacheline + 0); \
+      __m128i temp2 = _mm_load_si128(src_cacheline + 1); \
+      __m128i temp3 = _mm_load_si128(src_cacheline + 2); \
+      __m128i temp4 = _mm_load_si128(src_cacheline + 3);\
+\
+      _mm_store_si128(dst_cacheline + 0, temp1); \
+      _mm_store_si128(dst_cacheline + 1, temp2); \
+      _mm_store_si128(dst_cacheline + 2, temp3); \
+      _mm_store_si128(dst_cacheline + 3, temp4); \
+\
+      d += 64; \
+      s += 64; \
+      len -= 64; \
+   } while (len >= 64)
+
+#define SSE2_DQA_DQU(d, s, len) \
+   do { \
+      __m128i *dst_cacheline = (__m128i *)d; \
+      __m128i *src_cacheline = (__m128i *)s; \
+\
+      __m128i temp1 = _mm_load_si128(src_cacheline + 0); \
+      __m128i temp2 = _mm_load_si128(src_cacheline + 1); \
+      __m128i temp3 = _mm_load_si128(src_cacheline + 2); \
+      __m128i temp4 = _mm_load_si128(src_cacheline + 3);\
+\
+      _mm_storeu_si128(dst_cacheline + 0, temp1); \
+      _mm_storeu_si128(dst_cacheline + 1, temp2); \
+      _mm_storeu_si128(dst_cacheline + 2, temp3); \
+      _mm_storeu_si128(dst_cacheline + 3, temp4); \
+\
+      d += 64; \
+      s += 64; \
+      len -= 64; \
+   } while (len >= 64)
+
+#define SSE41_NTDQA_DQA(d, s, len) \
+   do { \
+      __m128i *dst_cacheline = (__m128i *)d; \
+      __m128i *src_cacheline = (__m128i *)s; \
+\
+      __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \
+      __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \
+      __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \
+      __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\
+\
+      _mm_store_si128(dst_cacheline + 0, temp1); \
+      _mm_store_si128(dst_cacheline + 1, temp2); \
+      _mm_store_si128(dst_cacheline + 2, temp3); \
+      _mm_store_si128(dst_cacheline + 3, temp4); \
+\
+      d += 64; \
+      s += 64; \
+      len -= 64; \
+   } while (len >= 64)
+
+#define SSE41_NTDQA_DQU(d, s, len) \
+   do { \
+      __m128i *dst_cacheline = (__m128i *)d; \
+      __m128i *src_cacheline = (__m128i *)s; \
+\
+      __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \
+      __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \
+      __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \
+      __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\
+\
+      _mm_storeu_si128(dst_cacheline + 0, temp1); \
+      _mm_storeu_si128(dst_cacheline + 1, temp2); \
+      _mm_storeu_si128(dst_cacheline + 2, temp3); \
+      _mm_storeu_si128(dst_cacheline + 3, temp4); \
+\
+      d += 64; \
+      s += 64; \
+      len -= 64; \
+   } while (len >= 64)
+
+#define SSE41_DQA_NTDQ(d, s, len) \
+   do { \
+      __m128i *dst_cacheline = (__m128i *)d; \
+      __m128i *src_cacheline = (__m128i *)s; \
+\
+      __m128i temp1 = _mm_load_si128(src_cacheline + 0); \
+      __m128i temp2 = _mm_load_si128(src_cacheline + 1); \
+      __m128i temp3 = _mm_load_si128(src_cacheline + 2); \
+      __m128i temp4 = _mm_load_si128(src_cacheline + 3);\
+\
+      _mm_stream_si128(dst_cacheline + 0, temp1); \
+      _mm_stream_si128(dst_cacheline + 1, temp2); \
+      _mm_stream_si128(dst_cacheline + 2, temp3); \
+      _mm_stream_si128(dst_cacheline + 3, temp4); \
+\
+      d += 64; \
+      s += 64; \
+      len -= 64; \
+   } while (len >= 64)
+
+#define SSE41_DQU_NTDQ(d, s, len) \
+   do { \
+      __m128i *dst_cacheline = (__m128i *)d; \
+      __m128i *src_cacheline = (__m128i *)s; \
+\
+      __m128i temp1 = _mm_loadu_si128(src_cacheline + 0); \
+      __m128i temp2 = _mm_loadu_si128(src_cacheline + 1); \
+      __m128i temp3 = _mm_loadu_si128(src_cacheline + 2); \
+      __m128i temp4 = _mm_loadu_si128(src_cacheline + 3);\
+\
+      _mm_stream_si128(dst_cacheline + 0, temp1); \
+      _mm_stream_si128(dst_cacheline + 1, temp2); \
+      _mm_stream_si128(dst_cacheline + 2, temp3); \
+      _mm_stream_si128(dst_cacheline + 3, temp4); \
+\
+      d += 64; \
+      s += 64; \
+      len -= 64; \
+   } while (len >= 64)
+
+#define SSE41_NTDQA_NTDQ(d, s, len) \
+   do { \
+      __m128i *dst_cacheline = (__m128i *)d; \
+      __m128i *src_cacheline = (__m128i *)s; \
+\
+      __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \
+      __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \
+      __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \
+      __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\
+\
+      _mm_stream_si128(dst_cacheline + 0, temp1); \
+      _mm_stream_si128(dst_cacheline + 1, temp2); \
+      _mm_stream_si128(dst_cacheline + 2, temp3); \
+      _mm_stream_si128(dst_cacheline + 3, temp4); \
+\
+      d += 64; \
+      s += 64; \
+      len -= 64; \
+   } while (len >= 64)
+
+static void
+sse2_dqa_dqx(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_SRC(d, s, len, 16);
+
+   if (len >= 64) {
+      if ((uintptr_t)d & 15)
+         SSE2_DQA_DQU(d, s, len);
+      else
+         SSE2_DQA_DQA(d, s, len);
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+sse2_dqx_dqa(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_DST(d, s, len, 16);
+
+   if (len >= 64) {
+      if ((uintptr_t)s & 15)
+         SSE2_DQU_DQA(d, s, len);
+      else
+         SSE2_DQA_DQA(d, s, len);
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+sse41_ntdqa_dqu(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_SRC(d, s, len, 16);
+
+   if (len >= 64) {
+      _mm_mfence();
+
+      SSE41_NTDQA_DQU(d, s, len);
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+sse41_ntdqa_dqx(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_SRC(d, s, len, 16);
+
+   if (len >= 64) {
+      _mm_mfence();
+
+      if ((uintptr_t)d & 15)
+         SSE41_NTDQA_DQU(d, s, len);
+      else
+         SSE41_NTDQA_DQA(d, s, len);
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+sse41_dqu_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_DST(d, s, len, 16);
+
+   if (len >= 64) {
+      SSE41_DQU_NTDQ(d, s, len);
+
+      _mm_sfence();
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+sse41_dqx_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_DST(d, s, len, 16);
+
+   if (len >= 64) {
+      if ((uintptr_t)s & 15)
+         SSE41_DQU_NTDQ(d, s, len);
+      else
+         SSE41_DQA_NTDQ(d, s, len);
+
+      _mm_sfence();
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+sse41_ntdqx_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_DST(d, s, len, 16);
+
+   if (len >= 64) {
+      if ((uintptr_t)s & 15)
+         SSE41_DQU_NTDQ(d, s, len);
+      else {
+         _mm_mfence();
+
+         SSE41_NTDQA_NTDQ(d, s, len);
+      }
+
+      _mm_sfence();
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+#define AVX_DQA_NTDQ(d, s, len) \
+   do { \
+      __m256i *dst_cacheline = (__m256i *)d; \
+      __m256i *src_cacheline = (__m256i *)s; \
+\
+      __m256i temp0 = _mm256_load_si256(src_cacheline + 0); \
+      __m256i temp1 = _mm256_load_si256(src_cacheline + 1); \
+      __m256i temp2 = _mm256_load_si256(src_cacheline + 2); \
+      __m256i temp3 = _mm256_load_si256(src_cacheline + 3); \
+\
+      _mm256_stream_si256(dst_cacheline + 0, temp0); \
+      _mm256_stream_si256(dst_cacheline + 1, temp1); \
+      _mm256_stream_si256(dst_cacheline + 2, temp2); \
+      _mm256_stream_si256(dst_cacheline + 3, temp3); \
+\
+      d += 128; \
+      s += 128; \
+      len -= 128; \
+   } while (len >= 128);
+
+#define AVX_DQU_NTDQ(d, s, len) \
+   do { \
+      __m256i *dst_cacheline = (__m256i *)d; \
+      __m256i *src_cacheline = (__m256i *)s; \
+\
+      __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0); \
+      __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1); \
+      __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2); \
+      __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3); \
+\
+      _mm256_stream_si256(dst_cacheline + 0, temp0); \
+      _mm256_stream_si256(dst_cacheline + 1, temp1); \
+      _mm256_stream_si256(dst_cacheline + 2, temp2); \
+      _mm256_stream_si256(dst_cacheline + 3, temp3); \
+\
+      d += 128; \
+      s += 128; \
+      len -= 128; \
+   } while (len >= 128)
+
+#define AVX_NTDQA_NTDQ(d, s, len) \
+   do { \
+      __m256i *dst_cacheline = (__m256i *)d; \
+      __m256i *src_cacheline = (__m256i *)s; \
+\
+      __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0); \
+      __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1); \
+      __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2); \
+      __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3); \
+\
+      _mm256_stream_si256(dst_cacheline + 0, temp0); \
+      _mm256_stream_si256(dst_cacheline + 1, temp1); \
+      _mm256_stream_si256(dst_cacheline + 2, temp2); \
+      _mm256_stream_si256(dst_cacheline + 3, temp3); \
+\
+      d += 128; \
+      s += 128; \
+      len -= 128; \
+   } while (len >= 128)
+
+static void
+avx_dqu_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_DST(d, s, len, 32);
+
+   if (len >= 64) {
+      AVX_DQU_NTDQ(d, s, len);
+
+      _mm_sfence();
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+avx_dqx_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_DST(d, s, len, 32);
+
+   if (len >= 64) {
+      if ((uintptr_t)s & 31)
+         AVX_DQU_NTDQ(d, s, len);
+      else
+         AVX_DQA_NTDQ(d, s, len);
+
+      _mm_sfence();
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+avx_ntdqx_ntdq(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   const char *restrict s = src;
+
+   ALIGN_DST(d, s, len, 32);
+
+   if (len >= 64) {
+      if ((uintptr_t)s & 31)
+         AVX_DQU_NTDQ(d, s, len);
+      else {
+         _mm_mfence();
+
+         AVX_NTDQA_NTDQ(d, s, len);
+      }
+
+      _mm_sfence();
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+#if 0
+static void
+memcpy_avx(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   char *restrict s = src;
+
+   /* memcpy() the misaligned header. At the end of this if block, <d> and <s>
+    * are aligned to a 16-byte boundary or <len> == 0.
+    */
+   if ((uintptr_t)s & 31) {
+      uintptr_t bytes_before_alignment_boundary = 32 - ((uintptr_t)s & 31);
+      assert(bytes_before_alignment_boundary < 32);
+
+	  uintptr_t copy_bytes = MIN2(bytes_before_alignment_boundary, len);
+      memcpy(d, s, copy_bytes);
+
+      d += copy_bytes;
+      s += copy_bytes;
+      len -= copy_bytes;
+   }
+
+   if (len >= 128) {
+      _mm_mfence();
+
+      while (len >= 128) {
+         __m256i *dst_cacheline = (__m256i *)d;
+         __m256i *src_cacheline = (__m256i *)s;
+
+#if 0
+         __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0);
+         __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1);
+         __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2);
+         __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3);
+
+         *(dst_cacheline + 0) = temp0;
+         *(dst_cacheline + 1) = temp1;
+         *(dst_cacheline + 2) = temp2;
+         *(dst_cacheline + 3) = temp3;
+#elif 0
+         __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0);
+         __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1);
+         __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2);
+         __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3);
+
+         _mm256_storeu_si256(dst_cacheline + 0, temp0);
+         _mm256_storeu_si256(dst_cacheline + 1, temp1);
+         _mm256_storeu_si256(dst_cacheline + 2, temp2);
+         _mm256_storeu_si256(dst_cacheline + 3, temp3);
+#elif 1
+         __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0);
+         __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1);
+         __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2);
+         __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3);
+
+         _mm256_stream_si256(dst_cacheline + 0, temp0);
+         _mm256_stream_si256(dst_cacheline + 1, temp1);
+         _mm256_stream_si256(dst_cacheline + 2, temp2);
+         _mm256_stream_si256(dst_cacheline + 3, temp3);
+#else
+         _mm256_stream_si256(dst_cacheline + 0, *(src_cacheline + 0));
+         _mm256_stream_si256(dst_cacheline + 1, *(src_cacheline + 1));
+         _mm256_stream_si256(dst_cacheline + 2, *(src_cacheline + 2));
+         _mm256_stream_si256(dst_cacheline + 3, *(src_cacheline + 3));
+#endif
+
+         d += 128;
+         s += 128;
+         len -= 128;
+      }
+
+      _mm_sfence();
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+
+static void
+memcpy_avx_source_unaligned(void *restrict dst, const void *restrict src, size_t len)
+{
+   char *restrict d = dst;
+   char *restrict s = src;
+
+   /* memcpy() the misaligned header. At the end of this if block, <d> and <s>
+    * are aligned to a 16-byte boundary or <len> == 0.
+    */
+   if ((uintptr_t)d & 31) {
+      uintptr_t bytes_before_alignment_boundary = 32 - ((uintptr_t)d & 31);
+      assert(bytes_before_alignment_boundary < 32);
+
+	  uintptr_t copy_bytes = MIN2(bytes_before_alignment_boundary, len);
+      memcpy(d, s, copy_bytes);
+
+      d += copy_bytes;
+      s += copy_bytes;
+      len -= copy_bytes;
+   }
+
+   if (len >= 256) {
+      _mm_mfence();
+
+#if 0
+      __asm__(
+         ".p2align 4\n"
+         "1:\n"
+         "vmovdqu (%1), %%ymm0\n"
+         "addq $64, %0\n"
+         "vmovdqu 32(%1), %%ymm1\n"
+         "addq $64, %1\n"
+         "subq $64, %2\n"
+         "vmovntdq %%ymm0, -64(%0)\n"
+         "vmovntdq %%ymm1, -32(%0)\n"
+         "cmpq $127, %2\n"
+         "ja 1b\n"
+         : "=r" (d), "=r" (s), "=r" (len)
+         : "0" (d), "1" (s), "2" (len)
+         : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory");
+#elif 1
+      __asm__(
+         ".p2align 4\n"
+         "1:\n"
+         "vmovntdqa (%1), %%ymm0\n"
+         "subq $-128, %0\n"
+         "vmovntdqa 32(%1), %%ymm1\n"
+         "vmovntdqa 64(%1), %%ymm2\n"
+         "vmovntdqa 96(%1), %%ymm3\n"
+         "subq $-128, %1\n"
+         "addq $-128, %2\n"
+         "vmovdqa %%ymm0, -128(%0)\n"
+         "vmovdqa %%ymm1, -96(%0)\n"
+         "vmovdqa %%ymm2, -64(%0)\n"
+         "vmovdqa %%ymm3, -32(%0)\n"
+         "cmpq $127, %2\n"
+         "ja 1b\n"
+         ".p2align 4\n"
+         "vzeroupper\n"
+         : "=r" (d), "=r" (s), "=r" (len)
+         : "0" (d), "1" (s), "2" (len)
+         : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory");
+#elif 0
+      __asm__(
+         ".p2align 4\n"
+         "1:\n"
+         "vmovdqu (%1), %%ymm0\n"
+         "addq $256, %0\n"
+         "vmovdqu 32(%1), %%ymm1\n"
+         "vmovdqu 64(%1), %%ymm2\n"
+         "vmovdqu 96(%1), %%ymm3\n"
+         "vmovdqu 128(%1), %%ymm4\n"
+         "vmovdqu 160(%1), %%ymm5\n"
+         "vmovdqu 192(%1), %%ymm6\n"
+         "vmovdqu 224(%1), %%ymm6\n"
+         "addq $256, %1\n"
+         "subq $256, %2\n"
+         "vmovntdq %%ymm0, -256(%0)\n"
+         "vmovntdq %%ymm1, -224(%0)\n"
+         "vmovntdq %%ymm2, -192(%0)\n"
+         "vmovntdq %%ymm3, -160(%0)\n"
+         "vmovntdq %%ymm4, -128(%0)\n"
+         "vmovntdq %%ymm5, -96(%0)\n"
+         "vmovntdq %%ymm6, -64(%0)\n"
+         "vmovntdq %%ymm7, -32(%0)\n"
+         "cmpq $255, %2\n"
+         "ja 1b\n"
+         : "=r" (d), "=r" (s), "=r" (len)
+         : "0" (d), "1" (s), "2" (len)
+         : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory");
+#else
+      while (len >= 128) {
+         __m256i *dst_cacheline = (__m256i *)d;
+         __m256i *src_cacheline = (__m256i *)s;
+
+//          _mm_prefetch(s + 0x1c0, _MM_HINT_NTA);
+//          _mm_prefetch(s + 0x280, _MM_HINT_NTA);
+
+#if 0
+         __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0);
+         __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1);
+         __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2);
+         __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3);
+
+         _mm256_stream_si256(dst_cacheline + 0, temp0);
+         _mm256_stream_si256(dst_cacheline + 1, temp1);
+         _mm256_stream_si256(dst_cacheline + 2, temp2);
+         _mm256_stream_si256(dst_cacheline + 3, temp3);
+#elif 0
+         __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0);
+         __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1);
+         __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2);
+         __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3);
+         __m256i temp4 = _mm256_loadu_si256(src_cacheline + 4);
+         __m256i temp5 = _mm256_loadu_si256(src_cacheline + 5);
+         __m256i temp6 = _mm256_loadu_si256(src_cacheline + 6);
+         __m256i temp7 = _mm256_loadu_si256(src_cacheline + 7);
+
+         _mm256_stream_si256(dst_cacheline + 0, temp0);
+         _mm256_stream_si256(dst_cacheline + 1, temp1);
+         _mm256_stream_si256(dst_cacheline + 2, temp2);
+         _mm256_stream_si256(dst_cacheline + 3, temp3);
+         _mm256_stream_si256(dst_cacheline + 4, temp4);
+         _mm256_stream_si256(dst_cacheline + 5, temp5);
+         _mm256_stream_si256(dst_cacheline + 6, temp6);
+         _mm256_stream_si256(dst_cacheline + 7, temp7);
+
+         d += 128;
+         s += 128;
+         len -= 128;
+#else
+         __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0);
+         __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1);
+         __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2);
+         __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3);
+
+         _mm256_store_si256(dst_cacheline + 0, temp0);
+         _mm256_store_si256(dst_cacheline + 1, temp1);
+         _mm256_store_si256(dst_cacheline + 2, temp2);
+         _mm256_store_si256(dst_cacheline + 3, temp3);
+#endif
+
+         d += 128;
+         s += 128;
+         len -= 128;
+      }
+#endif
+
+      _mm_sfence();
+   }
+
+   /* memcpy() the tail. */
+   if (len) {
+      memcpy(d, s, len);
+   }
+}
+#endif
+
+static void
+memcpy_wrapper(void *restrict dst, const void *restrict src, size_t len)
+{
+   memcpy(dst, src, len);
+}
+
+static const struct {
+   const char *name;
+   void (*ptr)(void *restrict dst, const void *restrict src, size_t len);
+} memcpy_table[] = {
+   { "sse2_dqx_dqa", sse2_dqx_dqa },
+   { "sse2_dqa_dqx", sse2_dqa_dqx },
+
+   { "sse41_ntdqa_dqu", sse41_ntdqa_dqu },
+   { "sse41_ntdqa_dqx", sse41_ntdqa_dqx },
+   { "sse41_dqu_ntdq", sse41_dqu_ntdq },
+   { "sse41_dqx_ntdq", sse41_dqx_ntdq },
+   { "sse41_ntdqx_ntdq", sse41_ntdqx_ntdq },
+
+   { "avx_dqu_ntdq", avx_dqu_ntdq },
+   { "avx_dqx_ntdq", avx_dqx_ntdq },
+
+   { "avx_ntdqx_ntdq", avx_ntdqx_ntdq },
+
+   { "memcpy", memcpy_wrapper }
+};
+
+void
+(*st_memcpy_read)(void *restrict dst, const void* restrict src, size_t len) = NULL;
+
+void
+st_init_memcpy_read()
+{
+   static const char envvar[] = "ST_MEMCPY_READ";
+   const char *name = debug_get_option(envvar, NULL);
+
+   if (name) {
+      for (unsigned i = 0; i < ARRAY_SIZE(memcpy_table); ++i) {
+         if (strcmp(name, memcpy_table[i].name) == 0) {
+            st_memcpy_read = memcpy_table[i].ptr;
+            return;
+         }
+      }
+
+      fprintf(stderr, "Unknown value of %s. Available options:\n", envvar);
+      for (unsigned i = 0; i < ARRAY_SIZE(memcpy_table); ++i) {
+         fprintf(stderr, "%s\n", memcpy_table[i].name);
+      }
+   }
+
+   st_memcpy_read = memcpy_wrapper;
+}
diff --git a/src/mesa/state_tracker/st_memcpy_read.h b/src/mesa/state_tracker/st_memcpy_read.h
new file mode 100644
index 0000000000..8014c5d5f8
--- /dev/null
+++ b/src/mesa/state_tracker/st_memcpy_read.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef ST_MEMCPY_READ_H
+#define ST_MEMCPY_READ_H
+
+extern void
+(*st_memcpy_read)(void *restrict dst, const void* restrict src, size_t len);
+
+void
+st_init_memcpy_read(void);
+
+#endif /* ST_MEMCPY_READ_H */
author	Nicolai Hähnle <nicolai.haehnle@amd.com>	2016-06-15 11:06:38 +0200
committer	Nicolai Hähnle <nicolai.haehnle@amd.com>	2016-06-16 13:28:09 +0200
commit	a96d83464c64048d2ec5ee810b331d01b5838903 (patch)
tree	be4076bf17730886908a0af4e9822347bd7d309c
parent	9c85441397bfecefa9d91f4540fdbc6784567519 (diff)