diff options
author | Nicolai Hähnle <nicolai.haehnle@amd.com> | 2016-06-15 11:06:38 +0200 |
---|---|---|
committer | Nicolai Hähnle <nicolai.haehnle@amd.com> | 2016-06-16 13:28:09 +0200 |
commit | a96d83464c64048d2ec5ee810b331d01b5838903 (patch) | |
tree | be4076bf17730886908a0af4e9822347bd7d309c | |
parent | 9c85441397bfecefa9d91f4540fdbc6784567519 (diff) |
WIP st/mesa: memcpy drop-ins for readpixelsreadpixels
-rw-r--r-- | src/mesa/Makefile.am | 2 | ||||
-rw-r--r-- | src/mesa/Makefile.sources | 2 | ||||
-rw-r--r-- | src/mesa/state_tracker/st_cb_readpixels.c | 8 | ||||
-rw-r--r-- | src/mesa/state_tracker/st_manager.c | 2 | ||||
-rw-r--r-- | src/mesa/state_tracker/st_memcpy_read.c | 797 | ||||
-rw-r--r-- | src/mesa/state_tracker/st_memcpy_read.h | 34 |
6 files changed, 843 insertions, 2 deletions
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am index 6d7a3cc948..26618ab607 100644 --- a/src/mesa/Makefile.am +++ b/src/mesa/Makefile.am @@ -154,6 +154,8 @@ libmesagallium_la_LIBADD = \ $(top_builddir)/src/compiler/glsl/libglsl.la \ $(ARCH_LIBS) +libmesagallium_la_CFLAGS = $(AM_CFLAGS) $(SSE41_CFLAGS) -mavx -mavx2 + libmesa_sse41_la_SOURCES = \ $(X86_SSE41_FILES) diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources index a49ad953c0..8e9c65584b 100644 --- a/src/mesa/Makefile.sources +++ b/src/mesa/Makefile.sources @@ -497,6 +497,8 @@ STATETRACKER_FILES = \ state_tracker/st_glsl_types.h \ state_tracker/st_manager.c \ state_tracker/st_manager.h \ + state_tracker/st_memcpy_read.c \ + state_tracker/st_memcpy_read.h \ state_tracker/st_mesa_to_tgsi.c \ state_tracker/st_mesa_to_tgsi.h \ state_tracker/st_nir.h \ diff --git a/src/mesa/state_tracker/st_cb_readpixels.c b/src/mesa/state_tracker/st_cb_readpixels.c index 4c2f2ce768..cbecc787ab 100644 --- a/src/mesa/state_tracker/st_cb_readpixels.c +++ b/src/mesa/state_tracker/st_cb_readpixels.c @@ -41,6 +41,7 @@ #include "st_context.h" #include "st_cb_bitmap.h" #include "st_cb_readpixels.h" +#include "st_memcpy_read.h" #include "state_tracker/st_cb_texture.h" #include "state_tracker/st_format.h" #include "state_tracker/st_pbo.h" @@ -519,6 +520,9 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y, if (!window) map += y * tex_xfer->stride + x * util_format_get_blocksize(dst_format); +// static ubyte dummy[4 * 1024 * 1024]; +// map = dummy; //HACK + /* memcpy data into a user buffer */ { const uint bytesPerRow = width * util_format_get_blocksize(dst_format); @@ -528,12 +532,12 @@ st_ReadPixels(struct gl_context *ctx, GLint x, GLint y, type, 0, 0); if (tex_xfer->stride == bytesPerRow && destStride == bytesPerRow) { - memcpy(dest, map, bytesPerRow * height); + (*st_memcpy_read)(dest, map, bytesPerRow * height); } else { GLuint row; for (row = 0; row < (unsigned) height; row++) { - memcpy(dest, map, bytesPerRow); + (*st_memcpy_read)(dest, map, bytesPerRow); map += tex_xfer->stride; dest += destStride; } diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c index 997d428449..26b22ccd05 100644 --- a/src/mesa/state_tracker/st_manager.c +++ b/src/mesa/state_tracker/st_manager.c @@ -46,6 +46,7 @@ #include "st_cb_fbo.h" #include "st_cb_flush.h" #include "st_manager.h" +#include "st_memcpy_read.h" #include "state_tracker/st_gl_api.h" @@ -969,5 +970,6 @@ static const struct st_api st_gl_api = { struct st_api * st_gl_api_create(void) { + st_init_memcpy_read(); return (struct st_api *) &st_gl_api; } diff --git a/src/mesa/state_tracker/st_memcpy_read.c b/src/mesa/state_tracker/st_memcpy_read.c new file mode 100644 index 0000000000..4f0ac112d9 --- /dev/null +++ b/src/mesa/state_tracker/st_memcpy_read.c @@ -0,0 +1,797 @@ +/* + * Copyright 2016 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +/* + * memcpy implementations used for copying from temporary staging textures to + * user memory. + */ + +#include "util/u_debug.h" +#include "util/u_math.h" + +#include "st_memcpy_read.h" + +#include <stdio.h> +#include <string.h> + +#include <immintrin.h> +#include <smmintrin.h> + +#define ALIGN_SRC(d, s, len, align) \ + do { \ + if ((uintptr_t)s & ((align) - 1)) { \ + uintptr_t missing = align - ((uintptr_t)s & (align - 1)); \ + uintptr_t copy = MIN2(missing, len); \ + memcpy(d, s, copy); \ + d += copy; \ + s += copy; \ + len -= copy; \ + } \ + } while (0) + +#define ALIGN_DST(d, s, len, align) \ + do { \ + if ((uintptr_t)d & ((align) - 1)) { \ + uintptr_t missing = align - ((uintptr_t)d & (align - 1)); \ + uintptr_t copy = MIN2(missing, len); \ + memcpy(d, s, copy); \ + d += copy; \ + s += copy; \ + len -= copy; \ + } \ + } while (0) + +#define SSE2_DQU_DQA(d, s, len) \ + do { \ + __m128i *dst_cacheline = (__m128i *)d; \ + __m128i *src_cacheline = (__m128i *)s; \ +\ + __m128i temp1 = _mm_loadu_si128(src_cacheline + 0); \ + __m128i temp2 = _mm_loadu_si128(src_cacheline + 1); \ + __m128i temp3 = _mm_loadu_si128(src_cacheline + 2); \ + __m128i temp4 = _mm_loadu_si128(src_cacheline + 3);\ +\ + _mm_store_si128(dst_cacheline + 0, temp1); \ + _mm_store_si128(dst_cacheline + 1, temp2); \ + _mm_store_si128(dst_cacheline + 2, temp3); \ + _mm_store_si128(dst_cacheline + 3, temp4); \ +\ + d += 64; \ + s += 64; \ + len -= 64; \ + } while (len >= 64) + +#define SSE2_DQA_DQA(d, s, len) \ + do { \ + __m128i *dst_cacheline = (__m128i *)d; \ + __m128i *src_cacheline = (__m128i *)s; \ +\ + __m128i temp1 = _mm_load_si128(src_cacheline + 0); \ + __m128i temp2 = _mm_load_si128(src_cacheline + 1); \ + __m128i temp3 = _mm_load_si128(src_cacheline + 2); \ + __m128i temp4 = _mm_load_si128(src_cacheline + 3);\ +\ + _mm_store_si128(dst_cacheline + 0, temp1); \ + _mm_store_si128(dst_cacheline + 1, temp2); \ + _mm_store_si128(dst_cacheline + 2, temp3); \ + _mm_store_si128(dst_cacheline + 3, temp4); \ +\ + d += 64; \ + s += 64; \ + len -= 64; \ + } while (len >= 64) + +#define SSE2_DQA_DQU(d, s, len) \ + do { \ + __m128i *dst_cacheline = (__m128i *)d; \ + __m128i *src_cacheline = (__m128i *)s; \ +\ + __m128i temp1 = _mm_load_si128(src_cacheline + 0); \ + __m128i temp2 = _mm_load_si128(src_cacheline + 1); \ + __m128i temp3 = _mm_load_si128(src_cacheline + 2); \ + __m128i temp4 = _mm_load_si128(src_cacheline + 3);\ +\ + _mm_storeu_si128(dst_cacheline + 0, temp1); \ + _mm_storeu_si128(dst_cacheline + 1, temp2); \ + _mm_storeu_si128(dst_cacheline + 2, temp3); \ + _mm_storeu_si128(dst_cacheline + 3, temp4); \ +\ + d += 64; \ + s += 64; \ + len -= 64; \ + } while (len >= 64) + +#define SSE41_NTDQA_DQA(d, s, len) \ + do { \ + __m128i *dst_cacheline = (__m128i *)d; \ + __m128i *src_cacheline = (__m128i *)s; \ +\ + __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \ + __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \ + __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \ + __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\ +\ + _mm_store_si128(dst_cacheline + 0, temp1); \ + _mm_store_si128(dst_cacheline + 1, temp2); \ + _mm_store_si128(dst_cacheline + 2, temp3); \ + _mm_store_si128(dst_cacheline + 3, temp4); \ +\ + d += 64; \ + s += 64; \ + len -= 64; \ + } while (len >= 64) + +#define SSE41_NTDQA_DQU(d, s, len) \ + do { \ + __m128i *dst_cacheline = (__m128i *)d; \ + __m128i *src_cacheline = (__m128i *)s; \ +\ + __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \ + __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \ + __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \ + __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\ +\ + _mm_storeu_si128(dst_cacheline + 0, temp1); \ + _mm_storeu_si128(dst_cacheline + 1, temp2); \ + _mm_storeu_si128(dst_cacheline + 2, temp3); \ + _mm_storeu_si128(dst_cacheline + 3, temp4); \ +\ + d += 64; \ + s += 64; \ + len -= 64; \ + } while (len >= 64) + +#define SSE41_DQA_NTDQ(d, s, len) \ + do { \ + __m128i *dst_cacheline = (__m128i *)d; \ + __m128i *src_cacheline = (__m128i *)s; \ +\ + __m128i temp1 = _mm_load_si128(src_cacheline + 0); \ + __m128i temp2 = _mm_load_si128(src_cacheline + 1); \ + __m128i temp3 = _mm_load_si128(src_cacheline + 2); \ + __m128i temp4 = _mm_load_si128(src_cacheline + 3);\ +\ + _mm_stream_si128(dst_cacheline + 0, temp1); \ + _mm_stream_si128(dst_cacheline + 1, temp2); \ + _mm_stream_si128(dst_cacheline + 2, temp3); \ + _mm_stream_si128(dst_cacheline + 3, temp4); \ +\ + d += 64; \ + s += 64; \ + len -= 64; \ + } while (len >= 64) + +#define SSE41_DQU_NTDQ(d, s, len) \ + do { \ + __m128i *dst_cacheline = (__m128i *)d; \ + __m128i *src_cacheline = (__m128i *)s; \ +\ + __m128i temp1 = _mm_loadu_si128(src_cacheline + 0); \ + __m128i temp2 = _mm_loadu_si128(src_cacheline + 1); \ + __m128i temp3 = _mm_loadu_si128(src_cacheline + 2); \ + __m128i temp4 = _mm_loadu_si128(src_cacheline + 3);\ +\ + _mm_stream_si128(dst_cacheline + 0, temp1); \ + _mm_stream_si128(dst_cacheline + 1, temp2); \ + _mm_stream_si128(dst_cacheline + 2, temp3); \ + _mm_stream_si128(dst_cacheline + 3, temp4); \ +\ + d += 64; \ + s += 64; \ + len -= 64; \ + } while (len >= 64) + +#define SSE41_NTDQA_NTDQ(d, s, len) \ + do { \ + __m128i *dst_cacheline = (__m128i *)d; \ + __m128i *src_cacheline = (__m128i *)s; \ +\ + __m128i temp1 = _mm_stream_load_si128(src_cacheline + 0); \ + __m128i temp2 = _mm_stream_load_si128(src_cacheline + 1); \ + __m128i temp3 = _mm_stream_load_si128(src_cacheline + 2); \ + __m128i temp4 = _mm_stream_load_si128(src_cacheline + 3);\ +\ + _mm_stream_si128(dst_cacheline + 0, temp1); \ + _mm_stream_si128(dst_cacheline + 1, temp2); \ + _mm_stream_si128(dst_cacheline + 2, temp3); \ + _mm_stream_si128(dst_cacheline + 3, temp4); \ +\ + d += 64; \ + s += 64; \ + len -= 64; \ + } while (len >= 64) + +static void +sse2_dqa_dqx(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_SRC(d, s, len, 16); + + if (len >= 64) { + if ((uintptr_t)d & 15) + SSE2_DQA_DQU(d, s, len); + else + SSE2_DQA_DQA(d, s, len); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +sse2_dqx_dqa(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_DST(d, s, len, 16); + + if (len >= 64) { + if ((uintptr_t)s & 15) + SSE2_DQU_DQA(d, s, len); + else + SSE2_DQA_DQA(d, s, len); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +sse41_ntdqa_dqu(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_SRC(d, s, len, 16); + + if (len >= 64) { + _mm_mfence(); + + SSE41_NTDQA_DQU(d, s, len); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +sse41_ntdqa_dqx(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_SRC(d, s, len, 16); + + if (len >= 64) { + _mm_mfence(); + + if ((uintptr_t)d & 15) + SSE41_NTDQA_DQU(d, s, len); + else + SSE41_NTDQA_DQA(d, s, len); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +sse41_dqu_ntdq(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_DST(d, s, len, 16); + + if (len >= 64) { + SSE41_DQU_NTDQ(d, s, len); + + _mm_sfence(); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +sse41_dqx_ntdq(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_DST(d, s, len, 16); + + if (len >= 64) { + if ((uintptr_t)s & 15) + SSE41_DQU_NTDQ(d, s, len); + else + SSE41_DQA_NTDQ(d, s, len); + + _mm_sfence(); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +sse41_ntdqx_ntdq(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_DST(d, s, len, 16); + + if (len >= 64) { + if ((uintptr_t)s & 15) + SSE41_DQU_NTDQ(d, s, len); + else { + _mm_mfence(); + + SSE41_NTDQA_NTDQ(d, s, len); + } + + _mm_sfence(); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +#define AVX_DQA_NTDQ(d, s, len) \ + do { \ + __m256i *dst_cacheline = (__m256i *)d; \ + __m256i *src_cacheline = (__m256i *)s; \ +\ + __m256i temp0 = _mm256_load_si256(src_cacheline + 0); \ + __m256i temp1 = _mm256_load_si256(src_cacheline + 1); \ + __m256i temp2 = _mm256_load_si256(src_cacheline + 2); \ + __m256i temp3 = _mm256_load_si256(src_cacheline + 3); \ +\ + _mm256_stream_si256(dst_cacheline + 0, temp0); \ + _mm256_stream_si256(dst_cacheline + 1, temp1); \ + _mm256_stream_si256(dst_cacheline + 2, temp2); \ + _mm256_stream_si256(dst_cacheline + 3, temp3); \ +\ + d += 128; \ + s += 128; \ + len -= 128; \ + } while (len >= 128); + +#define AVX_DQU_NTDQ(d, s, len) \ + do { \ + __m256i *dst_cacheline = (__m256i *)d; \ + __m256i *src_cacheline = (__m256i *)s; \ +\ + __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0); \ + __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1); \ + __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2); \ + __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3); \ +\ + _mm256_stream_si256(dst_cacheline + 0, temp0); \ + _mm256_stream_si256(dst_cacheline + 1, temp1); \ + _mm256_stream_si256(dst_cacheline + 2, temp2); \ + _mm256_stream_si256(dst_cacheline + 3, temp3); \ +\ + d += 128; \ + s += 128; \ + len -= 128; \ + } while (len >= 128) + +#define AVX_NTDQA_NTDQ(d, s, len) \ + do { \ + __m256i *dst_cacheline = (__m256i *)d; \ + __m256i *src_cacheline = (__m256i *)s; \ +\ + __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0); \ + __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1); \ + __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2); \ + __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3); \ +\ + _mm256_stream_si256(dst_cacheline + 0, temp0); \ + _mm256_stream_si256(dst_cacheline + 1, temp1); \ + _mm256_stream_si256(dst_cacheline + 2, temp2); \ + _mm256_stream_si256(dst_cacheline + 3, temp3); \ +\ + d += 128; \ + s += 128; \ + len -= 128; \ + } while (len >= 128) + +static void +avx_dqu_ntdq(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_DST(d, s, len, 32); + + if (len >= 64) { + AVX_DQU_NTDQ(d, s, len); + + _mm_sfence(); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +avx_dqx_ntdq(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_DST(d, s, len, 32); + + if (len >= 64) { + if ((uintptr_t)s & 31) + AVX_DQU_NTDQ(d, s, len); + else + AVX_DQA_NTDQ(d, s, len); + + _mm_sfence(); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +avx_ntdqx_ntdq(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + const char *restrict s = src; + + ALIGN_DST(d, s, len, 32); + + if (len >= 64) { + if ((uintptr_t)s & 31) + AVX_DQU_NTDQ(d, s, len); + else { + _mm_mfence(); + + AVX_NTDQA_NTDQ(d, s, len); + } + + _mm_sfence(); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +#if 0 +static void +memcpy_avx(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + char *restrict s = src; + + /* memcpy() the misaligned header. At the end of this if block, <d> and <s> + * are aligned to a 16-byte boundary or <len> == 0. + */ + if ((uintptr_t)s & 31) { + uintptr_t bytes_before_alignment_boundary = 32 - ((uintptr_t)s & 31); + assert(bytes_before_alignment_boundary < 32); + + uintptr_t copy_bytes = MIN2(bytes_before_alignment_boundary, len); + memcpy(d, s, copy_bytes); + + d += copy_bytes; + s += copy_bytes; + len -= copy_bytes; + } + + if (len >= 128) { + _mm_mfence(); + + while (len >= 128) { + __m256i *dst_cacheline = (__m256i *)d; + __m256i *src_cacheline = (__m256i *)s; + +#if 0 + __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0); + __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1); + __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2); + __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3); + + *(dst_cacheline + 0) = temp0; + *(dst_cacheline + 1) = temp1; + *(dst_cacheline + 2) = temp2; + *(dst_cacheline + 3) = temp3; +#elif 0 + __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0); + __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1); + __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2); + __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3); + + _mm256_storeu_si256(dst_cacheline + 0, temp0); + _mm256_storeu_si256(dst_cacheline + 1, temp1); + _mm256_storeu_si256(dst_cacheline + 2, temp2); + _mm256_storeu_si256(dst_cacheline + 3, temp3); +#elif 1 + __m256i temp0 = _mm256_stream_load_si256(src_cacheline + 0); + __m256i temp1 = _mm256_stream_load_si256(src_cacheline + 1); + __m256i temp2 = _mm256_stream_load_si256(src_cacheline + 2); + __m256i temp3 = _mm256_stream_load_si256(src_cacheline + 3); + + _mm256_stream_si256(dst_cacheline + 0, temp0); + _mm256_stream_si256(dst_cacheline + 1, temp1); + _mm256_stream_si256(dst_cacheline + 2, temp2); + _mm256_stream_si256(dst_cacheline + 3, temp3); +#else + _mm256_stream_si256(dst_cacheline + 0, *(src_cacheline + 0)); + _mm256_stream_si256(dst_cacheline + 1, *(src_cacheline + 1)); + _mm256_stream_si256(dst_cacheline + 2, *(src_cacheline + 2)); + _mm256_stream_si256(dst_cacheline + 3, *(src_cacheline + 3)); +#endif + + d += 128; + s += 128; + len -= 128; + } + + _mm_sfence(); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} + +static void +memcpy_avx_source_unaligned(void *restrict dst, const void *restrict src, size_t len) +{ + char *restrict d = dst; + char *restrict s = src; + + /* memcpy() the misaligned header. At the end of this if block, <d> and <s> + * are aligned to a 16-byte boundary or <len> == 0. + */ + if ((uintptr_t)d & 31) { + uintptr_t bytes_before_alignment_boundary = 32 - ((uintptr_t)d & 31); + assert(bytes_before_alignment_boundary < 32); + + uintptr_t copy_bytes = MIN2(bytes_before_alignment_boundary, len); + memcpy(d, s, copy_bytes); + + d += copy_bytes; + s += copy_bytes; + len -= copy_bytes; + } + + if (len >= 256) { + _mm_mfence(); + +#if 0 + __asm__( + ".p2align 4\n" + "1:\n" + "vmovdqu (%1), %%ymm0\n" + "addq $64, %0\n" + "vmovdqu 32(%1), %%ymm1\n" + "addq $64, %1\n" + "subq $64, %2\n" + "vmovntdq %%ymm0, -64(%0)\n" + "vmovntdq %%ymm1, -32(%0)\n" + "cmpq $127, %2\n" + "ja 1b\n" + : "=r" (d), "=r" (s), "=r" (len) + : "0" (d), "1" (s), "2" (len) + : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"); +#elif 1 + __asm__( + ".p2align 4\n" + "1:\n" + "vmovntdqa (%1), %%ymm0\n" + "subq $-128, %0\n" + "vmovntdqa 32(%1), %%ymm1\n" + "vmovntdqa 64(%1), %%ymm2\n" + "vmovntdqa 96(%1), %%ymm3\n" + "subq $-128, %1\n" + "addq $-128, %2\n" + "vmovdqa %%ymm0, -128(%0)\n" + "vmovdqa %%ymm1, -96(%0)\n" + "vmovdqa %%ymm2, -64(%0)\n" + "vmovdqa %%ymm3, -32(%0)\n" + "cmpq $127, %2\n" + "ja 1b\n" + ".p2align 4\n" + "vzeroupper\n" + : "=r" (d), "=r" (s), "=r" (len) + : "0" (d), "1" (s), "2" (len) + : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"); +#elif 0 + __asm__( + ".p2align 4\n" + "1:\n" + "vmovdqu (%1), %%ymm0\n" + "addq $256, %0\n" + "vmovdqu 32(%1), %%ymm1\n" + "vmovdqu 64(%1), %%ymm2\n" + "vmovdqu 96(%1), %%ymm3\n" + "vmovdqu 128(%1), %%ymm4\n" + "vmovdqu 160(%1), %%ymm5\n" + "vmovdqu 192(%1), %%ymm6\n" + "vmovdqu 224(%1), %%ymm6\n" + "addq $256, %1\n" + "subq $256, %2\n" + "vmovntdq %%ymm0, -256(%0)\n" + "vmovntdq %%ymm1, -224(%0)\n" + "vmovntdq %%ymm2, -192(%0)\n" + "vmovntdq %%ymm3, -160(%0)\n" + "vmovntdq %%ymm4, -128(%0)\n" + "vmovntdq %%ymm5, -96(%0)\n" + "vmovntdq %%ymm6, -64(%0)\n" + "vmovntdq %%ymm7, -32(%0)\n" + "cmpq $255, %2\n" + "ja 1b\n" + : "=r" (d), "=r" (s), "=r" (len) + : "0" (d), "1" (s), "2" (len) + : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory"); +#else + while (len >= 128) { + __m256i *dst_cacheline = (__m256i *)d; + __m256i *src_cacheline = (__m256i *)s; + +// _mm_prefetch(s + 0x1c0, _MM_HINT_NTA); +// _mm_prefetch(s + 0x280, _MM_HINT_NTA); + +#if 0 + __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0); + __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1); + __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2); + __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3); + + _mm256_stream_si256(dst_cacheline + 0, temp0); + _mm256_stream_si256(dst_cacheline + 1, temp1); + _mm256_stream_si256(dst_cacheline + 2, temp2); + _mm256_stream_si256(dst_cacheline + 3, temp3); +#elif 0 + __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0); + __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1); + __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2); + __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3); + __m256i temp4 = _mm256_loadu_si256(src_cacheline + 4); + __m256i temp5 = _mm256_loadu_si256(src_cacheline + 5); + __m256i temp6 = _mm256_loadu_si256(src_cacheline + 6); + __m256i temp7 = _mm256_loadu_si256(src_cacheline + 7); + + _mm256_stream_si256(dst_cacheline + 0, temp0); + _mm256_stream_si256(dst_cacheline + 1, temp1); + _mm256_stream_si256(dst_cacheline + 2, temp2); + _mm256_stream_si256(dst_cacheline + 3, temp3); + _mm256_stream_si256(dst_cacheline + 4, temp4); + _mm256_stream_si256(dst_cacheline + 5, temp5); + _mm256_stream_si256(dst_cacheline + 6, temp6); + _mm256_stream_si256(dst_cacheline + 7, temp7); + + d += 128; + s += 128; + len -= 128; +#else + __m256i temp0 = _mm256_loadu_si256(src_cacheline + 0); + __m256i temp1 = _mm256_loadu_si256(src_cacheline + 1); + __m256i temp2 = _mm256_loadu_si256(src_cacheline + 2); + __m256i temp3 = _mm256_loadu_si256(src_cacheline + 3); + + _mm256_store_si256(dst_cacheline + 0, temp0); + _mm256_store_si256(dst_cacheline + 1, temp1); + _mm256_store_si256(dst_cacheline + 2, temp2); + _mm256_store_si256(dst_cacheline + 3, temp3); +#endif + + d += 128; + s += 128; + len -= 128; + } +#endif + + _mm_sfence(); + } + + /* memcpy() the tail. */ + if (len) { + memcpy(d, s, len); + } +} +#endif + +static void +memcpy_wrapper(void *restrict dst, const void *restrict src, size_t len) +{ + memcpy(dst, src, len); +} + +static const struct { + const char *name; + void (*ptr)(void *restrict dst, const void *restrict src, size_t len); +} memcpy_table[] = { + { "sse2_dqx_dqa", sse2_dqx_dqa }, + { "sse2_dqa_dqx", sse2_dqa_dqx }, + + { "sse41_ntdqa_dqu", sse41_ntdqa_dqu }, + { "sse41_ntdqa_dqx", sse41_ntdqa_dqx }, + { "sse41_dqu_ntdq", sse41_dqu_ntdq }, + { "sse41_dqx_ntdq", sse41_dqx_ntdq }, + { "sse41_ntdqx_ntdq", sse41_ntdqx_ntdq }, + + { "avx_dqu_ntdq", avx_dqu_ntdq }, + { "avx_dqx_ntdq", avx_dqx_ntdq }, + + { "avx_ntdqx_ntdq", avx_ntdqx_ntdq }, + + { "memcpy", memcpy_wrapper } +}; + +void +(*st_memcpy_read)(void *restrict dst, const void* restrict src, size_t len) = NULL; + +void +st_init_memcpy_read() +{ + static const char envvar[] = "ST_MEMCPY_READ"; + const char *name = debug_get_option(envvar, NULL); + + if (name) { + for (unsigned i = 0; i < ARRAY_SIZE(memcpy_table); ++i) { + if (strcmp(name, memcpy_table[i].name) == 0) { + st_memcpy_read = memcpy_table[i].ptr; + return; + } + } + + fprintf(stderr, "Unknown value of %s. Available options:\n", envvar); + for (unsigned i = 0; i < ARRAY_SIZE(memcpy_table); ++i) { + fprintf(stderr, "%s\n", memcpy_table[i].name); + } + } + + st_memcpy_read = memcpy_wrapper; +} diff --git a/src/mesa/state_tracker/st_memcpy_read.h b/src/mesa/state_tracker/st_memcpy_read.h new file mode 100644 index 0000000000..8014c5d5f8 --- /dev/null +++ b/src/mesa/state_tracker/st_memcpy_read.h @@ -0,0 +1,34 @@ +/* + * Copyright 2016 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef ST_MEMCPY_READ_H +#define ST_MEMCPY_READ_H + +extern void +(*st_memcpy_read)(void *restrict dst, const void* restrict src, size_t len); + +void +st_init_memcpy_read(void); + +#endif /* ST_MEMCPY_READ_H */ |