summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIzik Eidus <ieidus@redhat.com>2010-02-16 21:35:57 +0200
committerAlexander Larsson <alexl@redhat.com>2010-02-17 11:14:38 +0100
commit43d86f9e38436d07066cf667ee4ed0b94d54a28c (patch)
treeac84ca278f5c20bc741355b548473c0defd48700
parentad5433a6b33ec59954601f0b7dbacc4609a4f754 (diff)
do guest driver to host memcpys using sse.
Just copy chunks of data in sse2 instrcutions with prefeching of cache bytes. - 35%~ cpu improvment when playing video in windows. The first code that I wrote for this patch actuly handled all the algiment requirment in the spice qxl win driver, and did all the copys in 16 bytes algiment, however due requiments from libspice server the cost of maitaing 16byte algiment for each "line of copy" is too high and therefore whenever we better use unalgiment copys whenever the bitmaps arent already 16byte algiment. There is alot of space for improvment in this patch, but due to timelines and the fact that we have another hurry tasks I will just push this patch for now - the results are very good right now Thanks. Signed-off-by: Izik Eidus <ieidus@redhat.com>
-rw-r--r--display/driver.c1
-rw-r--r--display/qxldd.h5
-rw-r--r--display/res.c321
-rw-r--r--display/res.h1
4 files changed, 313 insertions, 15 deletions
diff --git a/display/driver.c b/display/driver.c
index a8ef9f0..b723bf5 100644
--- a/display/driver.c
+++ b/display/driver.c
@@ -240,6 +240,7 @@ BOOL DrvEnableDriver(ULONG engine_version, ULONG enable_data_size, PDRVENABLEDAT
mspace_set_abort_func(mspace_abort);
mspace_set_print_func(mspace_print);
ResInitGlobals();
+ CheckAndSetSSE2();
InitGlobalRes();
DEBUG_PRINT((NULL, 1, "%s: end\n", __FUNCTION__));
return TRUE;
diff --git a/display/qxldd.h b/display/qxldd.h
index bff84d1..80dd30a 100644
--- a/display/qxldd.h
+++ b/display/qxldd.h
@@ -184,6 +184,9 @@ typedef struct DevRes {
#endif
} DevRes;
+#define SSE_MASK 15
+#define SSE_ALIGN 16
+
typedef struct PDev {
HANDLE driver;
HDEV eng;
@@ -268,6 +271,8 @@ typedef struct PDev {
Ring update_trace;
UpdateTrace update_trace_items[NUM_UPDATE_TRACE_ITEMS];
+
+ UINT8 FPUSave[16 * 4 + 15];
} PDev;
diff --git a/display/res.c b/display/res.c
index 2b0735a..a358351 100644
--- a/display/res.c
+++ b/display/res.c
@@ -90,6 +90,8 @@ typedef struct QXLOutput {
UINT8 data[0];
} QXLOutput;
+static int have_sse2 = FALSE;
+
UINT64 ReleaseOutput(PDev *pdev, UINT64 output_id)
{
@@ -798,26 +800,229 @@ static BOOL SetClip(PDev *pdev, CLIPOBJ *clip, QXLDrawable *drawable)
return TRUE;
}
+static _inline void fast_memcpy_aligment(void *dest, const void *src, size_t len)
+{
+ _asm
+ {
+ mov ecx, len
+ mov esi, src
+ mov edi, dest
+
+ cmp ecx, 128
+ jb try_to_copy64
+
+ prefetchnta [esi]
+ copy_128:
+ prefetchnta [esi + 64]
+
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + 16]
+ movdqa xmm2, [esi + 32]
+ movdqa xmm3, [esi + 48]
+
+ prefetchnta [esi + 128]
+
+ movntdq [edi], xmm0
+ movntdq [edi + 16], xmm1
+ movntdq [edi + 32], xmm2
+ movntdq [edi + 48], xmm3
+
+ movdqa xmm0, [esi + 64]
+ movdqa xmm1, [esi + 80]
+ movdqa xmm2, [esi + 96]
+ movdqa xmm3, [esi + 112]
+
+ movntdq [edi + 64], xmm0
+ movntdq [edi + 80], xmm1
+ movntdq [edi + 96], xmm2
+ movntdq [edi + 112], xmm3
+
+ add edi, 128
+ add esi, 128
+ sub ecx, 128
+ cmp ecx, 128
+ jae copy_128
+
+ try_to_copy64:
+ cmp ecx, 64
+ jb try_to_copy32
+
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + 16]
+ movdqa xmm2, [esi + 32]
+ movdqa xmm3, [esi + 48]
+
+ movntdq [edi], xmm0
+ movntdq [edi + 16], xmm1
+ movntdq [edi + 32], xmm2
+ movntdq [edi + 48], xmm3
+
+ add edi, 64
+ add esi, 64
+ sub ecx, 64
+ prefetchnta [esi]
+
+ try_to_copy32:
+ cmp ecx, 32
+ jb try_to_copy16
+
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + 16]
+ movntdq [edi], xmm0
+ movntdq [edi + 16], xmm1
+
+ add edi, 32
+ add esi, 32
+ sub ecx, 32
+
+ try_to_copy16:
+ cmp ecx, 16
+ jb try_to_copy4
+
+ movdqa xmm0, [esi]
+ movntdq [edi], xmm0
+
+ add edi, 16
+ add esi, 16
+ sub ecx, 16
+
+
+ try_to_copy4:
+ cmp ecx, 4
+ jb try_to_copy_1
+ movsd
+ sub ecx, 4
+ jmp try_to_copy4
+
+ try_to_copy_1:
+ rep movsb
+
+ sfence
+ }
+}
+
+static _inline void fast_memcpy_unaligment(void *dest, const void *src, size_t len)
+{
+ _asm
+ {
+ mov ecx, len
+ mov esi, src
+ mov edi, dest
+
+ cmp ecx, 128
+ jb try_to_copy64
+
+ prefetchnta [esi]
+ copy_128:
+ prefetchnta [esi + 64]
+
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + 16]
+ movdqu xmm2, [esi + 32]
+ movdqu xmm3, [esi + 48]
+
+ prefetchnta [esi + 128]
+
+ movntdq [edi], xmm0
+ movntdq [edi + 16], xmm1
+ movntdq [edi + 32], xmm2
+ movntdq [edi + 48], xmm3
+
+ movdqu xmm0, [esi + 64]
+ movdqu xmm1, [esi + 80]
+ movdqu xmm2, [esi + 96]
+ movdqu xmm3, [esi + 112]
+
+ movntdq [edi + 64], xmm0
+ movntdq [edi + 80], xmm1
+ movntdq [edi + 96], xmm2
+ movntdq [edi + 112], xmm3
+
+ add edi, 128
+ add esi, 128
+ sub ecx, 128
+ cmp ecx, 128
+ jae copy_128
+
+ try_to_copy64:
+ cmp ecx, 64
+ jb try_to_copy32
+
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + 16]
+ movdqu xmm2, [esi + 32]
+ movdqu xmm3, [esi + 48]
+
+ movntdq [edi], xmm0
+ movntdq [edi + 16], xmm1
+ movntdq [edi + 32], xmm2
+ movntdq [edi + 48], xmm3
+
+ add edi, 64
+ add esi, 64
+ sub ecx, 64
+ prefetchnta [esi]
+
+ try_to_copy32:
+ cmp ecx, 32
+ jb try_to_copy16
+
+ movdqu xmm0, [esi]
+ movdqu xmm1, [esi + 16]
+ movntdq [edi], xmm0
+ movntdq [edi + 16], xmm1
+
+ add edi, 32
+ add esi, 32
+ sub ecx, 32
+
+ try_to_copy16:
+ cmp ecx, 16
+ jb try_to_copy4
+
+ movdqu xmm0, [esi]
+ movntdq [edi], xmm0
+
+ add edi, 16
+ add esi, 16
+ sub ecx, 16
+
+
+ try_to_copy4:
+ cmp ecx, 4
+ jb try_to_copy_1
+ movsd
+ sub ecx, 4
+ jmp try_to_copy4
+
+ try_to_copy_1:
+ rep movsb
+
+ sfence
+ }
+}
+
#ifdef DBG
#define PutBytesAlign __PutBytesAlign
- #define PutBytes(pdev, chunk, now, end, src, size, page_counter, alloc_size)\
- __PutBytesAlign(pdev, chunk, now, end, src, size, page_counter, alloc_size, 1)
+#define PutBytes(pdev, chunk, now, end, src, size, page_counter, alloc_size, use_sse)\
+ __PutBytesAlign(pdev, chunk, now, end, src, size, page_counter, alloc_size, 1, use_sse)
#else
- #define PutBytesAlign(pdev, chunk, now, end, src, size, page_counter, alloc_size, alignment)\
- __PutBytesAlign(pdev, chunk, now, end, src, size, NULL, alloc_size, alignment)
- #define PutBytes(pdev, chunk, now, end, src, size, page_counter, alloc_size)\
- __PutBytesAlign(pdev, chunk, now, end, src, size, NULL, alloc_size, 1)
+#define PutBytesAlign(pdev, chunk, now, end, src, size, page_counter, alloc_size, alignment, use_sse)\
+ __PutBytesAlign(pdev, chunk, now, end, src, size, NULL, alloc_size, alignment, use_sse)
+#define PutBytes(pdev, chunk, now, end, src, size, page_counter, alloc_size, use_sse)\
+ __PutBytesAlign(pdev, chunk, now, end, src, size, NULL, alloc_size, 1, use_sse)
#endif
#define BITS_BUF_MAX (64 * 1024)
static void __PutBytesAlign(PDev *pdev, QXLDataChunk **chunk_ptr, UINT8 **now_ptr,
UINT8 **end_ptr, UINT8 *src, int size, int *page_counter,
- size_t alloc_size, uint32_t alignment)
+ size_t alloc_size, uint32_t alignment, BOOL use_sse)
{
QXLDataChunk *chunk = *chunk_ptr;
UINT8 *now = *now_ptr;
UINT8 *end = *end_ptr;
+ int offset;
DEBUG_PRINT((pdev, 12, "%s\n", __FUNCTION__));
while (size) {
@@ -831,7 +1036,35 @@ static void __PutBytesAlign(PDev *pdev, QXLDataChunk **chunk_ptr, UINT8 **now_pt
NEW_DATA_CHUNK(page_counter, aligned_size);
cp_size = MIN(end - now, size);
}
- RtlCopyMemory(now, src, cp_size);
+
+ if (use_sse) {
+ offset = (size_t)now & SSE_MASK;
+ if (offset) {
+ offset = SSE_ALIGN - offset;
+ if (offset >= cp_size) {
+ RtlCopyMemory(now, src, cp_size);
+ src += cp_size;
+ now += cp_size;
+ chunk->data_size += cp_size;
+ size -= cp_size;
+ continue;
+ }
+ RtlCopyMemory(now, src, offset);
+ now += offset;
+ src += offset;
+ size -= offset;
+ cp_size -= offset;
+ chunk->data_size += offset;
+ }
+
+ if (((size_t)src & SSE_MASK) == 0) {
+ fast_memcpy_aligment(now, src, cp_size);
+ } else {
+ fast_memcpy_unaligment(now, src, cp_size);
+ }
+ } else {
+ RtlCopyMemory(now, src, cp_size);
+ }
src += cp_size;
now += cp_size;
chunk->data_size += cp_size;
@@ -1270,6 +1503,36 @@ static void FreeBitmapImage(PDev *pdev, Resource *res) // todo: defer
DEBUG_PRINT((pdev, 13, "%s: done\n", __FUNCTION__));
}
+static _inline void RestoreFPU(PDev *pdev)
+{
+ void *align_addr = (void *)ALIGN((size_t)(&pdev->FPUSave), SSE_ALIGN);
+
+ _asm
+ {
+ mov esi, align_addr
+
+ movdqa xmm0, [esi]
+ movdqa xmm1, [esi + 16]
+ movdqa xmm2, [esi + 32]
+ movdqa xmm3, [esi + 48]
+ }
+}
+
+static _inline void SaveFPU(PDev *pdev)
+{
+ void *align_addr = (void *)ALIGN((size_t)(&pdev->FPUSave), SSE_ALIGN);
+
+ _asm
+ {
+ mov edi, align_addr
+
+ movdqa [edi], xmm0
+ movdqa [edi + 16], xmm1
+ movdqa [edi + 32], xmm2
+ movdqa [edi + 48], xmm3
+ }
+}
+
#define BITMAP_ALLOC_BASE (sizeof(Resource) + sizeof(InternalImage) + sizeof(QXLDataChunk))
static _inline Resource *GetBitmapImage(PDev *pdev, SURFOBJ *surf, XLATEOBJ *color_trans,
@@ -1283,6 +1546,7 @@ static _inline Resource *GetBitmapImage(PDev *pdev, SURFOBJ *surf, XLATEOBJ *col
UINT8 *src_end;
UINT8 *dest;
UINT8 *dest_end;
+ BOOL use_sse = FALSE;
DEBUG_PRINT((pdev, 12, "%s\n", __FUNCTION__));
ASSERT(pdev, width > 0 && height > 0);
@@ -1314,9 +1578,19 @@ static _inline Resource *GetBitmapImage(PDev *pdev, SURFOBJ *surf, XLATEOBJ *col
dest = chunk->data;
dest_end = (UINT8 *)image_res + alloc_size;
alloc_size = height * line_size;
+
+ if (have_sse2 && alloc_size >= 1024) {
+ use_sse = TRUE;
+ SaveFPU(pdev);
+ }
+
for (; src != src_end; src -= surf->lDelta, alloc_size -= line_size) {
PutBytesAlign(pdev, &chunk, &dest, &dest_end, src, line_size,
- &pdev->Res.num_bits_pages, alloc_size, line_size);
+ &pdev->Res.num_bits_pages, alloc_size, line_size, TRUE);
+ }
+
+ if (use_sse) {
+ RestoreFPU(pdev);
}
GetPallette(pdev, &internal->image.bitmap, color_trans);
@@ -2357,7 +2631,7 @@ static BOOL GetCursorCommon(PDev *pdev, QXLCursorCmd *cmd, LONG hot_x, LONG hot_
src_end = src + (surf->lDelta * surf->sizlBitmap.cy);
for (; src != src_end; src += surf->lDelta) {
PutBytes(pdev, &info->chunk, &info->now, &info->end, src, line_size,
- &pdev->Res.num_cursor_pages, PAGE_SIZE);
+ &pdev->Res.num_cursor_pages, PAGE_SIZE, FALSE);
}
CursorCacheAdd(pdev, internal);
@@ -2454,14 +2728,14 @@ BOOL GetColorCursor(PDev *pdev, QXLCursorCmd *cmd, LONG hot_x, LONG hot_y, SURFO
if (pdev->bitmap_format == BMF_32BPP) {
PutBytes(pdev, &info.chunk, &info.now, &info.end, (UINT8 *)color_trans->pulXlate,
- 256 << 2, &pdev->Res.num_cursor_pages, PAGE_SIZE);
+ 256 << 2, &pdev->Res.num_cursor_pages, PAGE_SIZE, FALSE);
} else {
int i;
for (i = 0; i < 256; i++) {
UINT32 ent = _16bppTo32bpp(color_trans->pulXlate[i]);
PutBytes(pdev, &info.chunk, &info.now, &info.end, (UINT8 *)&ent,
- 4, &pdev->Res.num_cursor_pages, PAGE_SIZE);
+ 4, &pdev->Res.num_cursor_pages, PAGE_SIZE, FALSE);
}
}
info.cursor->data_size += 256 << 2;
@@ -2474,14 +2748,14 @@ BOOL GetColorCursor(PDev *pdev, QXLCursorCmd *cmd, LONG hot_x, LONG hot_y, SURFO
if (pdev->bitmap_format == BMF_32BPP) {
PutBytes(pdev, &info.chunk, &info.now, &info.end, (UINT8 *)color_trans->pulXlate,
- 16 << 2, &pdev->Res.num_cursor_pages, PAGE_SIZE);
+ 16 << 2, &pdev->Res.num_cursor_pages, PAGE_SIZE, FALSE);
} else {
int i;
for (i = 0; i < 16; i++) {
UINT32 ent = _16bppTo32bpp(color_trans->pulXlate[i]);
PutBytes(pdev, &info.chunk, &info.now, &info.end, (UINT8 *)&ent,
- 4, &pdev->Res.num_cursor_pages, PAGE_SIZE);
+ 4, &pdev->Res.num_cursor_pages, PAGE_SIZE, FALSE);
}
}
info.cursor->data_size += 16 << 2;
@@ -2493,7 +2767,7 @@ BOOL GetColorCursor(PDev *pdev, QXLCursorCmd *cmd, LONG hot_x, LONG hot_y, SURFO
for (; src != src_end; src += mask->lDelta) {
PutBytes(pdev, &info.chunk, &info.now, &info.end, src, line_size,
- &pdev->Res.num_cursor_pages, PAGE_SIZE);
+ &pdev->Res.num_cursor_pages, PAGE_SIZE, FALSE);
}
}
@@ -2614,3 +2888,20 @@ void ResDestroyGlobals()
image_id_sem = NULL;
}
+void CheckAndSetSSE2()
+{
+ _asm
+ {
+ mov eax, 0x0000001
+ cpuid
+ and edx, 0x4000000
+ mov have_sse2, edx
+ }
+
+ if (have_sse2) {
+ have_sse2 = TRUE;
+ }
+}
+
+
+
diff --git a/display/res.h b/display/res.h
index e181434..5af8a26 100644
--- a/display/res.h
+++ b/display/res.h
@@ -56,4 +56,5 @@ BOOL ResInit(PDev *pdev);
void ResDestroy(PDev *pdev);
void ResInitGlobals();
void ResDestroyGlobals();
+void CheckAndSetSSE2();
#endif