#define RUN_ME /* nasm -g -f elf64 unpremultiply-sse2.S CFLAGS="-W -Wall -Wextra -std=c99 -O2 -g" CFLAGS="$CFLAGS -O3 -fomit-frame-pointer -funroll-all-loops" gcc $CFLAGS -c *.c gcc $CFLAGS -o `basename $0 .c` *.o exit $? */ /* * Copyright (c) 2009 M Joonas Pihlaja * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ /* Test driver for unpremultipliers. */ #include #include #include #include #include #include #include /* gettimeofday */ /* Pixel format config for a 32 bit pixel with 8 bit components. Only * the location of alpha matters. */ #ifndef ASHIFT # define ASHIFT 24 #endif #define RSHIFT ((24 + ASHIFT) % 32) #define GSHIFT ((16 + ASHIFT) % 32) #define BSHIFT (( 8 + ASHIFT) % 32) #define AMASK (255 << ASHIFT) #define RMASK (255 << RSHIFT) #define GMASK (255 << GSHIFT) #define BMASK (255 << BSHIFT) /* The methods we have available. */ void unpremultiply_with_sse2(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_inv(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_invb(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_lut(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_lutb(uint32_t *dst, uint32_t const *src, size_t n); void unpremultiply_with_div(uint32_t *dst, uint32_t const *src, size_t n); /* * Some non-functional methods to gauge the computation overhead. */ static void __attribute__((noinline)) unpremultiply_with_memcpy(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) { memcpy(dst, src, 4*n); } static void __attribute__((noinline)) unpremultiply_with_memset(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) { (void)src; memset(dst, 0, 4*n); } volatile uint32_t read_sum; static void __attribute__((noinline)) unpremultiply_with_read(uint32_t * restrict dst, uint32_t const * restrict src, size_t n) { size_t i; uint32_t sum = 0; (void)src; (void)dst; for (i=0; i> ASHIFT) & 0xFF; uint32_t r = (rgba >> RSHIFT) & 0xFF; uint32_t g = (rgba >> GSHIFT) & 0xFF; uint32_t b = (rgba >> BSHIFT) & 0xFF; r = r < a ? r : a; g = g < a ? g : a; b = b < a ? b : a; pix.u32 = (a << ASHIFT) | (r << RSHIFT) | (g << GSHIFT) | (b << BSHIFT); p[i*4 + 0] = pix.u8[0]; p[i*4 + 1] = pix.u8[1]; p[i*4 + 2] = pix.u8[2]; p[i*4 + 3] = pix.u8[3]; } } /* * Methods to create pixel patterns to test. */ static void fill_random(uint32_t *buf, size_t n) { uint32_t x = 123456789; size_t i; for (i=0; i [solid] [method] [verify]\n" "\n" "Prints out the time in ms taken to unpremultiply a data buffer\n" "using the given method. Optionally verifies the result.\n" "If 'aliased' is given on the command line then the source\n" "and destination buffers are the same.\n" "\n" "Available methods:\n" " div The reference unpremultiplier using three\n" " divisions per pixel.\n" " lut A division table based unpremultiplier.\n" " inv A reciprocal table based unpremultiplier.\n" " lutb 'lut' with special casings for constant and\n" " solid pixel runs.\n" " invb 'inv' with special casings for constant and\n" " solid pixel runs.\n" " sse2 An AMD64/SSE2 unpremultiplier.\n" "\n" "Methods suitable as timing references:\n" " copy memcpy(dst, src, pixels);\n" " write memset(dst, 0, pixels);\n" " read Sums the source pixels.\n" "\n" "Available data generators:\n" " random Random pixel values.\n" " gradient A smoothly varying alpha gradient.\n" " Usually gives the same results as\n" " the random data.\n" " clear All zeros.\n" "\n" "If the 'solid' suffix is given after the data, then the\n" "data is forced to be completely opaque by setting the alpha\n" "of every pixel to 255. If 'verify' is given on the command\n" "line then the final result is verified against a reference\n" "unpremultiplier. Verification using aliased data buffers\n" "works correctly when loops=1.\n" "\n" "Environment variables affecting the run:\n" " loops [500] Number of times to do the\n" " unpremultiplication\n" " pixels [2097152] Size of the data buffers in pixels.\n" " offset [0] Byte offset added to the data buffer\n" " pointers to misalign the pixel data.\n" " This option is liable to segfault the\n" " program.\n" ); return 1; } udst.u8 = calloc(n*4+offset, 1) + offset; usrc.u8 = calloc(n*4+offset, 1) + offset; #define dst udst.u32 #define src usrc.u32 for (i=1; i 0) { unpremultiply_with_div(dst, src, n); } } else if (0 == strcmp(method, "lut")) { while (nloops-- > 0) { unpremultiply_with_lut(dst, src, n); } } else if (0 == strcmp(method, "lutb")) { while (nloops-- > 0) { unpremultiply_with_lutb(dst, src, n); } } else if (0 == strcmp(method, "inv")) { while (nloops-- > 0) { unpremultiply_with_inv(dst, src, n); } } else if (0 == strcmp(method, "invb")) { while (nloops-- > 0) { unpremultiply_with_invb(dst, src, n); } } else if (0 == strcmp(method, "sse2")) { while (nloops-- > 0) { unpremultiply_with_sse2(dst, src, n); } } else if (0 == strcmp(method, "copy")) { while (nloops-- > 0) { unpremultiply_with_memcpy(dst, src, n); } } else if (0 == strcmp(method, "write")) { while (nloops-- > 0) { unpremultiply_with_memset(dst, src, n); } } else if (0 == strcmp(method, "read")) { while (nloops-- > 0) { unpremultiply_with_read(dst, src, n); } } else if (0 == strcmp(method, "noop")) { /* do nothing. */ } else { fprintf(stderr, "unknown method %s\n", method); return 1; } elapsed_ms = now_ms() - elapsed_ms; printf("%f\n", elapsed_ms); if (verify) { unsigned i; int maxdiff = 0; for (i=0; i maxdiff) { printf("maxdiff now %d: " "src[%u]=%08x dst[%u]=%08x ref[%u]=%08x " "(component %d)\n", diff, i, src[i], i, dst[i], i, ref[i], j); maxdiff = diff; } x >>= 8; y >>= 8; } } } return 0; }