diff options
author | Denis Steckelmacher <steckdenis@yahoo.fr> | 2011-08-20 14:32:33 +0200 |
---|---|---|
committer | Denis Steckelmacher <steckdenis@yahoo.fr> | 2011-08-20 14:32:33 +0200 |
commit | 97917f6c1898a6eeb68c54ed640263bb435fc0ec (patch) | |
tree | 7aee7beced995227b0c5051ba0d153637c5bbb54 | |
parent | 4c887fde686489e3e8cfee11f7e366146674627e (diff) |
Reimplement read_image* in native C++ instead of OpenCL.
This new implementation is smaller and more readable. It is also
architecure-independent and not tied to SSE.
-rw-r--r-- | src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/core/cpu/builtins.cpp | 47 | ||||
-rw-r--r-- | src/core/cpu/kernel.h | 6 | ||||
-rw-r--r-- | src/core/cpu/sampler.cpp | 301 | ||||
-rw-r--r-- | src/core/sampler.cpp | 2 | ||||
-rw-r--r-- | src/runtime/stdlib.c | 266 |
6 files changed, 347 insertions, 276 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 232e964..348d4bc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,6 +41,7 @@ set(COAL_SRC_FILES core/cpu/program.cpp core/cpu/worker.cpp core/cpu/builtins.cpp + core/cpu/sampler.cpp ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.h.embed.h ${CMAKE_CURRENT_BINARY_DIR}/runtime/stdlib.c.bc.embed.h diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp index aacd649..081717c 100644 --- a/src/core/cpu/builtins.cpp +++ b/src/core/cpu/builtins.cpp @@ -239,18 +239,6 @@ void CPUKernelWorkGroup::barrier(unsigned int flags) // a barrier and that we returned to this one. We can continue. } -void *CPUKernelWorkGroup::getImageData(Image2D *image, int x, int y, int z) const -{ - CPUBuffer *buffer = - (CPUBuffer *)image->deviceBuffer((DeviceInterface *)p_kernel->device()); - - return imageData((unsigned char *)buffer->data(), - x, y, z, - image->row_pitch(), - image->slice_pitch(), - image->pixel_size()); -} - void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const { std::cout << "OpenCL: Non-existant builtin function " << name @@ -309,17 +297,17 @@ static void barrier(unsigned int flags) // Images -int get_image_width(Image2D *image) +static int get_image_width(Image2D *image) { return image->width(); } -int get_image_height(Image2D *image) +static int get_image_height(Image2D *image) { return image->height(); } -int get_image_depth(Image3D *image) +static int get_image_depth(Image3D *image) { if (image->type() != MemObject::Image3D) return 1; @@ -327,17 +315,17 @@ int get_image_depth(Image3D *image) return image->depth(); } -int get_image_channel_data_type(Image2D *image) +static int get_image_channel_data_type(Image2D *image) { return image->format().image_channel_data_type; } -int get_image_channel_order(Image2D *image) +static int get_image_channel_order(Image2D *image) { return image->format().image_channel_order; } -void *image_data(Image2D *image, int x, int y, int z, int *order, int *type) +static void *image_data(Image2D *image, int x, int y, int z, int *order, int *type) { *order = image->format().image_channel_order; *type = image->format().image_channel_data_type; @@ -345,11 +333,26 @@ void *image_data(Image2D *image, int x, int y, int z, int *order, int *type) return g_work_group->getImageData(image, x, y, z); } -bool is_image_3d(Image3D *image) +static bool is_image_3d(Image3D *image) { return (image->type() == MemObject::Image3D ? 1 : 0); } +static void write_imagef(Image2D *image, int x, int y, int z, float *color) +{ + g_work_group->writeImage(image, x, y, z, color); +} + +static void write_imagei(Image2D *image, int x, int y, int z, int32_t *color) +{ + g_work_group->writeImage(image, x, y, z, color); +} + +static void write_imageui(Image2D *image, int x, int y, int z, uint32_t *color) +{ + g_work_group->writeImage(image, x, y, z, color); +} + /* * Bridge between LLVM and us */ @@ -392,6 +395,12 @@ void *getBuiltin(const std::string &name) return (void *)&image_data; else if (name == "__cpu_is_image_3d") return (void *)&is_image_3d; + else if (name == "__cpu_write_imagef") + return (void *)&write_imagef; + else if (name == "__cpu_write_imagei") + return (void *)&write_imagei; + else if (name == "__cpu_write_imageui") + return (void *)&write_imageui; else if (name == "debug") return (void *)&printf; diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h index b26e072..dec42e6 100644 --- a/src/core/cpu/kernel.h +++ b/src/core/cpu/kernel.h @@ -37,6 +37,7 @@ #include <ucontext.h> #include <pthread.h> +#include <stdint.h> namespace llvm { @@ -100,8 +101,13 @@ class CPUKernelWorkGroup size_t getNumGroups(cl_uint dimindx) const; size_t getGroupID(cl_uint dimindx) const; size_t getGlobalOffset(cl_uint dimindx) const; + void barrier(unsigned int flags); + void *getImageData(Image2D *image, int x, int y, int z) const; + void writeImage(Image2D *image, int x, int y, int z, float *color) const; + void writeImage(Image2D *image, int x, int y, int z, int32_t *color) const; + void writeImage(Image2D *image, int x, int y, int z, uint32_t *color) const; void builtinNotFound(const std::string &name) const; diff --git a/src/core/cpu/sampler.cpp b/src/core/cpu/sampler.cpp new file mode 100644 index 0000000..c173004 --- /dev/null +++ b/src/core/cpu/sampler.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2011, Denis Steckelmacher <steckdenis@yahoo.fr> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the copyright holder nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file cpu/sampler.cpp + * \brief OpenCL C image access functions + * + * It is recommended to compile this file using Clang as it supports the + * \c __builtin_shufflevector() built-in function, providing SSE or + * NEON-accelerated code. + */ + +#include "../memobject.h" +#include "kernel.h" +#include "buffer.h" +#include "builtins.h" + +#include <cstdlib> +#include <immintrin.h> + +using namespace Coal; + +/* + * Macros or functions used to accelerate the functions + */ +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +static void slow_shuffle4(uint32_t *rs, uint32_t *a, uint32_t *b, + int x, int y, int z, int w) +{ + rs[0] = (x < 4 ? a[x] : b[x - 4]); + rs[1] = (y < 4 ? a[y] : b[y - 4]); + rs[2] = (z < 4 ? a[z] : b[z - 4]); + rs[3] = (w < 4 ? a[w] : b[w - 4]); +} + +static void slow_convert_to_format4f(float *data, cl_channel_type type) +{ + // Convert always the four components of source to target + if (type == CL_FLOAT) + return; + + // NOTE: We can read and write at the same time in data because + // we always begin wy reading 4 bytes (float) and never write + // more than 4 bytes, so no data is corrupted + for (unsigned int i=0; i<3; ++i) + { + switch (type) + { + case CL_SNORM_INT8: + ((int8_t *)data)[i] = data[i] * 128.0f; + break; + case CL_SNORM_INT16: + ((int16_t *)data)[i] = data[i] * 32767.0f; + break; + case CL_UNORM_INT8: + ((uint8_t *)data)[i] = data[i] * 256.0f; + break; + case CL_UNORM_INT16: + ((uint16_t *)data)[i] = data[i] * 65535.0f; + break; + } + } +} + +static void slow_convert_to_format4i(int *data, cl_channel_type type) +{ + // Convert always the four components of source to target + if (type == CL_SIGNED_INT32) + return; + + for (unsigned int i=0; i<3; ++i) + { + switch (type) + { + case CL_SIGNED_INT8: + ((int8_t *)data)[i] = data[i]; + break; + case CL_SIGNED_INT16: + ((int16_t *)data)[i] = data[i]; + break; + } + } +} + +static void slow_convert_to_format4ui(uint32_t *data, cl_channel_type type) +{ + // Convert always the four components of source to target + if (type == CL_UNSIGNED_INT32) + return; + + for (unsigned int i=0; i<3; ++i) + { + switch (type) + { + case CL_UNSIGNED_INT8: + ((uint8_t *)data)[i] = data[i]; + break; + case CL_UNSIGNED_INT16: + ((uint16_t *)data)[i] = data[i]; + break; + } + } +} + +#if __has_builtin(__builtin_shufflevector) + #define shuffle4(rs, a, b, x, y, z, w) \ + *(__v4sf *)rs = __builtin_shufflevector(*(__v4sf *)a, *(__v4sf *)b, \ + x, y, z, w) +#else + #define shuffle4(rs, a, b, x, y, z, w) \ + slow_shuffle4(rs, a, b, x, y, z, w) +#endif + + #define convert_to_format4f(data, type) \ + slow_convert_to_format4f(data, type) + + #define convert_to_format4i(data, type) \ + slow_convert_to_format4i(data, type) + + #define convert_to_format4ui(data, type) \ + slow_convert_to_format4ui(data, type) + +static void swizzle(uint32_t *target, uint32_t *source, + cl_channel_order order, bool reading, uint32_t t_max) +{ + uint32_t special[4] = {0, t_max, 0, 0 }; + + if (reading) + { + switch (order) + { + case CL_R: + case CL_Rx: + // target = {source->x, 0, 0, t_max} + shuffle4(target, source, special, 0, 4, 4, 5); + break; + case CL_A: + // target = {0, 0, 0, source->x} + shuffle4(target, source, special, 4, 4, 4, 0); + break; + case CL_INTENSITY: + // target = {source->x, source->x, source->x, source->x} + shuffle4(target, source, source, 0, 0, 0, 0); + break; + case CL_LUMINANCE: + // target = {source->x, source->x, source->x, t_max} + shuffle4(target, source, special, 0, 0, 0, 5); + break; + case CL_RG: + case CL_RGx: + // target = {source->x, source->y, 0, t_max} + shuffle4(target, source, special, 0, 1, 4, 5); + break; + case CL_RA: + // target = {source->x, 0, 0, source->y} + shuffle4(target, source, special, 0, 4, 4, 1); + break; + case CL_RGB: + case CL_RGBx: + case CL_RGBA: + // Nothing to do, already the good order + std::memcpy(target, source, 16); + break; + case CL_ARGB: + // target = {source->y, source->z, source->w, source->x} + shuffle4(target, source, source, 1, 2, 3, 0); + break; + case CL_BGRA: + // target = {source->z, source->y, source->x, source->w} + shuffle4(target, source, source, 2, 1, 0, 3); + break; + } + } + else + { + switch (order) + { + case CL_A: + // target = {source->w, undef, undef, undef} + shuffle4(target, source, source, 3, 3, 3, 3); + break; + case CL_RA: + // target = {source->x, source->w, undef, undef} + shuffle4(target, source, source, 0, 3, 3, 3); + break; + case CL_ARGB: + // target = {source->w, source->x, source->y, source->z} + shuffle4(target, source, source, 3, 0, 1, 2); + break; + case CL_BGRA: + // target = {source->z, source->y, source->x, source->w} + shuffle4(target, source, source, 2, 1, 0, 3); + break; + default: + std::memcpy(target, source, 16); + } + } +} + +/* + * Actual implementation of the built-ins + */ + +void *CPUKernelWorkGroup::getImageData(Image2D *image, int x, int y, int z) const +{ + CPUBuffer *buffer = + (CPUBuffer *)image->deviceBuffer((DeviceInterface *)p_kernel->device()); + + return imageData((unsigned char *)buffer->data(), + x, y, z, + image->row_pitch(), + image->slice_pitch(), + image->pixel_size()); +} + +void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z, + float *color) const +{ + float converted[4]; + + // Swizzle to the correct order (float, int and uint are 32-bit, so the + // type has no importance + swizzle((uint32_t *)converted, (uint32_t *)color, + image->format().image_channel_order, false, 0); + + // Convert color to the correct format + convert_to_format4f(converted, image->format().image_channel_data_type); + + // Get a pointer in the image where to write the data + void *target = getImageData(image, x, y, z); + + // Copy the converted data to the image + std::memcpy(target, converted, image->pixel_size()); +} + +void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z, + int32_t *color) const +{ + int32_t converted[4]; + + // Swizzle to the correct order (float, int and uint are 32-bit, so the + // type has no importance + swizzle((uint32_t *)converted, (uint32_t *)color, + image->format().image_channel_order, false, 0); + + // Convert color to the correct format + convert_to_format4i(converted, image->format().image_channel_data_type); + + // Get a pointer in the image where to write the data + void *target = getImageData(image, x, y, z); + + // Copy the converted data to the image + std::memcpy(target, converted, image->pixel_size()); +} + +void CPUKernelWorkGroup::writeImage(Image2D *image, int x, int y, int z, + uint32_t *color) const +{ + uint32_t converted[4]; + + // Swizzle to the correct order (float, int and uint are 32-bit, so the + // type has no importance + swizzle((uint32_t *)converted, (uint32_t *)color, + image->format().image_channel_order, false, 0); + + // Convert color to the correct format + convert_to_format4ui(converted, image->format().image_channel_data_type); + + // Get a pointer in the image where to write the data + void *target = getImageData(image, x, y, z); + + // Copy the converted data to the image + std::memcpy(target, converted, image->pixel_size()); +}
\ No newline at end of file diff --git a/src/core/sampler.cpp b/src/core/sampler.cpp index 8895bd0..558b84a 100644 --- a/src/core/sampler.cpp +++ b/src/core/sampler.cpp @@ -26,7 +26,7 @@ */ /** - * \file sampler.cpp + * \file core/sampler.cpp * \brief Sampler */ diff --git a/src/runtime/stdlib.c b/src/runtime/stdlib.c index 2610a98..cbb3ec8 100644 --- a/src/runtime/stdlib.c +++ b/src/runtime/stdlib.c @@ -50,6 +50,10 @@ int __cpu_get_image_channel_order(void *image); int __cpu_is_image_3d(void *image); void *__cpu_image_data(void *image, int x, int y, int z, int *order, int *type); +void __cpu_write_imagef(void *image, int x, int y, int z, float4 *color); +void __cpu_write_imagei(void *image, int x, int y, int z, int4 *color); +void __cpu_write_imageui(void *image, int x, int y, int z, uint4 *color); + int4 handle_address_mode(image3d_t image, int4 coord, sampler_t sampler) { coord.w = 0; @@ -743,284 +747,34 @@ uint4 OVERLOAD read_imageui(image3d_t image, sampler_t sampler, float4 coord) void OVERLOAD write_imagef(image2d_t image, int2 coord, float4 color) { - int4 c; - c.xy = coord; - c.zw = 0; - - write_imagef((image3d_t)image, c, color); + __cpu_write_imagef(image, coord.x, coord.y, 0, &color); } void OVERLOAD write_imagef(image3d_t image, int4 coord, float4 color) { - int order, type; - void *v_target = __cpu_image_data(image, coord.x, coord.y, coord.z, &order, &type); - -#define SWIZZLE(order, target, data) \ - switch (order) \ - { \ - case CLK_R: \ - case CLK_Rx: \ - (*target).x = data.x; \ - break; \ - case CLK_A: \ - (*target).x = data.w; \ - break; \ - case CLK_RG: \ - case CLK_RGx: \ - (*target).xy = data.xy; \ - break; \ - case CLK_RA: \ - (*target).xy = data.xw; \ - break; \ - case CLK_RGBA: \ - *target = data; \ - break; \ - case CLK_BGRA: \ - (*target).zyxw = data.xyzw; \ - break; \ - case CLK_ARGB: \ - (*target).wxyz = data.xyzw; \ - break; \ - case CLK_INTENSITY: \ - case CLK_LUMINANCE: \ - (*target).x = data.x; \ - break; \ - } - - switch (type) - { - case CLK_SNORM_INT8: - { - char4 *target = v_target; - char4 data; - - // Denormalize - data.x = (color.x * 127.0f); - data.y = (color.y * 127.0f); - data.z = (color.z * 127.0f); - data.w = (color.w * 127.0f); - - SWIZZLE(order, target, data) - break; - } - case CLK_UNORM_INT8: - { - uchar4 *target = v_target; - uchar4 data; - - // Denormalize - data.x = (color.x * 255.0f); - data.y = (color.y * 255.0f); - data.z = (color.z * 255.0f); - data.w = (color.w * 255.0f); - - SWIZZLE(order, target, data) - break; - } - case CLK_SNORM_INT16: - { - short4 *target = v_target; - short4 data; - - // Denormalize - data.x = (color.x * 32767.0f); - data.y = (color.y * 32767.0f); - data.z = (color.z * 32767.0f); - data.w = (color.w * 32767.0f); - - SWIZZLE(order, target, data) - break; - } - case CLK_UNORM_INT16: - { - ushort4 *target = v_target; - ushort4 data; - - data.x = (color.x * 65535.0f); - data.y = (color.y * 65535.0f); - data.z = (color.z * 65535.0f); - data.w = (color.w * 65535.0f); - - SWIZZLE(order, target, data) - break; - } - case CLK_FLOAT: - { - float4 *target = v_target; - - SWIZZLE(order, target, color) - break; - } - } - -#undef SWIZZLE + __cpu_write_imagef(image, coord.x, coord.y, coord.z, &color); } -#define SWIZZLE_8(target, data) \ - case CLK_ARGB: \ - (*target).wxyz = data.xyzw; \ - break; \ - case CLK_BGRA: \ - (*target).zyxw = data.xyzw; \ - break; - -#define SWIZZLE_16(target, data) \ - case CLK_LUMINANCE: \ - case CLK_INTENSITY: \ - (*target).x = data.x; \ - break; - -#define SWIZZLE_32(target, data) \ - case CLK_R: \ - case CLK_Rx: \ - (*target).x = data.x; \ - break; \ - case CLK_A: \ - (*target).x = data.w; \ - break; \ - case CLK_RG: \ - case CLK_RGx: \ - (*target).xy = data.xy; \ - break; \ - case CLK_RA: \ - (*target).xy = data.xw; \ - break; \ - case CLK_RGBA: \ - *target = data; \ - break; - void OVERLOAD write_imagei(image2d_t image, int2 coord, int4 color) { - int4 c; - c.xy = coord; - c.zw = 0; - - write_imagei((image3d_t)image, c, color); + __cpu_write_imagei(image, coord.x, coord.y, 0, &color); } void OVERLOAD write_imagei(image3d_t image, int4 coord, int4 color) { - int order, type; - void *v_target = __cpu_image_data(image, coord.x, coord.y, coord.z, &order, &type); - - switch (type) - { - case CLK_SIGNED_INT8: - { - char4 *target = v_target; - char4 data; - - data.x = color.x; - data.y = color.y; - data.z = color.z; - data.w = color.w; - - switch (order) - { - SWIZZLE_8(target, data) - SWIZZLE_16(target, data) - SWIZZLE_32(target, data) - } - - break; - } - case CLK_SIGNED_INT16: - { - short4 *target = v_target; - short4 data; - - data.x = color.x; - data.y = color.y; - data.z = color.z; - data.w = color.w; - - switch (order) - { - SWIZZLE_16(target, data) - SWIZZLE_32(target, data) - } - - break; - } - case CLK_SIGNED_INT32: - { - int4 *target = v_target; - - switch (order) - { - SWIZZLE_32(target, color) - } - - break; - } - } + __cpu_write_imagei(image, coord.x, coord.y, coord.z, &color); } void OVERLOAD write_imageui(image2d_t image, int2 coord, uint4 color) { - int4 c; - c.xy = coord; - c.zw = 0; - - write_imageui((image3d_t)image, c, color); + __cpu_write_imageui(image, coord.x, coord.y, 0, &color); } void OVERLOAD write_imageui(image3d_t image, int4 coord, uint4 color) { - int order, type; - void *v_target = __cpu_image_data(image, coord.x, coord.y, coord.z, &order, &type); - - switch (type) - { - case CLK_UNSIGNED_INT8: - { - uchar4 *target = v_target; - uchar4 data; - - data.x = color.x; - data.y = color.y; - data.z = color.z; - data.w = color.w; - - switch (order) - { - SWIZZLE_8(target, data) - SWIZZLE_16(target, data) - SWIZZLE_32(target, data) - } - } - case CLK_UNSIGNED_INT16: - { - ushort4 *target = v_target; - ushort4 data; - - data.x = color.x; - data.y = color.y; - data.z = color.z; - data.w = color.w; - - switch (order) - { - SWIZZLE_16(target, data) - SWIZZLE_32(target, data) - } - } - case CLK_UNSIGNED_INT32: - { - uint4 *target = v_target; - - switch (order) - { - SWIZZLE_32(target, color) - } - } - } + __cpu_write_imageui(image, coord.x, coord.y, coord.z, &color); } -#undef SWIZZLE_8 -#undef SWIZZLE_16 -#undef SWIZZLE_32 - int2 OVERLOAD get_image_dim(image2d_t image) { int2 result; |