diff options
-rw-r--r-- | src/core/cpu/builtins.cpp | 24 | ||||
-rw-r--r-- | src/core/cpu/kernel.h | 18 | ||||
-rw-r--r-- | src/core/cpu/sampler.cpp | 330 | ||||
-rw-r--r-- | src/runtime/stdlib.c | 319 | ||||
-rw-r--r-- | tests/test_builtins.cpp | 2 |
5 files changed, 390 insertions, 303 deletions
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp index 265ba1f..217e55d 100644 --- a/src/core/cpu/builtins.cpp +++ b/src/core/cpu/builtins.cpp @@ -371,6 +371,24 @@ static void read_imageuii(uint32_t *result, Image2D *image, int x, int y, int z, g_work_group->readImage(result, image, x, y, z, sampler); } +static void read_imageff(float *result, Image2D *image, float x, float y, + float z, int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + +static void read_imageif(int32_t *result, Image2D *image, float x, float y, + float z, int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + +static void read_imageuif(uint32_t *result, Image2D *image, float x, float y, + float z, int32_t sampler) +{ + g_work_group->readImage(result, image, x, y, z, sampler); +} + /* * Bridge between LLVM and us */ @@ -425,6 +443,12 @@ void *getBuiltin(const std::string &name) return (void *)&read_imageii; else if (name == "__cpu_read_imageuii") return (void *)&read_imageuii; + else if (name == "__cpu_read_imageff") + return (void *)&read_imageff; + else if (name == "__cpu_read_imageif") + return (void *)&read_imageif; + else if (name == "__cpu_read_imageuif") + return (void *)&read_imageuif; else if (name == "debug") return (void *)&printf; diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h index e9d7124..70e348c 100644 --- a/src/core/cpu/kernel.h +++ b/src/core/cpu/kernel.h @@ -51,6 +51,7 @@ class CPUDevice; class Kernel; class KernelEvent; class Image2D; +class Image3D; class CPUKernel : public DeviceKernel { @@ -117,6 +118,13 @@ class CPUKernelWorkGroup void readImage(uint32_t *result, Image2D *image, int x, int y, int z, uint32_t sampler) const; + void readImage(float *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + void readImage(int32_t *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + void readImage(uint32_t *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + void builtinNotFound(const std::string &name) const; private: @@ -125,6 +133,16 @@ class CPUKernelWorkGroup template<typename T> void readImageImplI(T *result, Image2D *image, int x, int y, int z, uint32_t sampler) const; + template<typename T> + void readImageImplF(T *result, Image2D *image, float x, float y, float z, + uint32_t sampler) const; + template<typename T> + void linear3D(T *result, float a, float b, float c, + int i0, int j0, int k0, int i1, int j1, int k1, + Image3D *image) const; + template<typename T> + void linear2D(T *result, float a, float b, float c, int i0, int j0, + int i1, int j1, Image2D *image) const; private: CPUKernel *p_kernel; diff --git a/src/core/cpu/sampler.cpp b/src/core/cpu/sampler.cpp index c215cf8..43d9d3b 100644 --- a/src/core/cpu/sampler.cpp +++ b/src/core/cpu/sampler.cpp @@ -41,6 +41,7 @@ #include "builtins.h" #include <cstdlib> +#include <cmath> #include <immintrin.h> using namespace Coal; @@ -54,6 +55,26 @@ static int clamp(int a, int b, int c) return (a < b) ? b : ((a > c) ? c : a); } +static int min(int a, int b) +{ + return (a < b ? a : b); +} + +static int max(int a, int b) +{ + return (a > b ? a : b); +} + +static float frac(float x) +{ + return x - std::floor(x); +} + +static float round(float x) +{ + return (float)(int)x; +} + static bool handle_address_mode(Image2D *image, int &x, int &y, int &z, uint32_t sampler) { @@ -94,7 +115,7 @@ static void slow_shuffle4(uint32_t *rs, uint32_t *a, uint32_t *b, rs[3] = (w < 4 ? a[w] : b[w - 4]); } -static void slow_convert_to_format(void *dest, float *data, +static void convert_to_format(void *dest, float *data, cl_channel_type type, unsigned int channels) { // Convert always the four components of source to target @@ -121,7 +142,7 @@ static void slow_convert_to_format(void *dest, float *data, } } -static void slow_convert_from_format(float *data, void *source, +static void convert_from_format(float *data, void *source, cl_channel_type type, unsigned int channels) { // Convert always the four components of source to target @@ -148,7 +169,7 @@ static void slow_convert_from_format(float *data, void *source, } } -static void slow_convert_to_format(void *dest, int *data, +static void convert_to_format(void *dest, int *data, cl_channel_type type, unsigned int channels) { // Convert always the four components of source to target @@ -169,7 +190,7 @@ static void slow_convert_to_format(void *dest, int *data, } } -static void slow_convert_from_format(int32_t *data, void *source, +static void convert_from_format(int32_t *data, void *source, cl_channel_type type, unsigned int channels) { // Convert always the four components of source to target @@ -190,7 +211,7 @@ static void slow_convert_from_format(int32_t *data, void *source, } } -static void slow_convert_to_format(void *dest, uint32_t *data, +static void convert_to_format(void *dest, uint32_t *data, cl_channel_type type, unsigned int channels) { // Convert always the four components of source to target @@ -211,7 +232,7 @@ static void slow_convert_to_format(void *dest, uint32_t *data, } } -static void slow_convert_from_format(uint32_t *data, void *source, +static void convert_from_format(uint32_t *data, void *source, cl_channel_type type, unsigned int channels) { // Convert always the four components of source to target @@ -232,6 +253,81 @@ static void slow_convert_from_format(uint32_t *data, void *source, } } +template<typename T> +static void vec4_scalar_mul(T *vec, float val) +{ + for (unsigned int i=0; i<4; ++i) + vec[i] *= val; +} + +template<typename T> +static void vec4_add(T *vec1, T *vec2) +{ + for (unsigned int i=0; i<4; ++i) + vec1[i] += vec2[i]; +} + +template<typename T> +void CPUKernelWorkGroup::linear3D(T *result, float a, float b, float c, + int i0, int j0, int k0, int i1, int j1, int k1, + Image3D *image) const +{ + T accum[4]; + + readImageImplI<T>(result, image, i0, j0, k0, 0); + vec4_scalar_mul(result, (1.0f - a) * (1.0f - b) * (1.0f - c )); + + readImageImplI<T>(accum, image, i1, j0, k0, 0); + vec4_scalar_mul(accum, a * (1.0f - b) * (1.0f - c )); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i0, j1, k0, 0); + vec4_scalar_mul(accum, (1.0f - a) * b * (1.0f - c )); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i1, j1, k0, 0); + vec4_scalar_mul(accum, a * b * (1.0f -c )); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i0, j0, k1, 0); + vec4_scalar_mul(accum, (1.0f - a) * (1.0f - b) * c); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i1, j0, k1, 0); + vec4_scalar_mul(accum, a * (1.0f - b) * c); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i0, j1, k1, 0); + vec4_scalar_mul(accum, (1.0f - a) * b * c); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i1, j1, k1, 0); + vec4_scalar_mul(accum, a * b * c); + vec4_add(result, accum); +} + +template<typename T> +void CPUKernelWorkGroup::linear2D(T *result, float a, float b, float c, int i0, int j0, + int i1, int j1, Image2D *image) const +{ + T accum[4]; + + readImageImplI<T>(result, image, i0, j0, 0, 0); + vec4_scalar_mul(result, (1.0f - a) * (1.0f - b)); + + readImageImplI<T>(accum, image, i1, j0, 0, 0); + vec4_scalar_mul(accum, a * (1.0f - b)); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i0, j1, 0, 0); + vec4_scalar_mul(accum, (1.0f - a) * b); + vec4_add(result, accum); + + readImageImplI<T>(accum, image, i1, j1, 0, 0); + vec4_scalar_mul(accum, a * b); + vec4_add(result, accum); +} + #if __has_builtin(__builtin_shufflevector) #define shuffle4(rs, a, b, x, y, z, w) \ *(__v4sf *)rs = __builtin_shufflevector(*(__v4sf *)a, *(__v4sf *)b, \ @@ -241,12 +337,6 @@ static void slow_convert_from_format(uint32_t *data, void *source, slow_shuffle4(rs, a, b, x, y, z, w) #endif - #define convert_to_format(dest, data, type, channels) \ - slow_convert_to_format(dest, data, type, channels) - - #define convert_from_format(data, source, type, channels) \ - slow_convert_from_format(data, source, type, channels) - static void swizzle(uint32_t *target, uint32_t *source, cl_channel_order order, bool reading, uint32_t t_max) { @@ -461,3 +551,219 @@ void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, int x, int { readImageImplI<uint32_t>(result, image, x, y, z, sampler); } + +template<typename T> +void CPUKernelWorkGroup::readImageImplF(T *result, Image2D *image, float x, + float y, float z, uint32_t sampler) const +{ + bool is_3d = (image->type() == MemObject::Image3D); + Image3D *image3d = (Image3D *)image; + + int w = image->width(), + h = image->height(), + d = (is_3d ? image3d->depth() : 1); + + switch (sampler & 0xf0) + { + case CLK_ADDRESS_NONE: + case CLK_ADDRESS_CLAMP: + case CLK_ADDRESS_CLAMP_TO_EDGE: + /* De-normalize coordinates */ + if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE) + { + x *= (float)w; + y *= (float)h; + if (is_3d) z *= (float)d; + } + + switch (sampler & 0xf00) + { + case CLK_FILTER_NEAREST: + { + readImageImplI<T>(result, image, std::floor(x), + std::floor(y), std::floor(z), sampler); + } + case CLK_FILTER_LINEAR: + { + float a, b, c; + + a = frac(x - 0.5f); + b = frac(y - 0.5f); + c = frac(z - 0.5f); + + if (is_3d) + { + linear3D<T>(result, a, b, c, + std::floor(x - 0.5f), + std::floor(y - 0.5f), + std::floor(z - 0.5f), + std::floor(x - 0.5f) + 1, + std::floor(y - 0.5f) + 1, + std::floor(z - 0.5f) + 1, + image3d); + } + else + { + linear2D<T>(result, a, b, c, + std::floor(x - 0.5f), + std::floor(y - 0.5f), + std::floor(x - 0.5f) + 1, + std::floor(y - 0.5f) + 1, + image); + } + } + } + break; + case CLK_ADDRESS_REPEAT: + switch (sampler & 0xf00) + { + case CLK_FILTER_NEAREST: + { + int i, j, k; + + x = (x - std::floor(x)) * (float)w; + i = std::floor(x); + if (i > w - 1) + i = i - w; + + y = (y - std::floor(y)) * (float)h; + j = std::floor(y); + if (j > h - 1) + j = j - h; + + if (is_3d) + { + z = (z - std::floor(z)) * (float)d; + k = std::floor(z); + if (k > d - 1) + k = k - d; + } + + readImageImplI<T>(result, image, i, j, k, sampler); + } + case CLK_FILTER_LINEAR: + { + float a, b, c; + int i0, i1, j0, j1, k0, k1; + + x = (x - std::floor(x)) * (float)w; + i0 = std::floor(x - 0.5f); + i1 = i0 + 1; + if (i0 < 0) + i0 = w + i0; + if (i1 > w - 1) + i1 = i1 - w; + + y = (y - std::floor(y)) * (float)h; + j0 = std::floor(y - 0.5f); + j1 = j0 + 1; + if (j0 < 0) + j0 = h + j0; + if (j1 > h - 1) + j1 = j1 - h; + + if (is_3d) + { + z = (z - std::floor(z)) * (float)d; + k0 = std::floor(z - 0.5f); + k1 = k0 + 1; + if (k0 < 0) + k0 = d + k0; + if (k1 > d - 1) + k1 = k1 - d; + } + + a = frac(x - 0.5f); + b = frac(y - 0.5f); + c = frac(z - 0.5f); + + if (is_3d) + { + linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1, + image3d); + } + else + { + linear2D<T>(result, a, b, c, i0, j0, i1, j1, image); + } + } + } + break; + case CLK_ADDRESS_MIRRORED_REPEAT: + switch (sampler & 0xf00) + { + case CLK_FILTER_NEAREST: + { + x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w; + y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h; + if (is_3d) + z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d; + + readImageImplI<T>(result, image, + min(std::floor(x), w - 1), + min(std::floor(y), h - 1), + min(std::floor(z), d - 1), + sampler); + } + case CLK_FILTER_LINEAR: + { + float a, b, c; + int i0, i1, j0, j1, k0, k1; + + x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w; + i0 = std::floor(x - 0.5f); + i1 = i0 + 1; + i0 = max(i0, 0); + i1 = min(i1, w - 1); + + y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h; + j0 = std::floor(y - 0.5f); + j1 = j0 + 1; + j0 = max(j0, 0); + j1 = min(j1, h - 1); + + if (is_3d) + { + z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d; + k0 = std::floor(z - 0.5f); + k1 = k0 + 1; + k0 = max(k0, 0); + k1 = min(k1, d - 1); + } + + a = frac(x - 0.5f); + b = frac(y - 0.5f); + c = frac(z - 0.5f); + + if (is_3d) + { + linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1, + image3d); + } + else + { + linear2D<T>(result, a, b, c, i0, j0, i1, j1, image); + } + } + } + break; + } +} + +void CPUKernelWorkGroup::readImage(float *result, Image2D *image, float x, + float y, float z, uint32_t sampler) const +{ + readImageImplF<float>(result, image, x, y, z, sampler); +} + +void CPUKernelWorkGroup::readImage(int32_t *result, Image2D *image, float x, + float y, float z, uint32_t sampler) const +{ + readImageImplF<int32_t>(result, image, x, y, z, sampler); +} + +void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, float x, + float y, float z, uint32_t sampler) const +{ + readImageImplF<uint32_t>(result, image, x, y, z, sampler); +} diff --git a/src/runtime/stdlib.c b/src/runtime/stdlib.c index 4b6a0c2..0f10f0d 100644 --- a/src/runtime/stdlib.c +++ b/src/runtime/stdlib.c @@ -37,11 +37,6 @@ int debug(const char *format, ...); * Image functions */ -int clamp(int a, int b, int c) -{ - return (a < b) ? b : ((a > c) ? c : a); -} - int __cpu_get_image_width(void *image); int __cpu_get_image_height(void *image); int __cpu_get_image_depth(void *image); @@ -60,37 +55,12 @@ void __cpu_read_imageii(int4 *result, void *image, int x, int y, int z, sampler_t sampler); void __cpu_read_imageuii(uint4 *result, void *image, int x, int y, int z, sampler_t sampler); - -int4 handle_address_mode(image3d_t image, int4 coord, sampler_t sampler) -{ - coord.w = 0; - - int w = get_image_width(image), - h = get_image_height(image), - d = get_image_depth(image); - - if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP_TO_EDGE) - { - coord.x = clamp(coord.x, 0, w - 1); - coord.y = clamp(coord.y, 0, h - 1); - coord.z = clamp(coord.z, 0, d - 1); - } - else if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP) - { - coord.x = clamp(coord.x, 0, w); - coord.y = clamp(coord.y, 0, h); - coord.z = clamp(coord.z, 0, d); - } - - if (coord.x == w || - coord.y == h || - coord.z == d) - { - coord.w = 1; - } - - return coord; -} +void __cpu_read_imageff(float4 *result, void *image, float x, float y, float z, + sampler_t sampler); +void __cpu_read_imageif(int4 *result, void *image, float x, float y, float z, + sampler_t sampler); +void __cpu_read_imageuif(uint4 *result, void *image, float x, float y, float z, + sampler_t sampler); float4 OVERLOAD read_imagef(image2d_t image, sampler_t sampler, int2 coord) { @@ -112,257 +82,20 @@ float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, int4 coord) float4 OVERLOAD read_imagef(image2d_t image, sampler_t sampler, float2 coord) { - float4 c; - - c.xy = coord; - c.zw = 0; + float4 rs; - return read_imagef((image3d_t)image, sampler, c); -} + __cpu_read_imageff(&rs, image, coord.x, coord.y, 0.0f, sampler); -int4 f2i_floor(float4 value) -{ - int4 result = __builtin_ia32_cvtps2dq(value); - value = __builtin_ia32_psrldi128((int4)value, 31); - result -= (int4)value; - return result; + return rs; } -float4 f2f_floor(float4 value) +float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, float4 coord) { - return __builtin_ia32_cvtdq2ps(f2i_floor(value)); -} + float4 rs; -#define LINEAR_3D(t_max, suf) \ - (t_max - a) * (t_max - b) - (t_max - c) * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 0, 1, 2, 3)) + \ - a * (t_max - b) * (t_max - c) * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 4, 1, 2, 3)) + \ - (t_max - a) * b * (t_max - c) * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 0, 5, 2, 3)) + \ - a * b * (t_max - c) * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 4, 5, 2, 3)) + \ - (t_max - a) * (t_max - b) * c * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 0, 1, 6, 3)) + \ - a * (t_max - b) * c * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 4, 1, 6, 3)) + \ - (t_max - a) * b * c * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 0, 5, 6, 3)) + \ - a * b * c * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 4, 5, 6, 3)) - -#define LINEAR_2D(t_max, suf) \ - (t_max - a) * (t_max - b) * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 0, 1, 2, 2)) + \ - a * (t_max - b) * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 4, 1, 2, 2)) + \ - (t_max - a) * b * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 0, 5, 2, 2)) + \ - a * b * \ - read_image##suf(image, sampler, \ - __builtin_shufflevector(V0, V1, 4, 5, 2, 2)); - -#define READ_IMAGE(type, suf, type_max) \ - type##4 result; \ - \ - switch (sampler & 0xf0) \ - { \ - case CLK_ADDRESS_NONE: \ - case CLK_ADDRESS_CLAMP: \ - case CLK_ADDRESS_CLAMP_TO_EDGE: \ - /* Denormalize coords */ \ - if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE) \ - coord *= __builtin_ia32_cvtdq2ps(get_image_dim(image)); \ - \ - switch (sampler & 0xf00) \ - { \ - case CLK_FILTER_NEAREST: \ - { \ - int4 c = f2i_floor(coord); \ - \ - return read_image##suf(image, sampler, c); \ - } \ - case CLK_FILTER_LINEAR: \ - { \ - type a, b, c; \ - \ - coord -= 0.5f; \ - \ - int4 V0, V1; \ - \ - V0 = f2i_floor(coord); \ - V1 = f2i_floor(coord) + 1; \ - \ - coord -= f2f_floor(coord); \ - \ - a = (type)(coord.x * type_max); \ - b = (type)(coord.y * type_max); \ - c = (type)(coord.z * type_max); \ - \ - if (__cpu_is_image_3d(image)) \ - { \ - result = LINEAR_3D(type_max, suf); \ - } \ - else \ - { \ - result = LINEAR_2D(type_max, suf); \ - } \ - } \ - } \ - break; \ - case CLK_ADDRESS_REPEAT: \ - switch (sampler & 0xf00) \ - { \ - case CLK_FILTER_NEAREST: \ - { \ - int4 dim = get_image_dim(image); \ - coord = (coord - f2f_floor(coord)) * \ - __builtin_ia32_cvtdq2ps(dim); \ - \ - int4 c = f2i_floor(coord); \ - \ - /* if (c > dim - 1) c = c - dim */ \ - int4 mask = __builtin_ia32_pcmpgtd128(c, dim - 1); \ - int4 repl = c - dim; \ - c = (repl & mask) | (c & ~mask); \ - \ - return read_image##suf(image, sampler, c); \ - } \ - case CLK_FILTER_LINEAR: \ - { \ - type a, b, c; \ - \ - int4 dim = get_image_dim(image); \ - coord = (coord - f2f_floor(coord)) * \ - __builtin_ia32_cvtdq2ps(dim); \ - \ - float4 tmp = coord; \ - tmp -= 0.5f; \ - tmp -= f2f_floor(tmp); \ - \ - a = (type)(tmp.x * type_max); \ - b = (type)(tmp.y * type_max); \ - c = (type)(tmp.z * type_max); \ - \ - int4 V0, V1; \ - \ - V0 = f2i_floor(coord - 0.5f); \ - V1 = V0 + 1; \ - \ - /* if (0 > V0) V0 = dim + V0 */ \ - int4 zero = 0; \ - int4 mask = __builtin_ia32_pcmpgtd128(zero, V0); \ - int4 repl = dim + V0; \ - V0 = (repl & mask) | (V0 & ~mask); \ - \ - /* if (V1 > dim - 1) V1 = V1 - dim */ \ - mask = __builtin_ia32_pcmpgtd128(V1, dim); \ - repl = V1 - dim; \ - V1 = (repl & mask) | (V0 & ~mask); \ - \ - if (__cpu_is_image_3d(image)) \ - { \ - result = LINEAR_3D(type_max, suf); \ - } \ - else \ - { \ - result = LINEAR_2D(type_max, suf); \ - } \ - } \ - } \ - break; \ - case CLK_ADDRESS_MIRRORED_REPEAT: \ - switch (sampler & 0xf00) \ - { \ - case CLK_FILTER_NEAREST: \ - { \ - int4 dim = get_image_dim(image); \ - float4 two = 2.0f; \ - float4 prim = two * __builtin_ia32_cvtdq2ps( \ - __builtin_ia32_cvtps2dq(0.5f * coord)); \ - prim -= coord; \ - \ - /* abs(x) = x & ~{-0, -0, -0, -0} */ \ - float4 nzeroes = -0.0f; \ - prim = (float4)((int4)prim & ~(int4)nzeroes); \ - \ - coord = prim * __builtin_ia32_cvtdq2ps(dim); \ - int4 c = f2i_floor(coord); \ - \ - /* if (c > dim - 1) c = dim - 1 */ \ - int4 repl = dim - 1; \ - int4 mask = __builtin_ia32_pcmpgtd128(c, repl); \ - c = (repl & mask) | (c & ~mask); \ - \ - return read_image##suf(image, sampler, c); \ - } \ - case CLK_FILTER_LINEAR: \ - { \ - type a, b, c; \ - \ - int4 dim = get_image_dim(image); \ - float4 two = 2.0f; \ - float4 prim = two * __builtin_ia32_cvtdq2ps( \ - __builtin_ia32_cvtps2dq(0.5f * coord)); \ - prim -= coord; \ - \ - /* abs(x) = x & ~{-0, -0, -0, -0} */ \ - float4 nzeroes = -0.0f; \ - prim = (float4)((int4)prim & ~(int4)nzeroes); \ - \ - coord = prim * __builtin_ia32_cvtdq2ps(dim); \ - \ - float4 tmp = coord; \ - tmp -= 0.5f; \ - tmp -= f2f_floor(tmp); \ - \ - a = (type)(tmp.x * type_max); \ - b = (type)(tmp.y * type_max); \ - c = (type)(tmp.z * type_max); \ - \ - int4 V0, V1, zero = 0; \ - \ - V0 = f2i_floor(coord - 0.5f); \ - V1 = V0 + 1; \ - \ - /* if (0 > V0) V0 = 0 */ \ - int4 mask = __builtin_ia32_pcmpgtd128(V0, zero); \ - V0 &= ~mask; \ - \ - /* if (V1 > dim - 1) V1 = dim - 1 */ \ - int4 repl = dim - 1; \ - mask = __builtin_ia32_pcmpgtd128(V1, repl); \ - V1 = (repl & mask) | (V1 & ~mask); \ - \ - if (__cpu_is_image_3d(image)) \ - { \ - result = LINEAR_3D(type_max, suf); \ - } \ - else \ - { \ - result = LINEAR_2D(type_max, suf); \ - } \ - } \ - } \ - break; \ - } \ - \ - return result; + __cpu_read_imageff(&rs, image, coord.x, coord.y, coord.z, sampler); -float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, float4 coord) -{ - READ_IMAGE(float, f, 1.0f) + return rs; } int4 OVERLOAD read_imagei(image2d_t image, sampler_t sampler, int2 coord) @@ -385,17 +118,20 @@ int4 OVERLOAD read_imagei(image3d_t image, sampler_t sampler, int4 coord) int4 OVERLOAD read_imagei(image2d_t image, sampler_t sampler, float2 coord) { - float4 c; + int4 rs; - c.xy = coord; - c.zw = 0; + __cpu_read_imageif(&rs, image, coord.x, coord.y, 0.0f, sampler); - return read_imagei((image3d_t)image, sampler, c); + return rs; } int4 OVERLOAD read_imagei(image3d_t image, sampler_t sampler, float4 coord) { - READ_IMAGE(int, i, 0x7fffffff) + int4 rs; + + __cpu_read_imageif(&rs, image, coord.x, coord.y, coord.z, sampler); + + return rs; } uint4 OVERLOAD read_imageui(image2d_t image, sampler_t sampler, int2 coord) @@ -418,17 +154,20 @@ uint4 OVERLOAD read_imageui(image3d_t image, sampler_t sampler, int4 coord) uint4 OVERLOAD read_imageui(image2d_t image, sampler_t sampler, float2 coord) { - float4 c; + uint4 rs; - c.xy = coord; - c.zw = 0; + __cpu_read_imageuif(&rs, image, coord.x, coord.y, 0.0f, sampler); - return read_imageui((image3d_t)image, sampler, c); + return rs; } uint4 OVERLOAD read_imageui(image3d_t image, sampler_t sampler, float4 coord) { - READ_IMAGE(uint, ui, 0xffffffff) + uint4 rs; + + __cpu_read_imageuif(&rs, image, coord.x, coord.y, coord.z, sampler); + + return rs; } void OVERLOAD write_imagef(image2d_t image, int2 coord, float4 color) diff --git a/tests/test_builtins.cpp b/tests/test_builtins.cpp index 7dd35d8..94e836e 100644 --- a/tests/test_builtins.cpp +++ b/tests/test_builtins.cpp @@ -91,7 +91,7 @@ const char image_source[] = "\n" " float2 fcoords;\n" " fcoords.x = 0.31f;\n" - " fcoords.y = 3.1415f;\n" + " fcoords.y = 0.1415f;\n" " fcolor = read_imagef(image3, sampler, fcoords);\n" "}\n"; |