diff options
Diffstat (limited to 'src/runtime/stdlib.c')
-rw-r--r-- | src/runtime/stdlib.c | 423 |
1 files changed, 214 insertions, 209 deletions
diff --git a/src/runtime/stdlib.c b/src/runtime/stdlib.c index 7a17cc5..90b994a 100644 --- a/src/runtime/stdlib.c +++ b/src/runtime/stdlib.c @@ -222,231 +222,236 @@ float4 f2f_floor(float4 value) return __builtin_ia32_cvtdq2ps(f2i_floor(value)); } -#define LINEAR_3D(a, b, c, V0, V1, one) \ - (one - a) * (one - b) - (one - c) * \ - read_imagef(image, sampler, \ +#define LINEAR_3D(t_max, suf) \ + (t_max - a) * (t_max - b) - (t_max - c) * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 0, 1, 2, 3)) + \ - a * (one - b) * (one - c) * \ - read_imagef(image, sampler, \ + a * (t_max - b) * (t_max - c) * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 4, 1, 2, 3)) + \ - (one - a) * b * (one - c) * \ - read_imagef(image, sampler, \ + (t_max - a) * b * (t_max - c) * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 0, 5, 2, 3)) + \ - a * b * (one - c) * \ - read_imagef(image, sampler, \ + a * b * (t_max - c) * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 4, 5, 2, 3)) + \ - (one - a) * (one - b) * c * \ - read_imagef(image, sampler, \ + (t_max - a) * (t_max - b) * c * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 0, 1, 6, 3)) + \ - a * (one - b) * c * \ - read_imagef(image, sampler, \ + a * (t_max - b) * c * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 4, 1, 6, 3)) + \ - (one - a) * b * c * \ - read_imagef(image, sampler, \ + (t_max - a) * b * c * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 0, 5, 6, 3)) + \ a * b * c * \ - read_imagef(image, sampler, \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 4, 5, 6, 3)) -#define LINEAR_2D(a, b, V0, V1, one) \ - (one - a) * (one - b) * \ - read_imagef(image, sampler, \ +#define LINEAR_2D(t_max, suf) \ + (t_max - a) * (t_max - b) * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 0, 1, 2, 2)) + \ - a * (one - b) * \ - read_imagef(image, sampler, \ + a * (t_max - b) * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 4, 1, 2, 2)) + \ - (one - a) * b * \ - read_imagef(image, sampler, \ + (t_max - a) * b * \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 0, 5, 2, 2)) + \ a * b * \ - read_imagef(image, sampler, \ + read_image##suf(image, sampler, \ __builtin_shufflevector(V0, V1, 4, 5, 2, 2)); +#define READ_IMAGE(type, suf, type_max) \ + type##4 result; \ + \ + switch (sampler & 0xf0) \ + { \ + case CLK_ADDRESS_NONE: \ + case CLK_ADDRESS_CLAMP: \ + case CLK_ADDRESS_CLAMP_TO_EDGE: \ + /* Denormalize coords */ \ + if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE) \ + coord *= __builtin_ia32_cvtdq2ps(get_image_dim(image)); \ + \ + switch (sampler & 0xf00) \ + { \ + case CLK_FILTER_NEAREST: \ + { \ + int4 c = f2i_floor(coord); \ + \ + return read_image##suf(image, sampler, c); \ + } \ + case CLK_FILTER_LINEAR: \ + { \ + type a, b, c; \ + \ + coord -= 0.5f; \ + \ + int4 V0, V1; \ + \ + V0 = f2i_floor(coord); \ + V1 = f2i_floor(coord) + 1; \ + \ + coord -= f2f_floor(coord); \ + \ + a = (type)(coord.x * type_max); \ + b = (type)(coord.y * type_max); \ + c = (type)(coord.z * type_max); \ + \ + if (__cpu_is_image_3d(image)) \ + { \ + result = LINEAR_3D(type_max, suf); \ + } \ + else \ + { \ + result = LINEAR_2D(type_max, suf); \ + } \ + } \ + } \ + break; \ + case CLK_ADDRESS_REPEAT: \ + switch (sampler & 0xf00) \ + { \ + case CLK_FILTER_NEAREST: \ + { \ + int4 dim = get_image_dim(image); \ + coord = (coord - f2f_floor(coord)) * \ + __builtin_ia32_cvtdq2ps(dim); \ + \ + int4 c = f2i_floor(coord); \ + \ + /* if (c > dim - 1) c = c - dim */ \ + int4 mask = __builtin_ia32_pcmpgtd128(c, dim - 1); \ + int4 repl = c - dim; \ + c = (repl & mask) | (c & ~mask); \ + \ + return read_image##suf(image, sampler, c); \ + } \ + case CLK_FILTER_LINEAR: \ + { \ + type a, b, c; \ + \ + int4 dim = get_image_dim(image); \ + coord = (coord - f2f_floor(coord)) * \ + __builtin_ia32_cvtdq2ps(dim); \ + \ + float4 tmp = coord; \ + tmp -= 0.5f; \ + tmp -= f2f_floor(tmp); \ + \ + a = (type)(tmp.x * type_max); \ + b = (type)(tmp.y * type_max); \ + c = (type)(tmp.z * type_max); \ + \ + int4 V0, V1; \ + \ + V0 = f2i_floor(coord - 0.5f); \ + V1 = V0 + 1; \ + \ + /* if (0 > V0) V0 = dim + V0 */ \ + int4 zero = 0; \ + int4 mask = __builtin_ia32_pcmpgtd128(zero, V0); \ + int4 repl = dim + V0; \ + V0 = (repl & mask) | (V0 & ~mask); \ + \ + /* if (V1 > dim - 1) V1 = V1 - dim */ \ + mask = __builtin_ia32_pcmpgtd128(V1, dim); \ + repl = V1 - dim; \ + V1 = (repl & mask) | (V0 & ~mask); \ + \ + if (__cpu_is_image_3d(image)) \ + { \ + result = LINEAR_3D(type_max, suf); \ + } \ + else \ + { \ + result = LINEAR_2D(type_max, suf); \ + } \ + } \ + } \ + break; \ + case CLK_ADDRESS_MIRRORED_REPEAT: \ + switch (sampler & 0xf00) \ + { \ + case CLK_FILTER_NEAREST: \ + { \ + int4 dim = get_image_dim(image); \ + float4 two = 2.0f; \ + float4 prim = two * __builtin_ia32_cvtdq2ps( \ + __builtin_ia32_cvtps2dq(0.5f * coord)); \ + prim -= coord; \ + \ + /* abs(x) = x & ~{-0, -0, -0, -0} */ \ + float4 nzeroes = -0.0f; \ + prim = (float4)((int4)prim & ~(int4)nzeroes); \ + \ + coord = prim * __builtin_ia32_cvtdq2ps(dim); \ + int4 c = f2i_floor(coord); \ + \ + /* if (c > dim - 1) c = dim - 1 */ \ + int4 repl = dim - 1; \ + int4 mask = __builtin_ia32_pcmpgtd128(c, repl); \ + c = (repl & mask) | (c & ~mask); \ + \ + return read_image##suf(image, sampler, c); \ + } \ + case CLK_FILTER_LINEAR: \ + { \ + type a, b, c; \ + \ + int4 dim = get_image_dim(image); \ + float4 two = 2.0f; \ + float4 prim = two * __builtin_ia32_cvtdq2ps( \ + __builtin_ia32_cvtps2dq(0.5f * coord)); \ + prim -= coord; \ + \ + /* abs(x) = x & ~{-0, -0, -0, -0} */ \ + float4 nzeroes = -0.0f; \ + prim = (float4)((int4)prim & ~(int4)nzeroes); \ + \ + coord = prim * __builtin_ia32_cvtdq2ps(dim); \ + \ + float4 tmp = coord; \ + tmp -= 0.5f; \ + tmp -= f2f_floor(tmp); \ + \ + a = (type)(tmp.x * type_max); \ + b = (type)(tmp.y * type_max); \ + c = (type)(tmp.z * type_max); \ + \ + int4 V0, V1, zero = 0; \ + \ + V0 = f2i_floor(coord - 0.5f); \ + V1 = V0 + 1; \ + \ + /* if (0 > V0) V0 = 0 */ \ + int4 mask = __builtin_ia32_pcmpgtd128(V0, zero); \ + V0 &= ~mask; \ + \ + /* if (V1 > dim - 1) V1 = dim - 1 */ \ + int4 repl = dim - 1; \ + mask = __builtin_ia32_pcmpgtd128(V1, repl); \ + V1 = (repl & mask) | (V1 & ~mask); \ + \ + if (__cpu_is_image_3d(image)) \ + { \ + result = LINEAR_3D(type_max, suf); \ + } \ + else \ + { \ + result = LINEAR_2D(type_max, suf); \ + } \ + } \ + } \ + break; \ + } \ + \ + return result; + float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, float4 coord) { - float4 result; - - switch (sampler & 0xf0) - { - case CLK_ADDRESS_NONE: - case CLK_ADDRESS_CLAMP: - case CLK_ADDRESS_CLAMP_TO_EDGE: - // Denormalize coords - if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE) - coord *= __builtin_ia32_cvtdq2ps(get_image_dim(image)); - - switch (sampler & 0xf00) - { - case CLK_FILTER_NEAREST: - { - int4 c = f2i_floor(coord); - - return read_imagef(image, sampler, c); - } - case CLK_FILTER_LINEAR: - { - float a, b, c; - - coord -= 0.5f; - - int4 V0, V1; - - V0 = f2i_floor(coord); - V1 = f2i_floor(coord) + 1; - - coord -= f2f_floor(coord); - - a = coord.x; - b = coord.y; - c = coord.z; - - if (__cpu_is_image_3d(image)) - { - result = LINEAR_3D(a, b, c, V0, V1, 1.0f); - } - else - { - result = LINEAR_2D(a, b, V0, V1, 1.0f); - } - } - } - break; - case CLK_ADDRESS_REPEAT: - switch (sampler & 0xf00) - { - case CLK_FILTER_NEAREST: - { - int4 dim = get_image_dim(image); - coord = (coord - f2f_floor(coord)) * - __builtin_ia32_cvtdq2ps(dim); - - int4 c = f2i_floor(coord); - - // if (c > dim - 1) c = c - dim - int4 mask = __builtin_ia32_pcmpgtd128(c, dim - 1); - int4 repl = c - dim; - c = (repl & mask) | (c & ~mask); - - return read_imagef(image, sampler, c); - } - case CLK_FILTER_LINEAR: - { - float a, b, c; - - int4 dim = get_image_dim(image); - coord = (coord - f2f_floor(coord)) * - __builtin_ia32_cvtdq2ps(dim); - - float4 tmp = coord; - tmp -= 0.5f; - tmp -= f2f_floor(tmp); - - a = tmp.x; - b = tmp.y; - c = tmp.z; - - int4 V0, V1; - - V0 = f2i_floor(coord - 0.5f); - V1 = V0 + 1; - - // if (0 > V0) V0 = dim + V0 - int4 zero = 0; - int4 mask = __builtin_ia32_pcmpgtd128(zero, V0); - int4 repl = dim + V0; - V0 = (repl & mask) | (V0 & ~mask); - - // if (V1 > dim - 1) V1 = V1 - dim - mask = __builtin_ia32_pcmpgtd128(V1, dim); - repl = V1 - dim; - V1 = (repl & mask) | (V0 & ~mask); - - if (__cpu_is_image_3d(image)) - { - result = LINEAR_3D(a, b, c, V0, V1, 1.0f); - } - else - { - result = LINEAR_2D(a, b, V0, V1, 1.0f); - } - } - } - break; - case CLK_ADDRESS_MIRRORED_REPEAT: - switch (sampler & 0xf00) - { - case CLK_FILTER_NEAREST: - { - int4 dim = get_image_dim(image); - float4 two = 2.0f; - float4 prim = two * __builtin_ia32_cvtdq2ps( - __builtin_ia32_cvtps2dq(0.5f * coord)); - prim -= coord; - - // abs(x) = x & ~{-0, -0, -0, -0} - float4 nzeroes = -0.0f; - prim = (float4)((int4)prim & ~(int4)nzeroes); - - coord = prim * __builtin_ia32_cvtdq2ps(dim); - int4 c = f2i_floor(coord); - - // if (c > dim - 1) c = dim - 1 - int4 repl = dim - 1; - int4 mask = __builtin_ia32_pcmpgtd128(c, repl); - c = (repl & mask) | (c & ~mask); - - return read_imagef(image, sampler, c); - } - case CLK_FILTER_LINEAR: - { - float a, b, c; - - int4 dim = get_image_dim(image); - float4 two = 2.0f; - float4 prim = two * __builtin_ia32_cvtdq2ps( - __builtin_ia32_cvtps2dq(0.5f * coord)); - prim -= coord; - - // abs(x) = x & ~{-0, -0, -0, -0} - float4 nzeroes = -0.0f; - prim = (float4)((int4)prim & ~(int4)nzeroes); - - coord = prim * __builtin_ia32_cvtdq2ps(dim); - - float4 tmp = coord; - tmp -= 0.5f; - tmp -= f2f_floor(tmp); - - a = tmp.x; - b = tmp.y; - c = tmp.z; - - int4 V0, V1, zero = 0; - - V0 = f2i_floor(coord - 0.5f); - V1 = V0 + 1; - - // if (0 > V0) V0 = 0 - int4 mask = __builtin_ia32_pcmpgtd128(V0, zero); - V0 &= ~mask; - - // if (V1 > dim - 1) V1 = dim - 1 - int4 repl = dim - 1; - mask = __builtin_ia32_pcmpgtd128(V1, repl); - V1 = (repl & mask) | (V1 & ~mask); - - if (__cpu_is_image_3d(image)) - { - result = LINEAR_3D(a, b, c, V0, V1, 1.0f); - } - else - { - result = LINEAR_2D(a, b, V0, V1, 1.0f); - } - } - } - break; - } + READ_IMAGE(float, f, 1.0f) } #define UNSWIZZLE_8(source, data, m) \ @@ -595,7 +600,7 @@ int4 OVERLOAD read_imagei(image2d_t image, sampler_t sampler, float2 coord) int4 OVERLOAD read_imagei(image3d_t image, sampler_t sampler, float4 coord) { - + READ_IMAGE(int, i, 0x7fffffff) } uint4 OVERLOAD read_imageui(image2d_t image, sampler_t sampler, int2 coord) @@ -702,7 +707,7 @@ uint4 OVERLOAD read_imageui(image2d_t image, sampler_t sampler, float2 coord) uint4 OVERLOAD read_imageui(image3d_t image, sampler_t sampler, float4 coord) { - + READ_IMAGE(uint, ui, 0xffffffff) } #undef UNSWIZZLE_8 |