5 files changed, 390 insertions, 303 deletions
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp
index 265ba1f..217e55d 100644
--- a/src/core/cpu/builtins.cpp
+++ b/src/core/cpu/builtins.cpp
@@ -371,6 +371,24 @@ static void read_imageuii(uint32_t *result, Image2D *image, int x, int y, int z,
     g_work_group->readImage(result, image, x, y, z, sampler);
 }
 
+static void read_imageff(float *result, Image2D *image, float x, float y,
+                         float z, int32_t sampler)
+{
+    g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageif(int32_t *result, Image2D *image, float x, float y,
+                         float z, int32_t sampler)
+{
+    g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
+static void read_imageuif(uint32_t *result, Image2D *image, float x, float y,
+                          float z, int32_t sampler)
+{
+    g_work_group->readImage(result, image, x, y, z, sampler);
+}
+
 /*
  * Bridge between LLVM and us
  */
@@ -425,6 +443,12 @@ void *getBuiltin(const std::string &name)
         return (void *)&read_imageii;
     else if (name == "__cpu_read_imageuii")
         return (void *)&read_imageuii;
+    else if (name == "__cpu_read_imageff")
+        return (void *)&read_imageff;
+    else if (name == "__cpu_read_imageif")
+        return (void *)&read_imageif;
+    else if (name == "__cpu_read_imageuif")
+        return (void *)&read_imageuif;
 
     else if (name == "debug")
         return (void *)&printf;
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index e9d7124..70e348c 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -51,6 +51,7 @@ class CPUDevice;
 class Kernel;
 class KernelEvent;
 class Image2D;
+class Image3D;
 
 class CPUKernel : public DeviceKernel
 {
@@ -117,6 +118,13 @@ class CPUKernelWorkGroup
         void readImage(uint32_t *result, Image2D *image, int x, int y, int z,
                        uint32_t sampler) const;
 
+        void readImage(float *result, Image2D *image, float x, float y, float z,
+                       uint32_t sampler) const;
+        void readImage(int32_t *result, Image2D *image, float x, float y, float z,
+                       uint32_t sampler) const;
+        void readImage(uint32_t *result, Image2D *image, float x, float y, float z,
+                       uint32_t sampler) const;
+
         void builtinNotFound(const std::string &name) const;
 
     private:
@@ -125,6 +133,16 @@ class CPUKernelWorkGroup
         template<typename T>
         void readImageImplI(T *result, Image2D *image, int x, int y, int z,
                             uint32_t sampler) const;
+        template<typename T>
+        void readImageImplF(T *result, Image2D *image, float x, float y, float z,
+                            uint32_t sampler) const;
+        template<typename T>
+        void linear3D(T *result, float a, float b, float c,
+                       int i0, int j0, int k0, int i1, int j1, int k1,
+                       Image3D *image) const;
+        template<typename T>
+        void linear2D(T *result, float a, float b, float c, int i0, int j0,
+                      int i1, int j1, Image2D *image) const;
 
     private:
         CPUKernel *p_kernel;
diff --git a/src/core/cpu/sampler.cpp b/src/core/cpu/sampler.cpp
index c215cf8..43d9d3b 100644
--- a/src/core/cpu/sampler.cpp
+++ b/src/core/cpu/sampler.cpp
@@ -41,6 +41,7 @@
 #include "builtins.h"
 
 #include <cstdlib>
+#include <cmath>
 #include <immintrin.h>
 
 using namespace Coal;
@@ -54,6 +55,26 @@ static int clamp(int a, int b, int c)
     return (a < b) ? b : ((a > c) ? c : a);
 }
 
+static int min(int a, int b)
+{
+    return (a < b ? a : b);
+}
+
+static int max(int a, int b)
+{
+    return (a > b ? a : b);
+}
+
+static float frac(float x)
+{
+    return x - std::floor(x);
+}
+
+static float round(float x)
+{
+    return (float)(int)x;
+}
+
 static bool handle_address_mode(Image2D *image, int &x, int &y, int &z,
                                 uint32_t sampler)
 {
@@ -94,7 +115,7 @@ static void slow_shuffle4(uint32_t *rs, uint32_t *a, uint32_t *b,
     rs[3] = (w < 4 ? a[w] : b[w - 4]);
 }
 
-static void slow_convert_to_format(void *dest, float *data,
+static void convert_to_format(void *dest, float *data,
                                    cl_channel_type type, unsigned int channels)
 {
     // Convert always the four components of source to target
@@ -121,7 +142,7 @@ static void slow_convert_to_format(void *dest, float *data,
     }
 }
 
-static void slow_convert_from_format(float *data, void *source,
+static void convert_from_format(float *data, void *source,
                                      cl_channel_type type, unsigned int channels)
 {
     // Convert always the four components of source to target
@@ -148,7 +169,7 @@ static void slow_convert_from_format(float *data, void *source,
     }
 }
 
-static void slow_convert_to_format(void *dest, int *data,
+static void convert_to_format(void *dest, int *data,
                                    cl_channel_type type, unsigned int channels)
 {
     // Convert always the four components of source to target
@@ -169,7 +190,7 @@ static void slow_convert_to_format(void *dest, int *data,
     }
 }
 
-static void slow_convert_from_format(int32_t *data, void *source,
+static void convert_from_format(int32_t *data, void *source,
                                      cl_channel_type type, unsigned int channels)
 {
     // Convert always the four components of source to target
@@ -190,7 +211,7 @@ static void slow_convert_from_format(int32_t *data, void *source,
     }
 }
 
-static void slow_convert_to_format(void *dest, uint32_t *data,
+static void convert_to_format(void *dest, uint32_t *data,
                                    cl_channel_type type, unsigned int channels)
 {
     // Convert always the four components of source to target
@@ -211,7 +232,7 @@ static void slow_convert_to_format(void *dest, uint32_t *data,
     }
 }
 
-static void slow_convert_from_format(uint32_t *data, void *source,
+static void convert_from_format(uint32_t *data, void *source,
                                      cl_channel_type type, unsigned int channels)
 {
     // Convert always the four components of source to target
@@ -232,6 +253,81 @@ static void slow_convert_from_format(uint32_t *data, void *source,
     }
 }
 
+template<typename T>
+static void vec4_scalar_mul(T *vec, float val)
+{
+    for (unsigned int i=0; i<4; ++i)
+        vec[i] *= val;
+}
+
+template<typename T>
+static void vec4_add(T *vec1, T *vec2)
+{
+    for (unsigned int i=0; i<4; ++i)
+        vec1[i] += vec2[i];
+}
+
+template<typename T>
+void CPUKernelWorkGroup::linear3D(T *result, float a, float b, float c,
+              int i0, int j0, int k0, int i1, int j1, int k1,
+              Image3D *image) const
+{
+    T accum[4];
+
+    readImageImplI<T>(result, image, i0, j0, k0, 0);
+    vec4_scalar_mul(result, (1.0f - a) * (1.0f - b) * (1.0f - c ));
+
+    readImageImplI<T>(accum, image, i1, j0, k0, 0);
+    vec4_scalar_mul(accum, a * (1.0f - b) * (1.0f - c ));
+    vec4_add(result, accum);
+
+    readImageImplI<T>(accum, image, i0, j1, k0, 0);
+    vec4_scalar_mul(accum, (1.0f - a) * b * (1.0f - c ));
+    vec4_add(result, accum);
+
+    readImageImplI<T>(accum, image, i1, j1, k0, 0);
+    vec4_scalar_mul(accum, a * b * (1.0f -c ));
+    vec4_add(result, accum);
+
+    readImageImplI<T>(accum, image, i0, j0, k1, 0);
+    vec4_scalar_mul(accum, (1.0f - a) * (1.0f - b) * c);
+    vec4_add(result, accum);
+
+    readImageImplI<T>(accum, image, i1, j0, k1, 0);
+    vec4_scalar_mul(accum, a * (1.0f - b) * c);
+    vec4_add(result, accum);
+
+    readImageImplI<T>(accum, image, i0, j1, k1, 0);
+    vec4_scalar_mul(accum, (1.0f - a) * b * c);
+    vec4_add(result, accum);
+
+    readImageImplI<T>(accum, image, i1, j1, k1, 0);
+    vec4_scalar_mul(accum, a * b * c);
+    vec4_add(result, accum);
+}
+
+template<typename T>
+void CPUKernelWorkGroup::linear2D(T *result, float a, float b, float c, int i0, int j0,
+              int i1, int j1, Image2D *image) const
+{
+    T accum[4];
+
+    readImageImplI<T>(result, image, i0, j0, 0, 0);
+    vec4_scalar_mul(result, (1.0f - a) * (1.0f - b));
+
+    readImageImplI<T>(accum, image, i1, j0, 0, 0);
+    vec4_scalar_mul(accum, a * (1.0f - b));
+    vec4_add(result, accum);
+
+    readImageImplI<T>(accum, image, i0, j1, 0, 0);
+    vec4_scalar_mul(accum, (1.0f - a) * b);
+    vec4_add(result, accum);
+
+    readImageImplI<T>(accum, image, i1, j1, 0, 0);
+    vec4_scalar_mul(accum, a * b);
+    vec4_add(result, accum);
+}
+
 #if __has_builtin(__builtin_shufflevector)
     #define shuffle4(rs, a, b, x, y, z, w) \
         *(__v4sf *)rs = __builtin_shufflevector(*(__v4sf *)a, *(__v4sf *)b, \
@@ -241,12 +337,6 @@ static void slow_convert_from_format(uint32_t *data, void *source,
         slow_shuffle4(rs, a, b, x, y, z, w)
 #endif
 
-    #define convert_to_format(dest, data, type, channels) \
-        slow_convert_to_format(dest, data, type, channels)
-
-    #define convert_from_format(data, source, type, channels) \
-        slow_convert_from_format(data, source, type, channels)
-
 static void swizzle(uint32_t *target, uint32_t *source,
                     cl_channel_order order, bool reading, uint32_t t_max)
 {
@@ -461,3 +551,219 @@ void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, int x, int
 {
     readImageImplI<uint32_t>(result, image, x, y, z, sampler);
 }
+
+template<typename T>
+void CPUKernelWorkGroup::readImageImplF(T *result, Image2D *image, float x,
+                                        float y, float z, uint32_t sampler) const
+{
+    bool is_3d = (image->type() == MemObject::Image3D);
+    Image3D *image3d = (Image3D *)image;
+
+    int w = image->width(),
+        h = image->height(),
+        d = (is_3d ? image3d->depth() : 1);
+
+    switch (sampler & 0xf0)
+    {
+        case CLK_ADDRESS_NONE:
+        case CLK_ADDRESS_CLAMP:
+        case CLK_ADDRESS_CLAMP_TO_EDGE:
+            /* De-normalize coordinates */
+            if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE)
+            {
+                x *= (float)w;
+                y *= (float)h;
+                if (is_3d) z *= (float)d;
+            }
+
+            switch (sampler & 0xf00)
+            {
+                case CLK_FILTER_NEAREST:
+                {
+                    readImageImplI<T>(result, image, std::floor(x),
+                                      std::floor(y), std::floor(z), sampler);
+                }
+                case CLK_FILTER_LINEAR:
+                {
+                    float a, b, c;
+
+                    a = frac(x - 0.5f);
+                    b = frac(y - 0.5f);
+                    c = frac(z - 0.5f);
+
+                    if (is_3d)
+                    {
+                        linear3D<T>(result, a, b, c,
+                                    std::floor(x - 0.5f),
+                                    std::floor(y - 0.5f),
+                                    std::floor(z - 0.5f),
+                                    std::floor(x - 0.5f) + 1,
+                                    std::floor(y - 0.5f) + 1,
+                                    std::floor(z - 0.5f) + 1,
+                                    image3d);
+                    }
+                    else
+                    {
+                        linear2D<T>(result, a, b, c,
+                                    std::floor(x - 0.5f),
+                                    std::floor(y - 0.5f),
+                                    std::floor(x - 0.5f) + 1,
+                                    std::floor(y - 0.5f) + 1,
+                                    image);
+                    }
+                }
+            }
+            break;
+        case CLK_ADDRESS_REPEAT:
+            switch (sampler & 0xf00)
+            {
+                case CLK_FILTER_NEAREST:
+                {
+                    int i, j, k;
+
+                    x = (x - std::floor(x)) * (float)w;
+                    i = std::floor(x);
+                    if (i > w - 1)
+                        i = i - w;
+
+                    y = (y - std::floor(y)) * (float)h;
+                    j = std::floor(y);
+                    if (j > h - 1)
+                        j = j - h;
+
+                    if (is_3d)
+                    {
+                        z = (z - std::floor(z)) * (float)d;
+                        k = std::floor(z);
+                        if (k > d - 1)
+                            k = k - d;
+                    }
+
+                    readImageImplI<T>(result, image, i, j, k, sampler);
+                }
+                case CLK_FILTER_LINEAR:
+                {
+                    float a, b, c;
+                    int i0, i1, j0, j1, k0, k1;
+
+                    x = (x - std::floor(x)) * (float)w;
+                    i0 = std::floor(x - 0.5f);
+                    i1 = i0 + 1;
+                    if (i0 < 0)
+                        i0 = w + i0;
+                    if (i1 > w - 1)
+                        i1 = i1 - w;
+
+                    y = (y - std::floor(y)) * (float)h;
+                    j0 = std::floor(y - 0.5f);
+                    j1 = j0 + 1;
+                    if (j0 < 0)
+                        j0 = h + j0;
+                    if (j1 > h - 1)
+                        j1 = j1 - h;
+
+                    if (is_3d)
+                    {
+                        z = (z - std::floor(z)) * (float)d;
+                        k0 = std::floor(z - 0.5f);
+                        k1 = k0 + 1;
+                        if (k0 < 0)
+                            k0 = d + k0;
+                        if (k1 > d - 1)
+                            k1 = k1 - d;
+                    }
+
+                    a = frac(x - 0.5f);
+                    b = frac(y - 0.5f);
+                    c = frac(z - 0.5f);
+
+                    if (is_3d)
+                    {
+                        linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1,
+                                    image3d);
+                    }
+                    else
+                    {
+                        linear2D<T>(result, a, b, c, i0, j0, i1, j1, image);
+                    }
+                }
+            }
+            break;
+        case CLK_ADDRESS_MIRRORED_REPEAT:
+            switch (sampler & 0xf00)
+            {
+                case CLK_FILTER_NEAREST:
+                {
+                    x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w;
+                    y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h;
+                    if (is_3d)
+                        z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d;
+
+                    readImageImplI<T>(result, image,
+                                      min(std::floor(x), w - 1),
+                                      min(std::floor(y), h - 1),
+                                      min(std::floor(z), d - 1),
+                                      sampler);
+                }
+                case CLK_FILTER_LINEAR:
+                {
+                    float a, b, c;
+                    int i0, i1, j0, j1, k0, k1;
+
+                    x = std::fabs(x - 2.0f * round(0.5f * x)) * (float)w;
+                    i0 = std::floor(x - 0.5f);
+                    i1 = i0 + 1;
+                    i0 = max(i0, 0);
+                    i1 = min(i1, w - 1);
+
+                    y = std::fabs(y - 2.0f * round(0.5f * y)) * (float)h;
+                    j0 = std::floor(y - 0.5f);
+                    j1 = j0 + 1;
+                    j0 = max(j0, 0);
+                    j1 = min(j1, h - 1);
+
+                    if (is_3d)
+                    {
+                        z = std::fabs(z - 2.0f * round(0.5f * z)) * (float)d;
+                        k0 = std::floor(z - 0.5f);
+                        k1 = k0 + 1;
+                        k0 = max(k0, 0);
+                        k1 = min(k1, d - 1);
+                    }
+
+                    a = frac(x - 0.5f);
+                    b = frac(y - 0.5f);
+                    c = frac(z - 0.5f);
+
+                    if (is_3d)
+                    {
+                        linear3D<T>(result, a, b, c, i0, j0, k0, i1, j1, k1,
+                                    image3d);
+                    }
+                    else
+                    {
+                        linear2D<T>(result, a, b, c, i0, j0, i1, j1, image);
+                    }
+                }
+            }
+            break;
+    }
+}
+
+void CPUKernelWorkGroup::readImage(float *result, Image2D *image, float x,
+                                   float y, float z, uint32_t sampler) const
+{
+    readImageImplF<float>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(int32_t *result, Image2D *image, float x,
+                                   float y, float z, uint32_t sampler) const
+{
+    readImageImplF<int32_t>(result, image, x, y, z, sampler);
+}
+
+void CPUKernelWorkGroup::readImage(uint32_t *result, Image2D *image, float x,
+                                   float y, float z, uint32_t sampler) const
+{
+    readImageImplF<uint32_t>(result, image, x, y, z, sampler);
+}
diff --git a/src/runtime/stdlib.c b/src/runtime/stdlib.c
index 4b6a0c2..0f10f0d 100644
--- a/src/runtime/stdlib.c
+++ b/src/runtime/stdlib.c
@@ -37,11 +37,6 @@ int debug(const char *format, ...);
  * Image functions
  */
 
-int clamp(int a, int b, int c)
-{
-    return (a < b) ? b : ((a > c) ? c : a);
-}
-
 int __cpu_get_image_width(void *image);
 int __cpu_get_image_height(void *image);
 int __cpu_get_image_depth(void *image);
@@ -60,37 +55,12 @@ void __cpu_read_imageii(int4 *result, void *image, int x, int y, int z,
                         sampler_t sampler);
 void __cpu_read_imageuii(uint4 *result, void *image, int x, int y, int z,
                         sampler_t sampler);
-
-int4 handle_address_mode(image3d_t image, int4 coord, sampler_t sampler)
-{
-    coord.w = 0;
-
-    int w = get_image_width(image),
-        h = get_image_height(image),
-        d = get_image_depth(image);
-
-    if ((sampler & 0xf0) ==  CLK_ADDRESS_CLAMP_TO_EDGE)
-    {
-        coord.x = clamp(coord.x, 0, w - 1);
-        coord.y = clamp(coord.y, 0, h - 1);
-        coord.z = clamp(coord.z, 0, d - 1);
-    }
-    else if ((sampler & 0xf0) == CLK_ADDRESS_CLAMP)
-    {
-        coord.x = clamp(coord.x, 0, w);
-        coord.y = clamp(coord.y, 0, h);
-        coord.z = clamp(coord.z, 0, d);
-    }
-
-    if (coord.x == w ||
-        coord.y == h ||
-        coord.z == d)
-    {
-        coord.w = 1;
-    }
-
-    return coord;
-}
+void __cpu_read_imageff(float4 *result, void *image, float x, float y, float z,
+                        sampler_t sampler);
+void __cpu_read_imageif(int4 *result, void *image, float x, float y, float z,
+                        sampler_t sampler);
+void __cpu_read_imageuif(uint4 *result, void *image, float x, float y, float z,
+                        sampler_t sampler);
 
 float4 OVERLOAD read_imagef(image2d_t image, sampler_t sampler, int2 coord)
 {
@@ -112,257 +82,20 @@ float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, int4 coord)
 
 float4 OVERLOAD read_imagef(image2d_t image, sampler_t sampler, float2 coord)
 {
-    float4 c;
-
-    c.xy = coord;
-    c.zw = 0;
+    float4 rs;
 
-    return read_imagef((image3d_t)image, sampler, c);
-}
+    __cpu_read_imageff(&rs, image, coord.x, coord.y, 0.0f, sampler);
 
-int4 f2i_floor(float4 value)
-{
-    int4 result = __builtin_ia32_cvtps2dq(value);
-    value = __builtin_ia32_psrldi128((int4)value, 31);
-    result -= (int4)value;
-    return result;
+    return rs;
 }
 
-float4 f2f_floor(float4 value)
+float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, float4 coord)
 {
-    return __builtin_ia32_cvtdq2ps(f2i_floor(value));
-}
+    float4 rs;
 
-#define LINEAR_3D(t_max, suf)                           \
-    (t_max - a) * (t_max - b) - (t_max - c) *           \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 0, 1, 2, 3)) +  \
-    a * (t_max - b) * (t_max - c) *                     \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 4, 1, 2, 3)) +  \
-    (t_max - a) * b * (t_max - c) *                     \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 0, 5, 2, 3)) +  \
-    a * b * (t_max - c) *                               \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 4, 5, 2, 3)) +  \
-    (t_max - a) * (t_max - b) * c *                     \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 0, 1, 6, 3)) +  \
-    a * (t_max - b) * c *                               \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 4, 1, 6, 3)) +  \
-    (t_max - a) * b * c *                               \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 0, 5, 6, 3)) +  \
-    a * b * c *                                         \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 4, 5, 6, 3))
-
-#define LINEAR_2D(t_max, suf)                           \
-    (t_max - a) * (t_max - b) *                         \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 0, 1, 2, 2)) +  \
-    a * (t_max - b) *                                   \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 4, 1, 2, 2)) +  \
-    (t_max - a) * b *                                   \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 0, 5, 2, 2)) +  \
-    a * b *                                             \
-    read_image##suf(image, sampler,                     \
-        __builtin_shufflevector(V0, V1, 4, 5, 2, 2));
-
-#define READ_IMAGE(type, suf, type_max)                                        \
-    type##4 result;                                                            \
-                                                                               \
-    switch (sampler & 0xf0)                                                    \
-    {                                                                          \
-        case CLK_ADDRESS_NONE:                                                 \
-        case CLK_ADDRESS_CLAMP:                                                \
-        case CLK_ADDRESS_CLAMP_TO_EDGE:                                        \
-            /* Denormalize coords */                                           \
-            if ((sampler & 0xf) == CLK_NORMALIZED_COORDS_TRUE)                 \
-                coord *= __builtin_ia32_cvtdq2ps(get_image_dim(image));        \
-                                                                               \
-            switch (sampler & 0xf00)                                           \
-            {                                                                  \
-                case CLK_FILTER_NEAREST:                                       \
-                {                                                              \
-                    int4 c = f2i_floor(coord);                                 \
-                                                                               \
-                    return read_image##suf(image, sampler, c);                 \
-                }                                                              \
-                case CLK_FILTER_LINEAR:                                        \
-                {                                                              \
-                    type a, b, c;                                              \
-                                                                               \
-                    coord -= 0.5f;                                             \
-                                                                               \
-                    int4 V0, V1;                                               \
-                                                                               \
-                    V0 = f2i_floor(coord);                                     \
-                    V1 = f2i_floor(coord) + 1;                                 \
-                                                                               \
-                    coord -= f2f_floor(coord);                                 \
-                                                                               \
-                    a = (type)(coord.x * type_max);                            \
-                    b = (type)(coord.y * type_max);                            \
-                    c = (type)(coord.z * type_max);                            \
-                                                                               \
-                    if (__cpu_is_image_3d(image))                              \
-                    {                                                          \
-                        result = LINEAR_3D(type_max, suf);                     \
-                    }                                                          \
-                    else                                                       \
-                    {                                                          \
-                        result = LINEAR_2D(type_max, suf);                     \
-                    }                                                          \
-                }                                                              \
-            }                                                                  \
-            break;                                                             \
-        case CLK_ADDRESS_REPEAT:                                               \
-            switch (sampler & 0xf00)                                           \
-            {                                                                  \
-                case CLK_FILTER_NEAREST:                                       \
-                {                                                              \
-                    int4 dim = get_image_dim(image);                           \
-                    coord = (coord - f2f_floor(coord)) *                       \
-                                __builtin_ia32_cvtdq2ps(dim);                  \
-                                                                               \
-                    int4 c = f2i_floor(coord);                                 \
-                                                                               \
-                    /* if (c > dim - 1) c = c - dim */                         \
-                    int4 mask = __builtin_ia32_pcmpgtd128(c, dim - 1);         \
-                    int4 repl = c - dim;                                       \
-                    c = (repl & mask) | (c & ~mask);                           \
-                                                                               \
-                    return read_image##suf(image, sampler, c);                 \
-                }                                                              \
-                case CLK_FILTER_LINEAR:                                        \
-                {                                                              \
-                    type a, b, c;                                              \
-                                                                               \
-                    int4 dim = get_image_dim(image);                           \
-                    coord = (coord - f2f_floor(coord)) *                       \
-                                __builtin_ia32_cvtdq2ps(dim);                  \
-                                                                               \
-                    float4 tmp = coord;                                        \
-                    tmp -= 0.5f;                                               \
-                    tmp -= f2f_floor(tmp);                                     \
-                                                                               \
-                    a = (type)(tmp.x * type_max);                              \
-                    b = (type)(tmp.y * type_max);                              \
-                    c = (type)(tmp.z * type_max);                              \
-                                                                               \
-                    int4 V0, V1;                                               \
-                                                                               \
-                    V0 = f2i_floor(coord - 0.5f);                              \
-                    V1 = V0 + 1;                                               \
-                                                                               \
-                    /* if (0 > V0) V0 = dim + V0 */                            \
-                    int4 zero = 0;                                             \
-                    int4 mask = __builtin_ia32_pcmpgtd128(zero, V0);           \
-                    int4 repl = dim + V0;                                      \
-                    V0 = (repl & mask) | (V0 & ~mask);                         \
-                                                                               \
-                    /* if (V1 > dim - 1) V1 = V1 - dim */                      \
-                    mask = __builtin_ia32_pcmpgtd128(V1, dim);                 \
-                    repl = V1 - dim;                                           \
-                    V1 = (repl & mask) | (V0 & ~mask);                         \
-                                                                               \
-                    if (__cpu_is_image_3d(image))                              \
-                    {                                                          \
-                        result = LINEAR_3D(type_max, suf);                     \
-                    }                                                          \
-                    else                                                       \
-                    {                                                          \
-                        result = LINEAR_2D(type_max, suf);                     \
-                    }                                                          \
-                }                                                              \
-            }                                                                  \
-            break;                                                             \
-        case CLK_ADDRESS_MIRRORED_REPEAT:                                      \
-            switch (sampler & 0xf00)                                           \
-            {                                                                  \
-                case CLK_FILTER_NEAREST:                                       \
-                {                                                              \
-                    int4 dim = get_image_dim(image);                           \
-                    float4 two = 2.0f;                                         \
-                    float4 prim = two * __builtin_ia32_cvtdq2ps(               \
-                        __builtin_ia32_cvtps2dq(0.5f * coord));                \
-                    prim -= coord;                                             \
-                                                                               \
-                    /* abs(x) = x & ~{-0, -0, -0, -0} */                       \
-                    float4 nzeroes = -0.0f;                                    \
-                    prim = (float4)((int4)prim & ~(int4)nzeroes);              \
-                                                                               \
-                    coord = prim * __builtin_ia32_cvtdq2ps(dim);               \
-                    int4 c = f2i_floor(coord);                                 \
-                                                                               \
-                    /* if (c > dim - 1) c = dim - 1 */                         \
-                    int4 repl = dim - 1;                                       \
-                    int4 mask = __builtin_ia32_pcmpgtd128(c, repl);            \
-                    c = (repl & mask) | (c & ~mask);                           \
-                                                                               \
-                    return read_image##suf(image, sampler, c);                 \
-                }                                                              \
-                case CLK_FILTER_LINEAR:                                        \
-                {                                                              \
-                    type a, b, c;                                              \
-                                                                               \
-                    int4 dim = get_image_dim(image);                           \
-                    float4 two = 2.0f;                                         \
-                    float4 prim = two * __builtin_ia32_cvtdq2ps(               \
-                        __builtin_ia32_cvtps2dq(0.5f * coord));                \
-                    prim -= coord;                                             \
-                                                                               \
-                    /* abs(x) = x & ~{-0, -0, -0, -0} */                       \
-                    float4 nzeroes = -0.0f;                                    \
-                    prim = (float4)((int4)prim & ~(int4)nzeroes);              \
-                                                                               \
-                    coord = prim * __builtin_ia32_cvtdq2ps(dim);               \
-                                                                               \
-                    float4 tmp = coord;                                        \
-                    tmp -= 0.5f;                                               \
-                    tmp -= f2f_floor(tmp);                                     \
-                                                                               \
-                    a = (type)(tmp.x * type_max);                              \
-                    b = (type)(tmp.y * type_max);                              \
-                    c = (type)(tmp.z * type_max);                              \
-                                                                               \
-                    int4 V0, V1, zero = 0;                                     \
-                                                                               \
-                    V0 = f2i_floor(coord - 0.5f);                              \
-                    V1 = V0 + 1;                                               \
-                                                                               \
-                    /* if (0 > V0) V0 = 0 */                                   \
-                    int4 mask = __builtin_ia32_pcmpgtd128(V0, zero);           \
-                    V0 &= ~mask;                                               \
-                                                                               \
-                    /* if (V1 > dim - 1) V1 = dim - 1 */                       \
-                    int4 repl = dim - 1;                                       \
-                    mask = __builtin_ia32_pcmpgtd128(V1, repl);                \
-                    V1 = (repl & mask) | (V1 & ~mask);                         \
-                                                                               \
-                    if (__cpu_is_image_3d(image))                              \
-                    {                                                          \
-                        result = LINEAR_3D(type_max, suf);                     \
-                    }                                                          \
-                    else                                                       \
-                    {                                                          \
-                        result = LINEAR_2D(type_max, suf);                     \
-                    }                                                          \
-                }                                                              \
-            }                                                                  \
-            break;                                                             \
-    }                                                                          \
-                                                                               \
-    return result;
+    __cpu_read_imageff(&rs, image, coord.x, coord.y, coord.z, sampler);
 
-float4 OVERLOAD read_imagef(image3d_t image, sampler_t sampler, float4 coord)
-{
-    READ_IMAGE(float, f, 1.0f)
+    return rs;
 }
 
 int4 OVERLOAD read_imagei(image2d_t image, sampler_t sampler, int2 coord)
@@ -385,17 +118,20 @@ int4 OVERLOAD read_imagei(image3d_t image, sampler_t sampler, int4 coord)
 
 int4 OVERLOAD read_imagei(image2d_t image, sampler_t sampler, float2 coord)
 {
-    float4 c;
+    int4 rs;
 
-    c.xy = coord;
-    c.zw = 0;
+    __cpu_read_imageif(&rs, image, coord.x, coord.y, 0.0f, sampler);
 
-    return read_imagei((image3d_t)image, sampler, c);
+    return rs;
 }
 
 int4 OVERLOAD read_imagei(image3d_t image, sampler_t sampler, float4 coord)
 {
-    READ_IMAGE(int, i, 0x7fffffff)
+    int4 rs;
+
+    __cpu_read_imageif(&rs, image, coord.x, coord.y, coord.z, sampler);
+
+    return rs;
 }
 
 uint4 OVERLOAD read_imageui(image2d_t image, sampler_t sampler, int2 coord)
@@ -418,17 +154,20 @@ uint4 OVERLOAD read_imageui(image3d_t image, sampler_t sampler, int4 coord)
 
 uint4 OVERLOAD read_imageui(image2d_t image, sampler_t sampler, float2 coord)
 {
-    float4 c;
+    uint4 rs;
 
-    c.xy = coord;
-    c.zw = 0;
+    __cpu_read_imageuif(&rs, image, coord.x, coord.y, 0.0f, sampler);
 
-    return read_imageui((image3d_t)image, sampler, c);
+    return rs;
 }
 
 uint4 OVERLOAD read_imageui(image3d_t image, sampler_t sampler, float4 coord)
 {
-    READ_IMAGE(uint, ui, 0xffffffff)
+    uint4 rs;
+
+    __cpu_read_imageuif(&rs, image, coord.x, coord.y, coord.z, sampler);
+
+    return rs;
 }
 
 void OVERLOAD write_imagef(image2d_t image, int2 coord, float4 color)
diff --git a/tests/test_builtins.cpp b/tests/test_builtins.cpp
index 7dd35d8..94e836e 100644
--- a/tests/test_builtins.cpp
+++ b/tests/test_builtins.cpp
@@ -91,7 +91,7 @@ const char image_source[] =
     "\n"
     "   float2 fcoords;\n"
     "   fcoords.x = 0.31f;\n"
-    "   fcoords.y = 3.1415f;\n"
+    "   fcoords.y = 0.1415f;\n"
     "   fcolor = read_imagef(image3, sampler, fcoords);\n"
     "}\n";