summaryrefslogtreecommitdiff
path: root/opencl
diff options
context:
space:
mode:
Diffstat (limited to 'opencl')
-rw-r--r--opencl/stretch-contrast.cl63
-rw-r--r--opencl/stretch-contrast.cl.h63
2 files changed, 64 insertions, 62 deletions
diff --git a/opencl/stretch-contrast.cl b/opencl/stretch-contrast.cl
index 83f7ff9d..cda7b6d8 100644
--- a/opencl/stretch-contrast.cl
+++ b/opencl/stretch-contrast.cl
@@ -17,21 +17,21 @@
*/
-__kernel void init_stretch (__global float *out_min,
- __global float *out_max)
+__kernel void init_stretch (__global float4 *out_min,
+ __global float4 *out_max)
{
int gid = get_global_id (0);
- out_min[gid] = FLT_MAX;
- out_max[gid] = -FLT_MAX;
+ out_min[gid] = (float4)( FLT_MAX);
+ out_max[gid] = (float4)(-FLT_MAX);
}
__kernel void two_stages_local_min_max_reduce (__global const float4 *in,
- __global float *out_min,
- __global float *out_max,
- __local float *aux_min,
- __local float *aux_max,
- int n_pixels)
+ __global float4 *out_min,
+ __global float4 *out_max,
+ __local float4 *aux_min,
+ __local float4 *aux_max,
+ int n_pixels)
{
int gid = get_global_id(0);
int gsize = get_global_size(0);
@@ -40,20 +40,20 @@ __kernel void two_stages_local_min_max_reduce (__global const float4 *in,
float4 min_v = (float4)( FLT_MAX);
float4 max_v = (float4)(-FLT_MAX);
float4 in_v;
- float aux0, aux1;
+ float4 aux0, aux1;
int it;
/* Loop sequentially over chunks of input vector */
for (it = gid; it < n_pixels; it += gsize)
{
in_v = in[it];
- min_v = fmin (min_v, in_v);
- max_v = fmax (max_v, in_v);
+ min_v = min (min_v, in_v);
+ max_v = max (max_v, in_v);
}
/* Perform parallel reduction */
- aux_min[lid] = min (min (min_v.x, min_v.y), min_v.z);
- aux_max[lid] = max (max (max_v.x, max_v.y), max_v.z);
+ aux_min[lid] = min_v;
+ aux_max[lid] = max_v;
barrier (CLK_LOCAL_MEM_FENCE);
@@ -63,11 +63,11 @@ __kernel void two_stages_local_min_max_reduce (__global const float4 *in,
{
aux0 = aux_min[lid + it];
aux1 = aux_min[lid];
- aux_min[lid] = fmin (aux0, aux1);
+ aux_min[lid] = min (aux0, aux1);
aux0 = aux_max[lid + it];
aux1 = aux_max[lid];
- aux_max[lid] = fmax (aux0, aux1);
+ aux_max[lid] = max (aux0, aux1);
}
barrier (CLK_LOCAL_MEM_FENCE);
}
@@ -85,21 +85,21 @@ __kernel void two_stages_local_min_max_reduce (__global const float4 *in,
int nb_wg = gsize / lsize;
for (it = nb_wg; it < lsize; it++)
{
- out_min[it] = FLT_MAX;
- out_max[it] = -FLT_MAX;
+ out_min[it] = (float4)( FLT_MAX);
+ out_max[it] = (float4)(-FLT_MAX);
}
}
}
-__kernel void global_min_max_reduce (__global float *in_min,
- __global float *in_max,
- __global float *out_min_max)
+__kernel void global_min_max_reduce (__global float4 *in_min,
+ __global float4 *in_max,
+ __global float4 *out_min_max)
{
- int gid = get_global_id(0);
- int lid = get_local_id(0);
- int lsize = get_local_size(0);
- float aux0, aux1;
- int it;
+ int gid = get_global_id(0);
+ int lid = get_local_id(0);
+ int lsize = get_local_size(0);
+ float4 aux0, aux1;
+ int it;
/* Perform parallel reduction */
for (it = lsize / 2; it > 0; it >>= 1)
@@ -108,11 +108,11 @@ __kernel void global_min_max_reduce (__global float *in_min,
{
aux0 = in_min[lid + it];
aux1 = in_min[lid];
- in_min[gid] = fmin (aux0, aux1);
+ in_min[gid] = min (aux0, aux1);
aux0 = in_max[lid + it];
aux1 = in_max[lid];
- in_max[gid] = fmax (aux0, aux1);
+ in_max[gid] = max (aux0, aux1);
}
barrier (CLK_GLOBAL_MEM_FENCE);
}
@@ -125,12 +125,13 @@ __kernel void global_min_max_reduce (__global float *in_min,
__kernel void cl_stretch_contrast (__global const float4 *in,
__global float4 *out,
- float min,
- float diff)
+ float4 min,
+ float4 diff)
{
int gid = get_global_id(0);
float4 in_v = in[gid];
- in_v.xyz = (in_v.xyz - min) / diff;
+ in_v = (in_v - min) / diff;
+
out[gid] = in_v;
}
diff --git a/opencl/stretch-contrast.cl.h b/opencl/stretch-contrast.cl.h
index c4bd39ee..954ad1db 100644
--- a/opencl/stretch-contrast.cl.h
+++ b/opencl/stretch-contrast.cl.h
@@ -18,21 +18,21 @@ static const char* stretch_contrast_cl_source =
" */ \n"
" \n"
" \n"
-"__kernel void init_stretch (__global float *out_min, \n"
-" __global float *out_max) \n"
+"__kernel void init_stretch (__global float4 *out_min, \n"
+" __global float4 *out_max) \n"
"{ \n"
" int gid = get_global_id (0); \n"
" \n"
-" out_min[gid] = FLT_MAX; \n"
-" out_max[gid] = -FLT_MAX; \n"
+" out_min[gid] = (float4)( FLT_MAX); \n"
+" out_max[gid] = (float4)(-FLT_MAX); \n"
"} \n"
" \n"
"__kernel void two_stages_local_min_max_reduce (__global const float4 *in, \n"
-" __global float *out_min,\n"
-" __global float *out_max,\n"
-" __local float *aux_min,\n"
-" __local float *aux_max,\n"
-" int n_pixels)\n"
+" __global float4 *out_min,\n"
+" __global float4 *out_max,\n"
+" __local float4 *aux_min,\n"
+" __local float4 *aux_max,\n"
+" int n_pixels)\n"
"{ \n"
" int gid = get_global_id(0); \n"
" int gsize = get_global_size(0); \n"
@@ -41,20 +41,20 @@ static const char* stretch_contrast_cl_source =
" float4 min_v = (float4)( FLT_MAX); \n"
" float4 max_v = (float4)(-FLT_MAX); \n"
" float4 in_v; \n"
-" float aux0, aux1; \n"
+" float4 aux0, aux1; \n"
" int it; \n"
" \n"
" /* Loop sequentially over chunks of input vector */ \n"
" for (it = gid; it < n_pixels; it += gsize) \n"
" { \n"
" in_v = in[it]; \n"
-" min_v = fmin (min_v, in_v); \n"
-" max_v = fmax (max_v, in_v); \n"
+" min_v = min (min_v, in_v); \n"
+" max_v = max (max_v, in_v); \n"
" } \n"
" \n"
" /* Perform parallel reduction */ \n"
-" aux_min[lid] = min (min (min_v.x, min_v.y), min_v.z); \n"
-" aux_max[lid] = max (max (max_v.x, max_v.y), max_v.z); \n"
+" aux_min[lid] = min_v; \n"
+" aux_max[lid] = max_v; \n"
" \n"
" barrier (CLK_LOCAL_MEM_FENCE); \n"
" \n"
@@ -64,11 +64,11 @@ static const char* stretch_contrast_cl_source =
" { \n"
" aux0 = aux_min[lid + it]; \n"
" aux1 = aux_min[lid]; \n"
-" aux_min[lid] = fmin (aux0, aux1); \n"
+" aux_min[lid] = min (aux0, aux1); \n"
" \n"
" aux0 = aux_max[lid + it]; \n"
" aux1 = aux_max[lid]; \n"
-" aux_max[lid] = fmax (aux0, aux1); \n"
+" aux_max[lid] = max (aux0, aux1); \n"
" } \n"
" barrier (CLK_LOCAL_MEM_FENCE); \n"
" } \n"
@@ -86,21 +86,21 @@ static const char* stretch_contrast_cl_source =
" int nb_wg = gsize / lsize; \n"
" for (it = nb_wg; it < lsize; it++) \n"
" { \n"
-" out_min[it] = FLT_MAX; \n"
-" out_max[it] = -FLT_MAX; \n"
+" out_min[it] = (float4)( FLT_MAX); \n"
+" out_max[it] = (float4)(-FLT_MAX); \n"
" } \n"
" } \n"
"} \n"
" \n"
-"__kernel void global_min_max_reduce (__global float *in_min, \n"
-" __global float *in_max, \n"
-" __global float *out_min_max) \n"
+"__kernel void global_min_max_reduce (__global float4 *in_min, \n"
+" __global float4 *in_max, \n"
+" __global float4 *out_min_max) \n"
"{ \n"
-" int gid = get_global_id(0); \n"
-" int lid = get_local_id(0); \n"
-" int lsize = get_local_size(0); \n"
-" float aux0, aux1; \n"
-" int it; \n"
+" int gid = get_global_id(0); \n"
+" int lid = get_local_id(0); \n"
+" int lsize = get_local_size(0); \n"
+" float4 aux0, aux1; \n"
+" int it; \n"
" \n"
" /* Perform parallel reduction */ \n"
" for (it = lsize / 2; it > 0; it >>= 1) \n"
@@ -109,11 +109,11 @@ static const char* stretch_contrast_cl_source =
" { \n"
" aux0 = in_min[lid + it]; \n"
" aux1 = in_min[lid]; \n"
-" in_min[gid] = fmin (aux0, aux1); \n"
+" in_min[gid] = min (aux0, aux1); \n"
" \n"
" aux0 = in_max[lid + it]; \n"
" aux1 = in_max[lid]; \n"
-" in_max[gid] = fmax (aux0, aux1); \n"
+" in_max[gid] = max (aux0, aux1); \n"
" } \n"
" barrier (CLK_GLOBAL_MEM_FENCE); \n"
" } \n"
@@ -126,13 +126,14 @@ static const char* stretch_contrast_cl_source =
" \n"
"__kernel void cl_stretch_contrast (__global const float4 *in, \n"
" __global float4 *out, \n"
-" float min, \n"
-" float diff) \n"
+" float4 min, \n"
+" float4 diff) \n"
"{ \n"
" int gid = get_global_id(0); \n"
" float4 in_v = in[gid]; \n"
" \n"
-" in_v.xyz = (in_v.xyz - min) / diff; \n"
+" in_v = (in_v - min) / diff; \n"
+" \n"
" out[gid] = in_v; \n"
"} \n"
;