diff options
Diffstat (limited to 'opencl')
-rw-r--r-- | opencl/stretch-contrast.cl | 63 | ||||
-rw-r--r-- | opencl/stretch-contrast.cl.h | 63 |
2 files changed, 64 insertions, 62 deletions
diff --git a/opencl/stretch-contrast.cl b/opencl/stretch-contrast.cl index 83f7ff9d..cda7b6d8 100644 --- a/opencl/stretch-contrast.cl +++ b/opencl/stretch-contrast.cl @@ -17,21 +17,21 @@ */ -__kernel void init_stretch (__global float *out_min, - __global float *out_max) +__kernel void init_stretch (__global float4 *out_min, + __global float4 *out_max) { int gid = get_global_id (0); - out_min[gid] = FLT_MAX; - out_max[gid] = -FLT_MAX; + out_min[gid] = (float4)( FLT_MAX); + out_max[gid] = (float4)(-FLT_MAX); } __kernel void two_stages_local_min_max_reduce (__global const float4 *in, - __global float *out_min, - __global float *out_max, - __local float *aux_min, - __local float *aux_max, - int n_pixels) + __global float4 *out_min, + __global float4 *out_max, + __local float4 *aux_min, + __local float4 *aux_max, + int n_pixels) { int gid = get_global_id(0); int gsize = get_global_size(0); @@ -40,20 +40,20 @@ __kernel void two_stages_local_min_max_reduce (__global const float4 *in, float4 min_v = (float4)( FLT_MAX); float4 max_v = (float4)(-FLT_MAX); float4 in_v; - float aux0, aux1; + float4 aux0, aux1; int it; /* Loop sequentially over chunks of input vector */ for (it = gid; it < n_pixels; it += gsize) { in_v = in[it]; - min_v = fmin (min_v, in_v); - max_v = fmax (max_v, in_v); + min_v = min (min_v, in_v); + max_v = max (max_v, in_v); } /* Perform parallel reduction */ - aux_min[lid] = min (min (min_v.x, min_v.y), min_v.z); - aux_max[lid] = max (max (max_v.x, max_v.y), max_v.z); + aux_min[lid] = min_v; + aux_max[lid] = max_v; barrier (CLK_LOCAL_MEM_FENCE); @@ -63,11 +63,11 @@ __kernel void two_stages_local_min_max_reduce (__global const float4 *in, { aux0 = aux_min[lid + it]; aux1 = aux_min[lid]; - aux_min[lid] = fmin (aux0, aux1); + aux_min[lid] = min (aux0, aux1); aux0 = aux_max[lid + it]; aux1 = aux_max[lid]; - aux_max[lid] = fmax (aux0, aux1); + aux_max[lid] = max (aux0, aux1); } barrier (CLK_LOCAL_MEM_FENCE); } @@ -85,21 +85,21 @@ __kernel void two_stages_local_min_max_reduce (__global const float4 *in, int nb_wg = gsize / lsize; for (it = nb_wg; it < lsize; it++) { - out_min[it] = FLT_MAX; - out_max[it] = -FLT_MAX; + out_min[it] = (float4)( FLT_MAX); + out_max[it] = (float4)(-FLT_MAX); } } } -__kernel void global_min_max_reduce (__global float *in_min, - __global float *in_max, - __global float *out_min_max) +__kernel void global_min_max_reduce (__global float4 *in_min, + __global float4 *in_max, + __global float4 *out_min_max) { - int gid = get_global_id(0); - int lid = get_local_id(0); - int lsize = get_local_size(0); - float aux0, aux1; - int it; + int gid = get_global_id(0); + int lid = get_local_id(0); + int lsize = get_local_size(0); + float4 aux0, aux1; + int it; /* Perform parallel reduction */ for (it = lsize / 2; it > 0; it >>= 1) @@ -108,11 +108,11 @@ __kernel void global_min_max_reduce (__global float *in_min, { aux0 = in_min[lid + it]; aux1 = in_min[lid]; - in_min[gid] = fmin (aux0, aux1); + in_min[gid] = min (aux0, aux1); aux0 = in_max[lid + it]; aux1 = in_max[lid]; - in_max[gid] = fmax (aux0, aux1); + in_max[gid] = max (aux0, aux1); } barrier (CLK_GLOBAL_MEM_FENCE); } @@ -125,12 +125,13 @@ __kernel void global_min_max_reduce (__global float *in_min, __kernel void cl_stretch_contrast (__global const float4 *in, __global float4 *out, - float min, - float diff) + float4 min, + float4 diff) { int gid = get_global_id(0); float4 in_v = in[gid]; - in_v.xyz = (in_v.xyz - min) / diff; + in_v = (in_v - min) / diff; + out[gid] = in_v; } diff --git a/opencl/stretch-contrast.cl.h b/opencl/stretch-contrast.cl.h index c4bd39ee..954ad1db 100644 --- a/opencl/stretch-contrast.cl.h +++ b/opencl/stretch-contrast.cl.h @@ -18,21 +18,21 @@ static const char* stretch_contrast_cl_source = " */ \n" " \n" " \n" -"__kernel void init_stretch (__global float *out_min, \n" -" __global float *out_max) \n" +"__kernel void init_stretch (__global float4 *out_min, \n" +" __global float4 *out_max) \n" "{ \n" " int gid = get_global_id (0); \n" " \n" -" out_min[gid] = FLT_MAX; \n" -" out_max[gid] = -FLT_MAX; \n" +" out_min[gid] = (float4)( FLT_MAX); \n" +" out_max[gid] = (float4)(-FLT_MAX); \n" "} \n" " \n" "__kernel void two_stages_local_min_max_reduce (__global const float4 *in, \n" -" __global float *out_min,\n" -" __global float *out_max,\n" -" __local float *aux_min,\n" -" __local float *aux_max,\n" -" int n_pixels)\n" +" __global float4 *out_min,\n" +" __global float4 *out_max,\n" +" __local float4 *aux_min,\n" +" __local float4 *aux_max,\n" +" int n_pixels)\n" "{ \n" " int gid = get_global_id(0); \n" " int gsize = get_global_size(0); \n" @@ -41,20 +41,20 @@ static const char* stretch_contrast_cl_source = " float4 min_v = (float4)( FLT_MAX); \n" " float4 max_v = (float4)(-FLT_MAX); \n" " float4 in_v; \n" -" float aux0, aux1; \n" +" float4 aux0, aux1; \n" " int it; \n" " \n" " /* Loop sequentially over chunks of input vector */ \n" " for (it = gid; it < n_pixels; it += gsize) \n" " { \n" " in_v = in[it]; \n" -" min_v = fmin (min_v, in_v); \n" -" max_v = fmax (max_v, in_v); \n" +" min_v = min (min_v, in_v); \n" +" max_v = max (max_v, in_v); \n" " } \n" " \n" " /* Perform parallel reduction */ \n" -" aux_min[lid] = min (min (min_v.x, min_v.y), min_v.z); \n" -" aux_max[lid] = max (max (max_v.x, max_v.y), max_v.z); \n" +" aux_min[lid] = min_v; \n" +" aux_max[lid] = max_v; \n" " \n" " barrier (CLK_LOCAL_MEM_FENCE); \n" " \n" @@ -64,11 +64,11 @@ static const char* stretch_contrast_cl_source = " { \n" " aux0 = aux_min[lid + it]; \n" " aux1 = aux_min[lid]; \n" -" aux_min[lid] = fmin (aux0, aux1); \n" +" aux_min[lid] = min (aux0, aux1); \n" " \n" " aux0 = aux_max[lid + it]; \n" " aux1 = aux_max[lid]; \n" -" aux_max[lid] = fmax (aux0, aux1); \n" +" aux_max[lid] = max (aux0, aux1); \n" " } \n" " barrier (CLK_LOCAL_MEM_FENCE); \n" " } \n" @@ -86,21 +86,21 @@ static const char* stretch_contrast_cl_source = " int nb_wg = gsize / lsize; \n" " for (it = nb_wg; it < lsize; it++) \n" " { \n" -" out_min[it] = FLT_MAX; \n" -" out_max[it] = -FLT_MAX; \n" +" out_min[it] = (float4)( FLT_MAX); \n" +" out_max[it] = (float4)(-FLT_MAX); \n" " } \n" " } \n" "} \n" " \n" -"__kernel void global_min_max_reduce (__global float *in_min, \n" -" __global float *in_max, \n" -" __global float *out_min_max) \n" +"__kernel void global_min_max_reduce (__global float4 *in_min, \n" +" __global float4 *in_max, \n" +" __global float4 *out_min_max) \n" "{ \n" -" int gid = get_global_id(0); \n" -" int lid = get_local_id(0); \n" -" int lsize = get_local_size(0); \n" -" float aux0, aux1; \n" -" int it; \n" +" int gid = get_global_id(0); \n" +" int lid = get_local_id(0); \n" +" int lsize = get_local_size(0); \n" +" float4 aux0, aux1; \n" +" int it; \n" " \n" " /* Perform parallel reduction */ \n" " for (it = lsize / 2; it > 0; it >>= 1) \n" @@ -109,11 +109,11 @@ static const char* stretch_contrast_cl_source = " { \n" " aux0 = in_min[lid + it]; \n" " aux1 = in_min[lid]; \n" -" in_min[gid] = fmin (aux0, aux1); \n" +" in_min[gid] = min (aux0, aux1); \n" " \n" " aux0 = in_max[lid + it]; \n" " aux1 = in_max[lid]; \n" -" in_max[gid] = fmax (aux0, aux1); \n" +" in_max[gid] = max (aux0, aux1); \n" " } \n" " barrier (CLK_GLOBAL_MEM_FENCE); \n" " } \n" @@ -126,13 +126,14 @@ static const char* stretch_contrast_cl_source = " \n" "__kernel void cl_stretch_contrast (__global const float4 *in, \n" " __global float4 *out, \n" -" float min, \n" -" float diff) \n" +" float4 min, \n" +" float4 diff) \n" "{ \n" " int gid = get_global_id(0); \n" " float4 in_v = in[gid]; \n" " \n" -" in_v.xyz = (in_v.xyz - min) / diff; \n" +" in_v = (in_v - min) / diff; \n" +" \n" " out[gid] = in_v; \n" "} \n" ; |