static const char* bilateral_filter_fast_cl_source =
"/* This file is part of GEGL \n"
" * \n"
" * GEGL is free software; you can redistribute it and/or \n"
" * modify it under the terms of the GNU Lesser General Public \n"
" * License as published by the Free Software Foundation; either \n"
" * version 3 of the License, or (at your option) any later version. \n"
" * \n"
" * GEGL is distributed in the hope that it will be useful, \n"
" * but WITHOUT ANY WARRANTY; without even the implied warranty of \n"
" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU \n"
" * Lesser General Public License for more details. \n"
" * \n"
" * You should have received a copy of the GNU Lesser General Public \n"
" * License along with GEGL; if not, see . \n"
" * \n"
" * Copyright 2013 Victor Oliveira (victormatheus@gmail.com) \n"
" */ \n"
" \n"
"#define GRID(x,y,z) grid[x+sw*(y + z * sh)] \n"
" \n"
"#define LOCAL_W 8 \n"
"#define LOCAL_H 8 \n"
" \n"
"/* found by trial and error on a NVidia GPU */ \n"
" \n"
"// optimum value \n"
"// #define DEPTH_CHUNK 12 \n"
" \n"
"// a little less than 16k, works on most GPUs \n"
"#define DEPTH_CHUNK 7 \n"
" \n"
"__attribute__((reqd_work_group_size(8, 8, 1))) \n"
"__kernel void bilateral_downsample(__global const float4 *input, \n"
" __global float8 *grid, \n"
" int width, \n"
" int height, \n"
" int sw, \n"
" int sh, \n"
" int depth, \n"
" int s_sigma, \n"
" float r_sigma) \n"
"{ \n"
" const int gid_x = get_global_id(0); \n"
" const int gid_y = get_global_id(1); \n"
" \n"
" __local float8 grid_chunk[DEPTH_CHUNK][LOCAL_H][LOCAL_W]; \n"
" \n"
" if (gid_x > sw || gid_y > sh) return; \n"
" \n"
" for (int d = 0; d < depth; d+=DEPTH_CHUNK) \n"
" { \n"
" for (int k=0; k < DEPTH_CHUNK; k++) \n"
" { \n"
" grid_chunk[k][get_local_id(1)][get_local_id(0)] = (float8)(0.0f); \n"
" } \n"
" \n"
" barrier (CLK_LOCAL_MEM_FENCE); \n"
" \n"
" for (int ry=0; ry < s_sigma; ry++) \n"
" for (int rx=0; rx < s_sigma; rx++) \n"
" { \n"
" const int x = clamp(gid_x * s_sigma - s_sigma/2 + rx, 0, width -1);\n"
" const int y = clamp(gid_y * s_sigma - s_sigma/2 + ry, 0, height-1);\n"
" \n"
" const float4 val = input[y * width + x]; \n"
" \n"
" const int4 z = convert_int4(val * (1.0f/r_sigma) + 0.5f); \n"
" \n"
" // z >= d && z < d+DEPTH_CHUNK \n"
" int4 inbounds = (z >= d & z < d+DEPTH_CHUNK); \n"
" \n"
" if (inbounds.x) grid_chunk[z.x-d][get_local_id(1)][get_local_id(0)].s01 += (float2)(val.x, 1.0f);\n"
" if (inbounds.y) grid_chunk[z.y-d][get_local_id(1)][get_local_id(0)].s23 += (float2)(val.y, 1.0f);\n"
" if (inbounds.z) grid_chunk[z.z-d][get_local_id(1)][get_local_id(0)].s45 += (float2)(val.z, 1.0f);\n"
" if (inbounds.w) grid_chunk[z.w-d][get_local_id(1)][get_local_id(0)].s67 += (float2)(val.w, 1.0f);\n"
" \n"
" barrier (CLK_LOCAL_MEM_FENCE); \n"
" } \n"
" \n"
" for (int s=d, e=d+min(DEPTH_CHUNK, depth-d); s < e; s++) \n"
" { \n"
" grid[gid_x+sw*(gid_y + s * sh)] = grid_chunk[s-d][get_local_id(1)][get_local_id(0)];\n"
" } \n"
" } \n"
"} \n"
" \n"
"#undef LOCAL_W \n"
"#undef LOCAL_H \n"
" \n"
"#define LOCAL_W 16 \n"
"#define LOCAL_H 16 \n"
" \n"
"__attribute__((reqd_work_group_size(16, 16, 1))) \n"
"__kernel void bilateral_blur(__global const float8 *grid, \n"
" __global float2 *blurz_r, \n"
" __global float2 *blurz_g, \n"
" __global float2 *blurz_b, \n"
" __global float2 *blurz_a, \n"
" int sw, \n"
" int sh, \n"
" int depth) \n"
"{ \n"
" const int gid_x = get_global_id(0); \n"
" const int gid_y = get_global_id(1); \n"
" \n"
" const int lx = get_local_id(0); \n"
" const int ly = get_local_id(1); \n"
" \n"
" float8 vpp = (float8)(0.0f); \n"
" float8 vp = (float8)(0.0f); \n"
" float8 v = (float8)(0.0f); \n"
" \n"
" __local float8 data[LOCAL_H+2][LOCAL_W+2]; \n"
" \n"
" for (int d=0; d