summaryrefslogtreecommitdiff
path: root/kernels/compiler_box_blur.cl
blob: 26936e0ceb533f23c7083f07788e4966cd3117e2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
inline float3 unpack_fp3(uint u) {
  float3 u3;
  u3.x = (float) (u & 0xff); u >>= 8;
  u3.y = (float) (u & 0xff); u >>= 8;
  u3.z = (float) (u & 0xff);
  return u3;
}

inline uint pack_fp3(float3 u3) {
  uint u;
  u = (((uint) u3.x)) | (((uint) u3.y) << 8) | (((uint) u3.z) << 16);
  return u;
}

#define HFILTER3(C0, C1, C2, C3, CURR, LEFT, RIGHT)\
  float3 C0, C1, C2, C3;\
  do {\
    const uint4 from = vload4(CURR, src);\
    const float3 from0 = unpack_fp3(from.x);\
    const float3 from1 = unpack_fp3(from.y);\
    const float3 from2 = unpack_fp3(from.z);\
    const float3 from3 = unpack_fp3(from.w);\
    const float3 l = unpack_fp3(src[LEFT]);\
    const float3 r = unpack_fp3(src[RIGHT]);\
    C0 = (l+from0+from1);\
    C1 = (from0+from1+from2);\
    C2 = (from1+from2+from3);\
    C3 = (from2+from3+r);\
  } while(0)

__kernel void compiler_box_blur(__global const uint *src,
                                __global uint *dst,
                                int w,
                                int h,
                                int chunk)
{
  const int x = get_global_id(0);
  int y = get_global_id(1)*chunk;
  const int yend = min(y + chunk, h); /* we process a tile in the image */

  /* Current line (left (1 pixel), center (4 pixels), right (1 pixel)) */
  const int left = max(4*x-1, 0) + y*w;
  const int right = min(4*x+4, w-1) + y*w;
  int curr = x + y*(w>>2);
  HFILTER3(curr0, curr1, curr2, curr3, curr, left, right);

  /* Top line (left (1 pixel), center (4 pixels), right (1 pixel)) */
  const int ytop = max(y-1,0);
  const int topLeft = max(4*x-1, 0) + ytop*w;
  const int topRight = min(4*x+4, w-1) + ytop*w;
  const int top = x + ytop*(w>>2);
  HFILTER3(top0, top1, top2, top3, top, topLeft, topRight);

  /* To guard bottom line */
  const int maxBottom = x + (h-1)*(w>>2);
  const int maxBottomLeft = max(4*x-1,0) + (h-1)*w;
  const int maxBottomRight = min(4*x+4,w-1) + (h-1)*w;

  /* We use a short 3 pixel sliding window */
  const int ybottom = min(y+1,h-1);
  int bottomLeft = max(4*x-1, 0) + ybottom*w;
  int bottomRight = min(4*x+4, w-1) + ybottom*w;
  int bottom = x + ybottom*(w>>2);

  /* Top down sliding window */
  for (; y < yend; ++y, curr += (w>>2), bottom += (w>>2), bottomLeft += w, bottomRight += w) {
    const int center = min(bottom, maxBottom);
    const int left = min(bottomLeft, maxBottomLeft);
    const int right = min(bottomRight, maxBottomRight);
    HFILTER3(bottom0, bottom1, bottom2, bottom3, center, left, right);
    const float3 to0 = (top0+curr0+bottom0)*(1.f/9.f);
    const float3 to1 = (top1+curr1+bottom1)*(1.f/9.f);
    const float3 to2 = (top2+curr2+bottom2)*(1.f/9.f);
    const float3 to3 = (top3+curr3+bottom3)*(1.f/9.f);
    const uint4 to = (uint4)(pack_fp3(to0),pack_fp3(to1),pack_fp3(to2),pack_fp3(to3));
    vstore4(to, curr, dst);
    top0 = curr0; top1 = curr1; top2 = curr2; top3 = curr3;
    curr0 = bottom0; curr1 = bottom1; curr2 = bottom2; curr3 = bottom3;
  }
}