summaryrefslogtreecommitdiff
path: root/kernels/compiler_skip_check.cl
blob: 68369dd61b0c6c4e1501b2caa08894b46b8ccdf4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
__kernel __attribute__((intel_reqd_sub_group_size(16)))
void compiler_skip_check(__read_only image2d_t   src_img,
                         __read_only image2d_t   ref_img,
                         __global short2  *motion_vector_buffer,
                         __global ushort  *residual_buffer,
                         __global uint*          dwo_buffer,
                         __global uint*          pld_buffer){
  sampler_t vs = 0;

  int gr_id0 = get_group_id(0);
  int gr_id1 = get_group_id(1);

  ushort2 src_coord;

  src_coord.x = gr_id0 * 16;
  src_coord.y = gr_id1 * 16;

  intel_sub_group_avc_sic_payload_t payload =
      intel_sub_group_avc_sic_initialize(src_coord);

  //Ignore in unidirectional, so just set to 0
  uchar bidir_weight = 0;

  uint skip_block_partition_type = CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL;
  uint skip_motion_vector_mask = CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL;
  uchar skip_sad_adjustment = CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL;

  uint2 bi_mv;
  int mb_idx =  gr_id0  +  gr_id1 * get_num_groups(0);
  short2 input_mv = motion_vector_buffer[mb_idx];
  bi_mv.s0 = as_uint(input_mv);

  ulong mv = as_ulong(bi_mv);

  payload = intel_sub_group_avc_sic_configure_skc(
      skip_block_partition_type, skip_motion_vector_mask, mv, bidir_weight,
      skip_sad_adjustment, payload);

  intel_sub_group_avc_sic_result_t result =
      intel_sub_group_avc_sic_evaluate_with_single_reference(
          src_img, ref_img, vs, payload);

  ushort distortion = intel_sub_group_avc_sic_get_inter_distortions(result);

  int lid_x = get_local_id(0);
  if(lid_x == 0)
    residual_buffer[mb_idx] = distortion;
  dwo_buffer[mb_idx*16*4 + lid_x + 16*0] = result.s0;
  dwo_buffer[mb_idx*16*4 + lid_x + 16*1] = result.s1;
  dwo_buffer[mb_idx*16*4 + lid_x + 16*2] = result.s2;
  dwo_buffer[mb_idx*16*4 + lid_x + 16*3] = result.s3;

}