diff options
-rw-r--r-- | docs/howto/video-motion-estimation-howto.mdwn | 26 | ||||
-rw-r--r-- | src/kernels/cl_internal_block_motion_estimate_intel.cl | 199 |
2 files changed, 175 insertions, 50 deletions
diff --git a/docs/howto/video-motion-estimation-howto.mdwn b/docs/howto/video-motion-estimation-howto.mdwn index d9edc9b9..8deaa61a 100644 --- a/docs/howto/video-motion-estimation-howto.mdwn +++ b/docs/howto/video-motion-estimation-howto.mdwn @@ -1,21 +1,15 @@ Video Motion Vector HowTo ========================== -Beignet now supports cl_intel_accelerator and part of cl_intel_motion_estimation, which -are Khronos official extensions. It provides a hardware acceleration of video motion +Beignet now supports cl_intel_accelerator and cl_intel_motion_estimation, which are +Khronos official extensions. It provides a hardware acceleration of video motion vector to users. -Supported hardware platform and limitation ------------------------------------------- +Supported hardware platform +--------------------------- -Only 3rd Generation Intel Core Processors is supported for vme now. And now we just -implement this part of cl_intel_motion_estimation for motion vector computation(residuals -can not be returned yet) on 3rd Generation Intel Core Processors: - mb_block_type = CL_ME_MB_TYPE_16x16_INTEL - subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL - search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL / CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL - / CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL -We will fully support cl_intel_motion_estimation in the future. +Only 3rd Generation Intel Core Processors is supported for vme now. We will consider +to support more platforms if necessary. Steps ----- @@ -23,15 +17,13 @@ Steps In order to use video motion estimation provided by Beignet in your program, please follow the steps as below: -- Create a cl_accelerator_intel object using extension API clCreateAcceleratorINTEL, with - the following parameters: +- Create a cl_accelerator_intel object using extension API clCreateAcceleratorINTEL, like + this: _accelerator_type_intel accelerator_type = CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL; cl_motion_estimation_desc_intel vmedesc = {CL_ME_MB_TYPE_16x16_INTEL, CL_ME_SUBPIXEL_MODE_INTEGER_INTEL, CL_ME_SAD_ADJUST_MODE_NONE_INTEL, - CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL( - or CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL - or CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL) + CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL }; - Invoke clCreateProgramWithBuiltInKernels to create a program object with built-in kernels diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl b/src/kernels/cl_internal_block_motion_estimate_intel.cl index 5a223381..1f28f4e2 100644 --- a/src/kernels/cl_internal_block_motion_estimate_intel.cl +++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl @@ -59,23 +59,28 @@ void block_motion_estimate_intel(accelerator_intel_t accel, int lgid_x = get_group_id(0); int lgid_y = get_group_id(1); + int num_groups_x = get_num_groups(0); + int index = lgid_y * num_groups_x + lgid_x; + uint2 srcCoord = 0; + short2 predict_mv = 0; + if(prediction_motion_vector_buffer != NULL){ + predict_mv = prediction_motion_vector_buffer[index]; + predict_mv.x = predict_mv.x / 4; + predict_mv.y = predict_mv.y / 4; + } srcCoord.x = lgid_x * 16; srcCoord.y = lgid_y * 16; - //TODO: This line of code is just to workaround a curbe related bug caused by commit 061d214a6fc2876a0e24e094f87f2a172984bc23 - //After fix, this line should be removed. - src_grf0_dw5 = accel.mb_block_type; - //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL if(accel.search_path_type == 0x0){ - //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?); + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id); src_grf0_dw5 = (20 << 24) | (20 << 16) | (0 << 8) | (0); //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); - src_grf0_dw1 = 0xfffefffe; + src_grf0_dw1 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff); //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X); - src_grf0_dw0 = 0xfffefffe; + src_grf0_dw0 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & 0x0000ffff); //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) | (Start0Y << 20) src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20) //| (Start0X << 16) | (Max_Num_SU << 8) | (LenSP); @@ -84,35 +89,59 @@ void block_motion_estimate_intel(accelerator_intel_t accel, //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL else if(accel.search_path_type == 0x1){ src_grf0_dw5 = (24 << 24) | (24 << 16) | (0 << 8) | (0); - src_grf0_dw1 = 0xfffcfffc; - src_grf0_dw0 = 0xfffcfffc; + src_grf0_dw1 = ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & 0x0000ffff); + src_grf0_dw0 = ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & 0x0000ffff); src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20) | (0 << 16) | (48 << 8) | (48); } //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL else if(accel.search_path_type == 0x5){ src_grf0_dw5 = (40 << 24) | (48 << 16) | (0 << 8) | (0); - src_grf0_dw1 = 0xfff4fff0; - src_grf0_dw0 = 0xfff4fff0; + src_grf0_dw1 = ((-12 + predict_mv.y) << 16 ) | ((-16 + predict_mv.x) & 0x0000ffff); + src_grf0_dw0 = ((-12 + predict_mv.y) << 16 ) | ((-16 + + predict_mv.x) & 0x0000ffff); src_grf1_dw2 = (0 << 28) | (0 << 24) | (0 << 20) | (0 << 16) | (48 << 8) | (48); } - //src_grf0_dw7 = Debug; - src_grf0_dw7 = 0; - //src_grf0_dw6 = Debug; - src_grf0_dw6 = 0; - //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?); - //src_grf0_dw4 = Ignored; - src_grf0_dw4 = 0; - //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22) - src_grf0_dw3 = (0 << 31) | (0x7e << 24) | (0 << 22) + /*Deal with mb_block_type & sad_adjust_mode & subpixel_mode*/ + uchar sub_mb_part_mask = 0; + //CL_ME_MB_TYPE_16x16_INTEL + if(accel.mb_block_type == 0x0) + sub_mb_part_mask = 0x7e; + //CL_ME_MB_TYPE_8x8_INTEL + else if(accel.mb_block_type == 0x1) + sub_mb_part_mask = 0x77; + //CL_ME_MB_TYPE_4x4_INTEL + else if(accel.mb_block_type == 0x2) + sub_mb_part_mask = 0x3f; + + uchar inter_sad = 0; + //CL_ME_SAD_ADJUST_MODE_NONE_INTEL + if(accel.sad_adjust_mode == 0x0) + inter_sad = 0; + //CL_ME_SAD_ADJUST_MODE_HAAR_INTEL + else if(accel.sad_adjust_mode == 0x1) + inter_sad = 2; + + uchar sub_pel_mode = 0; + //CL_ME_SUBPIXEL_MODE_INTEGER_INTEL + if(accel.subpixel_mode == 0x0) + sub_pel_mode = 0; + //CL_ME_SUBPIXEL_MODE_HPEL_INTEL + else if(accel.subpixel_mode == 0x1) + sub_pel_mode = 1; + //CL_ME_SUBPIXEL_MODE_QPEL_INTEL + else if(accel.subpixel_mode == 0x2) + sub_pel_mode = 3; + + //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) | (Intra_SAD << 22) + src_grf0_dw3 = (0 << 31) | (sub_mb_part_mask << 24) | (0 << 22) //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) | (Reserverd << 18) - | (0 << 20) | (0 << 19) | (0 << 18) + | (inter_sad << 20) | (0 << 19) | (0 << 18) //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16) | (Dis_Field_Cache_Alloc << 15) | (0 << 17) | (0 << 16) | (0 << 15) //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) | (Dual_Search_Path_Opt << 11) - | (0 << 14) | (0 << 12) | (0 << 11) + | (0 << 14) | (sub_pel_mode << 12) | (0 << 11) //| (Search_Ctrl << 8) | (Ref_Access << 7) | (SrcAccess << 6) | (0 << 8) | (0 << 7) | (0 << 6) //| (Mb_Type_Remap << 4) | (Reserved_Workaround << 3) | (Reserved_Workaround << 2) @@ -120,6 +149,15 @@ void block_motion_estimate_intel(accelerator_intel_t accel, //| (Src_Size); | (0); + + //src_grf0_dw7 = Debug; + src_grf0_dw7 = 0; + //src_grf0_dw6 = Debug; + src_grf0_dw6 = 0; + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?); + //src_grf0_dw4 = Ignored; + src_grf0_dw4 = 0; + //src_grf0_dw2 = (SrcY << 16) | (SrcX); src_grf0_dw2 = (srcCoord.y << 16) | (srcCoord.x); //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); @@ -142,7 +180,8 @@ void block_motion_estimate_intel(accelerator_intel_t accel, /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) | (AdaptiveValidationControl << 29) | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << 24) | (Reserverd << 22) | (Bi_Weight << 16) | (Reserved << 6) | (MaxNumMVs);*/ - src_grf1_dw1 = (0 << 24) | (2); + //src_grf1_dw1 = (0 << 24) | (2); + src_grf1_dw1 = (0 << 24) | (16); /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << 16) | (Skip_Success << 8) | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) | (Early_Ime_Success_En << 5) | (Early_Success_En << 4) | (Part_Candidate_En << 3) | (Bi_Mix_Dis << 2) @@ -201,6 +240,8 @@ void block_motion_estimate_intel(accelerator_intel_t accel, src_grf4_dw1 = 0; src_grf4_dw0 = 0; + int lid_x = get_local_id(0); + vme_result = __gen_ocl_vme(src_image, ref_image, src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4, src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, @@ -217,17 +258,109 @@ void block_motion_estimate_intel(accelerator_intel_t accel, barrier(CLK_LOCAL_MEM_FENCE); - int lid_x = get_local_id(0); + short2 mv[16]; + ushort res[16]; + + uint write_back_dwx; uint simd_width = get_sub_group_size(); - uint write_back_grf1_dw0; - if(simd_width == 8) - write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1); - else if(simd_width == 16) - write_back_grf1_dw0 = __gen_ocl_region(8, vme_result.s0); - short2 val = as_short2( write_back_grf1_dw0 ); - int index = lgid_y * get_num_groups(0) + lgid_x; - if( lid_x == 0 ){ - motion_vector_buffer[index] = val; + + /* In simd 8 mode, one kernel variable 'uint' map to 8 dword. + * In simd 16 mode, one kernel variable 'uint' map to 16 dword. + * That's why we should treat simd8 and simd16 differently when + * use __gen_ocl_region. + * */ + if(simd_width == 8){ + write_back_dwx = __gen_ocl_region(0, vme_result.s1); + mv[0] = as_short2( write_back_dwx ); + + if(accel.mb_block_type > 0x0){ + for(int i = 2, j = 1; j < 4; i += 2, j++){ + write_back_dwx = __gen_ocl_region(i, vme_result.s1); + mv[j] = as_short2( write_back_dwx ); + } + if(accel.mb_block_type > 0x1){ + for(int i = 0, j = 4; j < 8; i += 2, j++){ + write_back_dwx = __gen_ocl_region(i, vme_result.s2); + mv[j] = as_short2( write_back_dwx ); + } + for(int i = 0, j = 8; j < 12; i += 2, j++){ + write_back_dwx = __gen_ocl_region(i, vme_result.s3); + mv[j] = as_short2( write_back_dwx ); + } + for(int i = 0, j = 12; j < 16; i += 2, j++){ + write_back_dwx = __gen_ocl_region(i, vme_result.s4); + mv[j] = as_short2( write_back_dwx ); + } + } + } + ushort2 temp_res; + for(int i = 0; i < 8; i++){ + write_back_dwx = __gen_ocl_region(i, vme_result.s5); + temp_res = as_ushort2(write_back_dwx); + res[i*2] = temp_res.s0; + res[i*2+1] = temp_res.s1; + } + } + else if(simd_width == 16){ + write_back_dwx = __gen_ocl_region(0 + 8, vme_result.s0); + mv[0] = as_short2( write_back_dwx ); + + if(accel.mb_block_type > 0x0){ + for(int i = 2, j = 1; j < 4; i += 2, j++){ + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s0); + mv[j] = as_short2( write_back_dwx ); + } + if(accel.mb_block_type > 0x1){ + for(int i = 0, j = 4; j < 8; i += 2, j++){ + write_back_dwx = __gen_ocl_region(i, vme_result.s1); + mv[j] = as_short2( write_back_dwx ); + } + for(int i = 0, j = 8; j < 12; i += 2, j++){ + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s1); + mv[j] = as_short2( write_back_dwx ); + } + for(int i = 0, j = 12; j < 16; i += 2, j++){ + write_back_dwx = __gen_ocl_region(i, vme_result.s2); + mv[j] = as_short2( write_back_dwx ); + } + } + } + ushort2 temp_res; + for(int i = 0; i < 8; i++){ + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s2); + temp_res = as_ushort2(write_back_dwx); + res[i*2] = temp_res.s0; + res[i*2+1] = temp_res.s1; + } + } + + int mv_index; + + //CL_ME_MB_TYPE_16x16_INTEL + if(accel.mb_block_type == 0x0){ + mv_index = index * 1; + if( lid_x == 0 ){ + motion_vector_buffer[mv_index] = mv[lid_x]; + residuals[mv_index] = 2 * res[lid_x]; + } + } + //CL_ME_MB_TYPE_8x8_INTEL + else if(accel.mb_block_type == 0x1){ + if(lid_x < 4){ + mv_index = lgid_y * num_groups_x * 4 + lgid_x * 2; + mv_index = mv_index + num_groups_x * 2 * (lid_x / 2) + (lid_x % 2); + motion_vector_buffer[mv_index] = mv[lid_x]; + residuals[mv_index] = 2 * res[lid_x]; + } + } + //CL_ME_MB_TYPE_4x4_INTEL + else if(accel.mb_block_type == 0x2){ + if(lid_x < 16){ + mv_index = lgid_y * num_groups_x * 16 + lgid_x * 4; + mv_index = mv_index + num_groups_x * 4 * (lid_x / 4) + (lid_x % 4); + motion_vector_buffer[mv_index] = mv[lid_x]; + residuals[mv_index] = 2 * res[lid_x]; + } } } |