diff options
author | Lv Meng <meng.lv@intel.com> | 2014-07-16 15:38:48 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@intel.com> | 2014-07-16 15:03:46 +0800 |
commit | d9483affa2c1c60aa8c894ce45016c929dafe37c (patch) | |
tree | b36b80b9a6f75b4db144044ec74995a6968e71f8 /src | |
parent | 36644bb9e3e94dfea42e44ab92f48fd120db808f (diff) |
improve the clEnqueueCopyBufferRect performance in some cases
Signed-off-by: Lv Meng <meng.lv@intel.com>
Reviewed-by: He Junyan <junyan.he@inbox.com>
Reviewed-by: Yang Rong <rong.r.yang@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/CMakeLists.txt | 3 | ||||
-rw-r--r-- | src/cl_context.h | 1 | ||||
-rw-r--r-- | src/cl_mem.c | 37 | ||||
-rw-r--r-- | src/kernels/cl_internal_copy_buf_rect_align4.cl | 15 |
4 files changed, 49 insertions, 7 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 45c83d42..186e42a5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -42,7 +42,8 @@ set (KERNEL_STR_FILES) set (KERNEL_NAMES cl_internal_copy_buf_align4 cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset -cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d +cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4 +cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d diff --git a/src/cl_context.h b/src/cl_context.h index 75afbf60..f8342d3d 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -47,6 +47,7 @@ enum _cl_internal_ker_type { CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET, CL_ENQUEUE_COPY_BUFFER_RECT, + CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, //copy image 1d to image 1d CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, //copy image 2d to image 2d CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, //copy image 3d to image 2d diff --git a/src/cl_mem.c b/src/cl_mem.c index 70bc3eb1..11411d97 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -1399,6 +1399,16 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1}; + // the src and dst mem rect is continuous, the copy is degraded to buf copy + if((region[0] == dst_row_pitch) && (region[0] == src_row_pitch) && + (region[1] * src_row_pitch == src_slice_pitch) && (region[1] * dst_row_pitch == dst_slice_pitch)){ + cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0]; + cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0]; + cl_int size = region[0]*region[1]*region[2]; + ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size); + return ret; + } + if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; @@ -1411,18 +1421,33 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, assert(src_buf->ctx == dst_buf->ctx); /* setup the kernel and run. */ - extern char cl_internal_copy_buf_rect_str[]; - extern size_t cl_internal_copy_buf_rect_str_size; - - ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT, - cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL); + size_t region0 = region[0]; + if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) && (dst_row_pitch % 4== 0) + && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (region0 % 4 == 0) ){ + extern char cl_internal_copy_buf_rect_align4_str[]; + extern size_t cl_internal_copy_buf_rect_align4_str_size; + region0 /= 4; + src_offset /= 4; + dst_offset /= 4; + src_row_pitch /= 4; + dst_row_pitch /= 4; + src_slice_pitch /= 4; + dst_slice_pitch /= 4; + ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4, + cl_internal_copy_buf_rect_align4_str, (size_t)cl_internal_copy_buf_rect_align4_str_size, NULL); + }else{ + extern char cl_internal_copy_buf_rect_str[]; + extern size_t cl_internal_copy_buf_rect_str_size; + ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT, + cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL); + } if (!ker) return CL_OUT_OF_RESOURCES; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf); - cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset); diff --git a/src/kernels/cl_internal_copy_buf_rect_align4.cl b/src/kernels/cl_internal_copy_buf_rect_align4.cl new file mode 100644 index 00000000..fbfe7b2a --- /dev/null +++ b/src/kernels/cl_internal_copy_buf_rect_align4.cl @@ -0,0 +1,15 @@ +kernel void __cl_copy_buffer_rect_align4 ( global int* src, global int* dst, + unsigned int region0, unsigned int region1, unsigned int region2, + unsigned int src_offset, unsigned int dst_offset, + unsigned int src_row_pitch, unsigned int src_slice_pitch, + unsigned int dst_row_pitch, unsigned int dst_slice_pitch) +{ + int i = get_global_id(0); + int j = get_global_id(1); + int k = get_global_id(2); + if((i >= region0) || (j>= region1) || (k>=region2)) + return; + src_offset += k * src_slice_pitch + j * src_row_pitch + i; + dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i; + dst[dst_offset] = src[src_offset]; +} |