diff options
author | Chuanbo Weng <chuanbo.weng@intel.com> | 2015-02-13 11:33:44 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@intel.com> | 2015-02-13 11:59:34 +0800 |
commit | b4c4fa99553381f096e87da4c5f198a15cda373b (patch) | |
tree | 3e23685d0b4b2162541b3786da0d21a334b53532 | |
parent | 17b53e8868155003e8d0008763e84b5318100109 (diff) |
Optimization of clEnqueueCopyImageToBuffer for 16 aligned case.
We can change the image_channel_order to CL_RGBA and
image_channel_data_type to CL_UNSIGNED_INT32 for some special
case, thus 16 bytes can be read by one work item. Bandwidth is
fully used.
v2:
Now we just optimize for IMAGE2D, so add judgement to not affect
other image type's code path.
Signed-off-by: Chuanbo Weng <chuanbo.weng@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
-rw-r--r-- | src/CMakeLists.txt | 2 | ||||
-rw-r--r-- | src/cl_context.h | 1 | ||||
-rw-r--r-- | src/cl_mem.c | 44 | ||||
-rw-r--r-- | src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl | 19 |
4 files changed, 57 insertions, 9 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 939f58da..d4181d8d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -49,7 +49,7 @@ cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy cl_internal_copy_image_2d_to_2d_array cl_internal_copy_image_1d_array_to_1d_array cl_internal_copy_image_2d_array_to_2d_array cl_internal_copy_image_2d_array_to_2d cl_internal_copy_image_2d_array_to_3d cl_internal_copy_image_3d_to_2d_array -cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer +cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_2d_to_buffer_align16 cl_internal_copy_image_3d_to_buffer cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d cl_internal_fill_buf_align8 cl_internal_fill_buf_align4 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign diff --git a/src/cl_context.h b/src/cl_context.h index 2ea0a73a..fdbfd2a4 100644 --- a/src/cl_context.h +++ b/src/cl_context.h @@ -60,6 +60,7 @@ enum _cl_internal_ker_type { CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D, //copy image 2d array to image 3d CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY, //copy image 3d to image 2d array CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, //copy image 2d to buffer + CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, //copy image 3d tobuffer CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, //copy buffer to image 2d CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, //copy buffer to image 3d diff --git a/src/cl_mem.c b/src/cl_mem.c index d2e502d1..57d27dd9 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -1712,6 +1712,10 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, uint32_t intel_fmt, bpp; cl_image_format fmt; size_t origin0, region0; + size_t kn_dst_offset; + int align16 = 0; + size_t align_size = 1; + size_t w_saved; if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; @@ -1722,24 +1726,48 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, /* We use one kernel to copy the data. The kernel is lazily created. */ assert(image->base.ctx == buffer->ctx); - fmt.image_channel_order = CL_R; - fmt.image_channel_data_type = CL_UNSIGNED_INT8; intel_fmt = image->intel_fmt; bpp = image->bpp; - image->intel_fmt = cl_image_get_intel_format(&fmt); - image->w = image->w * image->bpp; - image->bpp = 1; + w_saved = image->w; region0 = region[0] * bpp; - origin0 = src_origin[0] * bpp; + kn_dst_offset = dst_offset; + if((image->image_type == CL_MEM_OBJECT_IMAGE2D) && ((image->w * image->bpp) % 16 == 0) && + ((src_origin[0] * bpp) % 16 == 0) && (region0 % 16 == 0) && (dst_offset % 16 == 0)){ + fmt.image_channel_order = CL_RGBA; + fmt.image_channel_data_type = CL_UNSIGNED_INT32; + align16 = 1; + align_size = 16; + } + else{ + fmt.image_channel_order = CL_R; + fmt.image_channel_data_type = CL_UNSIGNED_INT8; + align_size = 1; + } + image->intel_fmt = cl_image_get_intel_format(&fmt); + image->w = (image->w * image->bpp) / align_size; + image->bpp = align_size; + region0 = (region[0] * bpp) / align_size; + origin0 = (src_origin[0] * bpp) / align_size; + kn_dst_offset /= align_size; global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; /* setup the kernel and run. */ if(image->image_type == CL_MEM_OBJECT_IMAGE2D) { + if(align16){ + extern char cl_internal_copy_image_2d_to_buffer_align16_str[]; + extern size_t cl_internal_copy_image_2d_to_buffer_align16_str_size; + + ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16, + cl_internal_copy_image_2d_to_buffer_align16_str, + (size_t)cl_internal_copy_image_2d_to_buffer_align16_str_size, NULL); + } + else{ extern char cl_internal_copy_image_2d_to_buffer_str[]; extern size_t cl_internal_copy_image_2d_to_buffer_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL); + } }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { extern char cl_internal_copy_image_3d_to_buffer_str[]; extern size_t cl_internal_copy_image_3d_to_buffer_str_size; @@ -1761,7 +1789,7 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0); cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]); cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); - cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_offset); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset); ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); @@ -1769,7 +1797,7 @@ fail: image->intel_fmt = intel_fmt; image->bpp = bpp; - image->w = image->w / bpp; + image->w = w_saved; return ret; } diff --git a/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl b/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl new file mode 100644 index 00000000..a32e5f23 --- /dev/null +++ b/src/kernels/cl_internal_copy_image_2d_to_buffer_align16.cl @@ -0,0 +1,19 @@ +kernel void __cl_copy_image_2d_to_buffer_align16( __read_only image2d_t image, global uint4* buffer, + unsigned int region0, unsigned int region1, unsigned int region2, + unsigned int src_origin0, unsigned int src_origin1, unsigned int src_origin2, + unsigned int dst_offset) +{ + + int i = get_global_id(0); + int j = get_global_id(1); + if((i >= region0) || (j>= region1)) + return; + uint4 color; + const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; + int2 src_coord; + src_coord.x = src_origin0 + i; + src_coord.y = src_origin1 + j; + color = read_imageui(image, sampler, src_coord); + + *(buffer + dst_offset + region0*j + i) = color; +} |