diff options
author | Junyan He <junyan.he@intel.com> | 2017-06-11 13:50:31 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2017-08-02 17:16:30 +0800 |
commit | 81660456bb6db646a98a9d4a02a8d332ab7fd617 (patch) | |
tree | 548148407ea9706089349486b01786efcb286a8f | |
parent | 7315608283784fab1565c3ed64e8bfc9fe4fd293 (diff) |
Add cl_mem_gen to implement cl_men for GEN device.
Signed-off-by: Junyan He <junyan.he@intel.com>
-rw-r--r-- | runtime/gen/cl_mem_gen.c | 1269 |
1 files changed, 1269 insertions, 0 deletions
diff --git a/runtime/gen/cl_mem_gen.c b/runtime/gen/cl_mem_gen.c new file mode 100644 index 00000000..8f9484d6 --- /dev/null +++ b/runtime/gen/cl_mem_gen.c @@ -0,0 +1,1269 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "cl_gen.h" +#include <unistd.h> + +/* All drm list for buffer, image and SVM usage, for debug */ +static list_head gen_drm_bo_list = {{&(gen_drm_bo_list.head_node), &(gen_drm_bo_list.head_node)}}; +static pthread_mutex_t gen_drm_bo_list_mutex = PTHREAD_MUTEX_INITIALIZER; + +LOCAL cl_mem_drm_bo +cl_mem_gen_create_drm_bo(dri_bufmgr *bufmgr, size_t size, size_t alignment, + cl_image_gen_tiling tiling, size_t stride, void *orig_data) +{ + cl_mem_drm_bo drm_bo = CL_CALLOC(1, sizeof(_cl_mem_drm_bo)); + if (drm_bo == NULL) + return NULL; + + /* HSW: Byte scattered Read/Write has limitation that + the buffer size must be a multiple of 4 bytes. */ + size = ALIGN(size, 4); + + drm_bo->bo = drm_intel_bo_alloc(bufmgr, "CL memory object", size, alignment); + if (drm_bo->bo == NULL) { + CL_FREE(drm_bo); + return NULL; + } + + CL_OBJECT_INIT_BASE(drm_bo, CL_OBJECT_DRM_BO_MAGIC); + drm_bo->gpu_size = size; + drm_bo->tiling = tiling; + drm_bo->stride = stride; + drm_bo->host_coherent = CL_FALSE; + intel_buffer_set_tiling(drm_bo->bo, drm_bo->tiling, drm_bo->stride); + + if (orig_data) + drm_intel_bo_subdata(drm_bo->bo, 0, size, orig_data); + + pthread_mutex_lock(&gen_drm_bo_list_mutex); + list_add_tail(&gen_drm_bo_list, &drm_bo->base.node); + pthread_mutex_unlock(&gen_drm_bo_list_mutex); + + return drm_bo; +} + +LOCAL cl_mem_drm_bo +cl_mem_gen_create_drm_bo_from_hostptr(dri_bufmgr *bufmgr, cl_bool svm, + size_t size, cl_uint cacheline_size, void *host_ptr) +{ +#ifdef HAS_USERPTR + int page_size = getpagesize(); + + if ((ALIGN((unsigned long)host_ptr, cacheline_size) != (unsigned long)host_ptr) || + (ALIGN((unsigned long)size, cacheline_size) != (unsigned long)size)) { + /* Must Align to a cache line size, or GPU will overwrite the data when cache flush */ + return NULL; + } + + cl_mem_drm_bo drm_bo = CL_CALLOC(1, sizeof(_cl_mem_drm_bo)); + if (drm_bo == NULL) + return NULL; + + CL_OBJECT_INIT_BASE(drm_bo, CL_OBJECT_DRM_BO_MAGIC); + drm_bo->host_coherent = CL_TRUE; + drm_bo->mapped_ptr = (void *)(((unsigned long)host_ptr) & (~(page_size - 1))); + drm_bo->in_page_offset = host_ptr - drm_bo->mapped_ptr; + drm_bo->gpu_size = ALIGN((drm_bo->in_page_offset + size), page_size); + drm_bo->bo = intel_buffer_alloc_userptr(bufmgr, "CL userptr memory object", + drm_bo->mapped_ptr, drm_bo->gpu_size, 0); + if (drm_bo->bo == NULL) { + CL_FREE(drm_bo); + return NULL; + } + + if (svm) { + drm_intel_bo_set_softpin_offset(drm_bo->bo, (size_t)drm_bo->mapped_ptr); + drm_intel_bo_use_48b_address_range(drm_bo->bo, 1); + drm_bo->svm = CL_TRUE; + } + + pthread_mutex_lock(&gen_drm_bo_list_mutex); + list_add_tail(&gen_drm_bo_list, &drm_bo->base.node); + pthread_mutex_unlock(&gen_drm_bo_list_mutex); + + return drm_bo; +#else + return NULL; +#endif +} + +LOCAL void +cl_mem_gen_drm_bo_ref(cl_mem_drm_bo drm_bo) +{ + assert(CL_OBJECT_IS_DRM_BO(drm_bo)); + assert(drm_bo->bo); + CL_OBJECT_INC_REF(drm_bo); +} + +LOCAL void +cl_mem_gen_drm_bo_delete(cl_mem_drm_bo drm_bo) +{ + assert(CL_OBJECT_IS_DRM_BO(drm_bo)); + assert(drm_bo->bo); + + if (CL_OBJECT_DEC_REF(drm_bo) > 1) + return; + + pthread_mutex_lock(&gen_drm_bo_list_mutex); + list_node_del(&drm_bo->base.node); + pthread_mutex_unlock(&gen_drm_bo_list_mutex); + + if (drm_bo->drm_map_ref > 0) { + CL_LOG_WARNING("Pay Attention: the drm object: %p is destroying but still hole %d map references", + drm_bo->bo, drm_bo->drm_map_ref); + } + drm_intel_bo_unreference(drm_bo->bo); + CL_OBJECT_DESTROY_BASE(drm_bo); + CL_FREE(drm_bo); +} + +LOCAL void * +cl_mem_gen_drm_bo_map(cl_mem_drm_bo drm_bo, cl_bool unsync) +{ + cl_bool already_sync = CL_FALSE; + void *ret_ptr = NULL; + + assert(CL_OBJECT_IS_DRM_BO(drm_bo)); + assert(drm_bo->bo); + + CL_OBJECT_TAKE_OWNERSHIP(drm_bo, 1); + if (drm_bo->drm_map_ref != 0) { + assert(drm_bo->mapped_ptr != NULL); + assert(drm_bo->mapped_ptr == drm_bo->bo->virtual); + } else { + if (drm_bo->host_coherent == CL_TRUE) { + /* Host ptr never need call drm_map api */ + assert(drm_bo->tiling == CL_NO_TILE); + assert(drm_bo->mapped_ptr == drm_bo->bo->virtual); + } else if (drm_bo->tiling != CL_NO_TILE || unsync) { + drm_intel_gem_bo_map_unsynchronized(drm_bo->bo); + drm_bo->mapped_ptr = drm_bo->bo->virtual; + } else { + drm_intel_bo_map(drm_bo->bo, 1); // Always mapped write + already_sync = CL_TRUE; + drm_bo->mapped_ptr = drm_bo->bo->virtual; + } + assert(drm_bo->mapped_ptr != NULL); + } + + drm_bo->drm_map_ref++; + if (drm_bo->host_coherent == CL_TRUE) { + ret_ptr = drm_bo->mapped_ptr + drm_bo->in_page_offset; + } else { + ret_ptr = drm_bo->mapped_ptr; + } + + CL_OBJECT_RELEASE_OWNERSHIP(drm_bo); + + if (unsync == CL_FALSE && already_sync == CL_FALSE) { + drm_intel_bo_wait_rendering(drm_bo->bo); + } + + assert(ret_ptr); + return ret_ptr; +} + +LOCAL void +cl_mem_gen_drm_bo_unmap(cl_mem_drm_bo drm_bo) +{ + assert(CL_OBJECT_IS_DRM_BO(drm_bo)); + assert(drm_bo->bo); + + CL_OBJECT_TAKE_OWNERSHIP(drm_bo, 1); + drm_bo->drm_map_ref--; + assert(drm_bo->bo->virtual != NULL); + assert(drm_bo->mapped_ptr == drm_bo->bo->virtual); + assert(drm_bo->drm_map_ref >= 0); + + if (drm_bo->drm_map_ref == 0) { + if (drm_bo->host_coherent == CL_FALSE) { + drm_intel_bo_unmap(drm_bo->bo); + assert(drm_bo->bo->virtual == NULL); + drm_bo->mapped_ptr = NULL; + } + } + CL_OBJECT_RELEASE_OWNERSHIP(drm_bo); +} + +LOCAL void +cl_mem_gen_drm_bo_sync(cl_mem_drm_bo drm_bo) +{ + assert(CL_OBJECT_IS_DRM_BO(drm_bo)); + assert(drm_bo->bo); + drm_intel_bo_wait_rendering(drm_bo->bo); +} + +LOCAL cl_bool +cl_mem_gen_drm_bo_expand(cl_mem_drm_bo drm_bo, size_t new_size, size_t alignment) +{ + drm_intel_bo *new_bo; + + assert(CL_OBJECT_IS_DRM_BO(drm_bo)); + assert(drm_bo->bo); + + CL_OBJECT_TAKE_OWNERSHIP(drm_bo, 1); + if (drm_bo->drm_map_ref > 0) { // Someone still mapping it, can not do this + CL_OBJECT_RELEASE_OWNERSHIP(drm_bo); + return CL_FALSE; + } + + if (drm_bo->tiling != CL_NO_TILE) { /* Only support no tile mode */ + CL_OBJECT_RELEASE_OWNERSHIP(drm_bo); + return CL_FALSE; + } + + if (drm_bo->host_coherent == CL_TRUE) { /* If use host conherent ptr, can not expand */ + CL_OBJECT_RELEASE_OWNERSHIP(drm_bo); + return CL_FALSE; + } + + new_bo = drm_intel_bo_alloc(drm_bo->bo->bufmgr, "CL memory object", new_size, alignment); + if (new_bo == NULL) { + CL_OBJECT_RELEASE_OWNERSHIP(drm_bo); + return CL_FALSE; + } + + drm_intel_bo_wait_rendering(drm_bo->bo); + + drm_intel_bo_map(new_bo, 1); + void *dst = new_bo->virtual; + void *src = NULL; + if (drm_bo->host_coherent) { + src = drm_bo->mapped_ptr; + } else { + drm_intel_bo_map(drm_bo->bo, 1); + src = drm_bo->bo->virtual; + } + assert(src); + memset(dst, 0, new_size); + memcpy(dst, src, drm_bo->gpu_size); + + drm_intel_bo_unmap(new_bo); + if (drm_bo->host_coherent == CL_FALSE) { + drm_intel_bo_unmap(drm_bo->bo); + } + + /* Reset all field */ + drm_intel_bo_unreference(drm_bo->bo); + assert(drm_bo->drm_map_ref == 0); + drm_bo->bo = new_bo; + drm_bo->gpu_size = new_size; + drm_bo->host_coherent = CL_FALSE; + drm_bo->mapped_ptr = NULL; + drm_bo->in_page_offset = 0; + drm_bo->tiling = CL_NO_TILE; + drm_bo->stride = 0; + + CL_OBJECT_RELEASE_OWNERSHIP(drm_bo); + return CL_TRUE; +} + +LOCAL cl_bool +cl_mem_gen_drm_bo_upload_data(cl_mem_drm_bo drm_bo, size_t offset, void *data, size_t size) +{ + int err = 0; + assert(CL_OBJECT_IS_DRM_BO(drm_bo)); + assert(drm_bo->bo); + drm_intel_bo_wait_rendering(drm_bo->bo); + + CL_OBJECT_TAKE_OWNERSHIP(drm_bo, 1); + + if (drm_bo->host_coherent) { + assert(drm_bo->mapped_ptr); + assert(drm_bo->gpu_size >= offset + size); + if (drm_bo->mapped_ptr + drm_bo->in_page_offset + offset != data) + memcpy(drm_bo->mapped_ptr + drm_bo->in_page_offset + offset, data, size); + } else { + err = drm_intel_bo_subdata(drm_bo->bo, offset, size, data); + } + + CL_OBJECT_RELEASE_OWNERSHIP(drm_bo); + return (err == 0); +} + +#define LOCAL_SZ_0 16 +#define LOCAL_SZ_1 4 +#define LOCAL_SZ_2 4 + +static cl_int +cl_mem_copy_buffer_gen(cl_command_queue queue, cl_event event, cl_mem src_buf, + cl_mem dst_buf, size_t src_offset, size_t dst_offset, size_t cb) +{ + cl_int ret = CL_SUCCESS; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {1, 1, 1}; + const unsigned int masks[4] = {0xffffffff, 0x0ff, 0x0ffff, 0x0ffffff}; + int aligned = 0; + int dw_src_offset = src_offset / 4; + int dw_dst_offset = dst_offset / 4; + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(src_buf->ctx == dst_buf->ctx); + + /* All 16 bytes aligned, fast and easy one. */ + if ((cb % 16 == 0) && (src_offset % 16 == 0) && (dst_offset % 16 == 0)) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_ALIGN16); + cb = cb / 16; + aligned = 1; + } else if ((cb % 4 == 0) && (src_offset % 4 == 0) && (dst_offset % 4 == 0)) { /* all Dword aligned.*/ + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_ALIGN4); + cb = cb / 4; + aligned = 1; + } + + if (aligned) { + assert(ker); + + if (cb < LOCAL_SZ_0) { + local_sz[0] = 1; + } else { + local_sz[0] = LOCAL_SZ_0; + } + global_sz[0] = ((cb + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0; + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); + cl_kernel_set_arg(ker, 4, sizeof(int), &cb); + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + /* Now handle the unaligned cases. */ + int dw_num = ((dst_offset % 4 + cb) + 3) / 4; + unsigned int first_mask = dst_offset % 4 == 0 ? 0x0 : masks[dst_offset % 4]; + unsigned int last_mask = masks[(dst_offset + cb) % 4]; + /* handle the very small range copy. */ + if (cb < 4 && dw_num == 1) { + first_mask = first_mask | ~last_mask; + } + + if (cb < LOCAL_SZ_0) { + local_sz[0] = 1; + } else { + local_sz[0] = LOCAL_SZ_0; + } + global_sz[0] = ((dw_num + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0; + + if (src_offset % 4 == dst_offset % 4) { + /* Src and dst has the same unaligned offset, just handle the + header and tail. */ + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET); + + assert(ker); + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); + cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); + cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); + cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + /* Dst's offset < Src's offset, so one dst dword need two sequential src dwords to fill it. */ + if (dst_offset % 4 < src_offset % 4) { + int align_diff = src_offset % 4 - dst_offset % 4; + unsigned int dw_mask = masks[align_diff]; + int shift = align_diff * 8; + + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET); + assert(ker); + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); + cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); + cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); + cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); + cl_kernel_set_arg(ker, 7, sizeof(int), &shift); + cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask); + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + /* Dst's offset > Src's offset, so one dst dword need two sequential src - and src to fill it. */ + if (dst_offset % 4 > src_offset % 4) { + int align_diff = dst_offset % 4 - src_offset % 4; + unsigned int dw_mask = masks[4 - align_diff]; + int shift = align_diff * 8; + int src_less = !(src_offset % 4) && !((src_offset + cb) % 4); + + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET); + assert(ker); + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); + cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); + cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); + cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); + cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); + cl_kernel_set_arg(ker, 7, sizeof(int), &shift); + cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask); + cl_kernel_set_arg(ker, 9, sizeof(int), &src_less); + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + /* no case can hanldle? */ + assert(0); +} + +LOCAL cl_int +cl_mem_enqueue_copy_buffer_gen(cl_event event, cl_int status) +{ + cl_int ret = CL_SUCCESS; + assert(event->exec_data.type == EnqueueCopyBuffer); + + if (event->exec_data.copy_buffer.cb == 0) // no need to do anything + return CL_SUCCESS; + + if (status == CL_QUEUED) { + ret = cl_mem_copy_buffer_gen(event->queue, event, event->exec_data.copy_buffer.src, + event->exec_data.copy_buffer.dst, + event->exec_data.copy_buffer.src_offset, + event->exec_data.copy_buffer.dst_offset, + event->exec_data.copy_buffer.cb); + return ret; + } + + if (status == CL_SUBMITTED) { + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_flush_gpgpu(event->exec_data.exec_ctx); + return ret; + } + + if (status == CL_RUNNING) { + /* Nothing to do */ + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_finish_gpgpu(event->exec_data.exec_ctx); + return ret; +} + +LOCAL cl_int +cl_mem_enqueue_fill_buffer_gen(cl_event event, cl_int status) +{ + cl_int ret = CL_SUCCESS; + assert(event->exec_data.type == EnqueueFillBuffer); + + if (event->exec_data.fill_buffer.size == 0) // no need to do anything + return CL_SUCCESS; + + if (status == CL_QUEUED) { + cl_command_queue queue = event->queue; + const void *pattern = event->exec_data.fill_buffer.pattern; + size_t pattern_size = event->exec_data.fill_buffer.pattern_size; + cl_mem buffer = event->exec_data.fill_buffer.buffer; + size_t offset = event->exec_data.fill_buffer.offset; + size_t size = event->exec_data.fill_buffer.size; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {1, 1, 1}; + char pattern_comb[4]; + int is_128 = 0; + const void *pattern1 = NULL; + + assert(offset % pattern_size == 0); + assert(size % pattern_size == 0); + + if (!size) + return ret; + + if (pattern_size == 128) { + /* 128 is according to pattern of double16, but double works not very + well on some platform. We use two float16 to handle this. */ + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN128); + is_128 = 1; + pattern_size = pattern_size / 2; + pattern1 = pattern + pattern_size; + size = size / 2; + } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */ + int order = ffs(pattern_size / 8) - 1; + + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order); + } else if (pattern_size == 4) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN4); + } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) { + /* The unaligned case. But if copy size and offset are aligned to 4, we can fake + the pattern with the pattern duplication fill in. */ + assert(pattern_size == 1 || pattern_size == 2); + + if (pattern_size == 2) { + memcpy(pattern_comb, pattern, sizeof(char) * 2); + memcpy(pattern_comb + 2, pattern, sizeof(char) * 2); + } else { + pattern_comb[0] = pattern_comb[1] = pattern_comb[2] = pattern_comb[3] = *(char *)pattern; + } + + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN4); + pattern_size = 4; + pattern = pattern_comb; + } + //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel + //functions. This depend on the usage but now we just use aligned 1 and 2. + else if (pattern_size == 2) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_ALIGN2); + } else if (pattern_size == 1) { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, + CL_ENQUEUE_FILL_BUFFER_UNALIGN); + } else + assert(0); + + assert(ker); + + size = size / pattern_size; + offset = offset / pattern_size; + + if (size < LOCAL_SZ_0) { + local_sz[0] = 1; + } else { + local_sz[0] = LOCAL_SZ_0; + } + global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0; + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer); + cl_kernel_set_arg(ker, 1, pattern_size, pattern); + cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset); + cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size); + if (is_128) + cl_kernel_set_arg(ker, 4, pattern_size, pattern1); + + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + if (status == CL_SUBMITTED) { + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_flush_gpgpu(event->exec_data.exec_ctx); + return ret; + } + + if (status == CL_RUNNING) { + /* Nothing to do */ + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_finish_gpgpu(event->exec_data.exec_ctx); + return ret; +} + +LOCAL cl_int +cl_mem_enqueue_copy_buffer_rect_gen(cl_event event, cl_int status) +{ + cl_int ret = CL_SUCCESS; + assert(event->exec_data.type == EnqueueCopyBufferRect); + + if (status == CL_QUEUED) { + cl_command_queue queue = event->queue; + cl_mem src_buf = event->exec_data.copy_buffer_rect.src_buf; + cl_mem dst_buf = event->exec_data.copy_buffer_rect.dst_buf; + const size_t *src_origin = event->exec_data.copy_buffer_rect.src_origin; + const size_t *dst_origin = event->exec_data.copy_buffer_rect.dst_origin; + const size_t *region = event->exec_data.copy_buffer_rect.region; + size_t src_row_pitch = event->exec_data.copy_buffer_rect.src_row_pitch; + size_t src_slice_pitch = event->exec_data.copy_buffer_rect.src_slice_pitch; + size_t dst_row_pitch = event->exec_data.copy_buffer_rect.dst_row_pitch; + size_t dst_slice_pitch = event->exec_data.copy_buffer_rect.dst_slice_pitch; + cl_kernel ker = NULL; + size_t global_off[] = {0, 0, 0}; + size_t global_sz[] = {1, 1, 1}; + size_t local_sz[] = {LOCAL_SZ_0, LOCAL_SZ_1, LOCAL_SZ_1}; + + // the src and dst mem rect is continuous, the copy is degraded to buf copy + if ((region[0] == dst_row_pitch) && (region[0] == src_row_pitch) && + (region[1] * src_row_pitch == src_slice_pitch) && + (region[1] * dst_row_pitch == dst_slice_pitch)) { + cl_int src_offset = src_origin[2] * src_slice_pitch + + src_origin[1] * src_row_pitch + src_origin[0]; + cl_int dst_offset = dst_origin[2] * dst_slice_pitch + + dst_origin[1] * dst_row_pitch + dst_origin[0]; + cl_int size = region[0] * region[1] * region[2]; + ret = cl_mem_copy_buffer_gen(queue, event, src_buf, dst_buf, src_offset, dst_offset, size); + return ret; + } + + if (region[1] == 1) + local_sz[1] = 1; + if (region[2] == 1) + local_sz[2] = 1; + global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; + global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; + global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; + cl_int src_offset = src_origin[2] * src_slice_pitch + src_origin[1] * src_row_pitch + src_origin[0]; + cl_int dst_offset = dst_origin[2] * dst_slice_pitch + dst_origin[1] * dst_row_pitch + dst_origin[0]; + + /* We use one kernel to copy the data. The kernel is lazily created. */ + assert(src_buf->ctx == dst_buf->ctx); + + /* setup the kernel and run. */ + size_t region0 = region[0]; + if ((src_offset % 4 == 0) && (dst_offset % 4 == 0) && + (src_row_pitch % 4 == 0) && (dst_row_pitch % 4 == 0) && + (src_slice_pitch % 4 == 0) && (dst_slice_pitch % 4 == 0) && (region0 % 4 == 0)) { + region0 /= 4; + src_offset /= 4; + dst_offset /= 4; + src_row_pitch /= 4; + dst_row_pitch /= 4; + src_slice_pitch /= 4; + dst_slice_pitch /= 4; + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4); + } else { + ker = cl_context_get_builtin_kernel_gen(queue->ctx, queue->device, CL_ENQUEUE_COPY_BUFFER_RECT); + } + + assert(ker); + + cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); + cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf); + cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); + cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); + cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); + cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset); + cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset); + cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch); + cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch); + cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch); + cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch); + + ret = cl_command_queue_ND_range_wrap(queue, ker, event, 1, global_off, global_sz, local_sz); + return ret; + } + + if (status == CL_SUBMITTED) { + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_flush_gpgpu(event->exec_data.exec_ctx); + return ret; + } + + if (status == CL_RUNNING) { + /* Nothing to do */ + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + assert(event->exec_data.exec_ctx); + ret = cl_command_queue_finish_gpgpu(event->exec_data.exec_ctx); + return ret; +} + +static cl_int +cl_mem_allocate_pipe_gen(cl_device_id device, cl_mem mem) +{ + cl_context_gen ctx_gen; + cl_mem_gen mem_gen; + size_t alignment = 64; + size_t total_sz; + cl_uint *ptr = NULL; + cl_mem_pipe pipe = NULL; + + assert(mem->size != 0); + DEV_PRIVATE_DATA(mem->ctx, device, ctx_gen); + assert(ctx_gen); + + mem_gen = CL_CALLOC(1, sizeof(_cl_mem_gen)); + if (mem_gen == NULL) + return CL_OUT_OF_HOST_MEMORY; + + mem_gen->mem_base.device = device; + mem->each_device[0] = (cl_mem_for_device)mem_gen; + + total_sz = mem->size; + /* HSW: Byte scattered Read/Write has limitation that + the buffer size must be a multiple of 4 bytes. */ + total_sz = ALIGN(total_sz, 4); + //The head of pipe is for data struct, and alignment to 128 byte for max data type double16 + total_sz += 128; + + mem_gen->drm_bo = cl_mem_gen_create_drm_bo(ctx_gen->drv->bufmgr, total_sz, alignment, + CL_NO_TILE, 0, NULL); + assert(mem_gen->drm_bo); + + pipe = cl_mem_to_pipe(mem); + + ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, CL_FALSE); + assert(ptr); + ptr[0] = pipe->max_packets; + ptr[1] = pipe->packet_size; + ptr[2] = 0; //write ptr + ptr[3] = 0; //read ptr + ptr[4] = 0; //reservation read ptr + ptr[5] = 0; //reservation write ptr + ptr[6] = 0; //packet num + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + + return CL_SUCCESS; +} + +static cl_int +cl_mem_allocate_buffer_gen(cl_device_id device, cl_mem mem) +{ + cl_context_gen ctx_gen; + cl_mem_gen mem_gen; + size_t alignment = 64; + + assert(mem->size != 0); + DEV_PRIVATE_DATA(mem->ctx, device, ctx_gen); + assert(ctx_gen); + + mem_gen = CL_CALLOC(1, sizeof(_cl_mem_gen)); + if (mem_gen == NULL) + return CL_OUT_OF_HOST_MEMORY; + + mem_gen->mem_base.device = device; + mem->each_device[0] = (cl_mem_for_device)mem_gen; + + /* Pinning will require stricter alignment rules */ + if (mem->flags & CL_MEM_PINNABLE) + alignment = 4096; + + if (mem->flags & CL_MEM_USE_HOST_PTR && device->host_unified_memory) { + if (cl_mem_to_buffer(mem)->svm_buf) { + cl_mem svm = cl_mem_to_buffer(mem)->svm_buf; + cl_mem_gen svm_gen; + assert(CL_OBJECT_IS_SVM(svm)); + DEV_PRIVATE_DATA(svm, device, svm_gen); + mem_gen->drm_bo = svm_gen->drm_bo; + cl_mem_gen_drm_bo_ref(mem_gen->drm_bo); + } else { + mem_gen->drm_bo = cl_mem_gen_create_drm_bo_from_hostptr( + ctx_gen->drv->bufmgr, CL_FALSE, mem->size, device->global_mem_cache_line_size, mem->host_ptr); + if (mem_gen->drm_bo == NULL) + mem_gen->drm_bo = cl_mem_gen_create_drm_bo(ctx_gen->drv->bufmgr, mem->size, alignment, + CL_NO_TILE, 0, mem->host_ptr); + } + } else { + mem_gen->drm_bo = cl_mem_gen_create_drm_bo(ctx_gen->drv->bufmgr, mem->size, alignment, + CL_NO_TILE, 0, mem->host_ptr); + } + assert(mem_gen->drm_bo); + + if (mem->flags & CL_MEM_COPY_HOST_PTR) { + assert(mem->host_ptr); + mem->host_ptr = NULL; + } + + return CL_SUCCESS; +} + +/* We hold mem->ownership when call this */ +LOCAL cl_int +cl_mem_allocate_gen(cl_device_id device, cl_mem mem) +{ + cl_int err = CL_SUCCESS; + cl_mem_gen mem_gen = NULL; + + assert(!CL_OBJECT_IS_SVM(mem)); + + if (mem->each_device[0]) // Already allocate + return CL_SUCCESS; + + if (CL_OBJECT_IS_SUB_BUFFER(mem)) { + /* Parent must have already allocated */ + assert(cl_mem_to_buffer(mem)->parent->base.each_device[0]); + if (cl_mem_to_buffer(mem)->parent->base.each_device[0]->device != device) { + /* Parent and sub buffer can not belong to different device */ + return CL_MEM_OBJECT_ALLOCATION_FAILURE; + } + + /* Just point to parent's private data */ + mem_gen = CL_CALLOC(1, sizeof(_cl_mem_gen)); + if (mem_gen == NULL) + return CL_OUT_OF_HOST_MEMORY; + + mem_gen->mem_base.device = device; + mem_gen->drm_bo = ((cl_mem_gen)(cl_mem_to_buffer(mem)->parent->base.each_device[0]))->drm_bo; + cl_mem_gen_drm_bo_ref(mem_gen->drm_bo); + mem->each_device[0] = (cl_mem_for_device)mem_gen; + } else if (CL_OBJECT_IS_BUFFER(mem)) { + err = cl_mem_allocate_buffer_gen(device, mem); + } else if (CL_OBJECT_IS_IMAGE(mem)) { + err = cl_mem_allocate_image_gen(device, mem); + } else if (CL_OBJECT_IS_PIPE(mem)) { + err = cl_mem_allocate_pipe_gen(device, mem); + } else { + assert(0); + } + + return err; +} + +LOCAL void +cl_mem_deallocate_gen(cl_device_id device, cl_mem mem) +{ + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + assert(!CL_OBJECT_IS_SVM(mem)); + + if (mem_gen == NULL) + return; + + assert(mem_gen->drm_bo); + cl_mem_gen_drm_bo_delete(mem_gen->drm_bo); + mem_gen->drm_bo = NULL; + CL_FREE(mem_gen); + mem->each_device[0] = NULL; +} + +LOCAL cl_int +cl_svm_create_gen(cl_device_id device, cl_mem svm_mem) +{ + cl_mem_gen mem_gen = NULL; + cl_context_gen ctx_gen; + int page_size; + page_size = getpagesize(); + cl_mem_svm svm = cl_mem_to_svm(svm_mem); + + DEV_PRIVATE_DATA(svm_mem->ctx, device, ctx_gen); + assert(ctx_gen); + + if (svm->real_size == 0 || ALIGN(svm->real_size, page_size) != svm->real_size) + return CL_DEVICE_MEM_BASE_ADDR_ALIGN; + + if (svm_mem->host_ptr == NULL || + ALIGN((size_t)svm_mem->host_ptr, page_size) != (size_t)svm_mem->host_ptr) + return CL_DEVICE_MEM_BASE_ADDR_ALIGN; + + mem_gen = CL_CALLOC(1, sizeof(_cl_mem_gen)); + if (mem_gen == NULL) + return CL_OUT_OF_HOST_MEMORY; + + mem_gen->mem_base.device = device; + mem_gen->drm_bo = cl_mem_gen_create_drm_bo_from_hostptr( + ctx_gen->drv->bufmgr, CL_TRUE, svm->real_size, + device->global_mem_cache_line_size, svm_mem->host_ptr); + + if (mem_gen->drm_bo == NULL) { + CL_FREE(mem_gen); + return CL_OUT_OF_RESOURCES; + } + + ASSIGN_DEV_PRIVATE_DATA(svm_mem, device, (cl_mem_for_device)mem_gen); + return CL_SUCCESS; +} + +LOCAL void +cl_svm_delete_gen(cl_device_id device, cl_mem svm_mem) +{ + cl_mem_gen mem_gen = NULL; + DEV_PRIVATE_DATA(svm_mem, device, mem_gen); + if (mem_gen == NULL) + return; + + assert(mem_gen->drm_bo); + cl_mem_gen_drm_bo_delete(mem_gen->drm_bo); + mem_gen->drm_bo = NULL; + CL_FREE(mem_gen); +} + +static cl_int +cl_enqueue_handle_map_buffer_gen(cl_event event, cl_int status) +{ + cl_mem mem = event->exec_data.map_buffer.mem_obj; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + void *ptr = NULL; + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(event->exec_data.map_buffer.size <= mem->size); + + if (status == CL_SUBMITTED || status == CL_RUNNING) + return CL_SUCCESS; + + if (status == CL_QUEUED) { + ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, event->exec_data.map_buffer.unsync_map); + assert(ptr); + if (CL_OBJECT_IS_SUB_BUFFER(mem)) { + ptr += cl_mem_to_buffer(mem)->sub_offset; + } + + ptr += event->exec_data.map_buffer.offset; + if (mem->flags & CL_MEM_USE_HOST_PTR) { + assert(mem->host_ptr); + event->exec_data.map_buffer.ptr = mem->host_ptr + event->exec_data.map_buffer.offset; + } else { + event->exec_data.map_buffer.ptr = ptr; + } + + event->exec_data.exec_ctx = ptr; // Find a place to store the mapped ptr temp + if (cl_mem_to_buffer(mem)->svm_buf) { + /* If from a svm, we never need to copy, always host coherent. */ + assert(mem->flags & CL_MEM_USE_HOST_PTR); + event->exec_data.exec_ctx = event->exec_data.map_buffer.ptr; + } + + return CL_SUCCESS; + } + + assert(status == CL_COMPLETE); + + if (event->exec_data.map_buffer.unsync_map) + cl_mem_gen_drm_bo_sync(mem_gen->drm_bo); + + ptr = event->exec_data.exec_ctx; + assert(ptr); + + /* Sync back the data to host if fake USE_HOST_PTR */ + if ((mem->flags & CL_MEM_USE_HOST_PTR) && ptr != event->exec_data.map_buffer.ptr) { + /* Should never overlap with the real buffer mapped address */ + assert((ptr + event->exec_data.map_buffer.size <= event->exec_data.map_buffer.ptr) || + (event->exec_data.map_buffer.ptr + event->exec_data.map_buffer.size <= ptr)); + memcpy(event->exec_data.map_buffer.ptr, ptr, event->exec_data.map_buffer.size); + } + + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_map_mem_gen(cl_event event, cl_int status) +{ + if (event->exec_data.type == EnqueueMapBuffer) + return cl_enqueue_handle_map_buffer_gen(event, status); + + if (event->exec_data.type == EnqueueMapImage) + return cl_enqueue_handle_map_image_gen(event, status); + + assert(0); + return CL_INVALID_VALUE; +} + +static cl_int +cl_enqueue_handle_unmap_buffer_gen(cl_event event, cl_int status) +{ + cl_mem mem = event->exec_data.unmap.mem_obj; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(event->exec_data.unmap.ptr); + assert(event->exec_data.unmap.size > 0); + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + /* Sync back the content if fake USE_HOST_PTR */ + if (mem->flags & CL_MEM_USE_HOST_PTR && + event->exec_data.unmap.ptr != mem->host_ptr + event->exec_data.unmap.offset) { + /* SVM never comes to here */ + assert(cl_mem_to_buffer(mem)->svm_buf == NULL); + assert(mem_gen->drm_bo->mapped_ptr); + + void *dst_ptr = mem_gen->drm_bo->mapped_ptr + event->exec_data.unmap.offset; + if (CL_OBJECT_IS_SUB_BUFFER(mem)) + dst_ptr += cl_mem_to_buffer(mem)->sub_offset; + + /* Should never overlap with the real buffer mapped address */ + assert((event->exec_data.unmap.ptr + event->exec_data.unmap.size <= dst_ptr) || + (dst_ptr + event->exec_data.unmap.size <= event->exec_data.unmap.ptr)); + memcpy(dst_ptr, event->exec_data.unmap.ptr, event->exec_data.unmap.size); + } + + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_unmap_mem_gen(cl_event event, cl_int status) +{ + assert(event->exec_data.type == EnqueueUnmapMemObject); + assert(CL_OBJECT_IS_MEM(event->exec_data.unmap.mem_obj)); + + if (CL_OBJECT_IS_BUFFER(event->exec_data.unmap.mem_obj)) + return cl_enqueue_handle_unmap_buffer_gen(event, status); + + if (CL_OBJECT_IS_IMAGE(event->exec_data.unmap.mem_obj)) + return cl_enqueue_handle_unmap_image_gen(event, status); + + assert(0); + return CL_INVALID_VALUE; +} + +LOCAL cl_int +cl_enqueue_read_buffer_gen(cl_event event, cl_int status) +{ + cl_mem_gen mem_gen = NULL; + void *data_ptr = NULL; + cl_mem mem = NULL; + + if (event->exec_data.type == EnqueueReadBuffer) { + mem = event->exec_data.read_write_buffer.buffer; + } else if (event->exec_data.type == EnqueueReadBufferRect) { + mem = event->exec_data.read_write_buffer_rect.buffer; + } else { + assert(0); + } + + mem_gen = (cl_mem_gen)mem->each_device[0]; + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(CL_OBJECT_IS_BUFFER(mem)); + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + data_ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, CL_FALSE); + if (data_ptr == NULL) + return CL_OUT_OF_RESOURCES; + + if (CL_OBJECT_IS_SUB_BUFFER(mem)) { + data_ptr += cl_mem_to_buffer(mem)->sub_offset; + } + + if (event->exec_data.type == EnqueueReadBuffer) { + /* sometimes, application invokes read buffer, instead of map buffer, even if userptr is enabled + memcpy is not necessary for this case */ + if (event->exec_data.read_write_buffer.ptr != (char *)data_ptr + event->exec_data.read_write_buffer.offset) { + memcpy(event->exec_data.read_write_buffer.ptr, + (char *)data_ptr + event->exec_data.read_write_buffer.offset, event->exec_data.read_write_buffer.size); + } + } else if (event->exec_data.type == EnqueueReadBufferRect) { + void *dst_ptr = NULL; + size_t *origin = event->exec_data.read_write_buffer_rect.origin; + size_t *host_origin = event->exec_data.read_write_buffer_rect.host_origin; + size_t *region = event->exec_data.read_write_buffer_rect.region; + size_t host_row_pitch = event->exec_data.read_write_buffer_rect.host_row_pitch; + size_t host_slice_pitch = event->exec_data.read_write_buffer_rect.host_slice_pitch; + size_t row_pitch = event->exec_data.read_write_buffer_rect.row_pitch; + size_t slice_pitch = event->exec_data.read_write_buffer_rect.slice_pitch; + size_t offset = origin[0] + row_pitch * origin[1] + slice_pitch * origin[2]; + data_ptr = (char *)data_ptr + offset; + offset = host_origin[0] + host_row_pitch * host_origin[1] + host_slice_pitch * host_origin[2]; + dst_ptr = (char *)event->exec_data.read_write_buffer_rect.ptr + offset; + + if (row_pitch == region[0] && row_pitch == host_row_pitch && + (region[2] == 1 || (slice_pitch == region[0] * region[1] && slice_pitch == host_slice_pitch))) { + memcpy(dst_ptr, data_ptr, region[2] == 1 ? row_pitch * region[1] : slice_pitch * region[2]); + } else { + cl_uint y, z; + for (z = 0; z < region[2]; z++) { + const char *src = data_ptr; + char *dst = dst_ptr; + for (y = 0; y < region[1]; y++) { + memcpy(dst, src, region[0]); + src += row_pitch; + dst += host_row_pitch; + } + data_ptr = (char *)data_ptr + slice_pitch; + dst_ptr = (char *)dst_ptr + host_slice_pitch; + } + } + } + + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + return CL_SUCCESS; +} + +/* Write use subdata, no map need */ +LOCAL cl_int +cl_enqueue_write_buffer_gen(cl_event event, cl_int status) +{ + cl_mem mem = event->exec_data.read_write_buffer.buffer; + cl_mem_gen mem_gen = (cl_mem_gen)mem->each_device[0]; + + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(CL_OBJECT_IS_BUFFER(mem)); + assert(event->exec_data.type == EnqueueWriteBuffer); + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + size_t offset = event->exec_data.read_write_buffer.offset; + if (CL_OBJECT_IS_SUB_BUFFER(mem)) { + offset += cl_mem_to_buffer(mem)->sub_offset; + } + + if (cl_mem_gen_drm_bo_upload_data(mem_gen->drm_bo, offset, event->exec_data.read_write_buffer.ptr, + event->exec_data.read_write_buffer.size) == CL_TRUE) { + return CL_SUCCESS; + } + + return CL_OUT_OF_RESOURCES; +} + +LOCAL cl_int +cl_enqueue_write_buffer_rect_gen(cl_event event, cl_int status) +{ + cl_mem_gen mem_gen = NULL; + void *data_ptr = NULL; + cl_mem mem = event->exec_data.read_write_buffer_rect.buffer; + + mem_gen = (cl_mem_gen)mem->each_device[0]; + assert(mem_gen); + assert(mem_gen->drm_bo); + assert(CL_OBJECT_IS_BUFFER(mem)); + assert(event->exec_data.type == EnqueueWriteBufferRect); + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + void *src_ptr = NULL; + size_t *origin = event->exec_data.read_write_buffer_rect.origin; + size_t *host_origin = event->exec_data.read_write_buffer_rect.host_origin; + size_t *region = event->exec_data.read_write_buffer_rect.region; + size_t host_row_pitch = event->exec_data.read_write_buffer_rect.host_row_pitch; + size_t host_slice_pitch = event->exec_data.read_write_buffer_rect.host_slice_pitch; + size_t row_pitch = event->exec_data.read_write_buffer_rect.row_pitch; + size_t slice_pitch = event->exec_data.read_write_buffer_rect.slice_pitch; + + data_ptr = cl_mem_gen_drm_bo_map(mem_gen->drm_bo, CL_FALSE); + if (data_ptr == NULL) + return CL_OUT_OF_RESOURCES; + + if (CL_OBJECT_IS_SUB_BUFFER(mem)) { + data_ptr += cl_mem_to_buffer(mem)->sub_offset; + } + + size_t offset = origin[0] + row_pitch * origin[1] + slice_pitch * origin[2]; + data_ptr = (char *)data_ptr + offset; + + offset = host_origin[0] + host_row_pitch * host_origin[1] + host_slice_pitch * host_origin[2]; + src_ptr = (char *)event->exec_data.read_write_buffer_rect.ptr + offset; + + if (row_pitch == region[0] && row_pitch == host_row_pitch && + (region[2] == 1 || (slice_pitch == region[0] * region[1] && slice_pitch == host_slice_pitch))) { + memcpy(data_ptr, src_ptr, region[2] == 1 ? row_pitch * region[1] : slice_pitch * region[2]); + } else { + cl_uint y, z; + for (z = 0; z < region[2]; z++) { + const char *src = src_ptr; + char *dst = data_ptr; + for (y = 0; y < region[1]; y++) { + memcpy(dst, src, region[0]); + src += host_row_pitch; + dst += row_pitch; + } + src_ptr = (char *)src_ptr + host_slice_pitch; + data_ptr = (char *)data_ptr + slice_pitch; + } + } + + cl_mem_gen_drm_bo_unmap(mem_gen->drm_bo); + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_svm_map_gen(cl_event event, cl_int status) +{ + cl_command_queue queue = event->queue; + cl_mem_gen mem_gen = NULL; + cl_mem mem = event->exec_data.svm_map.svm; + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + DEV_PRIVATE_DATA(mem, queue->device, mem_gen); + assert(mem_gen->drm_bo); + assert(mem_gen->drm_bo->svm); + assert(mem_gen->drm_bo->gpu_size >= event->exec_data.svm_map.size); + + cl_mem_gen_drm_bo_sync(mem_gen->drm_bo); + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_svm_unmap_gen(cl_event event, cl_int status) +{ + cl_command_queue queue = event->queue; + cl_mem_gen mem_gen = NULL; + cl_mem mem = event->exec_data.svm_unmap.svm; + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + DEV_PRIVATE_DATA(mem, queue->device, mem_gen); + assert(mem_gen->drm_bo); + assert(mem_gen->drm_bo->svm); + + cl_mem_gen_drm_bo_sync(mem_gen->drm_bo); + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_svm_fill_gen(cl_event event, cl_int status) +{ + cl_command_queue queue = event->queue; + cl_mem_gen mem_gen = NULL; + cl_mem mem = event->exec_data.svm_fill.svm; + size_t i, j; + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + DEV_PRIVATE_DATA(mem, queue->device, mem_gen); + assert(mem_gen->drm_bo); + assert(mem_gen->drm_bo->svm); + assert(event->exec_data.svm_fill.ptr >= mem->host_ptr); + + cl_mem_gen_drm_bo_sync(mem_gen->drm_bo); + + for (i = 0; i < event->exec_data.svm_fill.size;) { + for (j = 0; j < event->exec_data.svm_fill.pattern_size; j++) { + ((char *)event->exec_data.svm_fill.ptr)[i++] = + ((char *)event->exec_data.svm_fill.pattern)[j]; + } + } + + return CL_SUCCESS; +} + +LOCAL cl_int +cl_enqueue_svm_copy_gen(cl_event event, cl_int status) +{ + cl_command_queue queue = event->queue; + cl_mem_gen src_mem_gen = NULL; + cl_mem_gen dst_mem_gen = NULL; + cl_mem src_mem = event->exec_data.svm_copy.src; + cl_mem dst_mem = event->exec_data.svm_copy.dst; + + if (status == CL_QUEUED || status == CL_RUNNING || status == CL_SUBMITTED) + return CL_SUCCESS; + + DEV_PRIVATE_DATA(src_mem, queue->device, src_mem_gen); + DEV_PRIVATE_DATA(dst_mem, queue->device, dst_mem_gen); + assert(event->exec_data.svm_copy.src_ptr >= src_mem->host_ptr); + assert(event->exec_data.svm_copy.dst_ptr >= dst_mem->host_ptr); + + cl_mem_gen_drm_bo_sync(src_mem_gen->drm_bo); + cl_mem_gen_drm_bo_sync(dst_mem_gen->drm_bo); + + memcpy(event->exec_data.svm_copy.dst_ptr, + event->exec_data.svm_copy.src_ptr, event->exec_data.svm_copy.size); + return CL_SUCCESS; +} |