/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * * Author: Benjamin Segovia */ #include "cl_mem.h" #include "cl_image.h" #include "cl_context.h" #include "cl_utils.h" #include "cl_alloc.h" #include "cl_device_id.h" #include "cl_driver.h" #include "cl_khr_icd.h" #include "cl_kernel.h" #include "cl_command_queue.h" #include "cl_cmrt.h" #include "cl_enqueue.h" #include "CL/cl.h" #include "CL/cl_intel.h" #include #include #include #include #include #define FIELD_SIZE(CASE,TYPE) \ case JOIN(CL_,CASE): \ if(param_value_size_ret) \ *param_value_size_ret = sizeof(TYPE); \ if(!param_value) \ return CL_SUCCESS; \ if(param_value_size < sizeof(TYPE)) \ return CL_INVALID_VALUE; \ break; #define MAX_TILING_SIZE 128 * MB LOCAL cl_mem_object_type cl_get_mem_object_type(cl_mem mem) { switch (mem->type) { case CL_MEM_BUFFER_TYPE: case CL_MEM_SUBBUFFER_TYPE: return CL_MEM_OBJECT_BUFFER; case CL_MEM_IMAGE_TYPE: case CL_MEM_GL_IMAGE_TYPE: { struct _cl_mem_image *image = cl_mem_image(mem); return image->image_type; } default: return CL_MEM_OBJECT_BUFFER; } } LOCAL cl_int cl_get_pipe_info(cl_mem mem, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { _cl_mem_pipe *pipe; switch(param_name) { FIELD_SIZE(PIPE_PACKET_SIZE, cl_uint); FIELD_SIZE(PIPE_MAX_PACKETS, cl_uint); default: return CL_INVALID_VALUE; } if(mem->type != CL_MEM_PIPE_TYPE) return CL_INVALID_MEM_OBJECT; pipe = cl_mem_pipe(mem); switch(param_name) { case CL_PIPE_PACKET_SIZE: *((cl_uint *)param_value) = pipe->packet_size; break; case CL_PIPE_MAX_PACKETS: *((cl_uint *)param_value) = pipe->max_packets; break; } return CL_SUCCESS; } LOCAL cl_mem cl_mem_allocate(enum cl_mem_type type, cl_context ctx, cl_mem_flags flags, size_t sz, cl_int is_tiled, void *host_ptr, //pointer from application cl_mem buffer, //image2D from buffer cl_int *errcode) { cl_buffer_mgr bufmgr = NULL; cl_mem mem = NULL; cl_int err = CL_SUCCESS; size_t alignment = 64; assert(ctx); /* Allocate and inialize the structure itself */ if (type == CL_MEM_IMAGE_TYPE) { struct _cl_mem_image *image = NULL; TRY_ALLOC (image, CALLOC(struct _cl_mem_image)); mem = &image->base; } else if (type == CL_MEM_GL_IMAGE_TYPE ) { struct _cl_mem_gl_image *gl_image = NULL; TRY_ALLOC (gl_image, CALLOC(struct _cl_mem_gl_image)); mem = &gl_image->base.base; } else if (type == CL_MEM_BUFFER1D_IMAGE_TYPE) { struct _cl_mem_buffer1d_image *buffer1d_image = NULL; TRY_ALLOC(buffer1d_image, CALLOC(struct _cl_mem_buffer1d_image)); mem = &buffer1d_image->base.base; } else if (type == CL_MEM_PIPE_TYPE) { _cl_mem_pipe *pipe = NULL; TRY_ALLOC(pipe, CALLOC(struct _cl_mem_pipe)); mem = &pipe->base; } else { struct _cl_mem_buffer *buffer = NULL; TRY_ALLOC (buffer, CALLOC(struct _cl_mem_buffer)); mem = &buffer->base; } CL_OBJECT_INIT_BASE(mem, CL_OBJECT_MEM_MAGIC); list_init(&mem->dstr_cb_head); mem->type = type; mem->flags = flags; mem->is_userptr = 0; mem->offset = 0; mem->is_svm = 0; mem->cmrt_mem = NULL; if (mem->type == CL_MEM_IMAGE_TYPE) { cl_mem_image(mem)->is_image_from_buffer = 0; cl_mem_image(mem)->is_image_from_nv12_image = 0; cl_mem_image(mem)->is_ker_copy = 0; cl_mem_image(mem)->tmp_ker_buf = NULL; } if (sz != 0) { /* Pinning will require stricter alignment rules */ if ((flags & CL_MEM_PINNABLE) || is_tiled) alignment = 4096; /* Allocate space in memory */ bufmgr = cl_context_get_bufmgr(ctx); assert(bufmgr); #ifdef HAS_USERPTR uint8_t bufCreated = 0; if (ctx->devices[0]->host_unified_memory) { int page_size = getpagesize(); int cacheline_size = 0; cl_get_device_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL); if (type == CL_MEM_BUFFER_TYPE) { if (flags & CL_MEM_USE_HOST_PTR) { assert(host_ptr != NULL); cl_mem svm_mem = NULL; if((svm_mem = cl_context_get_svm_from_ptr(ctx, host_ptr)) != NULL) mem->is_svm = 1; /* userptr not support tiling */ if (!is_tiled) { if(svm_mem != NULL) { //SVM always paged alignment mem->offset = 0; mem->is_userptr = 1; mem->bo = svm_mem->bo; cl_mem_add_ref(svm_mem); bufCreated = 1; } else if ((ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr) && (ALIGN((unsigned long)sz, cacheline_size) == (unsigned long)sz)) { void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & (~(page_size - 1))); mem->offset = host_ptr - aligned_host_ptr; mem->is_userptr = 1; size_t aligned_sz = ALIGN((mem->offset + sz), page_size); mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", aligned_host_ptr, aligned_sz, 0); bufCreated = 1; } } } else if (flags & CL_MEM_ALLOC_HOST_PTR) { const size_t alignedSZ = ALIGN(sz, page_size); void* internal_host_ptr = cl_aligned_malloc(alignedSZ, page_size); mem->host_ptr = internal_host_ptr; mem->is_userptr = 1; mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", internal_host_ptr, alignedSZ, 0); bufCreated = 1; } } else if (type == CL_MEM_IMAGE_TYPE) { if (host_ptr != NULL) { assert(flags & CL_MEM_USE_HOST_PTR); assert(!is_tiled); assert(ALIGN((unsigned long)host_ptr, cacheline_size) == (unsigned long)host_ptr); void* aligned_host_ptr = (void*)(((unsigned long)host_ptr) & (~(page_size - 1))); mem->offset = host_ptr - aligned_host_ptr; mem->is_userptr = 1; size_t aligned_sz = ALIGN((mem->offset + sz), page_size); mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", aligned_host_ptr, aligned_sz, 0); bufCreated = 1; } } } if(type == CL_MEM_IMAGE_TYPE && buffer != NULL) { // if create image from USE_HOST_PTR buffer, the buffer's base address need be aligned. if(buffer->is_userptr) { int base_alignement = 0; cl_get_device_info(ctx->devices[0], CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT, sizeof(base_alignement), &base_alignement, NULL); if(ALIGN((unsigned long)buffer->host_ptr, base_alignement) != (unsigned long)buffer->host_ptr) { err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; goto error; } } // if the image if created from buffer, should use the bo directly to share same bo. mem->bo = buffer->bo; if (IS_IMAGE(buffer) && cl_mem_image(buffer)->fmt.image_channel_order == CL_NV12_INTEL) { cl_mem_image(mem)->is_image_from_nv12_image = 1; } else { cl_mem_image(mem)->is_image_from_buffer = 1; } bufCreated = 1; } if (!bufCreated) mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment); #else if(type == CL_MEM_IMAGE_TYPE && buffer != NULL) { // if the image if created from buffer, should use the bo directly to share same bo. mem->bo = buffer->bo; cl_mem_image(mem)->is_image_from_buffer = 1; } else mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment); #endif if (UNLIKELY(mem->bo == NULL)) { err = CL_MEM_OBJECT_ALLOCATION_FAILURE; goto error; } mem->size = sz; } /* Append the buffer in the context buffer list */ cl_context_add_mem(ctx, mem); exit: if (errcode) *errcode = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } LOCAL cl_int cl_mem_is_valid(cl_mem mem, cl_context ctx) { struct list_node *pos; cl_base_object pbase_object; CL_OBJECT_LOCK(ctx); list_for_each (pos, (&ctx->mem_objects)) { pbase_object = list_entry(pos, _cl_base_object, node); if (pbase_object == (cl_base_object)mem) { if (UNLIKELY(!CL_OBJECT_IS_MEM(mem))) { CL_OBJECT_UNLOCK(ctx); return CL_INVALID_MEM_OBJECT; } CL_OBJECT_UNLOCK(ctx); return CL_SUCCESS; } } CL_OBJECT_UNLOCK(ctx); return CL_INVALID_MEM_OBJECT; } LOCAL cl_mem cl_mem_new_buffer(cl_context ctx, cl_mem_flags flags, size_t sz, void *data, cl_int *errcode_ret) { /* Possible mem type combination: CL_MEM_ALLOC_HOST_PTR CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR CL_MEM_USE_HOST_PTR CL_MEM_COPY_HOST_PTR */ cl_int err = CL_SUCCESS; cl_mem mem = NULL; cl_ulong max_mem_size; if (UNLIKELY(sz == 0)) { err = CL_INVALID_BUFFER_SIZE; goto error; } if (UNLIKELY(((flags & CL_MEM_READ_WRITE) && (flags & (CL_MEM_READ_ONLY | CL_MEM_WRITE_ONLY))) || ((flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_WRITE_ONLY))) || ((flags & CL_MEM_ALLOC_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR)) || ((flags & CL_MEM_COPY_HOST_PTR) && (flags & CL_MEM_USE_HOST_PTR)) || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS)) || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY)) || ((flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS)) || ((flags & (~(CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR | CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) != 0))) { err = CL_INVALID_VALUE; goto error; } /* This flag is valid only if host_ptr is not NULL */ if (UNLIKELY((((flags & CL_MEM_COPY_HOST_PTR) || (flags & CL_MEM_USE_HOST_PTR)) && data == NULL)) || (!(flags & (CL_MEM_COPY_HOST_PTR |CL_MEM_USE_HOST_PTR)) && (data != NULL))) { err = CL_INVALID_HOST_PTR; goto error; } if ((err = cl_get_device_info(ctx->devices[0], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_size), &max_mem_size, NULL)) != CL_SUCCESS) { goto error; } if (UNLIKELY(sz > max_mem_size)) { err = CL_INVALID_BUFFER_SIZE; goto error; } /* HSW: Byte scattered Read/Write has limitation that the buffer size must be a multiple of 4 bytes. */ sz = ALIGN(sz, 4); /* Create the buffer in video memory */ mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, data, NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; /* Copy the data if required */ if (flags & CL_MEM_COPY_HOST_PTR) { if (mem->is_userptr) memcpy(mem->host_ptr, data, sz); else cl_buffer_subdata(mem->bo, 0, sz, data); } if ((flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) cl_buffer_subdata(mem->bo, 0, sz, data); if (flags & CL_MEM_USE_HOST_PTR) mem->host_ptr = data; exit: if (errcode_ret) *errcode_ret = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } LOCAL cl_mem cl_mem_new_sub_buffer(cl_mem buffer, cl_mem_flags flags, cl_buffer_create_type create_type, const void *create_info, cl_int *errcode_ret) { cl_int err = CL_SUCCESS; cl_mem mem = NULL; struct _cl_mem_buffer *sub_buf = NULL; if (buffer->type != CL_MEM_BUFFER_TYPE) { err = CL_INVALID_MEM_OBJECT; goto error; } if (flags && (((buffer->flags & CL_MEM_WRITE_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY))) || ((buffer->flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY))) || (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR)) || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS)) || ((flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY)) || ((flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_NO_ACCESS)))) { err = CL_INVALID_VALUE; goto error; } if((flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_READ_WRITE)) == 0) { flags |= buffer->flags & (CL_MEM_WRITE_ONLY | CL_MEM_READ_ONLY | CL_MEM_READ_WRITE); } flags |= buffer->flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR); if((flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) == 0) { flags |= buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS); } if (create_type != CL_BUFFER_CREATE_TYPE_REGION) { err = CL_INVALID_VALUE; goto error; } if (!create_info) { err = CL_INVALID_VALUE; goto error; } cl_buffer_region *info = (cl_buffer_region *)create_info; if (!info->size) { err = CL_INVALID_BUFFER_SIZE; goto error; } if (info->origin > buffer->size || info->origin + info->size > buffer->size) { err = CL_INVALID_VALUE; goto error; } if (info->origin & (buffer->ctx->devices[0]->mem_base_addr_align / 8 - 1)) { err = CL_MISALIGNED_SUB_BUFFER_OFFSET; goto error; } /* Now create the sub buffer and link it to the buffer. */ TRY_ALLOC (sub_buf, CALLOC(struct _cl_mem_buffer)); mem = &sub_buf->base; CL_OBJECT_INIT_BASE(mem, CL_OBJECT_MEM_MAGIC); list_init(&mem->dstr_cb_head); mem->type = CL_MEM_SUBBUFFER_TYPE; mem->flags = flags; mem->offset = buffer->offset; mem->is_userptr = buffer->is_userptr; sub_buf->parent = (struct _cl_mem_buffer*)buffer; cl_mem_add_ref(buffer); /* Append the buffer in the parent buffer list */ pthread_mutex_lock(&((struct _cl_mem_buffer*)buffer)->sub_lock); sub_buf->sub_next = ((struct _cl_mem_buffer*)buffer)->subs; if (((struct _cl_mem_buffer*)buffer)->subs != NULL) ((struct _cl_mem_buffer*)buffer)->subs->sub_prev = sub_buf; ((struct _cl_mem_buffer*)buffer)->subs = sub_buf; pthread_mutex_unlock(&((struct _cl_mem_buffer*)buffer)->sub_lock); mem->bo = buffer->bo; mem->size = info->size; sub_buf->sub_offset = info->origin; if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_COPY_HOST_PTR || buffer->flags & CL_MEM_ALLOC_HOST_PTR) { mem->host_ptr = buffer->host_ptr; } /* Append the buffer in the context buffer list */ cl_context_add_mem(buffer->ctx, mem); exit: if (errcode_ret) *errcode_ret = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } cl_mem cl_mem_new_pipe(cl_context ctx, cl_mem_flags flags, cl_uint packet_size, cl_uint max_packets, cl_int *errcode_ret) { _cl_mem_pipe* pipe = NULL; cl_uint *ptr = NULL; cl_mem mem = NULL; cl_int err; cl_uint sz; if(UNLIKELY((pipe = CALLOC(_cl_mem_pipe)) == NULL)) { err = CL_OUT_OF_RESOURCES; goto error; } sz = packet_size * max_packets; assert(sz != 0); /* HSW: Byte scattered Read/Write has limitation that the buffer size must be a multiple of 4 bytes. */ sz = ALIGN(sz, 4); sz += 128; //The head of pipe is for data struct, and alignment to 128 byte for max data type double16 mem = cl_mem_allocate(CL_MEM_PIPE_TYPE, ctx, flags, sz, CL_FALSE,NULL , NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; ptr = cl_mem_map_auto(mem, 1); if(ptr == NULL){ err = CL_OUT_OF_RESOURCES; goto error; } ptr[0] = max_packets; ptr[1] = packet_size; ptr[2] = 0; //write ptr ptr[3] = 0; //read ptr ptr[4] = 0; //reservation read ptr ptr[5] = 0; //reservation write ptr ptr[6] = 0; //packet num cl_mem_unmap(mem); pipe = cl_mem_pipe(mem); pipe->flags = flags; pipe->packet_size = packet_size; pipe->max_packets = max_packets; return mem; exit: if (errcode_ret) *errcode_ret = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } void cl_mem_replace_buffer(cl_mem buffer, cl_buffer new_bo) { cl_buffer_unreference(buffer->bo); buffer->bo = new_bo; cl_buffer_reference(new_bo); if (buffer->type != CL_MEM_SUBBUFFER_TYPE) return; struct _cl_mem_buffer *it = ((struct _cl_mem_buffer*)buffer)->sub_next; for( ; it != (struct _cl_mem_buffer*)buffer; it = it->sub_next) { cl_buffer_unreference(it->base.bo); it->base.bo = new_bo; cl_buffer_reference(new_bo); } } void* cl_mem_svm_allocate(cl_context ctx, cl_svm_mem_flags flags, size_t size, unsigned int alignment) { cl_int err = CL_SUCCESS; size_t max_mem_size; if(UNLIKELY(alignment & (alignment - 1))) return NULL; if ((err = cl_get_device_info(ctx->devices[0], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_size), &max_mem_size, NULL)) != CL_SUCCESS) { return NULL; } if(UNLIKELY(size == 0 || size > max_mem_size)) { return NULL; } if (flags & (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS)) { return NULL; } if (flags && ((flags & (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_FINE_GRAIN_BUFFER)) || ((flags & CL_MEM_WRITE_ONLY) && (flags & CL_MEM_READ_ONLY)) || ((flags & CL_MEM_WRITE_ONLY) && (flags & CL_MEM_READ_WRITE)) || ((flags & CL_MEM_READ_ONLY) && (flags & CL_MEM_READ_WRITE)))) { return NULL; } void * ptr = NULL; #ifdef HAS_BO_SET_SOFTPIN cl_buffer_mgr bufmgr = NULL; cl_mem mem; _cl_mem_svm* svm; if(UNLIKELY((svm = CALLOC(_cl_mem_svm)) == NULL)) return NULL; mem = &svm->base; mem->type = CL_MEM_SVM_TYPE; CL_OBJECT_INIT_BASE(mem, CL_OBJECT_MEM_MAGIC); list_init(&mem->dstr_cb_head); mem->flags = flags | CL_MEM_USE_HOST_PTR; mem->is_userptr = 0; mem->is_svm = 0; mem->offset = 0; bufmgr = cl_context_get_bufmgr(ctx); assert(bufmgr); int page_size = getpagesize(); const size_t alignedSZ = ALIGN(size, page_size); if(alignment == 0) alignment = page_size; else alignment = ALIGN(alignment, page_size); ptr = cl_aligned_malloc(alignedSZ, alignment); if(ptr == NULL) return NULL; mem->host_ptr = ptr; mem->is_svm = 1; mem->is_userptr = 1; mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL SVM memory object", ptr, alignedSZ, 0); mem->size = size; cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr); cl_buffer_set_bo_use_full_range(mem->bo, 1); /* Append the svm in the context buffer list */ cl_context_add_mem(ctx, mem); #endif return ptr; } void cl_mem_copy_image_region(const size_t *origin, const size_t *region, void *dst, size_t dst_row_pitch, size_t dst_slice_pitch, const void *src, size_t src_row_pitch, size_t src_slice_pitch, const struct _cl_mem_image *image, cl_bool offset_dst, cl_bool offset_src) { if(offset_dst) { size_t dst_offset = image->bpp * origin[0] + dst_row_pitch * origin[1] + dst_slice_pitch * origin[2]; dst = (char*)dst + dst_offset; } if(offset_src) { size_t src_offset = image->bpp * origin[0] + src_row_pitch * origin[1] + src_slice_pitch * origin[2]; src = (char*)src + src_offset; } if (!origin[0] && region[0] == image->w && dst_row_pitch == src_row_pitch && (region[2] == 1 || (!origin[1] && region[1] == image->h && dst_slice_pitch == src_slice_pitch))) { memcpy(dst, src, region[2] == 1 ? src_row_pitch*region[1] : src_slice_pitch*region[2]); } else { cl_uint y, z; for (z = 0; z < region[2]; z++) { const char* src_ptr = src; char* dst_ptr = dst; for (y = 0; y < region[1]; y++) { memcpy(dst_ptr, src_ptr, image->bpp*region[0]); src_ptr += src_row_pitch; dst_ptr += dst_row_pitch; } src = (char*)src + src_slice_pitch; dst = (char*)dst + dst_slice_pitch; } } } void cl_mem_copy_image_to_image(const size_t *dst_origin,const size_t *src_origin, const size_t *region, const struct _cl_mem_image *dst_image, const struct _cl_mem_image *src_image) { char* dst= cl_mem_map_auto((cl_mem)dst_image, 1); char* src= cl_mem_map_auto((cl_mem)src_image, 0); size_t dst_offset = dst_image->bpp * dst_origin[0] + dst_image->row_pitch * dst_origin[1] + dst_image->slice_pitch * dst_origin[2]; size_t src_offset = src_image->bpp * src_origin[0] + src_image->row_pitch * src_origin[1] + src_image->slice_pitch * src_origin[2]; dst= (char*)dst+ dst_offset; src= (char*)src+ src_offset; cl_uint y, z; for (z = 0; z < region[2]; z++) { const char* src_ptr = src; char* dst_ptr = dst; for (y = 0; y < region[1]; y++) { memcpy(dst_ptr, src_ptr, src_image->bpp*region[0]); src_ptr += src_image->row_pitch; dst_ptr += dst_image->row_pitch; } src = (char*)src + src_image->slice_pitch; dst = (char*)dst + dst_image->slice_pitch; } cl_mem_unmap_auto((cl_mem)src_image); cl_mem_unmap_auto((cl_mem)dst_image); } static void cl_mem_copy_image(struct _cl_mem_image *image, size_t row_pitch, size_t slice_pitch, void* host_ptr) { char* dst_ptr = cl_mem_map_auto((cl_mem)image, 1); size_t origin[3] = {0, 0, 0}; size_t region[3] = {image->w, image->h, image->depth}; cl_mem_copy_image_region(origin, region, dst_ptr, image->row_pitch, image->slice_pitch, host_ptr, row_pitch, slice_pitch, image, CL_FALSE, CL_FALSE); //offset is 0 cl_mem_unmap_auto((cl_mem)image); } cl_image_tiling_t cl_get_default_tiling(cl_driver drv) { static int initialized = 0; static cl_image_tiling_t tiling = CL_TILE_X; if (!initialized) { // FIXME, need to find out the performance diff's root cause on BDW. // SKL's 3D Image can't use TILE_X, so use TILE_Y as default if(cl_driver_get_ver(drv) == 8 || cl_driver_get_ver(drv) == 9) tiling = CL_TILE_Y; char *tilingStr = getenv("OCL_TILING"); if (tilingStr != NULL) { switch (tilingStr[0]) { case '0': tiling = CL_NO_TILE; break; case '1': tiling = CL_TILE_X; break; case '2': tiling = CL_TILE_Y; break; default: break; } } initialized = 1; } return tiling; } static cl_mem _cl_new_image_copy_from_host_ptr(cl_context ctx, cl_mem_flags flags, const cl_image_format *fmt, const cl_mem_object_type image_type, size_t w, size_t h, size_t depth, size_t pitch, size_t slice_pitch, size_t sz, size_t aligned_pitch, uint32_t intel_fmt, uint32_t bpp, cl_image_tiling_t tiling, void *data, //pointer from application cl_int *errcode_ret) { cl_int err = CL_SUCCESS; cl_mem mem = NULL; size_t origin[3] = {0, 0, 0}; size_t region[3] = {w, h, depth}; size_t aligned_slice_pitch = 0; if (ctx->image_queue == NULL) { ctx->image_queue = clCreateCommandQueueWithProperties(ctx, ctx->devices[0], 0, &err); if (err != CL_SUCCESS || !ctx->image_queue) { *errcode_ret = err; ctx->image_queue = NULL; return NULL; } } // Map host ptr to OCL buffer cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, sz, data, &err); if (err != CL_SUCCESS) { *errcode_ret = err; return NULL; } mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err); if (mem == NULL || err != CL_SUCCESS) { clReleaseMemObject(buf); return NULL; } cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch); if (image_type == CL_MEM_OBJECT_IMAGE2D) aligned_slice_pitch = 0; else //SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need CL_NO_TILE's aligned_h to calc. aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, tiling, 2)); cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt, intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling, 0, 0, 0); err = clEnqueueCopyBufferToImage(ctx->image_queue, buf, mem, 0, origin, region, 0, NULL, NULL); if(err != CL_SUCCESS) { clReleaseMemObject(buf); clReleaseMemObject(mem); return NULL; } err = clFinish(ctx->image_queue); if(err != CL_SUCCESS) { clReleaseMemObject(buf); clReleaseMemObject(mem); return NULL; } clReleaseMemObject(buf); if (flags & CL_MEM_USE_HOST_PTR && data) { mem->host_ptr = data; cl_mem_image(mem)->host_row_pitch = pitch; cl_mem_image(mem)->host_slice_pitch = slice_pitch; } return mem; } static cl_mem _cl_mem_new_image(cl_context ctx, cl_mem_flags flags, const cl_image_format *fmt, const cl_mem_object_type orig_image_type, size_t w, size_t h, size_t depth, size_t pitch, size_t slice_pitch, void *data, //pointer from application cl_mem buffer, //for image2D from buffer cl_int *errcode_ret) { cl_int err = CL_SUCCESS; cl_bool is_ker_copy = 0; cl_mem mem = NULL; cl_mem_object_type image_type = orig_image_type; uint32_t bpp = 0, intel_fmt = INTEL_UNSUPPORTED_FORMAT; size_t sz = 0, aligned_pitch = 0, aligned_slice_pitch = 0, aligned_h = 0; size_t origin_width = w; // for image1d buffer work around. cl_image_tiling_t tiling = CL_NO_TILE; int enable_true_hostptr = 0; // can't use BVAR (backend/src/sys/cvar.hpp) here as it's C++ const char *env = getenv("OCL_IMAGE_HOSTPTR"); if (env != NULL) { sscanf(env, "%i", &enable_true_hostptr); } /* Check flags consistency */ if (UNLIKELY((flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR)) && data == NULL)) { err = CL_INVALID_HOST_PTR; goto error; } /* Get the size of each pixel */ if (UNLIKELY((err = cl_image_byte_per_pixel(fmt, &bpp)) != CL_SUCCESS)) goto error; /* Only a sub-set of the formats are supported */ intel_fmt = cl_image_get_intel_format(fmt); if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) { err = CL_IMAGE_FORMAT_NOT_SUPPORTED; goto error; } /* See if the user parameters match */ #define DO_IMAGE_ERROR \ do { \ err = CL_INVALID_IMAGE_SIZE; \ goto error; \ } while (0); if (UNLIKELY(w == 0)) DO_IMAGE_ERROR; if (UNLIKELY(h == 0 && (image_type != CL_MEM_OBJECT_IMAGE1D && image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY && image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER))) DO_IMAGE_ERROR; if (image_type == CL_MEM_OBJECT_IMAGE1D) { size_t min_pitch = bpp * w; if (data && pitch == 0) pitch = min_pitch; h = 1; depth = 1; if (UNLIKELY(w > ctx->devices[0]->image2d_max_width)) DO_IMAGE_ERROR; if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR; if (UNLIKELY(data && (slice_pitch % pitch != 0))) DO_IMAGE_ERROR; if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR; if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR; tiling = CL_NO_TILE; } else if (image_type == CL_MEM_OBJECT_IMAGE2D || image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) { if (image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) { if (UNLIKELY(w > ctx->devices[0]->image_mem_size)) DO_IMAGE_ERROR; /* This is an image1d buffer which exceeds normal image size restrication We have to use a 2D image to simulate this 1D image. */ h = (w + ctx->devices[0]->image2d_max_width - 1) / ctx->devices[0]->image2d_max_width; w = w > ctx->devices[0]->image2d_max_width ? ctx->devices[0]->image2d_max_width : w; tiling = CL_NO_TILE; } else if(image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL && !IS_IMAGE(buffer)) { tiling = CL_NO_TILE; } else if (cl_driver_get_ver(ctx->drv) != 6) { /* Pick up tiling mode (we do only linear on SNB) */ tiling = cl_get_default_tiling(ctx->drv); } size_t min_pitch = bpp * w; if (data && pitch == 0) pitch = min_pitch; if (UNLIKELY(w > ctx->devices[0]->image2d_max_width)) DO_IMAGE_ERROR; if (UNLIKELY(h > ctx->devices[0]->image2d_max_height)) DO_IMAGE_ERROR; if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR; if (UNLIKELY(!data && pitch != 0 && buffer == NULL)) DO_IMAGE_ERROR; depth = 1; } else if (image_type == CL_MEM_OBJECT_IMAGE3D || image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY || image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { if (image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { h = 1; tiling = CL_NO_TILE; } else if (cl_driver_get_ver(ctx->drv) != 6) tiling = cl_get_default_tiling(ctx->drv); size_t min_pitch = bpp * w; if (data && pitch == 0) pitch = min_pitch; size_t min_slice_pitch = pitch * h; if (data && slice_pitch == 0) slice_pitch = min_slice_pitch; if (UNLIKELY(w > ctx->devices[0]->image3d_max_width)) DO_IMAGE_ERROR; if (UNLIKELY(h > ctx->devices[0]->image3d_max_height)) DO_IMAGE_ERROR; if (image_type == CL_MEM_OBJECT_IMAGE3D && (UNLIKELY(depth > ctx->devices[0]->image3d_max_depth))) DO_IMAGE_ERROR else if (UNLIKELY(depth > ctx->devices[0]->image_max_array_size)) DO_IMAGE_ERROR; if (UNLIKELY(data && min_pitch > pitch)) DO_IMAGE_ERROR; if (UNLIKELY(data && min_slice_pitch > slice_pitch)) DO_IMAGE_ERROR; if (UNLIKELY(!data && pitch != 0)) DO_IMAGE_ERROR; if (UNLIKELY(!data && slice_pitch != 0)) DO_IMAGE_ERROR; } else assert(0); #undef DO_IMAGE_ERROR if (fmt->image_channel_order == CL_NV12_INTEL) { h += h/2; } uint8_t enableUserptr = 0; if (enable_true_hostptr && ctx->devices[0]->host_unified_memory && data != NULL && (flags & CL_MEM_USE_HOST_PTR)) { int cacheline_size = 0; cl_get_device_info(ctx->devices[0], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheline_size), &cacheline_size, NULL); if (ALIGN((unsigned long)data, cacheline_size) == (unsigned long)data && ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)) == h && ALIGN(h * pitch * depth, cacheline_size) == h * pitch * depth && //h and pitch should same as aligned_h and aligned_pitch if enable userptr ((image_type != CL_MEM_OBJECT_IMAGE3D && image_type != CL_MEM_OBJECT_IMAGE1D_ARRAY && image_type != CL_MEM_OBJECT_IMAGE2D_ARRAY) || pitch * h == slice_pitch)) { tiling = CL_NO_TILE; enableUserptr = 1; } } /* Tiling requires to align both pitch and height */ if (tiling == CL_NO_TILE) { aligned_pitch = w * bpp; if (aligned_pitch < pitch && enableUserptr) aligned_pitch = pitch; //no need align the height if 2d image from buffer. //the pitch should be same with buffer's pitch as they share same bo. if (image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL && !IS_IMAGE(buffer)) { if(aligned_pitch < pitch) { aligned_pitch = pitch; } aligned_h = h; } else aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)); } else if (tiling == CL_TILE_X) { aligned_pitch = ALIGN(w * bpp, cl_buffer_get_tiling_align(ctx, CL_TILE_X, 0)); aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_TILE_X, 1)); } else if (tiling == CL_TILE_Y) { aligned_pitch = ALIGN(w * bpp, cl_buffer_get_tiling_align(ctx, CL_TILE_Y, 0)); aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_TILE_Y, 1)); } sz = aligned_pitch * aligned_h * depth; if (image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL && !IS_IMAGE(buffer)) { //image 2d created from buffer: per spec, the buffer sz maybe larger than the image 2d. if (buffer->size >= sz) sz = buffer->size; else { err = CL_INVALID_IMAGE_SIZE; goto error; } } /* If sz is large than 128MB, map gtt may fail in some system. Because there is no obviours performance drop, disable tiling. */ if (tiling != CL_NO_TILE && sz > MAX_TILING_SIZE) { if ((image_type == CL_MEM_OBJECT_IMAGE2D || image_type == CL_MEM_OBJECT_IMAGE3D) && buffer == NULL) { if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) { mem = _cl_new_image_copy_from_host_ptr(ctx, flags, fmt, image_type, w, h, depth, pitch, slice_pitch, sz, aligned_pitch, intel_fmt, bpp, tiling, data, &err); if (mem != NULL) { cl_mem_image(mem)->is_ker_copy = 1; goto exit; } else goto error; } else is_ker_copy = 1; } else { tiling = CL_NO_TILE; aligned_pitch = w * bpp; aligned_h = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1)); sz = aligned_pitch * aligned_h * depth; } } if (image_type != CL_MEM_OBJECT_IMAGE1D_BUFFER) { if (image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL) mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, buffer, &err); else { if (enableUserptr) mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, data, NULL, &err); else mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err); } } else { mem = cl_mem_allocate(CL_MEM_BUFFER1D_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, NULL, &err); if (mem != NULL && err == CL_SUCCESS) { struct _cl_mem_buffer1d_image *buffer1d_image = (struct _cl_mem_buffer1d_image *)mem; buffer1d_image->size = origin_width;; } } if (mem == NULL || err != CL_SUCCESS) goto error; if(!(image_type == CL_MEM_OBJECT_IMAGE2D && buffer != NULL)) { //no need set tiling if image 2d created from buffer since share same bo. cl_buffer_set_tiling(mem->bo, tiling, aligned_pitch); } if (image_type == CL_MEM_OBJECT_IMAGE1D || image_type == CL_MEM_OBJECT_IMAGE2D || image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) aligned_slice_pitch = 0; else //SKL need use tiling's aligned_h to calc slice_pitch and IVB to BDW need CL_NO_TILE's aligned_h to calc. aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, tiling, 2)); cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt, intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling, 0, 0, 0); /* Copy the data if required */ if (flags & CL_MEM_COPY_HOST_PTR && data) cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data); if (flags & CL_MEM_USE_HOST_PTR && data) { mem->host_ptr = data; cl_mem_image(mem)->host_row_pitch = pitch; cl_mem_image(mem)->host_slice_pitch = slice_pitch; if (!enableUserptr) cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data); } /* copy yuv data if required */ if(fmt->image_channel_order == CL_NV12_INTEL && data) { cl_mem_copy_image(cl_mem_image(mem), pitch, slice_pitch, data); } cl_mem_image(mem)->is_ker_copy = is_ker_copy; exit: if (errcode_ret) *errcode_ret = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } static cl_mem _cl_mem_new_image_from_nv12_image(cl_context ctx, cl_mem_flags flags, const cl_image_format* image_format, const cl_image_desc *image_desc, cl_int *errcode_ret) { cl_mem image = NULL; cl_mem imageIn = image_desc->mem_object; cl_int err = CL_SUCCESS; *errcode_ret = err; uint32_t bpp; uint32_t intel_fmt = INTEL_UNSUPPORTED_FORMAT; size_t width = 0; size_t height = 0; size_t depth = 0; /* Get the size of each pixel */ if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS)) goto error; /* Only a sub-set of the formats are supported */ intel_fmt = cl_image_get_intel_format(image_format); if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) { err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; goto error; } if(imageIn == NULL) { err = CL_INVALID_IMAGE_DESCRIPTOR; goto error; } if (cl_mem_image(imageIn)->fmt.image_channel_order != CL_NV12_INTEL || (image_format->image_channel_order != CL_R && image_format->image_channel_order != CL_RG) || image_format->image_channel_data_type != CL_UNORM_INT8) { err = CL_INVALID_IMAGE_DESCRIPTOR; goto error; } width = cl_mem_image(imageIn)->w; if (image_desc->image_depth == 0) { height = cl_mem_image(imageIn)->h * 2 / 3; } else if (image_desc->image_depth == 1) { width = cl_mem_image(imageIn)->w / 2; height = cl_mem_image(imageIn)->h / 3; } else { err = CL_INVALID_IMAGE_DESCRIPTOR; goto error; } //flags check here. if ((flags & CL_MEM_USE_HOST_PTR) || (flags & CL_MEM_ALLOC_HOST_PTR) || (flags & CL_MEM_COPY_HOST_PTR)) { err = CL_INVALID_VALUE; goto error; } if (!(imageIn->flags & CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL)) { if (((flags & CL_MEM_READ_WRITE) || (flags & CL_MEM_READ_ONLY)) && (imageIn->flags & CL_MEM_WRITE_ONLY)) { err = CL_INVALID_VALUE; goto error; } if (((flags & CL_MEM_READ_WRITE) || (flags & CL_MEM_WRITE_ONLY)) && (imageIn->flags | CL_MEM_READ_ONLY)) { err = CL_INVALID_VALUE; goto error; } if (((flags & CL_MEM_READ_WRITE) || (flags & CL_MEM_WRITE_ONLY) ||(flags & CL_MEM_READ_ONLY)) && (imageIn->flags & CL_MEM_NO_ACCESS_INTEL)) { err = CL_INVALID_VALUE; goto error; } if ((flags & CL_MEM_HOST_READ_ONLY) && (imageIn->flags & CL_MEM_HOST_WRITE_ONLY)) { err = CL_INVALID_VALUE; goto error; } if ((flags & CL_MEM_HOST_WRITE_ONLY) && (imageIn->flags & CL_MEM_HOST_READ_ONLY)) { err = CL_INVALID_VALUE; goto error; } if (((flags & CL_MEM_HOST_READ_ONLY) || (flags & CL_MEM_HOST_WRITE_ONLY)) && (imageIn->flags & CL_MEM_HOST_NO_ACCESS)) { err = CL_INVALID_VALUE; goto error; } } image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type, width, height, depth, cl_mem_image(imageIn)->row_pitch, 0, NULL, imageIn, errcode_ret); if (image == NULL) return NULL; if (image_desc->image_depth == 1) { cl_mem_image(image)->offset = cl_mem_image(imageIn)->row_pitch * height * 2; } cl_mem_image(image)->nv12_image = imageIn; cl_mem_add_ref(imageIn); return image; error: if (image) cl_mem_delete(image); image = NULL; *errcode_ret = err; return image; } static cl_mem _cl_mem_new_image_from_buffer(cl_context ctx, cl_mem_flags flags, const cl_image_format* image_format, const cl_image_desc *image_desc, cl_int *errcode_ret) { cl_mem image = NULL; cl_mem buffer = image_desc->buffer; cl_int err = CL_SUCCESS; *errcode_ret = err; cl_ulong max_size; cl_mem_flags merged_flags; uint32_t bpp; uint32_t intel_fmt = INTEL_UNSUPPORTED_FORMAT; size_t offset = 0; /* Get the size of each pixel */ if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS)) goto error; /* Only a sub-set of the formats are supported */ intel_fmt = cl_image_get_intel_format(image_format); if (UNLIKELY(intel_fmt == INTEL_UNSUPPORTED_FORMAT)) { err = CL_INVALID_IMAGE_FORMAT_DESCRIPTOR; goto error; } if (!buffer) { err = CL_INVALID_IMAGE_DESCRIPTOR; goto error; } if (flags & (CL_MEM_USE_HOST_PTR|CL_MEM_ALLOC_HOST_PTR|CL_MEM_COPY_HOST_PTR)) { err = CL_INVALID_IMAGE_DESCRIPTOR; goto error; } /* access check. */ if ((buffer->flags & CL_MEM_WRITE_ONLY) && (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_ONLY))) { err = CL_INVALID_VALUE; goto error; } if ((buffer->flags & CL_MEM_READ_ONLY) && (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { err = CL_INVALID_VALUE; goto error; } if ((buffer->flags & CL_MEM_HOST_WRITE_ONLY) && (flags & CL_MEM_HOST_READ_ONLY)) { err = CL_INVALID_VALUE; goto error; } if ((buffer->flags & CL_MEM_HOST_READ_ONLY) && (flags & CL_MEM_HOST_WRITE_ONLY)) { err = CL_INVALID_VALUE; goto error; } if ((buffer->flags & CL_MEM_HOST_NO_ACCESS) && (flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY))) { err = CL_INVALID_VALUE; goto error; } if ((err = cl_get_device_info(ctx->devices[0], CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, sizeof(max_size), &max_size, NULL)) != CL_SUCCESS) { goto error; } if (image_desc->image_width > max_size) { err = CL_INVALID_IMAGE_DESCRIPTOR; goto error; } if (image_desc->image_width*bpp > buffer->size) { err = CL_INVALID_IMAGE_DESCRIPTOR; goto error; } merged_flags = buffer->flags; if (flags & (CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY)) { merged_flags &= ~(CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY); merged_flags |= flags & (CL_MEM_READ_WRITE|CL_MEM_READ_WRITE|CL_MEM_WRITE_ONLY); } if (flags & (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS)) { merged_flags &= ~(CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS); merged_flags |= flags & (CL_MEM_HOST_WRITE_ONLY|CL_MEM_HOST_READ_ONLY|CL_MEM_HOST_NO_ACCESS); } struct _cl_mem_buffer *mem_buffer = (struct _cl_mem_buffer*)buffer; if (buffer->type == CL_MEM_SUBBUFFER_TYPE) { offset = ((struct _cl_mem_buffer *)buffer)->sub_offset; mem_buffer = mem_buffer->parent; } /* Get the size of each pixel */ if (UNLIKELY((err = cl_image_byte_per_pixel(image_format, &bpp)) != CL_SUCCESS)) goto error; if(image_desc->image_type == CL_MEM_OBJECT_IMAGE2D) { image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type, image_desc->image_width, image_desc->image_height, image_desc->image_depth, image_desc->image_row_pitch, image_desc->image_slice_pitch, NULL, image_desc->buffer, errcode_ret); } else if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) { // Per bspec, a image should has a at least 2 line vertical alignment, // thus we can't simply attach a buffer to a 1d image surface which has the same size. // We have to create a new image, and copy the buffer data to this new image. // And replace all the buffer object's reference to this image. image = _cl_mem_new_image(ctx, flags, image_format, image_desc->image_type, mem_buffer->base.size / bpp, 0, 0, 0, 0, NULL, NULL, errcode_ret); } else assert(0); if (image == NULL) return NULL; if(image_desc->image_type == CL_MEM_OBJECT_IMAGE2D) { //no need copy since the image 2d and buffer share same bo. } else if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) { // FIXME, we could use copy buffer to image to do this on GPU latter. // currently the copy buffer to image function doesn't support 1D image. // // There is a potential risk that this buffer was mapped and the caller // still hold the pointer and want to access it again. This scenario is // not explicitly forbidden in the spec, although it should not be permitted. void *src = cl_mem_map(buffer, 0); void *dst = cl_mem_map(image, 1); memcpy(dst, src, mem_buffer->base.size); cl_mem_unmap(image); cl_mem_unmap(buffer); struct _cl_mem_buffer1d_image* image_buffer = (struct _cl_mem_buffer1d_image*)image; image_buffer->descbuffer = buffer; } else assert(0); if (err != 0) goto error; // Now replace buffer's bo to this new bo, need to take care of sub buffer // case. if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER) cl_mem_replace_buffer(buffer, image->bo); /* Now point to the right offset if buffer is a SUB_BUFFER. */ if (buffer->flags & CL_MEM_USE_HOST_PTR) image->host_ptr = buffer->host_ptr + offset; cl_mem_image(image)->offset = offset; cl_mem_add_ref(buffer); cl_mem_image(image)->buffer_1d = buffer; return image; error: if (image) cl_mem_delete(image); image = NULL; *errcode_ret = err; return image; } LOCAL cl_mem cl_mem_new_image(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) { switch (image_desc->image_type) { case CL_MEM_OBJECT_IMAGE1D: case CL_MEM_OBJECT_IMAGE3D: return _cl_mem_new_image(context, flags, image_format, image_desc->image_type, image_desc->image_width, image_desc->image_height, image_desc->image_depth, image_desc->image_row_pitch, image_desc->image_slice_pitch, host_ptr, NULL, errcode_ret); case CL_MEM_OBJECT_IMAGE2D: if (image_desc->buffer) { if (IS_IMAGE(image_desc->buffer)) { return _cl_mem_new_image_from_nv12_image(context, flags, image_format, image_desc, errcode_ret); } else return _cl_mem_new_image_from_buffer(context, flags, image_format, image_desc, errcode_ret); } else return _cl_mem_new_image(context, flags, image_format, image_desc->image_type, image_desc->image_width, image_desc->image_height, image_desc->image_depth, image_desc->image_row_pitch, image_desc->image_slice_pitch, host_ptr, NULL, errcode_ret); case CL_MEM_OBJECT_IMAGE1D_ARRAY: case CL_MEM_OBJECT_IMAGE2D_ARRAY: return _cl_mem_new_image(context, flags, image_format, image_desc->image_type, image_desc->image_width, image_desc->image_height, image_desc->image_array_size, image_desc->image_row_pitch, image_desc->image_slice_pitch, host_ptr, NULL, errcode_ret); case CL_MEM_OBJECT_IMAGE1D_BUFFER: return _cl_mem_new_image_from_buffer(context, flags, image_format, image_desc, errcode_ret); break; case CL_MEM_OBJECT_BUFFER: default: assert(0); } return NULL; } LOCAL void cl_mem_svm_delete(cl_context ctx, void *svm_pointer) { cl_mem mem; if(UNLIKELY(svm_pointer == NULL)) return; mem = cl_context_get_svm_from_ptr(ctx, svm_pointer); if(mem == NULL) return; cl_mem_delete(mem); } LOCAL void cl_mem_delete(cl_mem mem) { cl_int i; cl_mem_dstr_cb cb = NULL; if (UNLIKELY(mem == NULL)) return; if (CL_OBJECT_DEC_REF(mem) > 1) return; #ifdef HAS_GL_EGL if (UNLIKELY(IS_GL_IMAGE(mem))) { cl_mem_gl_delete(cl_mem_gl_image(mem)); } #endif #ifdef HAS_CMRT if (mem->cmrt_mem != NULL) cmrt_destroy_memory(mem); #endif /* First, call all the callbacks registered by user. */ while (!list_empty(&mem->dstr_cb_head)) { cb = list_entry(mem->dstr_cb_head.head_node.n, _cl_mem_dstr_cb, node); list_node_del(&cb->node); cb->pfn_notify(mem, cb->user_data); cl_free(cb); } /* iff we are a image, delete the 1d buffer if has. */ if (IS_IMAGE(mem)) { if (cl_mem_image(mem)->buffer_1d) { assert(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER || cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE2D); cl_mem_delete(cl_mem_image(mem)->buffer_1d); if(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE2D && cl_mem_image(mem)->is_image_from_buffer == 1) { cl_mem_image(mem)->buffer_1d = NULL; mem->bo = NULL; } } if (cl_mem_image(mem)->nv12_image) { assert(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE2D); cl_mem_delete(cl_mem_image(mem)->nv12_image); if(cl_mem_image(mem)->image_type == CL_MEM_OBJECT_IMAGE2D && cl_mem_image(mem)->is_image_from_nv12_image == 1) { cl_mem_image(mem)->nv12_image = NULL; mem->bo = NULL; } } if (cl_mem_image(mem)->tmp_ker_buf) { cl_mem_delete(cl_mem_image(mem)->tmp_ker_buf); cl_mem_image(mem)->tmp_ker_buf = NULL; } } /* Someone still mapped, unmap */ if(mem->map_ref > 0) { assert(mem->mapped_ptr); for(i=0; imapped_ptr_sz; i++) { if(mem->mapped_ptr[i].ptr != NULL) { mem->map_ref--; cl_mem_unmap_auto(mem); } } assert(mem->map_ref == 0); } if (mem->mapped_ptr) free(mem->mapped_ptr); /* Iff we are sub, do nothing for bo release. */ if (mem->type == CL_MEM_SUBBUFFER_TYPE) { struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; /* Remove it from the parent's list */ assert(buffer->parent); pthread_mutex_lock(&buffer->parent->sub_lock); if (buffer->sub_prev) buffer->sub_prev->sub_next = buffer->sub_next; if (buffer->sub_next) buffer->sub_next->sub_prev = buffer->sub_prev; if (buffer->parent->subs == buffer) buffer->parent->subs = buffer->sub_next; pthread_mutex_unlock(&buffer->parent->sub_lock); cl_mem_delete((cl_mem )(buffer->parent)); } else if (mem->is_svm && mem->type != CL_MEM_SVM_TYPE) { cl_mem svm_mem = cl_context_get_svm_from_ptr(mem->ctx, mem->host_ptr); if (svm_mem != NULL) cl_mem_delete(svm_mem); } else if (LIKELY(mem->bo != NULL)) { cl_buffer_unreference(mem->bo); } /* Remove it from the list */ cl_context_remove_mem(mem->ctx, mem); if ((mem->is_userptr && (mem->flags & CL_MEM_ALLOC_HOST_PTR) && (mem->type != CL_MEM_SUBBUFFER_TYPE)) || (mem->is_svm && mem->type == CL_MEM_SVM_TYPE)) cl_free(mem->host_ptr); CL_OBJECT_DESTROY_BASE(mem); cl_free(mem); } LOCAL void cl_mem_add_ref(cl_mem mem) { assert(mem); CL_OBJECT_INC_REF(mem); } #define LOCAL_SZ_0 16 #define LOCAL_SZ_1 4 #define LOCAL_SZ_2 4 LOCAL cl_int cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf, size_t src_offset, size_t dst_offset, size_t cb) { cl_int ret = CL_SUCCESS; cl_kernel ker = NULL; size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {1,1,1}; const unsigned int masks[4] = {0xffffffff, 0x0ff, 0x0ffff, 0x0ffffff}; int aligned = 0; int dw_src_offset = src_offset/4; int dw_dst_offset = dst_offset/4; if (!cb) return ret; /* We use one kernel to copy the data. The kernel is lazily created. */ assert(src_buf->ctx == dst_buf->ctx); /* All 16 bytes aligned, fast and easy one. */ if((cb % 16 == 0) && (src_offset % 16 == 0) && (dst_offset % 16 == 0)) { extern char cl_internal_copy_buf_align16_str[]; extern size_t cl_internal_copy_buf_align16_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN16, cl_internal_copy_buf_align16_str, (size_t)cl_internal_copy_buf_align16_str_size, NULL); cb = cb/16; aligned = 1; } else if ((cb % 4 == 0) && (src_offset % 4 == 0) && (dst_offset % 4 == 0)) { /* all Dword aligned.*/ extern char cl_internal_copy_buf_align4_str[]; extern size_t cl_internal_copy_buf_align4_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_ALIGN4, cl_internal_copy_buf_align4_str, (size_t)cl_internal_copy_buf_align4_str_size, NULL); cb = cb/4; aligned = 1; } if (aligned) { if (!ker) return CL_OUT_OF_RESOURCES; if (cb < LOCAL_SZ_0) { local_sz[0] = 1; } else { local_sz[0] = LOCAL_SZ_0; } global_sz[0] = ((cb + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); cl_kernel_set_arg(ker, 4, sizeof(int), &cb); ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); cl_kernel_delete(ker); return ret; } /* Now handle the unaligned cases. */ int dw_num = ((dst_offset % 4 + cb) + 3) / 4; unsigned int first_mask = dst_offset % 4 == 0 ? 0x0 : masks[dst_offset % 4]; unsigned int last_mask = masks[(dst_offset + cb) % 4]; /* handle the very small range copy. */ if (cb < 4 && dw_num == 1) { first_mask = first_mask | ~last_mask; } if (cb < LOCAL_SZ_0) { local_sz[0] = 1; } else { local_sz[0] = LOCAL_SZ_0; } global_sz[0] = ((dw_num + LOCAL_SZ_0 - 1)/LOCAL_SZ_0)*LOCAL_SZ_0; if (src_offset % 4 == dst_offset % 4) { /* Src and dst has the same unaligned offset, just handle the header and tail. */ extern char cl_internal_copy_buf_unalign_same_offset_str[]; extern size_t cl_internal_copy_buf_unalign_same_offset_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SAME_OFFSET, cl_internal_copy_buf_unalign_same_offset_str, (size_t)cl_internal_copy_buf_unalign_same_offset_str_size, NULL); if (!ker) return CL_OUT_OF_RESOURCES; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); cl_kernel_delete(ker); return ret; } /* Dst's offset < Src's offset, so one dst dword need two sequential src dwords to fill it. */ if (dst_offset % 4 < src_offset % 4) { extern char cl_internal_copy_buf_unalign_dst_offset_str[]; extern size_t cl_internal_copy_buf_unalign_dst_offset_str_size; int align_diff = src_offset % 4 - dst_offset % 4; unsigned int dw_mask = masks[align_diff]; int shift = align_diff * 8; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET, cl_internal_copy_buf_unalign_dst_offset_str, (size_t)cl_internal_copy_buf_unalign_dst_offset_str_size, NULL); if (!ker) return CL_OUT_OF_RESOURCES; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); cl_kernel_set_arg(ker, 7, sizeof(int), &shift); cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask); ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); cl_kernel_delete(ker); return ret; } /* Dst's offset > Src's offset, so one dst dword need two sequential src - and src to fill it. */ if (dst_offset % 4 > src_offset % 4) { extern char cl_internal_copy_buf_unalign_src_offset_str[]; extern size_t cl_internal_copy_buf_unalign_src_offset_str_size; int align_diff = dst_offset % 4 - src_offset % 4; unsigned int dw_mask = masks[4 - align_diff]; int shift = align_diff * 8; int src_less = !(src_offset % 4) && !((src_offset + cb) % 4); ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET, cl_internal_copy_buf_unalign_src_offset_str, (size_t)cl_internal_copy_buf_unalign_src_offset_str_size, NULL); if (!ker) return CL_OUT_OF_RESOURCES; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); cl_kernel_set_arg(ker, 1, sizeof(int), &dw_src_offset); cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); cl_kernel_set_arg(ker, 7, sizeof(int), &shift); cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask); cl_kernel_set_arg(ker, 9, sizeof(int), &src_less); ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); cl_kernel_delete(ker); return ret; } /* no case can hanldle? */ assert(0); return ret; } LOCAL cl_int cl_image_fill(cl_command_queue queue, cl_event e, const void * pattern, struct _cl_mem_image* src_image, const size_t * origin, const size_t * region) { cl_int ret = CL_SUCCESS; cl_kernel ker = NULL; size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2}; uint32_t savedIntelFmt = src_image->intel_fmt; if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) { extern char cl_internal_fill_image_1d_str[]; extern size_t cl_internal_fill_image_1d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D, cl_internal_fill_image_1d_str, (size_t)cl_internal_fill_image_1d_str_size, NULL); }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { extern char cl_internal_fill_image_1d_array_str[]; extern size_t cl_internal_fill_image_1d_array_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_1D_ARRAY, cl_internal_fill_image_1d_array_str, (size_t)cl_internal_fill_image_1d_array_str_size, NULL); }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) { extern char cl_internal_fill_image_2d_str[]; extern size_t cl_internal_fill_image_2d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D, cl_internal_fill_image_2d_str, (size_t)cl_internal_fill_image_2d_str_size, NULL); }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { extern char cl_internal_fill_image_2d_array_str[]; extern size_t cl_internal_fill_image_2d_array_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_2D_ARRAY, cl_internal_fill_image_2d_array_str, (size_t)cl_internal_fill_image_2d_array_str_size, NULL); }else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) { extern char cl_internal_fill_image_3d_str[]; extern size_t cl_internal_fill_image_3d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_IMAGE_3D, cl_internal_fill_image_3d_str, (size_t)cl_internal_fill_image_3d_str_size, NULL); }else{ return CL_IMAGE_FORMAT_NOT_SUPPORTED; } if (!ker) return CL_OUT_OF_RESOURCES; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image); if(src_image->fmt.image_channel_order >= CL_sRGBA) { #define RGB2sRGB(linear) ( linear <= 0.0031308f )? ( 12.92f * linear ):( 1.055f * powf( linear, 1.0f/2.4f ) - 0.055f); cl_image_format fmt; float newpattern[4] = {0.0,0.0,0.0,((float*)pattern)[3]}; int i; for(i = 0;i < 3; i++){ if(src_image->fmt.image_channel_order == CL_sRGBA) { newpattern[i] = RGB2sRGB(((float*)pattern)[i]); } else newpattern[2-i] = RGB2sRGB(((float*)pattern)[i]); } cl_kernel_set_arg(ker, 1, sizeof(float)*4, newpattern); fmt.image_channel_order = CL_RGBA; fmt.image_channel_data_type = CL_UNORM_INT8; src_image->intel_fmt = cl_image_get_intel_format(&fmt); #undef RGB2sRGB } else cl_kernel_set_arg(ker, 1, sizeof(float)*4, pattern); cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]); cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin[0]); cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]); cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]); ret = cl_command_queue_ND_range(queue, ker, e, 3, global_off, global_off, global_sz, global_sz, local_sz, local_sz); cl_kernel_delete(ker); src_image->intel_fmt = savedIntelFmt; return ret; } LOCAL cl_int cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern, size_t pattern_size, cl_mem buffer, size_t offset, size_t size) { cl_int ret = CL_SUCCESS; cl_kernel ker = NULL; size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {1,1,1}; char pattern_comb[4]; int is_128 = 0; const void * pattern1 = NULL; assert(offset % pattern_size == 0); assert(size % pattern_size == 0); if (!size) return ret; if (pattern_size == 128) { /* 128 is according to pattern of double16, but double works not very well on some platform. We use two float16 to handle this. */ extern char cl_internal_fill_buf_align128_str[]; extern size_t cl_internal_fill_buf_align128_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN128, cl_internal_fill_buf_align128_str, (size_t)cl_internal_fill_buf_align128_str_size, NULL); is_128 = 1; pattern_size = pattern_size / 2; pattern1 = pattern + pattern_size; size = size / 2; } else if (pattern_size % 8 == 0) { /* Handle the 8 16 32 64 cases here. */ extern char cl_internal_fill_buf_align8_str[]; extern size_t cl_internal_fill_buf_align8_str_size; int order = ffs(pattern_size / 8) - 1; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN8_8 + order, cl_internal_fill_buf_align8_str, (size_t)cl_internal_fill_buf_align8_str_size, NULL); } else if (pattern_size == 4) { extern char cl_internal_fill_buf_align4_str[]; extern size_t cl_internal_fill_buf_align4_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4, cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL); } else if (size >= 4 && size % 4 == 0 && offset % 4 == 0) { /* The unaligned case. But if copy size and offset are aligned to 4, we can fake the pattern with the pattern duplication fill in. */ assert(pattern_size == 1 || pattern_size == 2); extern char cl_internal_fill_buf_align4_str[]; extern size_t cl_internal_fill_buf_align4_str_size; if (pattern_size == 2) { memcpy(pattern_comb, pattern, sizeof(char)*2); memcpy(pattern_comb + 2, pattern, sizeof(char)*2); } else { pattern_comb[0] = pattern_comb[1] = pattern_comb[2] = pattern_comb[3] = *(char *)pattern; } ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN4, cl_internal_fill_buf_align4_str, (size_t)cl_internal_fill_buf_align4_str_size, NULL); pattern_size = 4; pattern = pattern_comb; } //TODO: Unaligned cases, we may need to optimize it as cl_mem_copy, using mask in kernel //functions. This depend on the usage but now we just use aligned 1 and 2. else if (pattern_size == 2) { extern char cl_internal_fill_buf_align2_str[]; extern size_t cl_internal_fill_buf_align2_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_ALIGN2, cl_internal_fill_buf_align2_str, (size_t)cl_internal_fill_buf_align2_str_size, NULL); } else if (pattern_size == 1) { extern char cl_internal_fill_buf_unalign_str[]; extern size_t cl_internal_fill_buf_unalign_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_FILL_BUFFER_UNALIGN, cl_internal_fill_buf_unalign_str, (size_t)cl_internal_fill_buf_unalign_str_size, NULL); } else assert(0); if (!ker) return CL_OUT_OF_RESOURCES; size = size / pattern_size; offset = offset / pattern_size; if (size < LOCAL_SZ_0) { local_sz[0] = 1; } else { local_sz[0] = LOCAL_SZ_0; } global_sz[0] = ((size + LOCAL_SZ_0 - 1) / LOCAL_SZ_0) * LOCAL_SZ_0; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &buffer); cl_kernel_set_arg(ker, 1, pattern_size, pattern); cl_kernel_set_arg(ker, 2, sizeof(cl_uint), &offset); cl_kernel_set_arg(ker, 3, sizeof(cl_uint), &size); if (is_128) cl_kernel_set_arg(ker, 4, pattern_size, pattern1); ret = cl_command_queue_ND_range(queue, ker, e, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); cl_kernel_delete(ker); return ret; } LOCAL cl_int cl_mem_copy_buffer_rect(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf, const size_t *src_origin, const size_t *dst_origin, const size_t *region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch) { cl_int ret; cl_kernel ker; size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1}; // the src and dst mem rect is continuous, the copy is degraded to buf copy if((region[0] == dst_row_pitch) && (region[0] == src_row_pitch) && (region[1] * src_row_pitch == src_slice_pitch) && (region[1] * dst_row_pitch == dst_slice_pitch)){ cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0]; cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0]; cl_int size = region[0]*region[1]*region[2]; ret = cl_mem_copy(queue, NULL, src_buf, dst_buf,src_offset, dst_offset, size); return ret; } if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0]; cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0]; /* We use one kernel to copy the data. The kernel is lazily created. */ assert(src_buf->ctx == dst_buf->ctx); /* setup the kernel and run. */ size_t region0 = region[0]; if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) && (dst_row_pitch % 4== 0) && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (region0 % 4 == 0) ){ extern char cl_internal_copy_buf_rect_align4_str[]; extern size_t cl_internal_copy_buf_rect_align4_str_size; region0 /= 4; src_offset /= 4; dst_offset /= 4; src_row_pitch /= 4; dst_row_pitch /= 4; src_slice_pitch /= 4; dst_slice_pitch /= 4; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4, cl_internal_copy_buf_rect_align4_str, (size_t)cl_internal_copy_buf_rect_align4_str_size, NULL); }else{ extern char cl_internal_copy_buf_rect_str[]; extern size_t cl_internal_copy_buf_rect_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT, cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL); } if (!ker) return CL_OUT_OF_RESOURCES; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf); cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf); cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset); cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_offset); cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_row_pitch); cl_kernel_set_arg(ker, 8, sizeof(cl_int), &src_slice_pitch); cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch); cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch); ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); cl_kernel_delete(ker); return ret; } LOCAL cl_int cl_mem_kernel_copy_image(cl_command_queue queue, cl_event event, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image, const size_t *src_origin, const size_t *dst_origin, const size_t *region) { cl_int ret; cl_kernel ker = NULL; size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2}; uint32_t fixupDataType; uint32_t savedIntelFmt; if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; switch (src_image->fmt.image_channel_data_type) { case CL_SNORM_INT8: case CL_UNORM_INT8: fixupDataType = CL_UNSIGNED_INT8; break; case CL_HALF_FLOAT: case CL_SNORM_INT16: case CL_UNORM_INT16: fixupDataType = CL_UNSIGNED_INT16; break; case CL_FLOAT: fixupDataType = CL_UNSIGNED_INT32; break; default: fixupDataType = 0; } if (fixupDataType) { cl_image_format fmt; if (src_image->fmt.image_channel_order != CL_BGRA && src_image->fmt.image_channel_order != CL_sBGRA && src_image->fmt.image_channel_order != CL_sRGBA) fmt.image_channel_order = src_image->fmt.image_channel_order; else fmt.image_channel_order = CL_RGBA; fmt.image_channel_data_type = fixupDataType; savedIntelFmt = src_image->intel_fmt; src_image->intel_fmt = cl_image_get_intel_format(&fmt); dst_image->intel_fmt = src_image->intel_fmt; } /* We use one kernel to copy the data. The kernel is lazily created. */ assert(src_image->base.ctx == dst_image->base.ctx); /* setup the kernel and run. */ if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D) { if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D) { extern char cl_internal_copy_image_1d_to_1d_str[]; extern size_t cl_internal_copy_image_1d_to_1d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_1D_TO_1D, cl_internal_copy_image_1d_to_1d_str, (size_t)cl_internal_copy_image_1d_to_1d_str_size, NULL); } } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D) { if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { extern char cl_internal_copy_image_2d_to_2d_str[]; extern size_t cl_internal_copy_image_2d_to_2d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D, cl_internal_copy_image_2d_to_2d_str, (size_t)cl_internal_copy_image_2d_to_2d_str_size, NULL); } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { extern char cl_internal_copy_image_2d_to_3d_str[]; extern size_t cl_internal_copy_image_2d_to_3d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_3D, cl_internal_copy_image_2d_to_3d_str, (size_t)cl_internal_copy_image_2d_to_3d_str_size, NULL); } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { extern char cl_internal_copy_image_2d_to_2d_array_str[]; extern size_t cl_internal_copy_image_2d_to_2d_array_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_2D_ARRAY, cl_internal_copy_image_2d_to_2d_array_str, (size_t)cl_internal_copy_image_2d_to_2d_array_str_size, NULL); } } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { if(dst_image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) { extern char cl_internal_copy_image_1d_array_to_1d_array_str[]; extern size_t cl_internal_copy_image_1d_array_to_1d_array_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_1D_ARRAY_TO_1D_ARRAY, cl_internal_copy_image_1d_array_to_1d_array_str, (size_t)cl_internal_copy_image_1d_array_to_1d_array_str_size, NULL); } } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { extern char cl_internal_copy_image_2d_array_to_2d_array_str[]; extern size_t cl_internal_copy_image_2d_array_to_2d_array_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D_ARRAY, cl_internal_copy_image_2d_array_to_2d_array_str, (size_t)cl_internal_copy_image_2d_array_to_2d_array_str_size, NULL); } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { extern char cl_internal_copy_image_2d_array_to_2d_str[]; extern size_t cl_internal_copy_image_2d_array_to_2d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_2D, cl_internal_copy_image_2d_array_to_2d_str, (size_t)cl_internal_copy_image_2d_array_to_2d_str_size, NULL); } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { extern char cl_internal_copy_image_2d_array_to_3d_str[]; extern size_t cl_internal_copy_image_2d_array_to_3d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_ARRAY_TO_3D, cl_internal_copy_image_2d_array_to_3d_str, (size_t)cl_internal_copy_image_2d_array_to_3d_str_size, NULL); } } else if(src_image->image_type == CL_MEM_OBJECT_IMAGE3D) { if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D) { extern char cl_internal_copy_image_3d_to_2d_str[]; extern size_t cl_internal_copy_image_3d_to_2d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D, cl_internal_copy_image_3d_to_2d_str, (size_t)cl_internal_copy_image_3d_to_2d_str_size, NULL); } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE3D) { extern char cl_internal_copy_image_3d_to_3d_str[]; extern size_t cl_internal_copy_image_3d_to_3d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_3D, cl_internal_copy_image_3d_to_3d_str, (size_t)cl_internal_copy_image_3d_to_3d_str_size, NULL); } else if(dst_image->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY) { extern char cl_internal_copy_image_3d_to_2d_array_str[]; extern size_t cl_internal_copy_image_3d_to_2d_array_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_2D_ARRAY, cl_internal_copy_image_3d_to_2d_array_str, (size_t)cl_internal_copy_image_3d_to_2d_array_str_size, NULL); } } if (!ker) { ret = CL_OUT_OF_RESOURCES; goto fail; } cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_image); cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_image); cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion[0]); cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_origin[0]); cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]); cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); cl_kernel_set_arg(ker, 8, sizeof(cl_int), &dst_origin[0]); cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]); cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]); ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); fail: cl_kernel_delete(ker); if (fixupDataType) { src_image->intel_fmt = savedIntelFmt; dst_image->intel_fmt = savedIntelFmt; } return ret; } #define ALIGN16 16 #define ALIGN4 4 #define ALIGN1 1 static size_t get_align_size_for_copy_kernel(struct _cl_mem_image* image, const size_t origin0, const size_t region0, const size_t offset, cl_image_format *fmt) { size_t align_size = 0; if (((image->w * image->bpp) % ALIGN16 == 0) && ((origin0 * image->bpp) % ALIGN16 == 0) && (region0 % ALIGN16 == 0) && (offset % ALIGN16 == 0)) { fmt->image_channel_order = CL_RGBA; fmt->image_channel_data_type = CL_UNSIGNED_INT32; align_size = ALIGN16; } else if (((image->w * image->bpp) % ALIGN4 == 0) && ((origin0 * image->bpp) % ALIGN4 == 0) && (region0 % ALIGN4 == 0) && (offset % ALIGN4 == 0)) { fmt->image_channel_order = CL_R; fmt->image_channel_data_type = CL_UNSIGNED_INT32; align_size = ALIGN4; } else{ fmt->image_channel_order = CL_R; fmt->image_channel_data_type = CL_UNSIGNED_INT8; align_size = ALIGN1; } return align_size; } LOCAL cl_int cl_mem_copy_image_to_buffer(cl_command_queue queue, cl_event event, struct _cl_mem_image* image, cl_mem buffer, const size_t *src_origin, const size_t dst_offset, const size_t *region) { cl_int ret; cl_kernel ker = NULL; size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2}; uint32_t intel_fmt, bpp; cl_image_format fmt; size_t origin0, region0; size_t kn_dst_offset; size_t align_size = 1; size_t w_saved; if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; /* We use one kernel to copy the data. The kernel is lazily created. */ assert(image->base.ctx == buffer->ctx); intel_fmt = image->intel_fmt; bpp = image->bpp; w_saved = image->w; region0 = region[0] * bpp; kn_dst_offset = dst_offset; align_size = get_align_size_for_copy_kernel(image, src_origin[0], region0, dst_offset, &fmt); image->intel_fmt = cl_image_get_intel_format(&fmt); image->w = (image->w * image->bpp) / align_size; image->bpp = align_size; region0 = (region[0] * bpp) / align_size; origin0 = (src_origin[0] * bpp) / align_size; kn_dst_offset /= align_size; global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; /* setup the kernel and run. */ if(image->image_type == CL_MEM_OBJECT_IMAGE2D) { if(align_size == ALIGN16){ extern char cl_internal_copy_image_2d_to_buffer_align16_str[]; extern size_t cl_internal_copy_image_2d_to_buffer_align16_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN16, cl_internal_copy_image_2d_to_buffer_align16_str, (size_t)cl_internal_copy_image_2d_to_buffer_align16_str_size, NULL); } else if(align_size == ALIGN4){ extern char cl_internal_copy_image_2d_to_buffer_align4_str[]; extern size_t cl_internal_copy_image_2d_to_buffer_align4_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER_ALIGN4, cl_internal_copy_image_2d_to_buffer_align4_str, (size_t)cl_internal_copy_image_2d_to_buffer_align4_str_size, NULL); } else{ extern char cl_internal_copy_image_2d_to_buffer_str[]; extern size_t cl_internal_copy_image_2d_to_buffer_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_2D_TO_BUFFER, cl_internal_copy_image_2d_to_buffer_str, (size_t)cl_internal_copy_image_2d_to_buffer_str_size, NULL); } }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { if (align_size == ALIGN16) { extern char cl_internal_copy_image_3d_to_buffer_align16_str[]; extern size_t cl_internal_copy_image_3d_to_buffer_align16_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN16, cl_internal_copy_image_3d_to_buffer_align16_str, (size_t)cl_internal_copy_image_3d_to_buffer_align16_str_size, NULL); } else if (align_size == ALIGN4) { extern char cl_internal_copy_image_3d_to_buffer_align4_str[]; extern size_t cl_internal_copy_image_3d_to_buffer_align4_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER_ALIGN4, cl_internal_copy_image_3d_to_buffer_align4_str, (size_t)cl_internal_copy_image_3d_to_buffer_align4_str_size, NULL); } else { extern char cl_internal_copy_image_3d_to_buffer_str[]; extern size_t cl_internal_copy_image_3d_to_buffer_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_IMAGE_3D_TO_BUFFER, cl_internal_copy_image_3d_to_buffer_str, (size_t)cl_internal_copy_image_3d_to_buffer_str_size, NULL); } } if (!ker) { ret = CL_OUT_OF_RESOURCES; goto fail; } cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image); cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer); cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0); cl_kernel_set_arg(ker, 6, sizeof(cl_int), &src_origin[1]); cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset); ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); fail: cl_kernel_delete(ker); image->intel_fmt = intel_fmt; image->bpp = bpp; image->w = w_saved; return ret; } LOCAL cl_int cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_event event, cl_mem buffer, struct _cl_mem_image* image, const size_t src_offset, const size_t *dst_origin, const size_t *region) { cl_int ret; cl_kernel ker = NULL; size_t global_off[] = {0,0,0}; size_t global_sz[] = {1,1,1}; size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_2}; uint32_t intel_fmt, bpp; cl_image_format fmt; size_t origin0, region0; size_t kn_src_offset; size_t align_size = 1; size_t w_saved = 0; if(region[1] == 1) local_sz[1] = 1; if(region[2] == 1) local_sz[2] = 1; global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; global_sz[1] = ((region[1] + local_sz[1] - 1) / local_sz[1]) * local_sz[1]; global_sz[2] = ((region[2] + local_sz[2] - 1) / local_sz[2]) * local_sz[2]; /* We use one kernel to copy the data. The kernel is lazily created. */ assert(image->base.ctx == buffer->ctx); intel_fmt = image->intel_fmt; bpp = image->bpp; w_saved = image->w; region0 = region[0] * bpp; kn_src_offset = src_offset; align_size = get_align_size_for_copy_kernel(image, dst_origin[0], region0, src_offset, &fmt); image->intel_fmt = cl_image_get_intel_format(&fmt); image->w = (image->w * image->bpp) / align_size; image->bpp = align_size; region0 = (region[0] * bpp) / align_size; origin0 = (dst_origin[0] * bpp) / align_size; kn_src_offset /= align_size; global_sz[0] = ((region0 + local_sz[0] - 1) / local_sz[0]) * local_sz[0]; /* setup the kernel and run. */ if(image->image_type == CL_MEM_OBJECT_IMAGE2D) { if(align_size == ALIGN16){ extern char cl_internal_copy_buffer_to_image_2d_align16_str[]; extern size_t cl_internal_copy_buffer_to_image_2d_align16_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN16, cl_internal_copy_buffer_to_image_2d_align16_str, (size_t)cl_internal_copy_buffer_to_image_2d_align16_str_size, NULL); } else if(align_size == ALIGN4){ extern char cl_internal_copy_buffer_to_image_2d_align4_str[]; extern size_t cl_internal_copy_buffer_to_image_2d_align4_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D_ALIGN4, cl_internal_copy_buffer_to_image_2d_align4_str, (size_t)cl_internal_copy_buffer_to_image_2d_align4_str_size, NULL); } else{ extern char cl_internal_copy_buffer_to_image_2d_str[]; extern size_t cl_internal_copy_buffer_to_image_2d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_2D, cl_internal_copy_buffer_to_image_2d_str, (size_t)cl_internal_copy_buffer_to_image_2d_str_size, NULL); } }else if(image->image_type == CL_MEM_OBJECT_IMAGE3D) { if (align_size == ALIGN16) { extern char cl_internal_copy_buffer_to_image_3d_align16_str[]; extern size_t cl_internal_copy_buffer_to_image_3d_align16_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN16, cl_internal_copy_buffer_to_image_3d_align16_str, (size_t)cl_internal_copy_buffer_to_image_3d_align16_str_size, NULL); } else if (align_size == ALIGN4) { extern char cl_internal_copy_buffer_to_image_3d_align4_str[]; extern size_t cl_internal_copy_buffer_to_image_3d_align4_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D_ALIGN4, cl_internal_copy_buffer_to_image_3d_align4_str, (size_t)cl_internal_copy_buffer_to_image_3d_align4_str_size, NULL); } else { extern char cl_internal_copy_buffer_to_image_3d_str[]; extern size_t cl_internal_copy_buffer_to_image_3d_str_size; ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_TO_IMAGE_3D, cl_internal_copy_buffer_to_image_3d_str, (size_t)cl_internal_copy_buffer_to_image_3d_str_size, NULL); } } if (!ker) return CL_OUT_OF_RESOURCES; cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &image); cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &buffer); cl_kernel_set_arg(ker, 2, sizeof(cl_int), ®ion0); cl_kernel_set_arg(ker, 3, sizeof(cl_int), ®ion[1]); cl_kernel_set_arg(ker, 4, sizeof(cl_int), ®ion[2]); cl_kernel_set_arg(ker, 5, sizeof(cl_int), &origin0); cl_kernel_set_arg(ker, 6, sizeof(cl_int), &dst_origin[1]); cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]); cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset); ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_off, global_sz, global_sz, local_sz, local_sz); cl_kernel_delete(ker); image->intel_fmt = intel_fmt; image->bpp = bpp; image->w = w_saved; return ret; } LOCAL void* cl_mem_map(cl_mem mem, int write) { cl_buffer_map(mem->bo, write); assert(cl_buffer_get_virtual(mem->bo)); return cl_buffer_get_virtual(mem->bo); } LOCAL cl_int cl_mem_unmap(cl_mem mem) { cl_buffer_unmap(mem->bo); return CL_SUCCESS; } LOCAL void* cl_mem_map_gtt(cl_mem mem) { cl_buffer_map_gtt(mem->bo); assert(cl_buffer_get_virtual(mem->bo)); mem->mapped_gtt = 1; return cl_buffer_get_virtual(mem->bo); } LOCAL void * cl_mem_map_gtt_unsync(cl_mem mem) { cl_buffer_map_gtt_unsync(mem->bo); assert(cl_buffer_get_virtual(mem->bo)); return cl_buffer_get_virtual(mem->bo); } LOCAL cl_int cl_mem_unmap_gtt(cl_mem mem) { cl_buffer_unmap_gtt(mem->bo); return CL_SUCCESS; } LOCAL void* cl_mem_map_auto(cl_mem mem, int write) { //if mem is not created from userptr, the offset should be always zero. if (!mem->is_userptr) assert(mem->offset == 0); if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE) return cl_mem_map_gtt(mem); else { if (mem->is_userptr) { cl_buffer_wait_rendering(mem->bo); return mem->host_ptr; }else return cl_mem_map(mem, write); } } LOCAL cl_int cl_mem_unmap_auto(cl_mem mem) { if (mem->mapped_gtt == 1) { cl_buffer_unmap_gtt(mem->bo); mem->mapped_gtt = 0; } else if (!mem->is_userptr) cl_buffer_unmap(mem->bo); return CL_SUCCESS; } LOCAL cl_int cl_mem_pin(cl_mem mem) { assert(mem); if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0)) return CL_INVALID_MEM_OBJECT; cl_buffer_pin(mem->bo, 4096); return CL_SUCCESS; } LOCAL cl_int cl_mem_unpin(cl_mem mem) { assert(mem); if (UNLIKELY((mem->flags & CL_MEM_PINNABLE) == 0)) return CL_INVALID_MEM_OBJECT; cl_buffer_unpin(mem->bo); return CL_SUCCESS; } LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx, unsigned int bo_name, cl_int* errcode) { cl_int err = CL_SUCCESS; cl_mem mem = NULL; mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, NULL, NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; size_t sz = 0; mem->bo = cl_buffer_get_buffer_from_libva(ctx, bo_name, &sz); if (mem->bo == NULL) { err = CL_MEM_OBJECT_ALLOCATION_FAILURE; goto error; } mem->size = sz; exit: if (errcode) *errcode = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx, unsigned int bo_name, size_t offset, size_t width, size_t height, cl_image_format fmt, size_t row_pitch, cl_int *errcode) { cl_int err = CL_SUCCESS; cl_mem mem = NULL; struct _cl_mem_image *image = NULL; uint32_t intel_fmt, bpp; /* Get the size of each pixel */ if (UNLIKELY((err = cl_image_byte_per_pixel(&fmt, &bpp)) != CL_SUCCESS)) goto error; intel_fmt = cl_image_get_intel_format(&fmt); if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) { err = CL_IMAGE_FORMAT_NOT_SUPPORTED; goto error; } mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, NULL, NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; image = cl_mem_image(mem); mem->bo = cl_buffer_get_image_from_libva(ctx, bo_name, image); if (mem->bo == NULL) { err = CL_MEM_OBJECT_ALLOCATION_FAILURE; goto error; } image->w = width; image->h = height; image->image_type = CL_MEM_OBJECT_IMAGE2D; image->depth = 1; image->fmt = fmt; image->intel_fmt = intel_fmt; image->bpp = bpp; image->row_pitch = row_pitch; image->slice_pitch = 0; // NOTE: tiling of image is set in cl_buffer_get_image_from_libva(). image->tile_x = 0; image->tile_y = 0; image->offset = offset; exit: if (errcode) *errcode = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } LOCAL cl_int cl_mem_get_fd(cl_mem mem, int* fd) { cl_int err = CL_SUCCESS; if(cl_buffer_get_fd(mem->bo, fd)) err = CL_INVALID_OPERATION; return err; } LOCAL cl_mem cl_mem_new_buffer_from_fd(cl_context ctx, int fd, int buffer_sz, cl_int* errcode) { cl_int err = CL_SUCCESS; cl_mem mem = NULL; mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, NULL, NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; mem->bo = cl_buffer_get_buffer_from_fd(ctx, fd, buffer_sz); if (mem->bo == NULL) { err = CL_MEM_OBJECT_ALLOCATION_FAILURE; goto error; } mem->size = buffer_sz; exit: if (errcode) *errcode = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } LOCAL cl_mem cl_mem_new_image_from_fd(cl_context ctx, int fd, int image_sz, size_t offset, size_t width, size_t height, cl_image_format fmt, size_t row_pitch, cl_int *errcode) { cl_int err = CL_SUCCESS; cl_mem mem = NULL; struct _cl_mem_image *image = NULL; uint32_t intel_fmt, bpp; /* Get the size of each pixel */ if (UNLIKELY((err = cl_image_byte_per_pixel(&fmt, &bpp)) != CL_SUCCESS)) goto error; intel_fmt = cl_image_get_intel_format(&fmt); if (intel_fmt == INTEL_UNSUPPORTED_FORMAT) { err = CL_IMAGE_FORMAT_NOT_SUPPORTED; goto error; } mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, NULL, NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; image = cl_mem_image(mem); mem->bo = cl_buffer_get_image_from_fd(ctx, fd, image_sz, image); if (mem->bo == NULL) { err = CL_MEM_OBJECT_ALLOCATION_FAILURE; goto error; } mem->size = image_sz; image->w = width; image->h = height; image->image_type = CL_MEM_OBJECT_IMAGE2D; image->depth = 1; image->fmt = fmt; image->intel_fmt = intel_fmt; image->bpp = bpp; image->row_pitch = row_pitch; image->slice_pitch = 0; // NOTE: tiling of image is set in cl_buffer_get_image_from_fd(). image->tile_x = 0; image->tile_y = 0; image->offset = offset; exit: if (errcode) *errcode = err; return mem; error: cl_mem_delete(mem); mem = NULL; goto exit; } static cl_int get_mapped_address(cl_mem mem) { cl_int slot = -1; if (!mem->mapped_ptr_sz) { mem->mapped_ptr_sz = 16; mem->mapped_ptr = (cl_mapped_ptr *)malloc( sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz); if (!mem->mapped_ptr) { cl_mem_unmap_auto(mem); return slot; } memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); slot = 0; } else { int i = 0; for (; i < mem->mapped_ptr_sz; i++) { if (mem->mapped_ptr[i].ptr == NULL) { slot = i; break; } } if (i == mem->mapped_ptr_sz) { cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc( sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2); if (!new_ptr) { cl_mem_unmap_auto(mem); return slot; } memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); memcpy(new_ptr, mem->mapped_ptr, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); slot = mem->mapped_ptr_sz; mem->mapped_ptr_sz *= 2; free(mem->mapped_ptr); mem->mapped_ptr = new_ptr; } } assert(slot != -1); return slot; } LOCAL cl_int cl_mem_record_map_mem_for_kernel(cl_mem mem, void *ptr, void **mem_ptr, size_t offset, size_t size, const size_t *origin, const size_t *region, cl_mem tmp_ker_buf, uint8_t write_map) { // TODO: Need to add MT safe logic. cl_int slot = -1; int err = CL_SUCCESS; size_t sub_offset = 0; //ptr = (char*)ptr + offset + sub_offset; if(mem->flags & CL_MEM_USE_HOST_PTR) { assert(mem->host_ptr); //only calc ptr here, will do memcpy in enqueue *mem_ptr = (char*)ptr + offset + sub_offset; } else { *mem_ptr = ptr; } /* Record the mapped address. */ slot = get_mapped_address(mem); if (slot == -1) { err = CL_OUT_OF_HOST_MEMORY; goto error; } mem->mapped_ptr[slot].ptr = *mem_ptr; mem->mapped_ptr[slot].v_ptr = ptr; mem->mapped_ptr[slot].size = size; mem->mapped_ptr[slot].ker_write_map = write_map; mem->mapped_ptr[slot].tmp_ker_buf = tmp_ker_buf; if(origin) { assert(region); mem->mapped_ptr[slot].origin[0] = origin[0]; mem->mapped_ptr[slot].origin[1] = origin[1]; mem->mapped_ptr[slot].origin[2] = origin[2]; mem->mapped_ptr[slot].region[0] = region[0]; mem->mapped_ptr[slot].region[1] = region[1]; mem->mapped_ptr[slot].region[2] = region[2]; } mem->map_ref++; error: if (err != CL_SUCCESS) *mem_ptr = NULL; return err; } LOCAL cl_int cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset, size_t size, const size_t *origin, const size_t *region) { // TODO: Need to add MT safe logic. cl_int slot = -1; int err = CL_SUCCESS; size_t sub_offset = 0; if(mem->type == CL_MEM_SUBBUFFER_TYPE) { struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; sub_offset = buffer->sub_offset; } ptr = (char*)ptr + offset + sub_offset; if(mem->flags & CL_MEM_USE_HOST_PTR) { assert(mem->host_ptr); //only calc ptr here, will do memcpy in enqueue *mem_ptr = (char *)mem->host_ptr + offset + sub_offset; } else { *mem_ptr = ptr; } /* Record the mapped address. */ slot = get_mapped_address(mem); if (slot == -1) { err = CL_OUT_OF_HOST_MEMORY; goto error; } mem->mapped_ptr[slot].ptr = *mem_ptr; mem->mapped_ptr[slot].v_ptr = ptr; mem->mapped_ptr[slot].size = size; if(origin) { assert(region); mem->mapped_ptr[slot].origin[0] = origin[0]; mem->mapped_ptr[slot].origin[1] = origin[1]; mem->mapped_ptr[slot].origin[2] = origin[2]; mem->mapped_ptr[slot].region[0] = region[0]; mem->mapped_ptr[slot].region[1] = region[1]; mem->mapped_ptr[slot].region[2] = region[2]; } mem->map_ref++; error: if (err != CL_SUCCESS) *mem_ptr = NULL; return err; } LOCAL cl_int cl_mem_set_destructor_callback(cl_mem memobj, void(CL_CALLBACK *pfn_notify)(cl_mem, void *), void *user_data) { cl_mem_dstr_cb cb = cl_calloc(1, sizeof(_cl_mem_dstr_cb)); if (cb == NULL) { return CL_OUT_OF_HOST_MEMORY; } memset(cb, 0, sizeof(_cl_mem_dstr_cb)); list_node_init(&cb->node); cb->pfn_notify = pfn_notify; cb->user_data = user_data; CL_OBJECT_LOCK(memobj); list_add(&memobj->dstr_cb_head, &cb->node); CL_OBJECT_UNLOCK(memobj); return CL_SUCCESS; }