/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * * Author: Benjamin Segovia */ #include "cl_kernel.h" #include "cl_program.h" #include "cl_device_id.h" #include "cl_context.h" #include "cl_mem.h" #include "cl_alloc.h" #include "cl_utils.h" #include "cl_khr_icd.h" #include "CL/cl.h" #include "cl_sampler.h" #include "cl_accelerator_intel.h" #include "cl_cmrt.h" #include #include #include #include #include LOCAL void cl_kernel_delete(cl_kernel k) { uint32_t i; if (k == NULL) return; #ifdef HAS_CMRT if (k->cmrt_kernel != NULL) { cmrt_destroy_kernel(k); CL_OBJECT_DESTROY_BASE(k); cl_free(k); return; } #endif /* We are not done with the kernel */ if (CL_OBJECT_DEC_REF(k) > 1) return; /* Release one reference on all bos we own */ if (k->bo) cl_buffer_unreference(k->bo); /* This will be true for kernels created by clCreateKernel */ if (k->ref_its_program) cl_program_delete(k->program); /* Release the curbe if allocated */ if (k->curbe) cl_free(k->curbe); /* Release the argument array if required */ if (k->args) { for (i = 0; i < k->arg_n; ++i) if (k->args[i].mem != NULL) cl_mem_delete(k->args[i].mem); cl_free(k->args); } if (k->image_sz) cl_free(k->images); if (k->exec_info) cl_free(k->exec_info); if (k->device_enqueue_ptr) cl_mem_svm_delete(k->program->ctx, k->device_enqueue_ptr); if (k->device_enqueue_infos) cl_free(k->device_enqueue_infos); CL_OBJECT_DESTROY_BASE(k); cl_free(k); } LOCAL cl_kernel cl_kernel_new(cl_program p) { cl_kernel k = NULL; TRY_ALLOC_NO_ERR (k, CALLOC(struct _cl_kernel)); CL_OBJECT_INIT_BASE(k, CL_OBJECT_KERNEL_MAGIC); k->program = p; k->cmrt_kernel = NULL; exit: return k; error: cl_kernel_delete(k); k = NULL; goto exit; } LOCAL const char* cl_kernel_get_name(cl_kernel k) { if (UNLIKELY(k == NULL)) return NULL; return interp_kernel_get_name(k->opaque); } LOCAL const char* cl_kernel_get_attributes(cl_kernel k) { if (UNLIKELY(k == NULL)) return NULL; return interp_kernel_get_attributes(k->opaque); } LOCAL void cl_kernel_add_ref(cl_kernel k) { CL_OBJECT_INC_REF(k); } LOCAL cl_int cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value) { int32_t offset; /* where to patch */ enum gbe_arg_type arg_type; /* kind of argument */ size_t arg_sz; /* size of the argument */ cl_mem mem = NULL; /* for __global, __constant and image arguments */ cl_context ctx = k->program->ctx; if (UNLIKELY(index >= k->arg_n)) return CL_INVALID_ARG_INDEX; arg_type = interp_kernel_get_arg_type(k->opaque, index); arg_sz = interp_kernel_get_arg_size(k->opaque, index); if (k->vme && index == 0) { //the best method is to return the arg type of GBE_ARG_ACCELERATOR_INTEL //but it is not straightforward since clang does not support it now //the easy way is to consider typedef accelerator_intel_t as a struct, //this easy way makes the size mismatched, so use another size check method. if (sz != sizeof(cl_accelerator_intel) || arg_sz != sizeof(cl_motion_estimation_desc_intel)) return CL_INVALID_ARG_SIZE; cl_accelerator_intel* accel = (cl_accelerator_intel*)value; if ((*accel)->type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) return CL_INVALID_ACCELERATOR_TYPE_INTEL; } else { if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) { if (arg_type != GBE_ARG_SAMPLER || (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler))) return CL_INVALID_ARG_SIZE; } } if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0)) return CL_INVALID_ARG_SIZE; if(arg_type == GBE_ARG_VALUE) { if(UNLIKELY(value == NULL)) return CL_INVALID_ARG_VALUE; } else if(arg_type == GBE_ARG_LOCAL_PTR) { if(UNLIKELY(value != NULL)) return CL_INVALID_ARG_VALUE; } else if(arg_type == GBE_ARG_SAMPLER) { if (UNLIKELY(value == NULL)) return CL_INVALID_ARG_VALUE; cl_sampler s = *(cl_sampler*)value; if(!CL_OBJECT_IS_SAMPLER(s)) return CL_INVALID_SAMPLER; } else { // should be image, GLOBAL_PTR, CONSTANT_PTR if (UNLIKELY(value == NULL && (arg_type == GBE_ARG_IMAGE || arg_type == GBE_ARG_PIPE))) return CL_INVALID_ARG_VALUE; if(value != NULL) mem = *(cl_mem*)value; if(arg_type == GBE_ARG_PIPE) { _cl_mem_pipe* pipe= cl_mem_pipe(mem); size_t type_size = (size_t)interp_kernel_get_arg_info(k->opaque, index,5); if(pipe->packet_size != type_size) return CL_INVALID_ARG_VALUE; } if(value != NULL && mem) { if(CL_SUCCESS != cl_mem_is_valid(mem, ctx)) return CL_INVALID_MEM_OBJECT; if (UNLIKELY((arg_type == GBE_ARG_IMAGE && !IS_IMAGE(mem)) || (arg_type != GBE_ARG_IMAGE && IS_IMAGE(mem)))) return CL_INVALID_ARG_VALUE; } } /* Copy the structure or the value directly into the curbe */ if (arg_type == GBE_ARG_VALUE) { if (k->vme && index == 0) { cl_accelerator_intel accel; memcpy(&accel, value, sz); offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index); if (offset >= 0) { assert(offset + sz <= k->curbe_sz); memcpy(k->curbe + offset, &(accel->desc.me), arg_sz); } k->args[index].local_sz = 0; k->args[index].is_set = 1; k->args[index].mem = NULL; k->accel = accel; return CL_SUCCESS; } else { offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index); if (offset >= 0) { assert(offset + sz <= k->curbe_sz); memcpy(k->curbe + offset, value, sz); } k->args[index].local_sz = 0; k->args[index].is_set = 1; k->args[index].mem = NULL; return CL_SUCCESS; } } /* For a local pointer just save the size */ if (arg_type == GBE_ARG_LOCAL_PTR) { k->args[index].local_sz = sz; k->args[index].is_set = 1; k->args[index].mem = NULL; return CL_SUCCESS; } /* Is it a sampler*/ if (arg_type == GBE_ARG_SAMPLER) { cl_sampler sampler; memcpy(&sampler, value, sz); k->args[index].local_sz = 0; k->args[index].is_set = 1; k->args[index].mem = NULL; k->args[index].sampler = sampler; cl_set_sampler_arg_slot(k, index, sampler); offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index); if (offset >= 0) { assert(offset + 4 <= k->curbe_sz); memcpy(k->curbe + offset, &sampler->clkSamplerValue, 4); } return CL_SUCCESS; } if(value != NULL) mem = *(cl_mem*) value; if(value == NULL || mem == NULL) { /* for buffer object GLOBAL_PTR CONSTANT_PTR, it maybe NULL */ int32_t offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index); if (offset >= 0) *((uint32_t *)(k->curbe + offset)) = 0; assert(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR); if (k->args[index].mem) cl_mem_delete(k->args[index].mem); k->args[index].mem = NULL; k->args[index].is_set = 1; k->args[index].local_sz = 0; return CL_SUCCESS; } mem = *(cl_mem*) value; cl_mem_add_ref(mem); if (k->args[index].mem) cl_mem_delete(k->args[index].mem); k->args[index].mem = mem; k->args[index].is_set = 1; k->args[index].is_svm = mem->is_svm; if(mem->is_svm) k->args[index].ptr = mem->host_ptr; k->args[index].local_sz = 0; k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index); return CL_SUCCESS; } LOCAL cl_int cl_kernel_set_arg_svm_pointer(cl_kernel k, cl_uint index, const void *value) { enum gbe_arg_type arg_type; /* kind of argument */ //size_t arg_sz; /* size of the argument */ cl_context ctx = k->program->ctx; cl_mem mem= cl_context_get_svm_from_ptr(ctx, value); if (UNLIKELY(index >= k->arg_n)) return CL_INVALID_ARG_INDEX; arg_type = interp_kernel_get_arg_type(k->opaque, index); //arg_sz = interp_kernel_get_arg_size(k->opaque, index); if(arg_type != GBE_ARG_GLOBAL_PTR && arg_type != GBE_ARG_CONSTANT_PTR ) return CL_INVALID_ARG_VALUE; if(mem == NULL) return CL_INVALID_ARG_VALUE; cl_mem_add_ref(mem); if (k->args[index].mem) cl_mem_delete(k->args[index].mem); k->args[index].ptr = (void *)value; k->args[index].mem = mem; k->args[index].is_set = 1; k->args[index].is_svm = 1; k->args[index].local_sz = 0; k->args[index].bti = interp_kernel_get_arg_bti(k->opaque, index); return 0; } LOCAL cl_int cl_kernel_set_exec_info(cl_kernel k, size_t n, const void *value) { cl_int err = CL_SUCCESS; assert(k != NULL); if (n == 0) return err; TRY_ALLOC(k->exec_info, cl_calloc(n, 1)); memcpy(k->exec_info, value, n); k->exec_info_n = n / sizeof(void *); error: return err; } LOCAL int cl_get_kernel_arg_info(cl_kernel k, cl_uint arg_index, cl_kernel_arg_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { assert(k != NULL); void *ret_info = interp_kernel_get_arg_info(k->opaque, arg_index, param_name - CL_KERNEL_ARG_ADDRESS_QUALIFIER); uint32_t arg_type = interp_kernel_get_arg_type(k->opaque, arg_index); int str_len = 0; cl_kernel_arg_type_qualifier type_qual = CL_KERNEL_ARG_TYPE_NONE; switch (param_name) { case CL_KERNEL_ARG_ADDRESS_QUALIFIER: if (param_value_size_ret) *param_value_size_ret = sizeof(cl_kernel_arg_address_qualifier); if (!param_value) return CL_SUCCESS; if (param_value_size < sizeof(cl_kernel_arg_address_qualifier)) return CL_INVALID_VALUE; if ((size_t)ret_info == 0) { *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_PRIVATE; } else if ((size_t)ret_info == 1 || (size_t)ret_info == 4) { *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_GLOBAL; } else if ((size_t)ret_info == 2) { *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_CONSTANT; } else if ((size_t)ret_info == 3) { *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_LOCAL; } else { /* If no address qualifier is specified, the default address qualifier which is CL_KERNEL_ARG_ADDRESS_PRIVATE is returned. */ *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ADDRESS_PRIVATE; } return CL_SUCCESS; case CL_KERNEL_ARG_ACCESS_QUALIFIER: if (param_value_size_ret) *param_value_size_ret = sizeof(cl_kernel_arg_access_qualifier); if (!param_value) return CL_SUCCESS; if (param_value_size < sizeof(cl_kernel_arg_access_qualifier)) return CL_INVALID_VALUE; if (!strcmp((char*)ret_info, "write_only")) { *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_WRITE_ONLY; } else if (!strcmp((char*)ret_info, "read_only")) { *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_ONLY; } else if (!strcmp((char*)ret_info, "read_write")) { *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_READ_WRITE; } else { *(cl_kernel_arg_address_qualifier *)param_value = CL_KERNEL_ARG_ACCESS_NONE; } return CL_SUCCESS; case CL_KERNEL_ARG_TYPE_NAME: case CL_KERNEL_ARG_NAME: str_len = strlen(ret_info); if (param_value_size_ret) *param_value_size_ret = str_len + 1; if (!param_value) return CL_SUCCESS; if (param_value_size < str_len + 1) return CL_INVALID_VALUE; memcpy(param_value, ret_info, str_len); ((char *)param_value)[str_len] = 0; return CL_SUCCESS; case CL_KERNEL_ARG_TYPE_QUALIFIER: if (param_value_size_ret) *param_value_size_ret = sizeof(cl_kernel_arg_type_qualifier); if (!param_value) return CL_SUCCESS; if (param_value_size < sizeof(cl_kernel_arg_type_qualifier)) return CL_INVALID_VALUE; if (strstr((char*)ret_info, "const") && (arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR || arg_type == GBE_ARG_LOCAL_PTR)) type_qual = type_qual | CL_KERNEL_ARG_TYPE_CONST; if (strstr((char*)ret_info, "volatile")) type_qual = type_qual | CL_KERNEL_ARG_TYPE_VOLATILE; if (strstr((char*)ret_info, "restrict")) type_qual = type_qual | CL_KERNEL_ARG_TYPE_RESTRICT; if (strstr((char*)ret_info, "pipe")) type_qual = CL_KERNEL_ARG_TYPE_PIPE; *(cl_kernel_arg_type_qualifier *)param_value = type_qual; return CL_SUCCESS; default: assert(0); } return CL_SUCCESS; } LOCAL uint32_t cl_kernel_get_simd_width(cl_kernel k) { assert(k != NULL); return interp_kernel_get_simd_width(k->opaque); } LOCAL void cl_kernel_setup(cl_kernel k, gbe_kernel opaque) { cl_context ctx = k->program->ctx; cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx); if(k->bo != NULL) cl_buffer_unreference(k->bo); /* Allocate the gen code here */ const uint32_t code_sz = interp_kernel_get_code_size(opaque); const char *code = interp_kernel_get_code(opaque); k->bo = cl_buffer_alloc(bufmgr, "CL kernel", code_sz, 64u); k->arg_n = interp_kernel_get_arg_num(opaque); /* Upload the code */ cl_buffer_subdata(k->bo, 0, code_sz, code); k->opaque = opaque; const char* kname = cl_kernel_get_name(k); if (kname != NULL && strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0) k->vme = 1; else k->vme = 0; /* Create the curbe */ k->curbe_sz = interp_kernel_get_curbe_size(k->opaque); /* Get sampler data & size */ k->sampler_sz = interp_kernel_get_sampler_size(k->opaque); assert(k->sampler_sz <= GEN_MAX_SAMPLERS); if (k->sampler_sz > 0) interp_kernel_get_sampler_data(k->opaque, k->samplers); interp_kernel_get_compile_wg_size(k->opaque, k->compile_wg_sz); k->stack_size = interp_kernel_get_stack_size(k->opaque); /* Get image data & size */ k->image_sz = interp_kernel_get_image_size(k->opaque); assert(k->sampler_sz <= GEN_MAX_SURFACES); assert(k->image_sz <= ctx->devices[0]->max_read_image_args + ctx->devices[0]->max_write_image_args); if (k->image_sz > 0) { TRY_ALLOC_NO_ERR(k->images, cl_calloc(k->image_sz, sizeof(k->images[0]))); interp_kernel_get_image_data(k->opaque, k->images); } else k->images = NULL; return; error: cl_buffer_unreference(k->bo); k->bo = NULL; } LOCAL cl_kernel cl_kernel_dup(cl_kernel from) { cl_kernel to = NULL; if (UNLIKELY(from == NULL)) return NULL; TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel)); CL_OBJECT_INIT_BASE(to, CL_OBJECT_KERNEL_MAGIC); to->bo = from->bo; to->opaque = from->opaque; to->vme = from->vme; to->program = from->program; to->arg_n = from->arg_n; to->curbe_sz = from->curbe_sz; to->sampler_sz = from->sampler_sz; to->image_sz = from->image_sz; to->exec_info_n = from->exec_info_n; memcpy(to->compile_wg_sz, from->compile_wg_sz, sizeof(from->compile_wg_sz)); to->stack_size = from->stack_size; if (to->sampler_sz) memcpy(to->samplers, from->samplers, to->sampler_sz * sizeof(uint32_t)); if (to->image_sz) { TRY_ALLOC_NO_ERR(to->images, cl_calloc(to->image_sz, sizeof(to->images[0]))); memcpy(to->images, from->images, to->image_sz * sizeof(to->images[0])); } else to->images = NULL; if (to->exec_info_n) { /* Must always 0 here */ TRY_ALLOC_NO_ERR(to->exec_info, cl_calloc(to->exec_info_n, sizeof(void *))); memcpy(to->exec_info, from->exec_info, to->exec_info_n * sizeof(void *)); } TRY_ALLOC_NO_ERR(to->args, cl_calloc(to->arg_n, sizeof(cl_argument))); if (to->curbe_sz) TRY_ALLOC_NO_ERR(to->curbe, cl_calloc(1, to->curbe_sz)); /* Retain the bos */ if (from->bo) cl_buffer_reference(from->bo); /* We retain the program destruction since this kernel (user allocated) * depends on the program for some of its pointers */ assert(from->program); cl_program_add_ref(from->program); to->ref_its_program = CL_TRUE; exit: return to; error: cl_kernel_delete(to); to = NULL; goto exit; } LOCAL cl_int cl_kernel_work_group_sz(cl_kernel ker, const size_t *local_wk_sz, uint32_t wk_dim, size_t *wk_grp_sz) { cl_int err = CL_SUCCESS; size_t sz = 0; cl_uint i; for (i = 0; i < wk_dim; ++i) { const uint32_t required_sz = interp_kernel_get_required_work_group_size(ker->opaque, i); if (required_sz != 0 && required_sz != local_wk_sz[i]) { err = CL_INVALID_WORK_ITEM_SIZE; goto error; } } sz = local_wk_sz[0]; for (i = 1; i < wk_dim; ++i) sz *= local_wk_sz[i]; if (sz > cl_get_kernel_max_wg_sz(ker)) { err = CL_INVALID_WORK_ITEM_SIZE; goto error; } error: if (wk_grp_sz) *wk_grp_sz = sz; return err; }