/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * */ #include "cl_mem.h" #include "cl_kernel.h" #include "cl_enqueue.h" #include "cl_command_queue.h" #include "cl_event.h" #include "cl_context.h" #include "cl_program.h" #include "cl_alloc.h" #include "CL/cl.h" #include #include cl_int clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret) { const void *src_ptr = NULL; size_t src_size = 0; const char *str = NULL; cl_int ref; cl_uint n; if (!CL_OBJECT_IS_KERNEL(kernel)) { return CL_INVALID_KERNEL; } if (param_name == CL_KERNEL_CONTEXT) { src_ptr = &kernel->program->ctx; src_size = sizeof(cl_context); } else if (param_name == CL_KERNEL_PROGRAM) { src_ptr = &kernel->program; src_size = sizeof(cl_program); } else if (param_name == CL_KERNEL_NUM_ARGS) { n = kernel->arg_n; src_ptr = &n; src_size = sizeof(cl_uint); } else if (param_name == CL_KERNEL_REFERENCE_COUNT) { ref = CL_OBJECT_GET_REF(kernel); src_ptr = &ref; src_size = sizeof(cl_int); } else if (param_name == CL_KERNEL_FUNCTION_NAME) { str = cl_kernel_get_name(kernel); src_ptr = str; src_size = strlen(str) + 1; } else if (param_name == CL_KERNEL_ATTRIBUTES) { str = cl_kernel_get_attributes(kernel); src_ptr = str; src_size = strlen(str) + 1; } else { return CL_INVALID_VALUE; } return cl_get_info_helper(src_ptr, src_size, param_value, param_value_size, param_value_size_ret); } cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { size_t fixed_global_off[] = {0, 0, 0}; size_t fixed_global_sz[] = {1, 1, 1}; size_t fixed_local_sz[] = {1, 1, 1}; cl_int err = CL_SUCCESS; cl_uint i; cl_event e = NULL; cl_int event_status; do { if (!CL_OBJECT_IS_COMMAND_QUEUE(command_queue)) { err = CL_INVALID_COMMAND_QUEUE; break; } if (!CL_OBJECT_IS_KERNEL(kernel)) { err = CL_INVALID_KERNEL; break; } /* Check number of dimensions we have */ if (UNLIKELY(work_dim == 0 || work_dim > 3)) { err = CL_INVALID_WORK_DIMENSION; break; } /* We need a work size per dimension */ if (UNLIKELY(global_work_size == NULL)) { err = CL_INVALID_GLOBAL_WORK_SIZE; break; } if (kernel->vme) { if (work_dim != 2) { err = CL_INVALID_WORK_DIMENSION; break; } if (local_work_size != NULL) { err = CL_INVALID_WORK_GROUP_SIZE; break; } } if (global_work_offset != NULL) { for (i = 0; i < work_dim; ++i) { if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) { err = CL_INVALID_GLOBAL_OFFSET; break; } } } /* Queue and kernel must share the same context */ assert(kernel->program); if (command_queue->ctx != kernel->program->ctx) { err = CL_INVALID_CONTEXT; break; } if (local_work_size != NULL) { for (i = 0; i < work_dim; ++i) fixed_local_sz[i] = local_work_size[i]; } else { if (kernel->vme) { fixed_local_sz[0] = 16; fixed_local_sz[1] = 1; } else { uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large size_t realGroupSize = 1; for (i = 0; i < work_dim; i++) { for (j = maxDimSize; j > 1; j--) { if (global_work_size[i] % j == 0 && j <= maxGroupSize) { fixed_local_sz[i] = j; maxGroupSize = maxGroupSize / j; maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize; break; //choose next work_dim } } realGroupSize *= fixed_local_sz[i]; } //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop: //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel //it triggers the following message for many times. //to avoid too many messages, only print it for the first time of the process. //just use static variable since it doesn't matter to print a few times at multi-thread case. static int warn_no_good_localsize = 1; if (realGroupSize % 8 != 0 && warn_no_good_localsize) { warn_no_good_localsize = 0; DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide\n" " local_work_size[] explicitly, you can find good values with\n" " trial-and-error method."); } } } if (kernel->vme) { fixed_global_sz[0] = (global_work_size[0] + 15) / 16 * 16; fixed_global_sz[1] = (global_work_size[1] + 15) / 16; } else { for (i = 0; i < work_dim; ++i) fixed_global_sz[i] = global_work_size[i]; } if (global_work_offset != NULL) for (i = 0; i < work_dim; ++i) fixed_global_off[i] = global_work_offset[i]; if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) { if (fixed_local_sz[0] != kernel->compile_wg_sz[0] || fixed_local_sz[1] != kernel->compile_wg_sz[1] || fixed_local_sz[2] != kernel->compile_wg_sz[2]) { err = CL_INVALID_WORK_GROUP_SIZE; break; } } err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list, event, command_queue->ctx); if (err != CL_SUCCESS) { break; } int i, j, k; const size_t global_wk_sz_div[3] = { fixed_global_sz[0] / fixed_local_sz[0] * fixed_local_sz[0], fixed_global_sz[1] / fixed_local_sz[1] * fixed_local_sz[1], fixed_global_sz[2] / fixed_local_sz[2] * fixed_local_sz[2]}; const size_t global_wk_sz_rem[3] = { fixed_global_sz[0] % fixed_local_sz[0], fixed_global_sz[1] % fixed_local_sz[1], fixed_global_sz[2] % fixed_local_sz[2]}; cl_uint count; count = global_wk_sz_rem[0] ? 2 : 1; count *= global_wk_sz_rem[1] ? 2 : 1; count *= global_wk_sz_rem[2] ? 2 : 1; const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem}; /* Go through the at most 8 cases and euque if there is work items left */ for (i = 0; i < 2; i++) { for (j = 0; j < 2; j++) { for (k = 0; k < 2; k++) { size_t global_wk_sz_use[3] = {global_wk_all[k][0], global_wk_all[j][1], global_wk_all[i][2]}; size_t global_dim_off[3] = { k * global_wk_sz_div[0] / fixed_local_sz[0], j * global_wk_sz_div[1] / fixed_local_sz[1], i * global_wk_sz_div[2] / fixed_local_sz[2]}; size_t local_wk_sz_use[3] = { k ? global_wk_sz_rem[0] : fixed_local_sz[0], j ? global_wk_sz_rem[1] : fixed_local_sz[1], i ? global_wk_sz_rem[2] : fixed_local_sz[2]}; if (local_wk_sz_use[0] == 0 || local_wk_sz_use[1] == 0 || local_wk_sz_use[2] == 0) continue; e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list, event_wait_list, CL_COMMAND_NDRANGE_KERNEL, &err); if (err != CL_SUCCESS) { break; } /* Do device specific checks are enqueue the kernel */ err = cl_command_queue_ND_range(command_queue, kernel, e, work_dim, fixed_global_off, global_dim_off, fixed_global_sz, global_wk_sz_use, fixed_local_sz, local_wk_sz_use); if (err != CL_SUCCESS) { break; } e->exec_data.mid_event_of_enq = (count > 1); count--; /* We will flush the ndrange if no event depend. Else we will add it to queue list. The finish or Complete status will always be done in queue list. */ event_status = cl_event_is_ready(e); if (event_status < CL_COMPLETE) { // Error happend, cancel. err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST; break; } err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED), CL_FALSE); if (err != CL_SUCCESS) { break; } cl_command_queue_enqueue_event(command_queue, e); if (e->exec_data.mid_event_of_enq) cl_event_delete(e); } if (err != CL_SUCCESS) { break; } } if (err != CL_SUCCESS) { break; } } } while (0); if (err == CL_SUCCESS && event) { *event = e; } else { cl_event_delete(e); } return err; } cl_int clEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { const size_t global_size[3] = {1, 0, 0}; const size_t local_size[3] = {1, 0, 0}; return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_size, local_size, num_events_in_wait_list, event_wait_list, event); } cl_int clEnqueueNativeKernel(cl_command_queue command_queue, void (*user_func)(void *), void *args, size_t cb_args, cl_uint num_mem_objects, const cl_mem *mem_list, const void **args_mem_loc, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { cl_int err = CL_SUCCESS; void *new_args = NULL; void **new_args_mem_loc = NULL; cl_mem *new_mem_list = NULL; cl_int i; cl_int e_status; cl_event e = NULL; enqueue_data *data = NULL; do { if (user_func == NULL || (args == NULL && cb_args > 0) || (args == NULL && num_mem_objects > 0) || (args != NULL && cb_args == 0) || (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) || (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) { err = CL_INVALID_VALUE; break; } //Per spec, need copy args if (cb_args) { new_args = cl_malloc(cb_args); if (num_mem_objects) { new_args_mem_loc = cl_malloc(sizeof(void *) * num_mem_objects); new_mem_list = cl_malloc(sizeof(cl_mem) * num_mem_objects); memcpy(new_mem_list, mem_list, sizeof(cl_mem) * num_mem_objects); } if (new_args == NULL || new_args_mem_loc == NULL) { err = CL_OUT_OF_HOST_MEMORY; break; } memcpy(new_args, args, cb_args); for (i = 0; i < num_mem_objects; ++i) { if (!CL_OBJECT_IS_MEM(mem_list[i])) { err = CL_INVALID_MEM_OBJECT; break; } new_args_mem_loc[i] = new_args + (args_mem_loc[i] - args); //change to new args } } err = cl_event_check_waitlist(num_events_in_wait_list, event_wait_list, event, command_queue->ctx); if (err != CL_SUCCESS) { break; } e = cl_event_create(command_queue->ctx, command_queue, num_events_in_wait_list, event_wait_list, CL_COMMAND_NATIVE_KERNEL, &err); if (err != CL_SUCCESS) { break; } e_status = cl_event_is_ready(e); if (e_status < CL_COMPLETE) { err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST; break; } data = &e->exec_data; data->type = EnqueueNativeKernel; data->mem_list = new_mem_list; data->ptr = new_args; data->size = cb_args; data->offset = (size_t)num_mem_objects; data->const_ptr = new_args_mem_loc; data->user_func = user_func; new_args = NULL; new_mem_list = NULL; new_args_mem_loc = NULL; // Event delete will free them. err = cl_event_exec(e, (e_status == CL_COMPLETE ? CL_COMPLETE : CL_QUEUED), CL_FALSE); if (err != CL_SUCCESS) { break; } if (e_status != CL_COMPLETE) cl_command_queue_enqueue_event(command_queue, e); } while (0); if (err != CL_SUCCESS) { if (new_args) cl_free(new_args); if (new_mem_list) cl_free(new_mem_list); if (new_args_mem_loc) cl_free(new_args_mem_loc); } if (err == CL_SUCCESS && event) { *event = e; } else { cl_event_delete(e); } return err; }