diff options
author | Junyan He <junyan.he@intel.com> | 2016-09-26 16:00:07 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2016-09-28 15:59:47 +0800 |
commit | 4a61637a8dbb1d7ddea131a059afd02b33df4ce0 (patch) | |
tree | 705daadeb402fa24e859d0b0c5523c123cb44536 | |
parent | adb62811ea72bad4018b8e0af2cedca513a9eea4 (diff) |
Modify all event related functions using new event handle.
Rewrite the cl_event, and modify all the event functions
using this new event manner. Event will co-operate with
command queue's thread together.
v2:
Fix a logic problem in event create failed.
V3:
Set enqueue default to do nothing, handle some enqueue has nothing
to do.
Signed-off-by: Junyan He <junyan.he@intel.com>
Reviewed-by: Yang Rong <rong.r.yang@intel.com>
-rw-r--r-- | src/CMakeLists.txt | 5 | ||||
-rw-r--r-- | src/cl_api.c | 1888 | ||||
-rw-r--r-- | src/cl_api_kernel.c | 27 | ||||
-rw-r--r-- | src/cl_command_queue.c | 98 | ||||
-rw-r--r-- | src/cl_command_queue.h | 7 | ||||
-rw-r--r-- | src/cl_command_queue_enqueue.c | 8 | ||||
-rw-r--r-- | src/cl_command_queue_gen7.c | 21 | ||||
-rw-r--r-- | src/cl_enqueue.c | 502 | ||||
-rw-r--r-- | src/cl_enqueue.h | 44 | ||||
-rw-r--r-- | src/cl_event.c | 1067 | ||||
-rw-r--r-- | src/cl_event.h | 146 | ||||
-rw-r--r-- | src/cl_mem.c | 118 | ||||
-rw-r--r-- | src/cl_mem.h | 29 |
13 files changed, 1074 insertions, 2886 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 009d0576..626b43f2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -68,6 +68,10 @@ set(OPENCL_SRC ${KERNEL_STR_FILES} cl_base_object.c cl_api.c + cl_api_mem.c + cl_api_kernel.c + cl_api_command_queue.c + cl_api_event.c cl_alloc.c cl_kernel.c cl_program.c @@ -85,6 +89,7 @@ set(OPENCL_SRC cl_command_queue.c cl_command_queue.h cl_command_queue_gen7.c + cl_command_queue_enqueue.c cl_thread.c cl_driver.h cl_driver.cpp diff --git a/src/cl_api.c b/src/cl_api.c index a2fee158..f8c48de6 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -67,92 +67,6 @@ typedef intptr_t cl_device_partition_property; return RET; \ } while(0) -static inline cl_int -handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list, - cl_event* event, enqueue_data* data, cl_command_type type) -{ - cl_int status = cl_event_wait_events(num, wait_list, queue); - cl_event e = NULL; - if(event != NULL || status == CL_ENQUEUE_EXECUTE_DEFER) { - e = cl_event_new(queue->ctx, queue, type, event!=NULL); - - /* if need profiling, add the submit timestamp here. */ - if (e->type != CL_COMMAND_USER && - e->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(e, CL_PROFILING_COMMAND_QUEUED); - cl_event_get_queued_cpu_timestamp(e); - } - - if(event != NULL) - *event = e; - if(status == CL_ENQUEUE_EXECUTE_DEFER) { - cl_event_new_enqueue_callback(e, data, num, wait_list); - } - } - set_current_event(queue, e); - return status; -} - -/* The following code checking overlap is from Appendix of openCL spec 1.1 */ -cl_bool check_copy_overlap(const size_t src_offset[3], - const size_t dst_offset[3], - const size_t region[3], - size_t row_pitch, size_t slice_pitch) -{ - const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]}; - const size_t src_max[] = {src_offset[0] + region[0], - src_offset[1] + region[1], - src_offset[2] + region[2]}; - const size_t dst_min[] = {dst_offset[0], dst_offset[1], dst_offset[2]}; - const size_t dst_max[] = {dst_offset[0] + region[0], - dst_offset[1] + region[1], - dst_offset[2] + region[2]}; - // Check for overlap - cl_bool overlap = CL_TRUE; - unsigned i; - size_t dst_start = dst_offset[2] * slice_pitch + - dst_offset[1] * row_pitch + dst_offset[0]; - size_t dst_end = dst_start + (region[2] * slice_pitch + - region[1] * row_pitch + region[0]); - size_t src_start = src_offset[2] * slice_pitch + - src_offset[1] * row_pitch + src_offset[0]; - size_t src_end = src_start + (region[2] * slice_pitch + - region[1] * row_pitch + region[0]); - - for (i=0; i != 3; ++i) { - overlap = overlap && (src_min[i] < dst_max[i]) - && (src_max[i] > dst_min[i]); - } - - if (!overlap) { - size_t delta_src_x = (src_offset[0] + region[0] > row_pitch) ? - src_offset[0] + region[0] - row_pitch : 0; - size_t delta_dst_x = (dst_offset[0] + region[0] > row_pitch) ? - dst_offset[0] + region[0] - row_pitch : 0; - if ( (delta_src_x > 0 && delta_src_x > dst_offset[0]) || - (delta_dst_x > 0 && delta_dst_x > src_offset[0]) ) { - if ( (src_start <= dst_start && dst_start < src_end) || - (dst_start <= src_start && src_start < dst_end) ) - overlap = CL_TRUE; - } - if (region[2] > 1) { - size_t src_height = slice_pitch / row_pitch; - size_t dst_height = slice_pitch / row_pitch; - size_t delta_src_y = (src_offset[1] + region[1] > src_height) ? - src_offset[1] + region[1] - src_height : 0; - size_t delta_dst_y = (dst_offset[1] + region[1] > dst_height) ? - dst_offset[1] + region[1] - dst_height : 0; - if ( (delta_src_y > 0 && delta_src_y > dst_offset[1]) || - (delta_dst_y > 0 && delta_dst_y > src_offset[1]) ) { - if ( (src_start <= dst_start && dst_start < src_end) || - (dst_start <= src_start && src_start < dst_end) ) - overlap = CL_TRUE; - } - } - } - return overlap; -} - static cl_int cl_check_device_type(cl_device_type device_type) { @@ -448,16 +362,6 @@ error: } cl_int -clReleaseCommandQueue(cl_command_queue command_queue) -{ - cl_int err = CL_SUCCESS; - CHECK_QUEUE (command_queue); - cl_command_queue_delete(command_queue); -error: - return err; -} - -cl_int clGetCommandQueueInfo(cl_command_queue command_queue, cl_command_queue_info param_name, size_t param_value_size, @@ -1369,26 +1273,6 @@ clGetKernelSubGroupInfoKHR(cl_kernel kernel, } cl_int -clWaitForEvents(cl_uint num_events, - const cl_event * event_list) -{ - cl_int err = CL_SUCCESS; - cl_context ctx = NULL; - - if(num_events > 0 && event_list) - ctx = event_list[0]->ctx; - - TRY(cl_event_check_waitlist, num_events, event_list, NULL, ctx); - - while(cl_event_wait_events(num_events, event_list, NULL) == CL_ENQUEUE_EXECUTE_DEFER) { - usleep(8000); //sleep 8ms to wait other thread - } - -error: - return err; -} - -cl_int clGetEventInfo(cl_event event, cl_event_info param_name, size_t param_value_size, @@ -1403,9 +1287,9 @@ clGetEventInfo(cl_event event, } else if (param_name == CL_EVENT_CONTEXT) { FILL_GETINFO_RET (cl_context, 1, &event->ctx, CL_SUCCESS); } else if (param_name == CL_EVENT_COMMAND_TYPE) { - FILL_GETINFO_RET (cl_command_type, 1, &event->type, CL_SUCCESS); + FILL_GETINFO_RET (cl_command_type, 1, &event->event_type, CL_SUCCESS); } else if (param_name == CL_EVENT_COMMAND_EXECUTION_STATUS) { - cl_event_update_status(event, 0); + cl_event_get_status(event); FILL_GETINFO_RET (cl_int, 1, &event->status, CL_SUCCESS); } else if (param_name == CL_EVENT_REFERENCE_COUNT) { cl_uint ref = CL_OBJECT_GET_REF(event); @@ -1419,22 +1303,6 @@ error: } -cl_event -clCreateUserEvent(cl_context context, - cl_int * errcode_ret) -{ - cl_int err = CL_SUCCESS; - cl_event event = NULL; - CHECK_CONTEXT(context); - - TRY_ALLOC(event, cl_event_new(context, NULL, CL_COMMAND_USER, CL_TRUE)); - -error: - if(errcode_ret) - *errcode_ret = err; - return event; -} - cl_int clRetainEvent(cl_event event) { @@ -1459,48 +1327,6 @@ error: return err; } -cl_int -clSetUserEventStatus(cl_event event, - cl_int execution_status) -{ - cl_int err = CL_SUCCESS; - - CHECK_EVENT(event); - if(execution_status > CL_COMPLETE) { - err = CL_INVALID_VALUE; - goto error; - } - if(event->status != CL_SUBMITTED) { - err = CL_INVALID_OPERATION; - goto error; - } - - cl_event_set_status(event, execution_status); -error: - return err; -} - -cl_int -clSetEventCallback(cl_event event, - cl_int command_exec_callback_type, - void (CL_CALLBACK * pfn_notify) (cl_event, cl_int, void *), - void * user_data) -{ - cl_int err = CL_SUCCESS; - - CHECK_EVENT(event); - if((pfn_notify == NULL) || - (command_exec_callback_type > CL_SUBMITTED) || - (command_exec_callback_type < CL_COMPLETE)) { - err = CL_INVALID_VALUE; - goto error; - } - err = cl_event_set_callback(event, command_exec_callback_type, pfn_notify, user_data); - -error: - return err; - -} cl_int clGetEventProfilingInfo(cl_event event, @@ -1513,9 +1339,9 @@ clGetEventProfilingInfo(cl_event event, cl_ulong ret_val; CHECK_EVENT(event); - cl_event_update_status(event, 0); + //cl_event_update_status(event, 0); - if (event->type == CL_COMMAND_USER || + if (event->event_type == CL_COMMAND_USER || !(event->queue->props & CL_QUEUE_PROFILING_ENABLE) || event->status != CL_COMPLETE) { err = CL_PROFILING_INFO_NOT_AVAILABLE; @@ -1552,1712 +1378,6 @@ error: return err; } -cl_int -clFlush(cl_command_queue command_queue) -{ - /* have nothing to do now, as currently - * clEnqueueNDRangeKernel will flush at - * the end of each calling. we may need - * to optimize it latter.*/ - return 0; -} - -cl_int -clFinish(cl_command_queue command_queue) -{ - cl_int err = CL_SUCCESS; - - CHECK_QUEUE (command_queue); - -#ifdef HAS_CMRT - if (command_queue->cmrt_event != NULL) - return cmrt_wait_for_task_finished(command_queue); -#endif - - err = cl_command_queue_finish(command_queue); - -error: - return err; -} - -cl_int -clEnqueueReadBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - size_t offset, - size_t size, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, defer_enqueue_data = { 0 }; - CHECK_QUEUE(command_queue); - CHECK_MEM(buffer); - if (command_queue->ctx != buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (!ptr || !size || offset + size > buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { - err = CL_INVALID_OPERATION; - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx); - - data = &defer_enqueue_data; - data->type = EnqueueReadBuffer; - data->mem_obj = buffer; - data->ptr = ptr; - data->offset = offset; - data->size = size; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - -error: - return err; -} - -cl_int -clEnqueueReadBufferRect(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_read, - const size_t * buffer_origin, - const size_t * host_origin, - const size_t * region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_MEM(buffer); - - if (command_queue->ctx != buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { - err = CL_INVALID_OPERATION; - goto error; - } - - if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) { - err = CL_INVALID_VALUE; - goto error; - } - - if(buffer_row_pitch == 0) - buffer_row_pitch = region[0]; - if(buffer_slice_pitch == 0) - buffer_slice_pitch = region[1] * buffer_row_pitch; - - if(host_row_pitch == 0) - host_row_pitch = region[0]; - if(host_slice_pitch == 0) - host_slice_pitch = region[1] * host_row_pitch; - - if (buffer_row_pitch < region[0] || - host_row_pitch < region[0]) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) || - (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch - + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch - + buffer_origin[0] + region[0] > buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx); - - data = &no_wait_data; - data->type = EnqueueReadBufferRect; - data->mem_obj = buffer; - data->ptr = ptr; - data->origin[0] = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2]; - data->host_origin[0] = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2]; - data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2]; - data->row_pitch = buffer_row_pitch; - data->slice_pitch = buffer_slice_pitch; - data->host_row_pitch = host_row_pitch; - data->host_slice_pitch = host_slice_pitch; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_READ_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - - error: - return err; -} - -cl_int -clEnqueueWriteBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - size_t offset, - size_t size, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_MEM(buffer); - if (command_queue->ctx != buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (!ptr || !size || offset + size > buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) { - err = CL_INVALID_OPERATION; - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx); - - data = &no_wait_data; - data->type = EnqueueWriteBuffer; - data->mem_obj = buffer; - data->const_ptr = ptr; - data->offset = offset; - data->size = size; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_WRITE_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - - error: - return err; -} - -cl_int -clEnqueueWriteBufferRect(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_write, - const size_t * buffer_origin, - const size_t * host_origin, - const size_t * region, - size_t buffer_row_pitch, - size_t buffer_slice_pitch, - size_t host_row_pitch, - size_t host_slice_pitch, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_MEM(buffer); - - if (command_queue->ctx != buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) { - err = CL_INVALID_OPERATION; - goto error; - } - - if (!ptr || !region || region[0] == 0 || region[1] == 0 || region[2] == 0) { - err = CL_INVALID_VALUE; - goto error; - } - - if(buffer_row_pitch == 0) - buffer_row_pitch = region[0]; - if(buffer_slice_pitch == 0) - buffer_slice_pitch = region[1] * buffer_row_pitch; - - if(host_row_pitch == 0) - host_row_pitch = region[0]; - if(host_slice_pitch == 0) - host_slice_pitch = region[1] * host_row_pitch; - - if (buffer_row_pitch < region[0] || - host_row_pitch < region[0]) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((buffer_slice_pitch < region[1] * buffer_row_pitch || buffer_slice_pitch % buffer_row_pitch != 0 ) || - (host_slice_pitch < region[1] * host_row_pitch || host_slice_pitch % host_row_pitch != 0 )) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((buffer_origin[2] + region[2] - 1) * buffer_slice_pitch - + (buffer_origin[1] + region[1] - 1) * buffer_row_pitch - + buffer_origin[0] + region[0] > buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx); - - data = &no_wait_data; - data->type = EnqueueWriteBufferRect; - data->mem_obj = buffer; - data->const_ptr = ptr; - data->origin[0] = buffer_origin[0]; data->origin[1] = buffer_origin[1]; data->origin[2] = buffer_origin[2]; - data->host_origin[0] = host_origin[0]; data->host_origin[1] = host_origin[1]; data->host_origin[2] = host_origin[2]; - data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2]; - data->row_pitch = buffer_row_pitch; - data->slice_pitch = buffer_slice_pitch; - data->host_row_pitch = host_row_pitch; - data->host_slice_pitch = host_slice_pitch; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_WRITE_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - -error: - return err; -} - -cl_int -clEnqueueFillImage(cl_command_queue command_queue, - cl_mem image, - const void * fill_color, - const size_t * porigin, - const size_t * pregion, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_IMAGE(image, src_image); - FIXUP_IMAGE_REGION(src_image, pregion, region); - FIXUP_IMAGE_ORIGIN(src_image, porigin, origin); - - if (command_queue->ctx != image->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (fill_color == NULL) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!origin || !region || origin[0] + region[0] > src_image->w || origin[1] + region[1] > src_image->h || origin[2] + region[2] > src_image->depth) { - err = CL_INVALID_VALUE; - goto error; - } - - if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (origin[2] != 0 || region[2] != 1)){ - err = CL_INVALID_VALUE; - goto error; - } - - if (src_image->image_type == CL_MEM_OBJECT_IMAGE1D && (origin[2] != 0 ||origin[1] != 0 || region[2] != 1 || region[1] != 1)){ - err = CL_INVALID_VALUE; - goto error; - } - - err = cl_image_fill(command_queue, fill_color, src_image, origin, region); - if (err) { - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, image->ctx); - - data = &no_wait_data; - data->type = EnqueueFillImage; - data->queue = command_queue; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - if (event && (*event)->type != CL_COMMAND_USER - && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); - } - - err = cl_command_queue_flush(command_queue); - } - - if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_fill_image", "", command_queue); - - return 0; - - error: - return err; -} - -cl_int -clEnqueueFillBuffer(cl_command_queue command_queue, - cl_mem buffer, - const void * pattern, - size_t pattern_size, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - static size_t valid_sz[] = {1, 2, 4, 8, 16, 32, 64, 128}; - int i = 0; - - CHECK_QUEUE(command_queue); - CHECK_MEM(buffer); - - if (command_queue->ctx != buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (offset + size > buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - if (pattern == NULL) { - err = CL_INVALID_VALUE; - goto error; - } - - for (i = 0; i < sizeof(valid_sz) / sizeof(size_t); i++) { - if (valid_sz[i] == pattern_size) - break; - } - if (i == sizeof(valid_sz) / sizeof(size_t)) { - err = CL_INVALID_VALUE; - goto error; - } - - if (offset % pattern_size || size % pattern_size) { - err = CL_INVALID_VALUE; - goto error; - } - - err = cl_mem_fill(command_queue, pattern, pattern_size, buffer, offset, size); - if (err) { - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx); - - data = &no_wait_data; - data->type = EnqueueFillBuffer; - data->queue = command_queue; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_FILL_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - if (event && (*event)->type != CL_COMMAND_USER - && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); - } - - err = cl_command_queue_flush(command_queue); - } - - if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_fill_buffer", "", command_queue); - - return 0; - - error: - return err; -} - -cl_int -clEnqueueCopyBuffer(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - size_t src_offset, - size_t dst_offset, - size_t cb, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_MEM(src_buffer); - CHECK_MEM(dst_buffer); - - if (command_queue->ctx != src_buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (command_queue->ctx != dst_buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (src_offset + cb > src_buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - if (dst_offset + cb > dst_buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - /* Check overlap */ - if (src_buffer == dst_buffer - && (src_offset <= dst_offset && dst_offset <= src_offset + cb - 1) - && (dst_offset <= src_offset && src_offset <= dst_offset + cb - 1)) { - err = CL_MEM_COPY_OVERLAP; - goto error; - } - - /* Check sub overlap */ - if (src_buffer->type == CL_MEM_SUBBUFFER_TYPE && dst_buffer->type == CL_MEM_SUBBUFFER_TYPE ) { - struct _cl_mem_buffer* src_b = (struct _cl_mem_buffer*)src_buffer; - struct _cl_mem_buffer* dst_b = (struct _cl_mem_buffer*)dst_buffer; - size_t src_sub_offset = src_b->sub_offset; - size_t dst_sub_offset = dst_b->sub_offset; - - if ((src_offset + src_sub_offset <= dst_offset + dst_sub_offset - && dst_offset + dst_sub_offset <= src_offset + src_sub_offset + cb - 1) - && (dst_offset + dst_sub_offset <= src_offset + src_sub_offset - && src_offset + src_sub_offset <= dst_offset + dst_sub_offset + cb - 1)) { - err = CL_MEM_COPY_OVERLAP; - goto error; - } - } - - err = cl_mem_copy(command_queue, src_buffer, dst_buffer, src_offset, dst_offset, cb); - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx); - - data = &no_wait_data; - data->type = EnqueueCopyBuffer; - data->queue = command_queue; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_COPY_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - if (event && (*event)->type != CL_COMMAND_USER - && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); - } - - err = cl_command_queue_flush(command_queue); - } - - if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy", "", command_queue); - - return 0; - -error: - return err; -} - -cl_int -clEnqueueCopyBufferRect(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_buffer, - const size_t * src_origin, - const size_t * dst_origin, - const size_t * region, - size_t src_row_pitch, - size_t src_slice_pitch, - size_t dst_row_pitch, - size_t dst_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_MEM(src_buffer); - CHECK_MEM(dst_buffer); - - if ((command_queue->ctx != src_buffer->ctx) || - (command_queue->ctx != dst_buffer->ctx)) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (!region || region[0] == 0 || region[1] == 0 || region[2] == 0) { - err = CL_INVALID_VALUE; - goto error; - } - - if(src_row_pitch == 0) - src_row_pitch = region[0]; - if(src_slice_pitch == 0) - src_slice_pitch = region[1] * src_row_pitch; - - if(dst_row_pitch == 0) - dst_row_pitch = region[0]; - if(dst_slice_pitch == 0) - dst_slice_pitch = region[1] * dst_row_pitch; - - if (src_row_pitch < region[0] || - dst_row_pitch < region[0]) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((src_slice_pitch < region[1] * src_row_pitch || src_slice_pitch % src_row_pitch != 0 ) || - (dst_slice_pitch < region[1] * dst_row_pitch || dst_slice_pitch % dst_row_pitch != 0 )) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((src_origin[2] + region[2] - 1) * src_slice_pitch - + (src_origin[1] + region[1] - 1) * src_row_pitch - + src_origin[0] + region[0] > src_buffer->size - ||(dst_origin[2] + region[2] - 1) * dst_slice_pitch - + (dst_origin[1] + region[1] - 1) * dst_row_pitch - + dst_origin[0] + region[0] > dst_buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - if (src_buffer == dst_buffer && (src_row_pitch != dst_row_pitch || src_slice_pitch != dst_slice_pitch)) { - err = CL_INVALID_VALUE; - goto error; - } - - if (src_buffer == dst_buffer && - check_copy_overlap(src_origin, dst_origin, region, src_row_pitch, src_slice_pitch)) { - err = CL_MEM_COPY_OVERLAP; - goto error; - } - - cl_mem_copy_buffer_rect(command_queue, src_buffer, dst_buffer, src_origin, dst_origin, region, - src_row_pitch, src_slice_pitch, dst_row_pitch, dst_slice_pitch); - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_buffer->ctx); - - data = &no_wait_data; - data->type = EnqueueCopyBufferRect; - data->queue = command_queue; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_COPY_BUFFER_RECT) == CL_ENQUEUE_EXECUTE_IMM) { - if (event && (*event)->type != CL_COMMAND_USER - && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); - } - - err = cl_command_queue_flush(command_queue); - } - - if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_rect", "", command_queue); - -error: - return err; -} - -cl_int -clEnqueueReadImage(cl_command_queue command_queue, - cl_mem mem, - cl_bool blocking_read, - const size_t * porigin, - const size_t * pregion, - size_t row_pitch, - size_t slice_pitch, - void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_IMAGE(mem, image); - FIXUP_IMAGE_REGION(image, pregion, region); - FIXUP_IMAGE_ORIGIN(image, porigin, origin); - if (command_queue->ctx != mem->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!row_pitch) - row_pitch = image->bpp*region[0]; - else if (row_pitch < image->bpp*region[0]) { - err = CL_INVALID_VALUE; - goto error; - } - - if (image->slice_pitch) { - if (!slice_pitch) - slice_pitch = row_pitch*region[1]; - else if (slice_pitch < row_pitch*region[1]) { - err = CL_INVALID_VALUE; - goto error; - } - } - else if (slice_pitch) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!ptr) { - err = CL_INVALID_VALUE; - goto error; - } - - if (mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) { - err = CL_INVALID_OPERATION; - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx); - - data = &no_wait_data; - data->type = EnqueueReadImage; - data->mem_obj = mem; - data->ptr = ptr; - data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2]; - data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2]; - data->row_pitch = row_pitch; - data->slice_pitch = slice_pitch; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_READ_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - -error: - return err; -} - -cl_int -clEnqueueWriteImage(cl_command_queue command_queue, - cl_mem mem, - cl_bool blocking_write, - const size_t * porigin, - const size_t * pregion, - size_t row_pitch, - size_t slice_pitch, - const void * ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_IMAGE(mem, image); - FIXUP_IMAGE_REGION(image, pregion, region); - FIXUP_IMAGE_ORIGIN(image, porigin, origin); - if (command_queue->ctx != mem->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!row_pitch) - row_pitch = image->bpp*region[0]; - else if (row_pitch < image->bpp*region[0]) { - err = CL_INVALID_VALUE; - goto error; - } - - if (image->slice_pitch) { - if (!slice_pitch) - slice_pitch = row_pitch*region[1]; - else if (slice_pitch < row_pitch*region[1]) { - err = CL_INVALID_VALUE; - goto error; - } - } - else if (slice_pitch) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!ptr) { - err = CL_INVALID_VALUE; - goto error; - } - - if (mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)) { - err = CL_INVALID_OPERATION; - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx); - - data = &no_wait_data; - data->type = EnqueueWriteImage; - data->mem_obj = mem; - data->const_ptr = ptr; - data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2]; - data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2]; - data->row_pitch = row_pitch; - data->slice_pitch = slice_pitch; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_WRITE_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - -error: - return err; -} - -cl_int -clEnqueueCopyImage(cl_command_queue command_queue, - cl_mem src_mem, - cl_mem dst_mem, - const size_t * psrc_origin, - const size_t * pdst_origin, - const size_t * pregion, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - cl_bool overlap = CL_TRUE; - cl_int i = 0; - - CHECK_QUEUE(command_queue); - CHECK_IMAGE(src_mem, src_image); - CHECK_IMAGE(dst_mem, dst_image); - FIXUP_IMAGE_REGION(src_image, pregion, region); - FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin); - FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin); - if (command_queue->ctx != src_mem->ctx || - command_queue->ctx != dst_mem->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (src_image->fmt.image_channel_order != dst_image->fmt.image_channel_order || - src_image->fmt.image_channel_data_type != dst_image->fmt.image_channel_data_type) { - err = CL_IMAGE_FORMAT_MISMATCH; - goto error; - } - - if (!src_origin || !region || src_origin[0] + region[0] > src_image->w || - src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w || - dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) || - (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1))) { - err = CL_INVALID_VALUE; - goto error; - } - - if (src_image == dst_image) { - for(i = 0; i < 3; i++) - overlap = overlap && (src_origin[i] < dst_origin[i] + region[i]) - && (dst_origin[i] < src_origin[i] + region[i]); - if(overlap == CL_TRUE) { - err = CL_MEM_COPY_OVERLAP; - goto error; - } - } - - cl_mem_kernel_copy_image(command_queue, src_image, dst_image, src_origin, dst_origin, region); - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx); - - data = &no_wait_data; - data->type = EnqueueCopyImage; - data->queue = command_queue; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_COPY_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { - if (event && (*event)->type != CL_COMMAND_USER - && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); - } - - err = cl_command_queue_flush(command_queue); - } - - if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_kernel_copy_image", "", command_queue); - -error: - return err; -} - -cl_int -clEnqueueCopyImageToBuffer(cl_command_queue command_queue, - cl_mem src_mem, - cl_mem dst_buffer, - const size_t * psrc_origin, - const size_t * pregion, - size_t dst_offset, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_IMAGE(src_mem, src_image); - CHECK_MEM(dst_buffer); - FIXUP_IMAGE_REGION(src_image, pregion, region); - FIXUP_IMAGE_ORIGIN(src_image, psrc_origin, src_origin); - if (command_queue->ctx != src_mem->ctx || - command_queue->ctx != dst_buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (dst_offset + region[0]*region[1]*region[2]*src_image->bpp > dst_buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!src_origin || !region || src_origin[0] + region[0] > src_image->w || - src_origin[1] + region[1] > src_image->h || src_origin[2] + region[2] > src_image->depth) { - err = CL_INVALID_VALUE; - goto error; - } - - if (src_image->image_type == CL_MEM_OBJECT_IMAGE2D && (src_origin[2] != 0 || region[2] != 1)) { - err = CL_INVALID_VALUE; - goto error; - } - - cl_mem_copy_image_to_buffer(command_queue, src_image, dst_buffer, src_origin, dst_offset, region); - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, src_mem->ctx); - - data = &no_wait_data; - data->type = EnqueueCopyImageToBuffer; - data->queue = command_queue; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_COPY_IMAGE_TO_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - if (event && (*event)->type != CL_COMMAND_USER - && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); - } - - err = cl_command_queue_flush(command_queue); - } - - if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_image_to_buffer", "", command_queue); - -error: - return err; -} - -cl_int -clEnqueueCopyBufferToImage(cl_command_queue command_queue, - cl_mem src_buffer, - cl_mem dst_mem, - size_t src_offset, - const size_t * pdst_origin, - const size_t * pregion, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_MEM(src_buffer); - CHECK_IMAGE(dst_mem, dst_image); - FIXUP_IMAGE_REGION(dst_image, pregion, region); - FIXUP_IMAGE_ORIGIN(dst_image, pdst_origin, dst_origin); - if (command_queue->ctx != src_buffer->ctx || - command_queue->ctx != dst_mem->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (src_offset + region[0]*region[1]*region[2]*dst_image->bpp > src_buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!dst_origin || !region || dst_origin[0] + region[0] > dst_image->w || - dst_origin[1] + region[1] > dst_image->h || dst_origin[2] + region[2] > dst_image->depth) { - err = CL_INVALID_VALUE; - goto error; - } - - if (dst_image->image_type == CL_MEM_OBJECT_IMAGE2D && (dst_origin[2] != 0 || region[2] != 1)) { - err = CL_INVALID_VALUE; - goto error; - } - - cl_mem_copy_buffer_to_image(command_queue, src_buffer, dst_image, src_offset, dst_origin, region); - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, dst_mem->ctx); - - data = &no_wait_data; - data->type = EnqueueCopyBufferToImage; - data->queue = command_queue; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_COPY_BUFFER_TO_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { - if (event && (*event)->type != CL_COMMAND_USER - && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); - } - - err = cl_command_queue_flush(command_queue); - } - - if(b_output_kernel_perf) - time_end(command_queue->ctx, "beignet internal kernel : cl_mem_copy_buffer_to_image", "", command_queue); - -error: - return err; -} - -static cl_int _cl_map_mem(cl_mem mem, void *ptr, void **mem_ptr, - size_t offset, size_t size, - const size_t *origin, const size_t *region) -{ - cl_int slot = -1; - int err = CL_SUCCESS; - size_t sub_offset = 0; - - if(mem->type == CL_MEM_SUBBUFFER_TYPE) { - struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; - sub_offset = buffer->sub_offset; - } - - ptr = (char*)ptr + offset + sub_offset; - if(mem->flags & CL_MEM_USE_HOST_PTR) { - assert(mem->host_ptr); - //only calc ptr here, will do memcpy in enqueue - *mem_ptr = (char *)mem->host_ptr + offset + sub_offset; - } else { - *mem_ptr = ptr; - } - /* Record the mapped address. */ - if (!mem->mapped_ptr_sz) { - mem->mapped_ptr_sz = 16; - mem->mapped_ptr = (cl_mapped_ptr *)malloc( - sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz); - if (!mem->mapped_ptr) { - cl_mem_unmap_auto(mem); - err = CL_OUT_OF_HOST_MEMORY; - goto error; - } - memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); - slot = 0; - } else { - int i = 0; - for (; i < mem->mapped_ptr_sz; i++) { - if (mem->mapped_ptr[i].ptr == NULL) { - slot = i; - break; - } - } - if (i == mem->mapped_ptr_sz) { - cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc( - sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2); - if (!new_ptr) { - cl_mem_unmap_auto(mem); - err = CL_OUT_OF_HOST_MEMORY; - goto error; - } - memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); - memcpy(new_ptr, mem->mapped_ptr, - mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); - slot = mem->mapped_ptr_sz; - mem->mapped_ptr_sz *= 2; - free(mem->mapped_ptr); - mem->mapped_ptr = new_ptr; - } - } - assert(slot != -1); - mem->mapped_ptr[slot].ptr = *mem_ptr; - mem->mapped_ptr[slot].v_ptr = ptr; - mem->mapped_ptr[slot].size = size; - if(origin) { - assert(region); - mem->mapped_ptr[slot].origin[0] = origin[0]; - mem->mapped_ptr[slot].origin[1] = origin[1]; - mem->mapped_ptr[slot].origin[2] = origin[2]; - mem->mapped_ptr[slot].region[0] = region[0]; - mem->mapped_ptr[slot].region[1] = region[1]; - mem->mapped_ptr[slot].region[2] = region[2]; - } - mem->map_ref++; -error: - if (err != CL_SUCCESS) - *mem_ptr = NULL; - return err; -} - -void * -clEnqueueMapBuffer(cl_command_queue command_queue, - cl_mem buffer, - cl_bool blocking_map, - cl_map_flags map_flags, - size_t offset, - size_t size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event, - cl_int * errcode_ret) -{ - cl_int err = CL_SUCCESS; - void *ptr = NULL; - void *mem_ptr = NULL; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_MEM(buffer); - if (command_queue->ctx != buffer->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (!size || offset + size > buffer->size) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((map_flags & CL_MAP_READ && - buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) || - (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) && - buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) - { - err = CL_INVALID_OPERATION; - goto error; - } - -#ifdef HAS_CMRT - if (command_queue->cmrt_event != NULL) - cmrt_wait_for_task_finished(command_queue); -#endif - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, buffer->ctx); - - data = &no_wait_data; - data->type = EnqueueMapBuffer; - data->mem_obj = buffer; - data->offset = offset; - data->size = size; - data->ptr = ptr; - data->unsync_map = 1; - if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) - data->write_map = 1; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - data->unsync_map = 0; - err = cl_enqueue_handle(event ? *event : NULL, data); - if (err != CL_SUCCESS) - goto error; - ptr = data->ptr; - if(event) cl_event_set_status(*event, CL_COMPLETE); - } else { - if (buffer->is_userptr) - ptr = buffer->host_ptr; - else { - if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) { - err = CL_MAP_FAILURE; - goto error; - } - } - } - err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL); - if (err != CL_SUCCESS) - goto error; - -error: - if (errcode_ret) - *errcode_ret = err; - return mem_ptr; -} - -void * -clEnqueueMapImage(cl_command_queue command_queue, - cl_mem mem, - cl_bool blocking_map, - cl_map_flags map_flags, - const size_t * porigin, - const size_t * pregion, - size_t * image_row_pitch, - size_t * image_slice_pitch, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event, - cl_int * errcode_ret) -{ - cl_int err = CL_SUCCESS; - void *ptr = NULL; - void *mem_ptr = NULL; - size_t offset = 0; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_IMAGE(mem, image); - FIXUP_IMAGE_REGION(image, pregion, region); - FIXUP_IMAGE_ORIGIN(image, porigin, origin); - if (command_queue->ctx != mem->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - if (!origin || !region || origin[0] + region[0] > image->w || origin[1] + region[1] > image->h || origin[2] + region[2] > image->depth) { - err = CL_INVALID_VALUE; - goto error; - } - - if (!image_row_pitch || (image->slice_pitch && !image_slice_pitch)) { - err = CL_INVALID_VALUE; - goto error; - } - - if ((map_flags & CL_MAP_READ && - mem->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)) || - (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION) && - mem->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))) - { - err = CL_INVALID_OPERATION; - goto error; - } - -#ifdef HAS_CMRT - if (command_queue->cmrt_event != NULL) - cmrt_wait_for_task_finished(command_queue); -#endif - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, mem->ctx); - - data = &no_wait_data; - data->type = EnqueueMapImage; - data->mem_obj = mem; - data->origin[0] = origin[0]; data->origin[1] = origin[1]; data->origin[2] = origin[2]; - data->region[0] = region[0]; data->region[1] = region[1]; data->region[2] = region[2]; - data->ptr = ptr; - data->unsync_map = 1; - if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION)) - data->write_map = 1; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) { - data->unsync_map = 0; - err = cl_enqueue_handle(event ? *event : NULL, data); - if (err != CL_SUCCESS) - goto error; - ptr = data->ptr; - if(event) cl_event_set_status(*event, CL_COMPLETE); - } else { - if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) { - err = CL_MAP_FAILURE; - goto error; - } - } - - if(mem->flags & CL_MEM_USE_HOST_PTR) { - if (image_slice_pitch) - *image_slice_pitch = image->host_slice_pitch; - *image_row_pitch = image->host_row_pitch; - - offset = image->bpp*origin[0] + image->host_row_pitch*origin[1] + image->host_slice_pitch*origin[2]; - } else { - if (image_slice_pitch) - *image_slice_pitch = image->slice_pitch; - if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) - *image_row_pitch = image->slice_pitch; - else - *image_row_pitch = image->row_pitch; - - offset = image->bpp*origin[0] + image->row_pitch*origin[1] + image->slice_pitch*origin[2]; - } - err = _cl_map_mem(mem, ptr, &mem_ptr, offset, 0, origin, region); - -error: - if (errcode_ret) - *errcode_ret = err; - return mem_ptr; //TODO: map and unmap first -} - -cl_int -clEnqueueUnmapMemObject(cl_command_queue command_queue, - cl_mem memobj, - void * mapped_ptr, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_MEM(memobj); - if (command_queue->ctx != memobj->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, memobj->ctx); - - data = &no_wait_data; - data->type = EnqueueUnmapMemObject; - data->mem_obj = memobj; - data->ptr = mapped_ptr; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_UNMAP_MEM_OBJECT) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - -error: - return err; -} - -cl_int -clEnqueueMigrateMemObjects(cl_command_queue command_queue, - cl_uint num_mem_objects, - const cl_mem * mem_objects, - cl_mem_migration_flags flags, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - /* So far, we just support 1 device and no subdevice. So all the command queues - belong to the small context. There is no need to migrate the mem objects by now. */ - cl_int err = CL_SUCCESS; - cl_uint i = 0; - enqueue_data *data, defer_enqueue_data = { 0 }; - - if (!flags & CL_MIGRATE_MEM_OBJECT_HOST) - CHECK_QUEUE(command_queue); - - if (num_mem_objects == 0 || mem_objects == NULL) { - err = CL_INVALID_VALUE; - goto error; - } - - if (flags && flags & ~(CL_MIGRATE_MEM_OBJECT_HOST | - CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED)) { - err = CL_INVALID_VALUE; - goto error; - } - - for (i = 0; i < num_mem_objects; i++) { - CHECK_MEM(mem_objects[i]); - if (mem_objects[i]->ctx != command_queue->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - } - - /* really nothing to do, fill the event. */ - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx); - data = &defer_enqueue_data; - data->type = EnqueueMigrateMemObj; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_READ_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - -error: - return err; -} - -cl_int -clEnqueueNDRangeKernel(cl_command_queue command_queue, - cl_kernel kernel, - cl_uint work_dim, - const size_t * global_work_offset, - const size_t * global_work_size, - const size_t * local_work_size, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - size_t fixed_global_off[] = {0,0,0}; - size_t fixed_global_sz[] = {1,1,1}; - size_t fixed_local_sz[] = {1,1,1}; - cl_int err = CL_SUCCESS; - cl_uint i; - enqueue_data *data, no_wait_data = { 0 }; - - CHECK_QUEUE(command_queue); - CHECK_KERNEL(kernel); - - /* Check number of dimensions we have */ - if (UNLIKELY(work_dim == 0 || work_dim > 3)) { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } - - /* We need a work size per dimension */ - if (UNLIKELY(global_work_size == NULL)) { - err = CL_INVALID_GLOBAL_WORK_SIZE; - goto error; - } - - if (kernel->vme) { - if (work_dim != 2) { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } - if (local_work_size != NULL) { - err = CL_INVALID_WORK_GROUP_SIZE; - goto error; - } - } - - if (global_work_offset != NULL) - for (i = 0; i < work_dim; ++i) { - if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) { - err = CL_INVALID_GLOBAL_OFFSET; - goto error; - } - } - - /* Local sizes must be non-null and divide global sizes */ - if (local_work_size != NULL) - for (i = 0; i < work_dim; ++i) - if (UNLIKELY(local_work_size[i] == 0 || global_work_size[i] % local_work_size[i])) { - err = CL_INVALID_WORK_GROUP_SIZE; - goto error; - } - - /* Queue and kernel must share the same context */ - assert(kernel->program); - if (command_queue->ctx != kernel->program->ctx) { - err = CL_INVALID_CONTEXT; - goto error; - } - -#ifdef HAS_CMRT - if (kernel->cmrt_kernel != NULL) { - err = cmrt_enqueue(command_queue, kernel, global_work_size, local_work_size); - goto error; - } -#endif - - /* XXX No event right now */ - //FATAL_IF(num_events_in_wait_list > 0, "Events are not supported"); - //FATAL_IF(event_wait_list != NULL, "Events are not supported"); - //FATAL_IF(event != NULL, "Events are not supported"); - - if (local_work_size != NULL) { - for (i = 0; i < work_dim; ++i) - fixed_local_sz[i] = local_work_size[i]; - } else { - if (kernel->vme) { - fixed_local_sz[0] = 16; - fixed_local_sz[1] = 1; - } else { - uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large - size_t realGroupSize = 1; - for (i = 0; i< work_dim; i++) { - for (j = maxDimSize; j > 1; j--) { - if (global_work_size[i] % j == 0 && j <= maxGroupSize) { - fixed_local_sz[i] = j; - maxGroupSize = maxGroupSize /j; - maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize; - break; //choose next work_dim - } - } - realGroupSize *= fixed_local_sz[i]; - } - - //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop: - //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel - //it triggers the following message for many times. - //to avoid too many messages, only print it for the first time of the process. - //just use static variable since it doesn't matter to print a few times at multi-thread case. - static int warn_no_good_localsize = 1; - if (realGroupSize % 8 != 0 && warn_no_good_localsize) { - warn_no_good_localsize = 0; - DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide local_work_size[] explicitly, you can find good values with trial-and-error method."); - } - } - } - - if (kernel->vme) { - fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16; - fixed_global_sz[1] = (global_work_size[1]+15) / 16; - } else { - for (i = 0; i < work_dim; ++i) - fixed_global_sz[i] = global_work_size[i]; - } - if (global_work_offset != NULL) - for (i = 0; i < work_dim; ++i) - fixed_global_off[i] = global_work_offset[i]; - - if (kernel->compile_wg_sz[0] || kernel->compile_wg_sz[1] || kernel->compile_wg_sz[2]) { - if (fixed_local_sz[0] != kernel->compile_wg_sz[0] - || fixed_local_sz[1] != kernel->compile_wg_sz[1] - || fixed_local_sz[2] != kernel->compile_wg_sz[2]) - { - err = CL_INVALID_WORK_GROUP_SIZE; - goto error; - } - } - - /* Do device specific checks are enqueue the kernel */ - err = cl_command_queue_ND_range(command_queue, - kernel, - work_dim, - fixed_global_off, - fixed_global_sz, - fixed_local_sz); - if(err != CL_SUCCESS) - goto error; - - data = &no_wait_data; - data->type = EnqueueNDRangeKernel; - data->queue = command_queue; - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx); - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_NDRANGE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) { - if (event && (*event)->type != CL_COMMAND_USER - && (*event)->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(*event, CL_PROFILING_COMMAND_SUBMIT); - } - - err = cl_command_queue_flush(command_queue); - } - -error: - if(b_output_kernel_perf) - { - if(kernel->program->build_opts != NULL) - time_end(command_queue->ctx, cl_kernel_get_name(kernel), kernel->program->build_opts, command_queue); - else - time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue); - } - - return err; -} - -cl_int -clEnqueueTask(cl_command_queue command_queue, - cl_kernel kernel, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - const size_t global_size[3] = {1, 0, 0}; - const size_t local_size[3] = {1, 0, 0}; - - return clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_size, local_size, - num_events_in_wait_list, event_wait_list, event); -} - -cl_int -clEnqueueNativeKernel(cl_command_queue command_queue, - void (*user_func)(void *), - void * args, - size_t cb_args, - cl_uint num_mem_objects, - const cl_mem * mem_list, - const void ** args_mem_loc, - cl_uint num_events_in_wait_list, - const cl_event * event_wait_list, - cl_event * event) -{ - cl_int err = CL_SUCCESS; - void *new_args = NULL; - enqueue_data *data, no_wait_data = { 0 }; - cl_int i; - - if(user_func == NULL || - (args == NULL && cb_args > 0) || - (args == NULL && num_mem_objects ==0) || - (args != NULL && cb_args == 0) || - (num_mem_objects > 0 && (mem_list == NULL || args_mem_loc == NULL)) || - (num_mem_objects == 0 && (mem_list != NULL || args_mem_loc != NULL))) { - err = CL_INVALID_VALUE; - goto error; - } - - //Per spec, need copy args - if (cb_args) - { - new_args = malloc(cb_args); - if (!new_args) - { - err = CL_OUT_OF_HOST_MEMORY; - goto error; - } - memcpy(new_args, args, cb_args); - - for (i=0; i<num_mem_objects; ++i) - { - CHECK_MEM(mem_list[i]); - args_mem_loc[i] = new_args + (args_mem_loc[i] - args); //change to new args - } - } - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx); - - data = &no_wait_data; - data->type = EnqueueNativeKernel; - data->mem_list = mem_list; - data->ptr = new_args; - data->size = cb_args; - data->offset = (size_t)num_mem_objects; - data->const_ptr = args_mem_loc; - data->user_func = user_func; - - if(handle_events(command_queue, num_events_in_wait_list, event_wait_list, - event, data, CL_COMMAND_NATIVE_KERNEL) == CL_ENQUEUE_EXECUTE_IMM) { - err = cl_enqueue_handle(event ? *event : NULL, data); - if(event) cl_event_set_status(*event, CL_COMPLETE); - } - -error: - return err; -} - -cl_int -clEnqueueMarker(cl_command_queue command_queue, - cl_event *event) -{ - cl_int err = CL_SUCCESS; - CHECK_QUEUE(command_queue); - if(event == NULL) { - err = CL_INVALID_VALUE; - goto error; - } - - cl_event_marker_with_wait_list(command_queue, 0, NULL, event); -error: - return err; -} - -cl_int -clEnqueueMarkerWithWaitList(cl_command_queue command_queue, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) -{ - cl_int err = CL_SUCCESS; - CHECK_QUEUE(command_queue); - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx); - - cl_event_marker_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event); -error: - return err; -} - -cl_int -clEnqueueWaitForEvents(cl_command_queue command_queue, - cl_uint num_events, - const cl_event * event_list) -{ - cl_int err = CL_SUCCESS; - CHECK_QUEUE(command_queue); - err = clWaitForEvents(num_events, event_list); - -error: - return err; -} - -cl_int -clEnqueueBarrier(cl_command_queue command_queue) -{ - cl_int err = CL_SUCCESS; - CHECK_QUEUE(command_queue); - - cl_event_barrier_with_wait_list(command_queue, 0, NULL, NULL); - -error: - return err; -} - -cl_int -clEnqueueBarrierWithWaitList(cl_command_queue command_queue, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event) -{ - cl_int err = CL_SUCCESS; - CHECK_QUEUE(command_queue); - - TRY(cl_event_check_waitlist, num_events_in_wait_list, event_wait_list, event, command_queue->ctx); - - cl_event_barrier_with_wait_list(command_queue, num_events_in_wait_list, event_wait_list, event); -error: - return err; -} - #define EXTFUNC(x) \ if (strcmp(#x, func_name) == 0) \ return (void *)x; diff --git a/src/cl_api_kernel.c b/src/cl_api_kernel.c index a1075d73..ef494e6e 100644 --- a/src/cl_api_kernel.c +++ b/src/cl_api_kernel.c @@ -130,10 +130,19 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, } realGroupSize *= fixed_local_sz[i]; } - if (realGroupSize % 8 != 0) + + //in a loop of conformance test (such as test_api repeated_setup_cleanup), in each loop: + //create a new context, a new command queue, and uses 'globalsize[0]=1000, localsize=NULL' to enqueu kernel + //it triggers the following message for many times. + //to avoid too many messages, only print it for the first time of the process. + //just use static variable since it doesn't matter to print a few times at multi-thread case. + static int warn_no_good_localsize = 1; + if (realGroupSize % 8 != 0 && warn_no_good_localsize) { + warn_no_good_localsize = 0; DEBUGP(DL_WARNING, "unable to find good values for local_work_size[i], please provide\n" - " local_work_size[] explicitly, you can find good values with\n" - " trial-and-error method."); + " local_work_size[] explicitly, you can find good values with\n" + " trial-and-error method."); + } } } @@ -253,10 +262,10 @@ clEnqueueNativeKernel(cl_command_queue command_queue, //Per spec, need copy args if (cb_args) { - new_args = CL_MALLOC(cb_args); + new_args = cl_malloc(cb_args); if (num_mem_objects) { - new_args_mem_loc = CL_MALLOC(sizeof(void *) * num_mem_objects); - new_mem_list = CL_MALLOC(sizeof(cl_mem) * num_mem_objects); + new_args_mem_loc = cl_malloc(sizeof(void *) * num_mem_objects); + new_mem_list = cl_malloc(sizeof(cl_mem) * num_mem_objects); memcpy(new_mem_list, mem_list, sizeof(cl_mem) * num_mem_objects); } @@ -320,11 +329,11 @@ clEnqueueNativeKernel(cl_command_queue command_queue, if (err != CL_SUCCESS) { if (new_args) - CL_FREE(new_args); + cl_free(new_args); if (new_mem_list) - CL_FREE(new_mem_list); + cl_free(new_mem_list); if (new_args_mem_loc) - CL_FREE(new_args_mem_loc); + cl_free(new_args_mem_loc); } if (err == CL_SUCCESS && event) { diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 8d3c6b03..54a487c8 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -45,14 +45,16 @@ cl_command_queue_new(cl_context ctx) assert(ctx); TRY_ALLOC_NO_ERR (queue, CALLOC(struct _cl_command_queue)); CL_OBJECT_INIT_BASE(queue, CL_OBJECT_COMMAND_QUEUE_MAGIC); + cl_command_queue_init_enqueue(queue); - queue->cmrt_event = NULL; if ((queue->thread_data = cl_thread_data_create()) == NULL) { goto error; } /* Append the command queue in the list */ cl_context_add_queue(ctx, queue); + queue->ctx = ctx; + queue->cmrt_event = NULL; exit: return queue; @@ -69,6 +71,8 @@ cl_command_queue_delete(cl_command_queue queue) if (CL_OBJECT_DEC_REF(queue) > 1) return; + cl_command_queue_destroy_enqueue(queue); + #ifdef HAS_CMRT if (queue->cmrt_event != NULL) cmrt_destroy_event(queue); @@ -76,7 +80,7 @@ cl_command_queue_delete(cl_command_queue queue) // If there is a list of valid events, we need to give them // a chance to call the call-back function. - cl_event_update_last_events(queue,1); + //cl_event_update_last_events(queue,1); cl_thread_data_destroy(queue); queue->thread_data = NULL; @@ -112,10 +116,9 @@ set_image_info(char *curbe, } LOCAL cl_int -cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k) +cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu) { uint32_t i; - GET_QUEUE_THREAD_GPGPU(queue); for (i = 0; i < k->image_sz; i++) { int id = k->images[i].arg_idx; @@ -149,9 +152,9 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k) } LOCAL cl_int -cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k) +cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k, cl_gpgpu gpgpu) { - GET_QUEUE_THREAD_GPGPU(queue); + //GET_QUEUE_THREAD_GPGPU(queue); /* Bind all user buffers (given by clSetKernelArg) */ uint32_t i; @@ -175,7 +178,8 @@ cl_command_queue_bind_surface(cl_command_queue queue, cl_kernel k) return CL_SUCCESS; } -extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, uint32_t, const size_t *, const size_t *, const size_t *); +extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, cl_event, + uint32_t, const size_t *, const size_t *, const size_t *); static cl_int cl_kernel_check_args(cl_kernel k) @@ -190,6 +194,7 @@ cl_kernel_check_args(cl_kernel k) LOCAL cl_int cl_command_queue_ND_range(cl_command_queue queue, cl_kernel k, + cl_event event, const uint32_t work_dim, const size_t *global_wk_off, const size_t *global_wk_sz, @@ -203,8 +208,10 @@ cl_command_queue_ND_range(cl_command_queue queue, /* Check that the user did not forget any argument */ TRY (cl_kernel_check_args, k); + if (ver == 7 || ver == 75 || ver == 8 || ver == 9) - TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz); + TRY (cl_command_queue_ND_range_gen7, queue, k, event, + work_dim, global_wk_off, global_wk_sz, local_wk_sz); else FATAL ("Unknown Gen Device"); @@ -213,7 +220,7 @@ error: } LOCAL int -cl_command_queue_flush_gpgpu(cl_command_queue queue, cl_gpgpu gpgpu) +cl_command_queue_flush_gpgpu(cl_gpgpu gpgpu) { void* printf_info = cl_gpgpu_get_printf_info(gpgpu); void* profiling_info; @@ -246,15 +253,15 @@ cl_command_queue_flush(cl_command_queue queue) { int err; GET_QUEUE_THREAD_GPGPU(queue); - err = cl_command_queue_flush_gpgpu(queue, gpgpu); + err = cl_command_queue_flush_gpgpu(gpgpu); // We now keep a list of uncompleted events and check if they compelte // every flush. This can make sure all events created have chance to be // update status, so the callback functions or reference can be handled. - cl_event_update_last_events(queue,0); + //cl_event_update_last_events(queue,0); cl_event current_event = get_current_event(queue); if (current_event && err == CL_SUCCESS) { - err = cl_event_flush(current_event); + //err = cl_event_flush(current_event); set_current_event(queue, NULL); } cl_invalid_thread_gpgpu(queue); @@ -265,7 +272,7 @@ LOCAL cl_int cl_command_queue_finish(cl_command_queue queue) { cl_gpgpu_sync(cl_get_thread_batch_buf(queue)); - cl_event_update_last_events(queue,1); + //cl_event_update_last_events(queue,1); return CL_SUCCESS; } @@ -337,72 +344,69 @@ cl_command_queue_remove_event(cl_command_queue queue, cl_event event) queue->wait_events_num -= 1; } -#define DEFAULT_WAIT_EVENTS_SIZE 16 LOCAL void cl_command_queue_insert_barrier_event(cl_command_queue queue, cl_event event) { - cl_int i=0; - cl_event *new_list; + cl_int i = 0; + + cl_event_add_ref(event); assert(queue != NULL); - if(queue->barrier_events == NULL) { - queue->barrier_events_size = DEFAULT_WAIT_EVENTS_SIZE; - TRY_ALLOC_NO_ERR (queue->barrier_events, CALLOC_ARRAY(cl_event, queue->barrier_events_size)); + CL_OBJECT_LOCK(queue); + + if (queue->barrier_events == NULL) { + queue->barrier_events_size = 4; + queue->barrier_events = cl_calloc(queue->barrier_events_size, sizeof(cl_event)); + assert(queue->barrier_events); } - for(i=0; i<queue->barrier_events_num; i++) { - if(queue->barrier_events[i] == event) - return; //is in the barrier_events, need to insert + for (i = 0; i<queue->barrier_events_num; i++) { + assert(queue->barrier_events[i] != event); } if(queue->barrier_events_num < queue->barrier_events_size) { queue->barrier_events[queue->barrier_events_num++] = event; + CL_OBJECT_UNLOCK(queue); return; } - //barrier_events_num == barrier_events_size, array is full + /* Array is full, double expand. */ queue->barrier_events_size *= 2; - TRY_ALLOC_NO_ERR (new_list, CALLOC_ARRAY(cl_event, queue->barrier_events_size)); - memcpy(new_list, queue->barrier_events, sizeof(cl_event)*queue->barrier_events_num); - cl_free(queue->barrier_events); - queue->barrier_events = new_list; - queue->barrier_events[queue->barrier_events_num++] = event; - return; + queue->barrier_events = cl_realloc(queue->barrier_events, + queue->barrier_events_size * sizeof(cl_event)); + assert(queue->barrier_events); -exit: + queue->barrier_events[queue->barrier_events_num++] = event; + CL_OBJECT_UNLOCK(queue); return; -error: - if(queue->barrier_events) - cl_free(queue->barrier_events); - queue->barrier_events = NULL; - queue->barrier_events_size = 0; - queue->barrier_events_num = 0; - goto exit; - } LOCAL void cl_command_queue_remove_barrier_event(cl_command_queue queue, cl_event event) { - cl_int i=0; + cl_int i = 0; + assert(queue != NULL); - if(queue->barrier_events_num == 0) - return; + CL_OBJECT_LOCK(queue); - for(i=0; i<queue->barrier_events_num; i++) { + assert(queue->barrier_events_num > 0); + assert(queue->barrier_events); + + for(i = 0; i < queue->barrier_events_num; i++) { if(queue->barrier_events[i] == event) break; } + assert(i < queue->barrier_events_num); // Must find it. - if(i == queue->barrier_events_num) - return; - - if(i == queue->barrier_events_num - 1) { + if(i == queue->barrier_events_num - 1) { // The last one. queue->barrier_events[i] = NULL; } else { - for(; i<queue->barrier_events_num-1; i++) { + for(; i < queue->barrier_events_num - 1; i++) { // Move forward. queue->barrier_events[i] = queue->barrier_events[i+1]; } } queue->barrier_events_num -= 1; + CL_OBJECT_UNLOCK(queue); + + cl_event_delete(event); } diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h index 34886f8e..470cafb7 100644 --- a/src/cl_command_queue.h +++ b/src/cl_command_queue.h @@ -81,6 +81,7 @@ extern void cl_command_queue_add_ref(cl_command_queue); /* Map ND range kernel from OCL API */ extern cl_int cl_command_queue_ND_range(cl_command_queue queue, cl_kernel ker, + cl_event event, const uint32_t work_dim, const size_t *global_work_offset, const size_t *global_work_size, @@ -93,16 +94,16 @@ extern cl_int cl_command_queue_set_report_buffer(cl_command_queue, cl_mem); extern cl_int cl_command_queue_flush(cl_command_queue); /* Flush for the specified gpgpu */ -extern int cl_command_queue_flush_gpgpu(cl_command_queue, cl_gpgpu); +extern int cl_command_queue_flush_gpgpu(cl_gpgpu); /* Wait for the completion of the command queue */ extern cl_int cl_command_queue_finish(cl_command_queue); /* Bind all the surfaces in the GPGPU state */ -extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel); +extern cl_int cl_command_queue_bind_surface(cl_command_queue, cl_kernel, cl_gpgpu); /* Bind all the image surfaces in the GPGPU state */ -extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel); +extern cl_int cl_command_queue_bind_image(cl_command_queue, cl_kernel, cl_gpgpu); /* Insert a user event to command's wait_events */ extern void cl_command_queue_insert_event(cl_command_queue, cl_event); diff --git a/src/cl_command_queue_enqueue.c b/src/cl_command_queue_enqueue.c index 1848d50c..7bc6dd3c 100644 --- a/src/cl_command_queue_enqueue.c +++ b/src/cl_command_queue_enqueue.c @@ -18,7 +18,7 @@ */ #include "cl_command_queue.h" -#include "cl_event_new.h" +#include "cl_event.h" #include "cl_alloc.h" #include <stdio.h> @@ -203,7 +203,7 @@ cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_nu } assert(event_num > 0); - enqueued_list = CL_CALLOC(event_num, sizeof(cl_event)); + enqueued_list = cl_calloc(event_num, sizeof(cl_event)); assert(enqueued_list); i = 0; @@ -265,7 +265,7 @@ cl_command_queue_wait_flush(cl_command_queue queue) cl_event_delete(enqueued_list[i]); } if (enqueued_list) - CL_FREE(enqueued_list); + cl_free(enqueued_list); return CL_SUCCESS; } @@ -315,7 +315,7 @@ cl_command_queue_wait_finish(cl_command_queue queue) cl_event_delete(enqueued_list[i]); } if (enqueued_list) - CL_FREE(enqueued_list); + cl_free(enqueued_list); return CL_SUCCESS; } diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index b6a5920c..5ad3b8b4 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -23,6 +23,7 @@ #include "cl_kernel.h" #include "cl_device_id.h" #include "cl_mem.h" +#include "cl_event.h" #include "cl_utils.h" #include "cl_alloc.h" @@ -123,12 +124,12 @@ error: } static int -cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker) +cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker, cl_gpgpu gpgpu) { /* calculate constant buffer size * we need raw_size & aligned_size */ - GET_QUEUE_THREAD_GPGPU(queue); + //GET_QUEUE_THREAD_GPGPU(queue); int32_t arg; size_t offset = 0; uint32_t raw_size = 0, aligned_size =0; @@ -331,12 +332,14 @@ cl_alloc_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num LOCAL cl_int cl_command_queue_ND_range_gen7(cl_command_queue queue, cl_kernel ker, + cl_event event, const uint32_t work_dim, const size_t *global_wk_off, const size_t *global_wk_sz, const size_t *local_wk_sz) { - GET_QUEUE_THREAD_GPGPU(queue); + //GET_QUEUE_THREAD_GPGPU(queue); + cl_gpgpu gpgpu = cl_gpgpu_new(queue->ctx->drv); cl_context ctx = queue->ctx; char *final_curbe = NULL; /* Includes them and one sub-buffer per group */ cl_gpgpu_kernel kernel; @@ -403,9 +406,9 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, } /* Bind user buffers */ - cl_command_queue_bind_surface(queue, ker); + cl_command_queue_bind_surface(queue, ker, gpgpu); /* Bind user images */ - if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker) != CL_SUCCESS)) + if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker, gpgpu) != CL_SUCCESS)) return err; /* Bind all samplers */ if (ker->vme) @@ -419,7 +422,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, /* Bind a stack if needed */ cl_bind_stack(gpgpu, ker); - if (cl_upload_constant_buffer(queue, ker) != 0) + if (cl_upload_constant_buffer(queue, ker, gpgpu) != 0) goto error; cl_gpgpu_states_setup(gpgpu, &kernel); @@ -440,7 +443,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, batch_sz = cl_kernel_compute_batch_sz(ker); if (cl_gpgpu_batch_reset(gpgpu, batch_sz) != 0) goto error; - cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu)); + //cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu)); cl_gpgpu_batch_start(gpgpu); /* Issue the GPGPU_WALKER command */ @@ -448,6 +451,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, /* Close the batch buffer and submit it */ cl_gpgpu_batch_end(gpgpu, 0); + + event->exec_data.gpgpu = gpgpu; + event->exec_data.type = EnqueueNDRangeKernel; + return CL_SUCCESS; error: diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c index 54c0ffa9..ac29ebe4 100644 --- a/src/cl_enqueue.c +++ b/src/cl_enqueue.c @@ -16,93 +16,101 @@ * * Author: Rong Yang <rong.r.yang@intel.com> */ -#include <stdio.h> -#include <string.h> -#include <assert.h> -#include <pthread.h> +//#include "cl_image.h" #include "cl_enqueue.h" -#include "cl_image.h" #include "cl_driver.h" #include "cl_event.h" #include "cl_command_queue.h" #include "cl_utils.h" +#include "cl_alloc.h" +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <pthread.h> - -cl_int cl_enqueue_read_buffer(enqueue_data* data) +static cl_int +cl_enqueue_read_buffer(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; cl_mem mem = data->mem_obj; + + if (status != CL_COMPLETE) + return err; + assert(mem->type == CL_MEM_BUFFER_TYPE || mem->type == CL_MEM_SUBBUFFER_TYPE); - struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; + struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem; //cl_buffer_get_subdata sometime is very very very slow in linux kernel, in skl and chv, //and it is randomly. So temporary disable it, use map/copy/unmap to read. //Should re-enable it after find root cause. if (0 && !mem->is_userptr) { if (cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset, - data->size, data->ptr) != 0) + data->size, data->ptr) != 0) err = CL_MAP_FAILURE; } else { - void* src_ptr = cl_mem_map_auto(mem, 0); + void *src_ptr = cl_mem_map_auto(mem, 0); if (src_ptr == NULL) err = CL_MAP_FAILURE; else { //sometimes, application invokes read buffer, instead of map buffer, even if userptr is enabled //memcpy is not necessary for this case - if (data->ptr != (char*)src_ptr + data->offset + buffer->sub_offset) - memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size); + if (data->ptr != (char *)src_ptr + data->offset + buffer->sub_offset) + memcpy(data->ptr, (char *)src_ptr + data->offset + buffer->sub_offset, data->size); cl_mem_unmap_auto(mem); } } return err; } -cl_int cl_enqueue_read_buffer_rect(enqueue_data* data) +static cl_int +cl_enqueue_read_buffer_rect(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; - void* src_ptr; - void* dst_ptr; + void *src_ptr; + void *dst_ptr; - const size_t* origin = data->origin; - const size_t* host_origin = data->host_origin; - const size_t* region = data->region; + const size_t *origin = data->origin; + const size_t *host_origin = data->host_origin; + const size_t *region = data->region; cl_mem mem = data->mem_obj; + + if (status != CL_COMPLETE) + return err; + assert(mem->type == CL_MEM_BUFFER_TYPE || mem->type == CL_MEM_SUBBUFFER_TYPE); - struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; + struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem; if (!(src_ptr = cl_mem_map_auto(mem, 0))) { err = CL_MAP_FAILURE; goto error; } - size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2]; - src_ptr = (char*)src_ptr + offset + buffer->sub_offset; - - offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2]; - dst_ptr = (char *)data->ptr + offset; - - if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch && - (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch))) - { - memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]); - } - else { - cl_uint y, z; - for (z = 0; z < region[2]; z++) { - const char* src = src_ptr; - char* dst = dst_ptr; - for (y = 0; y < region[1]; y++) { - memcpy(dst, src, region[0]); - src += data->row_pitch; - dst += data->host_row_pitch; - } - src_ptr = (char*)src_ptr + data->slice_pitch; - dst_ptr = (char*)dst_ptr + data->host_slice_pitch; - } - } + size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch * origin[2]; + src_ptr = (char *)src_ptr + offset + buffer->sub_offset; + + offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data->host_slice_pitch * host_origin[2]; + dst_ptr = (char *)data->ptr + offset; + + if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch && + (region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data->slice_pitch == data->host_slice_pitch))) { + memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]); + } else { + cl_uint y, z; + for (z = 0; z < region[2]; z++) { + const char *src = src_ptr; + char *dst = dst_ptr; + for (y = 0; y < region[1]; y++) { + memcpy(dst, src, region[0]); + src += data->row_pitch; + dst += data->host_row_pitch; + } + src_ptr = (char *)src_ptr + data->slice_pitch; + dst_ptr = (char *)dst_ptr + data->host_slice_pitch; + } + } err = cl_mem_unmap_auto(mem); @@ -110,75 +118,80 @@ error: return err; } -cl_int cl_enqueue_write_buffer(enqueue_data *data) +static cl_int +cl_enqueue_write_buffer(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; cl_mem mem = data->mem_obj; assert(mem->type == CL_MEM_BUFFER_TYPE || mem->type == CL_MEM_SUBBUFFER_TYPE); - struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; + struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem; + + if (status != CL_COMPLETE) + return err; if (mem->is_userptr) { - void* dst_ptr = cl_mem_map_auto(mem, 1); + void *dst_ptr = cl_mem_map_auto(mem, 1); if (dst_ptr == NULL) err = CL_MAP_FAILURE; else { - memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size); + memcpy((char *)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size); cl_mem_unmap_auto(mem); } - } - else { + } else { if (cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset, - data->size, data->const_ptr) != 0) + data->size, data->const_ptr) != 0) err = CL_MAP_FAILURE; } return err; } -cl_int cl_enqueue_write_buffer_rect(enqueue_data *data) +static cl_int +cl_enqueue_write_buffer_rect(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; - void* src_ptr; - void* dst_ptr; + void *src_ptr; + void *dst_ptr; - const size_t* origin = data->origin; - const size_t* host_origin = data->host_origin; - const size_t* region = data->region; + const size_t *origin = data->origin; + const size_t *host_origin = data->host_origin; + const size_t *region = data->region; cl_mem mem = data->mem_obj; assert(mem->type == CL_MEM_BUFFER_TYPE || mem->type == CL_MEM_SUBBUFFER_TYPE); - struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; + struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem; + + if (status != CL_COMPLETE) + return err; if (!(dst_ptr = cl_mem_map_auto(mem, 1))) { err = CL_MAP_FAILURE; goto error; } - size_t offset = origin[0] + data->row_pitch*origin[1] + data->slice_pitch*origin[2]; + size_t offset = origin[0] + data->row_pitch * origin[1] + data->slice_pitch * origin[2]; dst_ptr = (char *)dst_ptr + offset + buffer->sub_offset; - offset = host_origin[0] + data->host_row_pitch*host_origin[1] + data->host_slice_pitch*host_origin[2]; - src_ptr = (char*)data->const_ptr + offset; + offset = host_origin[0] + data->host_row_pitch * host_origin[1] + data->host_slice_pitch * host_origin[2]; + src_ptr = (char *)data->const_ptr + offset; if (data->row_pitch == region[0] && data->row_pitch == data->host_row_pitch && - (region[2] == 1 || (data->slice_pitch == region[0]*region[1] && data->slice_pitch == data->host_slice_pitch))) - { - memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]); - } - else { + (region[2] == 1 || (data->slice_pitch == region[0] * region[1] && data->slice_pitch == data->host_slice_pitch))) { + memcpy(dst_ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]); + } else { cl_uint y, z; for (z = 0; z < region[2]; z++) { - const char* src = src_ptr; - char* dst = dst_ptr; + const char *src = src_ptr; + char *dst = dst_ptr; for (y = 0; y < region[1]; y++) { memcpy(dst, src, region[0]); src += data->host_row_pitch; dst += data->row_pitch; } - src_ptr = (char*)src_ptr + data->host_slice_pitch; - dst_ptr = (char*)dst_ptr + data->slice_pitch; + src_ptr = (char *)src_ptr + data->host_slice_pitch; + dst_ptr = (char *)dst_ptr + data->slice_pitch; } } @@ -188,16 +201,19 @@ error: return err; } - -cl_int cl_enqueue_read_image(enqueue_data *data) +static cl_int +cl_enqueue_read_image(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; - void* src_ptr; + void *src_ptr; cl_mem mem = data->mem_obj; CHECK_IMAGE(mem, image); - const size_t* origin = data->origin; - const size_t* region = data->region; + const size_t *origin = data->origin; + const size_t *region = data->region; + + if (status != CL_COMPLETE) + return err; if (!(src_ptr = cl_mem_map_auto(mem, 0))) { err = CL_MAP_FAILURE; @@ -208,40 +224,42 @@ cl_int cl_enqueue_read_image(enqueue_data *data) src_ptr = (char*)src_ptr + offset; if (!origin[0] && region[0] == image->w && data->row_pitch == image->row_pitch && - (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch))) - { - memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch*region[1] : data->slice_pitch*region[2]); - } - else { + (region[2] == 1 || (!origin[1] && region[1] == image->h && data->slice_pitch == image->slice_pitch))) { + memcpy(data->ptr, src_ptr, region[2] == 1 ? data->row_pitch * region[1] : data->slice_pitch * region[2]); + } else { cl_uint y, z; for (z = 0; z < region[2]; z++) { - const char* src = src_ptr; - char* dst = data->ptr; + const char *src = src_ptr; + char *dst = data->ptr; for (y = 0; y < region[1]; y++) { - memcpy(dst, src, image->bpp*region[0]); + memcpy(dst, src, image->bpp * region[0]); src += image->row_pitch; dst += data->row_pitch; } - src_ptr = (char*)src_ptr + image->slice_pitch; - data->ptr = (char*)data->ptr + data->slice_pitch; + src_ptr = (char *)src_ptr + image->slice_pitch; + data->ptr = (char *)data->ptr + data->slice_pitch; } } - err = cl_mem_unmap_auto(mem); + err = cl_mem_unmap_auto(mem); error: return err; - } -cl_int cl_enqueue_write_image(enqueue_data *data) +static cl_int +cl_enqueue_write_image(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; - void* dst_ptr; + void *dst_ptr; cl_mem mem = data->mem_obj; + CHECK_IMAGE(mem, image); + if (status != CL_COMPLETE) + return err; + if (!(dst_ptr = cl_mem_map_auto(mem, 1))) { err = CL_MAP_FAILURE; goto error; @@ -255,45 +273,57 @@ cl_int cl_enqueue_write_image(enqueue_data *data) error: return err; - } -cl_int cl_enqueue_map_buffer(enqueue_data *data) +static cl_int +cl_enqueue_map_buffer(enqueue_data *data, cl_int status) { void *ptr = NULL; cl_int err = CL_SUCCESS; cl_mem mem = data->mem_obj; assert(mem->type == CL_MEM_BUFFER_TYPE || mem->type == CL_MEM_SUBBUFFER_TYPE); - struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; + struct _cl_mem_buffer *buffer = (struct _cl_mem_buffer *)mem; - if (mem->is_userptr) - ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0); - else { - if(data->unsync_map == 1) - //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here - ptr = cl_mem_map_gtt(mem); - else + if (status == CL_SUBMITTED) { + if (buffer->base.is_userptr) { + ptr = buffer->base.host_ptr; + } else { + if ((ptr = cl_mem_map_gtt_unsync(&buffer->base)) == NULL) { + err = CL_MAP_FAILURE; + return err; + } + } + data->ptr = ptr; + } else if (status == CL_COMPLETE) { + if (mem->is_userptr) ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0); - } + else { + if (data->unsync_map == 1) + //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here + ptr = cl_mem_map_gtt(mem); + else + ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0); + } - if (ptr == NULL) { - err = CL_MAP_FAILURE; - goto error; - } - data->ptr = ptr; + if (ptr == NULL) { + err = CL_MAP_FAILURE; + return err; + } + data->ptr = ptr; - if((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) { - assert(mem->host_ptr); - ptr = (char*)ptr + data->offset + buffer->sub_offset; - memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size); + if ((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) { + assert(mem->host_ptr); + ptr = (char *)ptr + data->offset + buffer->sub_offset; + memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size); + } } -error: return err; } -cl_int cl_enqueue_map_image(enqueue_data *data) +static cl_int +cl_enqueue_map_image(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; cl_mem mem = data->mem_obj; @@ -301,46 +331,59 @@ cl_int cl_enqueue_map_image(enqueue_data *data) size_t row_pitch = 0; CHECK_IMAGE(mem, image); - if(data->unsync_map == 1) - //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here - ptr = cl_mem_map_gtt(mem); - else - ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0); + if (status == CL_SUBMITTED) { + if ((ptr = cl_mem_map_gtt_unsync(mem)) == NULL) { + err = CL_MAP_FAILURE; + goto error; + } + data->ptr = ptr; + } else if (status == CL_COMPLETE) { + if (data->unsync_map == 1) + //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here + ptr = cl_mem_map_gtt(mem); + else + ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0); + + if (ptr == NULL) { + err = CL_MAP_FAILURE; + goto error; + } - if (ptr == NULL) { - err = CL_MAP_FAILURE; - goto error; - } - data->ptr = (char*)ptr + image->offset; - if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) - row_pitch = image->slice_pitch; - else - row_pitch = image->row_pitch; - - if(mem->flags & CL_MEM_USE_HOST_PTR) { - assert(mem->host_ptr); - if (!mem->is_userptr) - //src and dst need add offset in function cl_mem_copy_image_region - cl_mem_copy_image_region(data->origin, data->region, - mem->host_ptr, image->host_row_pitch, image->host_slice_pitch, - data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE); + data->ptr = (char*)ptr + image->offset; + if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + row_pitch = image->slice_pitch; + else + row_pitch = image->row_pitch; + + if(mem->flags & CL_MEM_USE_HOST_PTR) { + assert(mem->host_ptr); + if (!mem->is_userptr) + //src and dst need add offset in function cl_mem_copy_image_region + cl_mem_copy_image_region(data->origin, data->region, + mem->host_ptr, image->host_row_pitch, image->host_slice_pitch, + data->ptr, row_pitch, image->slice_pitch, image, CL_TRUE, CL_TRUE); + } } error: return err; } -cl_int cl_enqueue_unmap_mem_object(enqueue_data *data) +static cl_int +cl_enqueue_unmap_mem_object(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; int i, j; size_t mapped_size = 0; size_t origin[3], region[3]; - void * v_ptr = NULL; - void * mapped_ptr = data->ptr; + void *v_ptr = NULL; + void *mapped_ptr = data->ptr; cl_mem memobj = data->mem_obj; size_t row_pitch = 0; + if (status != CL_COMPLETE) + return err; + assert(memobj->mapped_ptr_sz >= memobj->map_ref); INVALID_VALUE_IF(!mapped_ptr); for (i = 0; i < memobj->mapped_ptr_sz; i++) { @@ -348,7 +391,7 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data) memobj->mapped_ptr[i].ptr = NULL; mapped_size = memobj->mapped_ptr[i].size; v_ptr = memobj->mapped_ptr[i].v_ptr; - for(j=0; j<3; j++) { + for (j = 0; j < 3; j++) { region[j] = memobj->mapped_ptr[i].region[j]; origin[j] = memobj->mapped_ptr[i].origin[j]; memobj->mapped_ptr[i].region[j] = 0; @@ -364,10 +407,10 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data) INVALID_VALUE_IF(i == memobj->mapped_ptr_sz); if (memobj->flags & CL_MEM_USE_HOST_PTR) { - if(memobj->type == CL_MEM_BUFFER_TYPE || - memobj->type == CL_MEM_SUBBUFFER_TYPE) { + if (memobj->type == CL_MEM_BUFFER_TYPE || + memobj->type == CL_MEM_SUBBUFFER_TYPE) { assert(mapped_ptr >= memobj->host_ptr && - mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size); + mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size); /* Sync the data. */ if (!memobj->is_userptr) memcpy(v_ptr, mapped_ptr, mapped_size); @@ -381,8 +424,8 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data) if (!memobj->is_userptr) //v_ptr have added offset, host_ptr have not added offset. cl_mem_copy_image_region(origin, region, v_ptr, row_pitch, image->slice_pitch, - memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch, - image, CL_FALSE, CL_TRUE); + memobj->host_ptr, image->host_row_pitch, image->host_slice_pitch, + image, CL_FALSE, CL_TRUE); } } else { assert(v_ptr == mapped_ptr); @@ -391,24 +434,24 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data) cl_mem_unmap_auto(memobj); /* shrink the mapped slot. */ - if (memobj->mapped_ptr_sz/2 > memobj->map_ref) { + if (memobj->mapped_ptr_sz / 2 > memobj->map_ref) { int j = 0; cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc( - sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz/2)); + sizeof(cl_mapped_ptr) * (memobj->mapped_ptr_sz / 2)); if (!new_ptr) { /* Just do nothing. */ goto error; } - memset(new_ptr, 0, (memobj->mapped_ptr_sz/2) * sizeof(cl_mapped_ptr)); + memset(new_ptr, 0, (memobj->mapped_ptr_sz / 2) * sizeof(cl_mapped_ptr)); for (i = 0; i < memobj->mapped_ptr_sz; i++) { if (memobj->mapped_ptr[i].ptr) { new_ptr[j] = memobj->mapped_ptr[i]; j++; - assert(j < memobj->mapped_ptr_sz/2); + assert(j < memobj->mapped_ptr_sz / 2); } } - memobj->mapped_ptr_sz = memobj->mapped_ptr_sz/2; + memobj->mapped_ptr_sz = memobj->mapped_ptr_sz / 2; free(memobj->mapped_ptr); memobj->mapped_ptr = new_ptr; } @@ -417,7 +460,8 @@ error: return err; } -cl_int cl_enqueue_native_kernel(enqueue_data *data) +static cl_int +cl_enqueue_native_kernel(enqueue_data *data, cl_int status) { cl_int err = CL_SUCCESS; cl_uint num_mem_objects = (cl_uint)data->offset; @@ -425,18 +469,19 @@ cl_int cl_enqueue_native_kernel(enqueue_data *data) const void **args_mem_loc = (const void **)data->const_ptr; cl_uint i; - for (i=0; i<num_mem_objects; ++i) - { - const cl_mem buffer = mem_list[i]; - CHECK_MEM(buffer); + if (status != CL_COMPLETE) + return err; + + for (i = 0; i < num_mem_objects; ++i) { + const cl_mem buffer = mem_list[i]; + CHECK_MEM(buffer); - *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0); + *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0); } data->user_func(data->ptr); - for (i=0; i<num_mem_objects; ++i) - { - cl_mem_unmap_auto(mem_list[i]); + for (i = 0; i < num_mem_objects; ++i) { + cl_mem_unmap_auto(mem_list[i]); } free(data->ptr); @@ -444,46 +489,115 @@ error: return err; } -cl_int cl_enqueue_handle(cl_event event, enqueue_data* data) +static cl_int +cl_enqueue_ndrange(enqueue_data *data, cl_int status) { - /* if need profiling, add the submit timestamp here. */ - if (event && event->type != CL_COMMAND_USER - && event->queue->props & CL_QUEUE_PROFILING_ENABLE) { - cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT); + cl_int err = CL_SUCCESS; + + if (status == CL_SUBMITTED) { + err = cl_command_queue_flush_gpgpu(data->gpgpu); + } else if (status == CL_COMPLETE) { + void *batch_buf = cl_gpgpu_ref_batch_buf(data->gpgpu); + cl_gpgpu_sync(batch_buf); + cl_gpgpu_unref_batch_buf(batch_buf); + /* Finished, we can release the gpgpu now. */ + cl_gpgpu_delete(data->gpgpu); + data->gpgpu = NULL; } - switch(data->type) { - case EnqueueReadBuffer: - return cl_enqueue_read_buffer(data); - case EnqueueReadBufferRect: - return cl_enqueue_read_buffer_rect(data); - case EnqueueWriteBuffer: - return cl_enqueue_write_buffer(data); - case EnqueueWriteBufferRect: - return cl_enqueue_write_buffer_rect(data); - case EnqueueReadImage: - return cl_enqueue_read_image(data); - case EnqueueWriteImage: - return cl_enqueue_write_image(data); - case EnqueueMapBuffer: - return cl_enqueue_map_buffer(data); - case EnqueueMapImage: - return cl_enqueue_map_image(data); - case EnqueueUnmapMemObject: - return cl_enqueue_unmap_mem_object(data); - case EnqueueCopyBufferRect: - case EnqueueCopyBuffer: - case EnqueueCopyImage: - case EnqueueCopyBufferToImage: - case EnqueueCopyImageToBuffer: - case EnqueueNDRangeKernel: - case EnqueueFillBuffer: - case EnqueueFillImage: - return cl_event_flush(event); - case EnqueueNativeKernel: - return cl_enqueue_native_kernel(data); - case EnqueueMigrateMemObj: - default: - return CL_SUCCESS; + return err; +} + +static cl_int +cl_enqueue_marker_or_barrier(enqueue_data *data, cl_int status) +{ + return CL_COMPLETE; +} + +LOCAL void +cl_enqueue_delete(enqueue_data *data) +{ + if (data == NULL) + return; + + if (data->type == EnqueueCopyBufferRect || + data->type == EnqueueCopyBuffer || + data->type == EnqueueCopyImage || + data->type == EnqueueCopyBufferToImage || + data->type == EnqueueCopyImageToBuffer || + data->type == EnqueueNDRangeKernel || + data->type == EnqueueFillBuffer || + data->type == EnqueueFillImage) { + if (data->gpgpu) { + cl_gpgpu_delete(data->gpgpu); + data->gpgpu = NULL; + } + return; + } + + if (data->type == EnqueueNativeKernel) { + if (data->mem_list) { + cl_free((void*)data->mem_list); + data->mem_list = NULL; + } + if (data->ptr) { + cl_free((void*)data->ptr); + data->ptr = NULL; + } + if (data->const_ptr) { + cl_free((void*)data->const_ptr); + data->const_ptr = NULL; + } + } +} + +LOCAL cl_int +cl_enqueue_handle(enqueue_data *data, cl_int status) +{ + /* if need profiling, add the submit timestamp here. */ + // if (event && event->event_type != CL_COMMAND_USER && + // event->queue->props & CL_QUEUE_PROFILING_ENABLE) { + // cl_event_get_timestamp(event, CL_PROFILING_COMMAND_SUBMIT); + // } + + switch (data->type) { + case EnqueueReturnSuccesss: + return CL_SUCCESS; + case EnqueueReadBuffer: + return cl_enqueue_read_buffer(data, status); + case EnqueueReadBufferRect: + return cl_enqueue_read_buffer_rect(data, status); + case EnqueueWriteBuffer: + return cl_enqueue_write_buffer(data, status); + case EnqueueWriteBufferRect: + return cl_enqueue_write_buffer_rect(data, status); + case EnqueueReadImage: + return cl_enqueue_read_image(data, status); + case EnqueueWriteImage: + return cl_enqueue_write_image(data, status); + case EnqueueMapBuffer: + return cl_enqueue_map_buffer(data, status); + case EnqueueMapImage: + return cl_enqueue_map_image(data, status); + case EnqueueUnmapMemObject: + return cl_enqueue_unmap_mem_object(data, status); + case EnqueueMarker: + case EnqueueBarrier: + return cl_enqueue_marker_or_barrier(data, status); + case EnqueueCopyBufferRect: + case EnqueueCopyBuffer: + case EnqueueCopyImage: + case EnqueueCopyBufferToImage: + case EnqueueCopyImageToBuffer: + case EnqueueNDRangeKernel: + case EnqueueFillBuffer: + case EnqueueFillImage: + //return cl_event_flush(event); + return cl_enqueue_ndrange(data, status); + case EnqueueNativeKernel: + return cl_enqueue_native_kernel(data, status); + case EnqueueMigrateMemObj: + default: + return CL_SUCCESS; } } diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h index 09305aff..f8fff9d3 100644 --- a/src/cl_enqueue.h +++ b/src/cl_enqueue.h @@ -24,7 +24,8 @@ #include "CL/cl.h" typedef enum { - EnqueueReadBuffer = 0, + EnqueueReturnSuccesss = 0, /* For some case, we have nothing to do, just return SUCCESS. */ + EnqueueReadBuffer, EnqueueReadBufferRect, EnqueueWriteBuffer, EnqueueWriteBufferRect, @@ -49,26 +50,29 @@ typedef enum { } enqueue_type; typedef struct _enqueue_data { - enqueue_type type; /* Command type */ - cl_mem mem_obj; /* Enqueue's cl_mem */ - cl_command_queue queue; /* Command queue */ - size_t offset; /* Mem object's offset */ - size_t size; /* Size */ - size_t origin[3]; /* Origin */ - size_t host_origin[3]; /* Origin */ - size_t region[3]; /* Region */ - size_t row_pitch; /* Row pitch */ - size_t slice_pitch; /* Slice pitch */ - size_t host_row_pitch; /* Host row pitch, used in read/write buffer rect */ - size_t host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */ - const void * const_ptr; /* Const ptr for memory read */ - void * ptr; /* Ptr for write and return value */ - const cl_mem* mem_list; /* mem_list of clEnqueueNativeKernel */ - uint8_t unsync_map; /* Indicate the clEnqueueMapBuffer/Image is unsync map */ - uint8_t write_map; /* Indicate if the clEnqueueMapBuffer is write enable */ - void (*user_func)(void *); /* pointer to a host-callable user function */ + enqueue_type type; /* Command type */ + cl_mem mem_obj; /* Enqueue's cl_mem */ + cl_command_queue queue; /* Command queue */ + size_t offset; /* Mem object's offset */ + size_t size; /* Size */ + size_t origin[3]; /* Origin */ + size_t host_origin[3]; /* Origin */ + size_t region[3]; /* Region */ + size_t row_pitch; /* Row pitch */ + size_t slice_pitch; /* Slice pitch */ + size_t host_row_pitch; /* Host row pitch, used in read/write buffer rect */ + size_t host_slice_pitch; /* Host slice pitch, used in read/write buffer rect */ + const void *const_ptr; /* Const ptr for memory read */ + void *ptr; /* Ptr for write and return value */ + const cl_mem *mem_list; /* mem_list of clEnqueueNativeKernel */ + uint8_t unsync_map; /* Indicate the clEnqueueMapBuffer/Image is unsync map */ + uint8_t write_map; /* Indicate if the clEnqueueMapBuffer is write enable */ + void (*user_func)(void *); /* pointer to a host-callable user function */ + cl_gpgpu gpgpu; } enqueue_data; /* Do real enqueue commands */ -cl_int cl_enqueue_handle(cl_event event, enqueue_data* data); +extern cl_int cl_enqueue_handle(enqueue_data *data, cl_int status); +extern void cl_enqueue_delete(enqueue_data *data); + #endif /* __CL_ENQUEUE_H__ */ diff --git a/src/cl_event.c b/src/cl_event.c index 6c7c2e0c..4acd619b 100644 --- a/src/cl_event.c +++ b/src/cl_event.c @@ -14,750 +14,615 @@ * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see <http://www.gnu.org/licenses/>. * - * Author: Rong Yang <rong.r.yang@intel.com> */ #include "cl_event.h" #include "cl_context.h" -#include "cl_utils.h" -#include "cl_alloc.h" -#include "cl_khr_icd.h" -#include "cl_kernel.h" #include "cl_command_queue.h" - -#include <assert.h> +#include "cl_alloc.h" +#include <string.h> #include <stdio.h> -void cl_event_update_last_events(cl_command_queue queue, int wait) +LOCAL cl_int +cl_event_get_timestamp(cl_event event, cl_profiling_info param_name) { - cl_event last_event = get_last_event(queue); - if(!last_event) return; - cl_event next, now; - now = last_event; - while(now){ - next = now->last_next;//get next first in case set status maintain it - cl_event_update_status(now,wait);//update event status - now = next; - } + // TODO: + return CL_INVALID_VALUE; } -void cl_event_insert_last_events(cl_command_queue queue,cl_event event) +LOCAL cl_ulong +cl_event_get_timestamp_delta(cl_ulong start_timestamp, cl_ulong end_timestamp) { - if(!event) return; - cl_event last_event = get_last_event(queue); - if(last_event){ - cl_event now = last_event; - while(now->last_next) - now = now->last_next; - now->last_next = event; - event->last_prev = now; + cl_ulong ret_val; + + if (end_timestamp > start_timestamp) { + ret_val = end_timestamp - start_timestamp; + } else { + /*if start time stamp is greater than end timstamp then set ret value to max*/ + ret_val = ((cl_ulong)1 << 32); } - else set_last_event(queue,event); + + return ret_val; } -static inline cl_bool -cl_event_is_gpu_command_type(cl_command_type type) +LOCAL cl_ulong +cl_event_get_start_timestamp(cl_event event) { - switch(type) { - case CL_COMMAND_COPY_BUFFER: - case CL_COMMAND_FILL_BUFFER: - case CL_COMMAND_COPY_IMAGE: - case CL_COMMAND_COPY_IMAGE_TO_BUFFER: - case CL_COMMAND_COPY_BUFFER_TO_IMAGE: - case CL_COMMAND_COPY_BUFFER_RECT: - case CL_COMMAND_TASK: - case CL_COMMAND_NDRANGE_KERNEL: - return CL_TRUE; - default: - return CL_FALSE; - } + cl_ulong ret_val; + + ret_val = cl_event_get_timestamp_delta(event->timestamp[0], event->timestamp[2]); + + return ret_val; } -int cl_event_flush(cl_event event) +LOCAL cl_ulong +cl_event_get_end_timestamp(cl_event event) { - int err = CL_SUCCESS; - if(!event) { - err = CL_INVALID_VALUE; - return err; - } + cl_ulong ret_val; - assert(event->gpgpu_event != NULL); - if (event->gpgpu) { - err = cl_command_queue_flush_gpgpu(event->queue, event->gpgpu); - cl_gpgpu_delete(event->gpgpu); - event->gpgpu = NULL; - } - cl_gpgpu_event_flush(event->gpgpu_event); - cl_event_insert_last_events(event->queue,event); - return err; + ret_val = cl_event_get_timestamp_delta(event->timestamp[0], event->timestamp[3]); + + return ret_val; } -cl_event cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, cl_bool emplict) +LOCAL void +cl_event_add_ref(cl_event event) { - cl_event event = NULL; - GET_QUEUE_THREAD_GPGPU(queue); + assert(event); + CL_OBJECT_INC_REF(event); +} - /* Allocate and inialize the structure itself */ - TRY_ALLOC_NO_ERR (event, CALLOC(struct _cl_event)); - CL_OBJECT_INIT_BASE(event, CL_OBJECT_EVENT_MAGIC); +LOCAL cl_int +cl_event_get_status(cl_event event) +{ + cl_int ret; + + assert(event); + CL_OBJECT_LOCK(event); + ret = event->status; + CL_OBJECT_UNLOCK(event); + return ret; +} + +static cl_event +cl_event_new(cl_context ctx, cl_command_queue queue, cl_command_type type, + cl_uint num_events, cl_event *event_list) +{ + cl_event e = cl_calloc(1, sizeof(_cl_event)); + if (e == NULL) + return NULL; + + CL_OBJECT_INIT_BASE(e, CL_OBJECT_EVENT_MAGIC); /* Append the event in the context event list */ - cl_context_add_event(ctx, event); - - /* Initialize all members and create GPGPU event object */ - event->queue = queue; - event->type = type; - event->gpgpu_event = NULL; - if(type == CL_COMMAND_USER) { - event->status = CL_SUBMITTED; + cl_context_add_event(ctx, e); + e->queue = queue; + + list_init(&e->callbacks); + list_init(&e->enqueue_node); + + assert(type >= CL_COMMAND_NDRANGE_KERNEL && type <= CL_COMMAND_FILL_IMAGE); + e->event_type = type; + if (type == CL_COMMAND_USER) { + e->status = CL_SUBMITTED; + } else { + e->status = CL_QUEUED; } - else { - event->status = CL_QUEUED; - if(cl_event_is_gpu_command_type(event->type)) - event->gpgpu_event = cl_gpgpu_event_new(gpgpu); + + if (type == CL_COMMAND_USER) { + assert(queue == NULL); } - cl_event_add_ref(event); //dec when complete - event->user_cb = NULL; - event->enqueue_cb = NULL; - event->waits_head = NULL; - event->emplict = emplict; - -exit: - return event; -error: - cl_event_delete(event); - event = NULL; - goto exit; + + e->depend_events = event_list; + e->depend_event_num = num_events; + return e; } -void cl_event_delete(cl_event event) +LOCAL void +cl_event_delete(cl_event event) { + int i; + cl_event_user_callback cb; + if (UNLIKELY(event == NULL)) return; - cl_event_update_status(event, 0); - if (CL_OBJECT_DEC_REF(event) > 1) return; - /* Call all user's callback if haven't execute */ - cl_event_call_callback(event, CL_COMPLETE, CL_TRUE); // CL_COMPLETE status will force all callbacks that are not executed to run + cl_enqueue_delete(&event->exec_data); - /* delete gpgpu event object */ - if(event->gpgpu_event) - cl_gpgpu_event_delete(event->gpgpu_event); + assert(list_empty(&event->enqueue_node)); - /* Remove it from the list */ - cl_context_remove_event(event->ctx, event); + if (event->depend_events) { + assert(event->depend_event_num); + for (i = 0; i < event->depend_event_num; i++) { + cl_event_delete(event->depend_events[i]); + } + cl_free(event->depend_events); + } - if (event->gpgpu) { - fprintf(stderr, "Warning: a event is deleted with a pending enqueued task.\n"); - cl_gpgpu_delete(event->gpgpu); - event->gpgpu = NULL; + /* Free all the callbacks. Last ref, no need to lock. */ + while (!list_empty(&event->callbacks)) { + cb = list_entry(event->callbacks.next, _cl_event_user_callback, node); + list_del(&cb->node); + cl_free(cb); } + /* Remove it from the list */ + assert(event->ctx); + cl_context_remove_event(event->ctx, event); + CL_OBJECT_DESTROY_BASE(event); cl_free(event); } -void cl_event_add_ref(cl_event event) +LOCAL cl_event +cl_event_create(cl_context ctx, cl_command_queue queue, cl_uint num_events, + const cl_event *event_list, cl_command_type type, cl_int *errcode_ret) { - assert(event); - CL_OBJECT_INC_REF(event); -} + cl_event e = NULL; + cl_event *depend_events = NULL; + cl_int err = CL_SUCCESS; + cl_uint total_events = 0; + int i; -cl_int cl_event_set_callback(cl_event event , - cl_int command_exec_callback_type, - EVENT_NOTIFY pfn_notify, - void* user_data) -{ - assert(event); - assert(pfn_notify); + assert(ctx); - cl_int err = CL_SUCCESS; - user_callback *cb; - TRY_ALLOC(cb, CALLOC(user_callback)); - - cb->pfn_notify = pfn_notify; - cb->user_data = user_data; - cb->status = command_exec_callback_type; - cb->executed = CL_FALSE; - - - // It is possible that the event enqueued is already completed. - // clEnqueueReadBuffer can be synchronous and when the callback - // is registered after, it still needs to get executed. - CL_OBJECT_LOCK(event); // Thread safety required: operations on the event->status can be made from many different threads - if(event->status <= command_exec_callback_type) { - /* Call user callback */ - CL_OBJECT_UNLOCK(event); // pfn_notify can call clFunctions that use the event_lock and from here it's not required - cb->pfn_notify(event, event->status, cb->user_data); - cl_free(cb); - } else { - // Enqueue to callback list - cb->next = event->user_cb; - event->user_cb = cb; - CL_OBJECT_UNLOCK(event); - } + do { + if (event_list) + assert(num_events); -exit: - return err; -error: - err = CL_OUT_OF_HOST_MEMORY; - cl_free(cb); - goto exit; -}; - -cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event *event,cl_context ctx) -{ - cl_int err = CL_SUCCESS; - cl_int i; - /* check the event_wait_list and num_events_in_wait_list */ - if((event_wait_list == NULL) && - (num_events_in_wait_list > 0)) - goto error; - - if ((event_wait_list != NULL) && - (num_events_in_wait_list == 0)){ - goto error; - } + if (queue == NULL) { + assert(type == CL_COMMAND_USER); + assert(event_list == NULL); + assert(num_events == 0); - /* check the event and context */ - for(i=0; i<num_events_in_wait_list; i++) { - CHECK_EVENT(event_wait_list[i]); - if(event_wait_list[i]->status < CL_COMPLETE) { - err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST; - goto exit; - } - if(event && event == &event_wait_list[i]) - goto error; - if(event_wait_list[i]->ctx != ctx) { - err = CL_INVALID_CONTEXT; - goto exit; - } - } + e = cl_event_new(ctx, queue, type, 0, NULL); + if (e == NULL) { + err = CL_OUT_OF_HOST_MEMORY; + break; + } + } else { + CL_OBJECT_LOCK(queue); + total_events = queue->barrier_events_num + num_events; + + if (total_events) { + depend_events = cl_calloc(total_events, sizeof(cl_event)); + if (depend_events == NULL) { + CL_OBJECT_UNLOCK(queue); + err = CL_OUT_OF_HOST_MEMORY; + break; + } + } -exit: - return err; -error: - err = CL_INVALID_EVENT_WAIT_LIST; //reset error - goto exit; -} + /* Add all the barrier events as depend events. */ + for (i = 0; i < queue->barrier_events_num; i++) { + assert(CL_EVENT_IS_BARRIER(queue->barrier_events[i])); + cl_event_add_ref(queue->barrier_events[i]); + depend_events[num_events + i] = queue->barrier_events[i]; + } -cl_int cl_event_wait_events(cl_uint num_events_in_wait_list, const cl_event *event_wait_list, - cl_command_queue queue) -{ - cl_int i; + CL_OBJECT_UNLOCK(queue); - /* Check whether wait user events */ - for(i=0; i<num_events_in_wait_list; i++) { - if(event_wait_list[i]->status <= CL_COMPLETE) - continue; + for (i = 0; i < num_events; i++) { + assert(event_list[i]); + assert(event_list[i]->ctx == ctx); + assert(CL_OBJECT_IS_EVENT(event_list[i])); + cl_event_add_ref(event_list[i]); + depend_events[i] = event_list[i]; + } - /* Need wait on user event, return and do enqueue defer */ - if((event_wait_list[i]->type == CL_COMMAND_USER) || - (event_wait_list[i]->enqueue_cb && - (event_wait_list[i]->enqueue_cb->wait_user_events != NULL))){ - return CL_ENQUEUE_EXECUTE_DEFER; - } - } + if (depend_events) + assert(total_events); - if(queue && queue->barrier_events_num ) - return CL_ENQUEUE_EXECUTE_DEFER; + e = cl_event_new(ctx, queue, type, total_events, depend_events); + if (e == NULL) { + err = CL_OUT_OF_HOST_MEMORY; + break; + } + depend_events = NULL; + } + } while (0); - /* Non user events or all user event finished, wait all enqueue events finish */ - for(i=0; i<num_events_in_wait_list; i++) { - if(event_wait_list[i]->status <= CL_COMPLETE) - continue; + if (err != CL_SUCCESS) { + if (depend_events) { + for (i = 0; i < total_events; i++) { + cl_event_delete(depend_events[i]); + } + cl_free(depend_events); + } - //enqueue callback haven't finish, in another thread, wait - if(event_wait_list[i]->enqueue_cb != NULL) - return CL_ENQUEUE_EXECUTE_DEFER; - if(event_wait_list[i]->gpgpu_event) - cl_gpgpu_event_update_status(event_wait_list[i]->gpgpu_event, 1); - cl_event_set_status(event_wait_list[i], CL_COMPLETE); //Execute user's callback + // if set depend_events, must succeed. + assert(e->depend_events == NULL); + cl_event_delete(e); } - return CL_ENQUEUE_EXECUTE_IMM; + + if (errcode_ret) + *errcode_ret = err; + + return e; } -void cl_event_new_enqueue_callback(cl_event event, - enqueue_data *data, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list) +LOCAL cl_int +cl_event_set_callback(cl_event event, cl_int exec_type, cl_event_notify_cb pfn_notify, void *user_data) { - enqueue_callback *cb, *node; - user_event *user_events, *u_ev; - cl_command_queue queue = event ? event->queue : NULL; - cl_int i; cl_int err = CL_SUCCESS; + cl_event_user_callback cb; + cl_bool exec_imm = CL_FALSE; - /* Allocate and initialize the structure itself */ - TRY_ALLOC_NO_ERR (cb, CALLOC(enqueue_callback)); - cb->num_events = 0; - TRY_ALLOC_NO_ERR (cb->wait_list, CALLOC_ARRAY(cl_event, num_events_in_wait_list)); - for(i=0; i<num_events_in_wait_list; i++) { - //user event will insert to cb->wait_user_events, need not in wait list, avoid ref twice - if(event_wait_list[i]->type != CL_COMMAND_USER) { - cb->wait_list[cb->num_events++] = event_wait_list[i]; - cl_event_add_ref(event_wait_list[i]); //add defer enqueue's wait event reference - } - } - cb->event = event; - cb->next = NULL; - cb->wait_user_events = NULL; - - if(queue && queue->barrier_events_num > 0) { - for(i=0; i<queue->barrier_events_num; i++) { - /* Insert the enqueue_callback to user event list */ - node = queue->wait_events[i]->waits_head; - if(node == NULL) - queue->wait_events[i]->waits_head = cb; - else{ - while((node != cb) && node->next) - node = node->next; - if(node == cb) //wait on dup user event - continue; - node->next = cb; - } + assert(event); + assert(pfn_notify); - /* Insert the user event to enqueue_callback's wait_user_events */ - TRY(cl_event_insert_user_event, &cb->wait_user_events, queue->wait_events[i]); - cl_event_add_ref(queue->wait_events[i]); + do { + cb = cl_calloc(1, sizeof(_cl_event_user_callback)); + if (cb == NULL) { + err = CL_OUT_OF_HOST_MEMORY; + break; } - } - /* Find out all user events that in event_wait_list wait */ - for(i=0; i<num_events_in_wait_list; i++) { - if(event_wait_list[i]->status <= CL_COMPLETE) - continue; - - if(event_wait_list[i]->type == CL_COMMAND_USER) { - /* Insert the enqueue_callback to user event list */ - node = event_wait_list[i]->waits_head; - if(node == NULL) - event_wait_list[i]->waits_head = cb; - else { - while((node != cb) && node->next) - node = node->next; - if(node == cb) //wait on dup user event - continue; - node->next = cb; - } - /* Insert the user event to enqueue_callback's wait_user_events */ - TRY(cl_event_insert_user_event, &cb->wait_user_events, event_wait_list[i]); - cl_event_add_ref(event_wait_list[i]); - if(queue) - cl_command_queue_insert_event(queue, event_wait_list[i]); - if(queue && data->type == EnqueueBarrier){ - cl_command_queue_insert_barrier_event(queue, event_wait_list[i]); - } - } else if(event_wait_list[i]->enqueue_cb != NULL) { - user_events = event_wait_list[i]->enqueue_cb->wait_user_events; - while(user_events != NULL) { - /* Insert the enqueue_callback to user event's waits_tail */ - node = user_events->event->waits_head; - if(node == NULL) - event_wait_list[i]->waits_head = cb; - else{ - while((node != cb) && node->next) - node = node->next; - if(node == cb) { //wait on dup user event - user_events = user_events->next; - continue; - } - node->next = cb; - } - - /* Insert the user event to enqueue_callback's wait_user_events */ - TRY(cl_event_insert_user_event, &cb->wait_user_events, user_events->event); - cl_event_add_ref(user_events->event); - if(queue) - cl_command_queue_insert_event(event->queue, user_events->event); - if(queue && data->type == EnqueueBarrier){ - cl_command_queue_insert_barrier_event(event->queue, user_events->event); - } - user_events = user_events->next; - } - } - } - if(event != NULL && event->queue != NULL && event->gpgpu_event != NULL) { - event->gpgpu = cl_thread_gpgpu_take(event->queue); - data->ptr = (void *)event->gpgpu_event; - } - cb->data = *data; - if(event) - event->enqueue_cb = cb; - -exit: - return; -error: - if(cb) { - while(cb->wait_user_events) { - u_ev = cb->wait_user_events; - cb->wait_user_events = cb->wait_user_events->next; - cl_event_delete(u_ev->event); - cl_free(u_ev); + list_init(&cb->node); + cb->pfn_notify = pfn_notify; + cb->user_data = user_data; + cb->status = exec_type; + cb->executed = CL_FALSE; + + CL_OBJECT_LOCK(event); + if (event->status > exec_type) { + list_add_tail(&cb->node, &event->callbacks); + cb = NULL; + } else { + /* The state has already OK, call it immediately. */ + exec_imm = CL_TRUE; } - for(i=0; i<cb->num_events; i++) { - if(cb->wait_list[i]) { - cl_event_delete(cb->wait_list[i]); - } - } - cl_free(cb); - } - goto exit; -} + CL_OBJECT_UNLOCK(event); -void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb) { - user_callback *user_cb = NULL; - user_callback *queue_cb = NULL; // For thread safety, we create a queue that holds user_callback's pfn_notify contents - user_callback *temp_cb = NULL; - user_cb = event->user_cb; - CL_OBJECT_LOCK(event); - while(user_cb) { - if(user_cb->status >= status - && user_cb->executed == CL_FALSE) { // Added check to not execute a callback when it was already handled - user_cb->executed = CL_TRUE; - temp_cb = cl_malloc(sizeof(user_callback)); - if(!temp_cb) { - break; // Out of memory - } - temp_cb->pfn_notify = user_cb->pfn_notify; // Minor struct copy to call ppfn_notify out of the pthread_mutex - temp_cb->user_data = user_cb->user_data; - if(free_cb) { - cl_free(user_cb); - } - if(!queue_cb) { - queue_cb = temp_cb; - queue_cb->next = NULL; - } else { // Enqueue First - temp_cb->next = queue_cb; - queue_cb = temp_cb; - } + if (exec_imm) { + cb->pfn_notify(event, event->status, cb->user_data); } - user_cb = user_cb->next; - } - CL_OBJECT_UNLOCK(event); - // Calling the callbacks outside of the event_lock is required because the callback can call cl_api functions and get deadlocked - while(queue_cb) { // For each callback queued, actually execute the callback - queue_cb->pfn_notify(event, event->status, queue_cb->user_data); - temp_cb = queue_cb; - queue_cb = queue_cb->next; - cl_free(temp_cb); - } + } while (0); + + if (cb) + cl_free(cb); + + return err; } -void cl_event_set_status(cl_event event, cl_int status) +LOCAL cl_int +cl_event_set_status(cl_event event, cl_int status) { - cl_int ret, i; - cl_event evt; + list_head tmp_callbacks; + list_head *n; + list_head *pos; + cl_bool notify_queue = CL_FALSE; + cl_event_user_callback cb; + + assert(event); CL_OBJECT_LOCK(event); - if(status >= event->status) { + if (event->status <= CL_COMPLETE) { // Already set to error or completed CL_OBJECT_UNLOCK(event); - return; + return CL_INVALID_OPERATION; } - if(event->status <= CL_COMPLETE) { - event->status = status; //have done enqueue before or doing in another thread - CL_OBJECT_UNLOCK(event); - return; + + if (CL_EVENT_IS_USER(event)) { + assert(event->status != CL_RUNNING && event->status != CL_QUEUED); + } else { + assert(event->queue); // Must belong to some queue. } - if(status <= CL_COMPLETE) { - if(event->enqueue_cb) { - if(status == CL_COMPLETE) { - cl_enqueue_handle(event, &event->enqueue_cb->data); - if(event->gpgpu_event) - cl_gpgpu_event_update_status(event->gpgpu_event, 1); //now set complet, need refine - } else { - if(event->gpgpu_event) { - // Error then cancel the enqueued event. - cl_gpgpu_delete(event->gpgpu); - event->gpgpu = NULL; - } - } + if (status >= event->status) { // Should never go back. + CL_OBJECT_UNLOCK(event); + return CL_INVALID_OPERATION; + } - event->status = status; //Change the event status after enqueue and befor unlock + event->status = status; + /* Call all the callbacks. */ + if (!list_empty(&event->callbacks)) { + do { + status = event->status; + list_init(&tmp_callbacks); + list_replace(&event->callbacks, &tmp_callbacks); + list_init(&event->callbacks); + /* Call all the callbacks without lock. */ CL_OBJECT_UNLOCK(event); - for(i=0; i<event->enqueue_cb->num_events; i++) - cl_event_delete(event->enqueue_cb->wait_list[i]); + + list_for_each_safe(pos, n, &tmp_callbacks) + { + cb = list_entry(pos, _cl_event_user_callback, node); + + assert(cb->executed == CL_FALSE); + + if (cb->status < status) + continue; + + list_del(&cb->node); + cb->executed = CL_TRUE; + cb->pfn_notify(event, status, cb->user_data); + cl_free(cb); + } + CL_OBJECT_LOCK(event); - if(event->enqueue_cb->wait_list) - cl_free(event->enqueue_cb->wait_list); - cl_free(event->enqueue_cb); - event->enqueue_cb = NULL; - } + // Set back the uncalled callbacks. + list_splice_tail(&tmp_callbacks, &event->callbacks); + + /* Status may changed because we unlock. need to check again. */ + } while (status != event->status); } - if(event->status >= status) //maybe changed in other threads - event->status = status; + + /* Wakeup all the waiter for status change. */ + CL_OBJECT_NOTIFY_COND(event); + + if (event->status <= CL_COMPLETE) { + notify_queue = CL_TRUE; + } + CL_OBJECT_UNLOCK(event); - /* Call user callback */ - cl_event_call_callback(event, status, CL_FALSE); + /* Need to notify all the command queue within the same context. */ + if (notify_queue) { + cl_command_queue *q_list = NULL; + cl_uint queue_num = 0; + int i = 0; + int cookie = 0; + + /*First, we need to remove it from queue's barrier list. */ + if (CL_EVENT_IS_BARRIER(event)) { + assert(event->queue); + cl_command_queue_remove_barrier_event(event->queue, event); + } - if(event->type == CL_COMMAND_USER) { - /* Check all defer enqueue */ - enqueue_callback *cb, *enqueue_cb = event->waits_head; - while(enqueue_cb) { - /* Remove this user event in enqueue_cb, update the header if needed. */ - cl_event_remove_user_event(&enqueue_cb->wait_user_events, event); - cl_event_delete(event); + /* Then, notify all the queues within the same context. */ + CL_OBJECT_LOCK(event->ctx); + do { + queue_num = event->ctx->queue_num; + cookie = event->ctx->queue_cookie; + + if (queue_num > 0) { + q_list = cl_calloc(queue_num, sizeof(cl_command_queue)); + assert(q_list); + i = 0; + list_for_each(pos, &event->ctx->queues) + { + q_list[i] = (cl_command_queue)(list_entry(pos, _cl_base_object, node)); + assert(i < queue_num); + i++; + } - /* Still wait on other user events */ - if(enqueue_cb->wait_user_events != NULL) { - enqueue_cb = enqueue_cb->next; - continue; - } + CL_OBJECT_UNLOCK(event->ctx); // Update status without context lock. - //remove user event frome enqueue_cb's ctx - cl_command_queue_remove_event(enqueue_cb->event->queue, event); - cl_command_queue_remove_barrier_event(enqueue_cb->event->queue, event); - - /* All user events complete, now wait enqueue events */ - ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list, - enqueue_cb->event->queue); - assert(ret != CL_ENQUEUE_EXECUTE_DEFER); - ret = ~ret; - cb = enqueue_cb; - enqueue_cb = enqueue_cb->next; - - /* Call the pending operation */ - evt = cb->event; - /* TODO: if this event wait on several events, one event's - status is error, the others is complete, what's the status - of this event? Can't find the description in OpenCL spec. - Simply update to latest finish wait event.*/ - cl_event_set_status(cb->event, status); - if(evt->emplict == CL_FALSE) { - cl_event_delete(evt); + for (i = 0; i < queue_num; i++) { + cl_command_queue_notify(q_list[i]); + } + + CL_OBJECT_LOCK(event->ctx); // Lock again. + } else { + /* No queue? Just do nothing. */ } - } - event->waits_head = NULL; - } - if(event->status <= CL_COMPLETE){ - /* Maintain the last_list when event completed*/ - if (event->last_prev) - event->last_prev->last_next = event->last_next; - if (event->last_next) - event->last_next->last_prev = event->last_prev; - if(event->queue && get_last_event(event->queue) == event) - set_last_event(event->queue, event->last_next); - event->last_prev = NULL; - event->last_next = NULL; - cl_event_delete(event); + } while (cookie != event->ctx->queue_cookie); // Some queue may be added when we unlock. + CL_OBJECT_UNLOCK(event->ctx); + + if (q_list) + cl_free(q_list); } + + return CL_SUCCESS; } -void cl_event_update_status(cl_event event, int wait) +LOCAL cl_int +cl_event_wait_for_event_ready(const cl_event event) { - if(event->status <= CL_COMPLETE) - return; - if((event->gpgpu_event) && - (cl_gpgpu_event_update_status(event->gpgpu_event, wait) == command_complete)) - cl_event_set_status(event, CL_COMPLETE); + assert(CL_OBJECT_IS_EVENT(event)); + return cl_event_wait_for_events_list(event->depend_event_num, event->depend_events); } -cl_int cl_event_marker_with_wait_list(cl_command_queue queue, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event* event) +LOCAL cl_int +cl_event_wait_for_events_list(cl_uint num_events, const cl_event *event_list) { - enqueue_data data = { 0 }; + int i; cl_event e; + cl_int ret = CL_SUCCESS; - e = cl_event_new(queue->ctx, queue, CL_COMMAND_MARKER, CL_TRUE); - if(e == NULL) - return CL_OUT_OF_HOST_MEMORY; - - if(event != NULL ){ - *event = e; - } + for (i = 0; i < num_events; i++) { + e = event_list[i]; + assert(e); + assert(CL_OBJECT_IS_EVENT(e)); -//enqueues a marker command which waits for either a list of events to complete, or if the list is -//empty it waits for all commands previously enqueued in command_queue to complete before it completes. - if(num_events_in_wait_list > 0){ - if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) { - data.type = EnqueueMarker; - cl_event_new_enqueue_callback(event?*event:NULL, &data, num_events_in_wait_list, event_wait_list); - return CL_SUCCESS; + CL_OBJECT_LOCK(e); + while (e->status > CL_COMPLETE) { + CL_OBJECT_WAIT_ON_COND(e); } - } else if(queue->wait_events_num > 0) { - data.type = EnqueueMarker; - cl_event_new_enqueue_callback(event?*event:NULL, &data, queue->wait_events_num, queue->wait_events); - return CL_SUCCESS; + /* Iff some error happened, return the error. */ + if (e->status < CL_COMPLETE) { + ret = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST; + } + CL_OBJECT_UNLOCK(e); } - cl_event_update_last_events(queue,1); - - cl_event_set_status(e, CL_COMPLETE); - return CL_SUCCESS; + return ret; } -cl_int cl_event_barrier_with_wait_list(cl_command_queue queue, - cl_uint num_events_in_wait_list, - const cl_event *event_wait_list, - cl_event* event) +LOCAL cl_int +cl_event_check_waitlist(cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event *event, cl_context ctx) { - enqueue_data data = { 0 }; - cl_event e; - - e = cl_event_new(queue->ctx, queue, CL_COMMAND_BARRIER, CL_TRUE); - if(e == NULL) - return CL_OUT_OF_HOST_MEMORY; + cl_int err = CL_SUCCESS; + cl_int i; - if(event != NULL ){ - *event = e; - } -//enqueues a barrier command which waits for either a list of events to complete, or if the list is -//empty it waits for all commands previously enqueued in command_queue to complete before it completes. - if(num_events_in_wait_list > 0){ - if(cl_event_wait_events(num_events_in_wait_list, event_wait_list, queue) == CL_ENQUEUE_EXECUTE_DEFER) { - data.type = EnqueueBarrier; - cl_event_new_enqueue_callback(e, &data, num_events_in_wait_list, event_wait_list); - return CL_SUCCESS; + do { + /* check the event_wait_list and num_events_in_wait_list */ + if ((event_wait_list == NULL) && (num_events_in_wait_list > 0)) { + err = CL_INVALID_EVENT_WAIT_LIST; + break; } - } else if(queue->wait_events_num > 0) { - data.type = EnqueueBarrier; - cl_event_new_enqueue_callback(e, &data, queue->wait_events_num, queue->wait_events); - return CL_SUCCESS; - } - cl_event_update_last_events(queue,1); + if ((event_wait_list != NULL) && (num_events_in_wait_list == 0)) { + err = CL_INVALID_EVENT_WAIT_LIST; + break; + } - cl_event_set_status(e, CL_COMPLETE); - return CL_SUCCESS; -} + /* check the event and context */ + for (i = 0; i < num_events_in_wait_list; i++) { + if (event_wait_list[i] == NULL || !CL_OBJECT_IS_EVENT(event_wait_list[i])) { + err = CL_INVALID_EVENT; + break; + } -cl_ulong cl_event_get_cpu_timestamp(cl_ulong *cpu_time) -{ - struct timespec ts; + if (cl_event_get_status(event_wait_list[i]) < CL_COMPLETE) { + err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST; + break; + } - if(clock_gettime(CLOCK_MONOTONIC_RAW,&ts) != 0){ - printf("CPU Timmer error\n"); - return CL_FALSE; - } - *cpu_time = (1000000000.0) * (cl_ulong) ts.tv_sec + (cl_ulong) ts.tv_nsec; + if (event == event_wait_list + i) { /* Pointer of element of the wait list */ + err = CL_INVALID_EVENT_WAIT_LIST; + break; + } - return CL_SUCCESS; -} + /* check all belong to same context. */ + if (ctx == NULL) { + ctx = event_wait_list[i]->ctx; + } + if (event_wait_list[i]->ctx != ctx) { + err = CL_INVALID_CONTEXT; + break; + } + } -cl_int cl_event_get_queued_cpu_timestamp(cl_event event) -{ - cl_int ret_val; + if (err != CL_SUCCESS) + break; - ret_val = cl_event_get_cpu_timestamp(&event->queued_timestamp); + } while (0); - return ret_val; + return err; } -cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,cl_ulong end_timestamp) +LOCAL void +cl_event_exec(cl_event event, cl_int exec_status) { - cl_ulong ret_val; + /* We are MT safe here, no one should call this + at the same time. No need to lock */ + cl_int ret = CL_SUCCESS; + cl_int status = cl_event_get_status(event); + cl_int depend_status; - if(end_timestamp > start_timestamp){ - ret_val = end_timestamp - start_timestamp; - } - else { - /*if start time stamp is greater than end timstamp then set ret value to max*/ - ret_val = ((cl_ulong) 1 << 32); + if (status < CL_COMPLETE || status <= exec_status) { + return; } - return ret_val; -} - -cl_ulong cl_event_get_start_timestamp(cl_event event) -{ - cl_ulong ret_val; + depend_status = cl_event_is_ready(event); + assert(depend_status <= CL_COMPLETE); + if (depend_status < CL_COMPLETE) { // Error happend, cancel exec. + ret = cl_event_set_status(event, depend_status); + return; + } - ret_val = cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[2]); + /* Do the according thing based on event type. */ + ret = cl_enqueue_handle(&event->exec_data, exec_status); - return ret_val; + if (ret != CL_SUCCESS) { + assert(ret < 0); + DEBUGP(DL_WARNING, "Exec event %p error, type is %d, error staus is %d", + event, event->event_type, ret); + ret = cl_event_set_status(event, ret); + assert(ret == CL_SUCCESS); + } else { + ret = cl_event_set_status(event, exec_status); + assert(ret == CL_SUCCESS); + } } -cl_ulong cl_event_get_end_timestamp(cl_event event) +/* 0 means ready, >0 means not ready, <0 means error. */ +LOCAL cl_int +cl_event_is_ready(cl_event event) { - cl_ulong ret_val; - - ret_val = cl_event_get_timestamp_delta(event->timestamp[0],event->timestamp[3]); + int i; + int status; - return ret_val; -} + for (i = 0; i < event->depend_event_num; i++) { + status = cl_event_get_status(event->depend_events[i]); -cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name) -{ - cl_ulong ret_val = 0; - GET_QUEUE_THREAD_GPGPU(event->queue); - - if (!event->gpgpu_event) { - cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val); - event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; - return CL_SUCCESS; + if (status != CL_COMPLETE) { + return status; + } } - if(param_name == CL_PROFILING_COMMAND_SUBMIT || - param_name == CL_PROFILING_COMMAND_QUEUED) { - cl_gpgpu_event_get_gpu_cur_timestamp(gpgpu, &ret_val); - event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; - return CL_SUCCESS; - } else if(param_name == CL_PROFILING_COMMAND_START) { - cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 0, &ret_val); - event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; - return CL_SUCCESS; - } else if (param_name == CL_PROFILING_COMMAND_END) { - cl_gpgpu_event_get_exec_timestamp(gpgpu, event->gpgpu_event, 1, &ret_val); - event->timestamp[param_name - CL_PROFILING_COMMAND_QUEUED] = ret_val; - return CL_SUCCESS; - } - return CL_INVALID_VALUE; + return CL_COMPLETE; } -cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event) +LOCAL cl_event +cl_event_create_marker_or_barrier(cl_command_queue queue, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_bool is_barrier, cl_int *error) { - user_event * u_iter = *p_u_ev; - user_event * u_ev; - - while(u_iter) - { - if(u_iter->event == event) - return CL_SUCCESS; - u_iter = u_iter->next; - } + cl_event e = NULL; + cl_int err = CL_SUCCESS; + cl_command_type type = CL_COMMAND_MARKER; + enqueue_type eq_type = EnqueueMarker; - TRY_ALLOC_NO_ERR (u_ev, CALLOC(user_event)); - u_ev->event = event; - u_ev->next = *p_u_ev; - *p_u_ev = u_ev; + if (is_barrier) { + type = CL_COMMAND_BARRIER; + eq_type = EnqueueBarrier; + } + if (event_wait_list) { + assert(num_events_in_wait_list > 0); - return CL_SUCCESS; -error: - return CL_FALSE; -} + e = cl_event_create(queue->ctx, queue, num_events_in_wait_list, + event_wait_list, type, &err); + if (err != CL_SUCCESS) { + *error = err; + return NULL; + } + } else { /* The marker depends on all events in the queue now. */ + cl_command_queue_enqueue_worker worker = &queue->worker; + cl_uint i; + cl_uint event_num; + cl_event *depend_events; + + CL_OBJECT_LOCK(queue); + + /* First, wait for the command queue retire all in executing event. */ + while (1) { + if (worker->quit) { // already destroy the queue? + CL_OBJECT_UNLOCK(queue); + *error = CL_INVALID_COMMAND_QUEUE; + return NULL; + } -cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event) -{ - user_event * u_iter = *p_u_ev; - user_event * u_prev = *p_u_ev; - - while(u_iter){ - if(u_iter->event == event ){ - if(u_iter == *p_u_ev){ - *p_u_ev = u_iter->next; - }else{ - u_prev->next = u_iter->next; + if (worker->in_exec_status != CL_COMPLETE) { + CL_OBJECT_WAIT_ON_COND(queue); + continue; } - cl_free(u_iter); + break; } - u_prev = u_iter; - u_iter = u_iter->next; + + event_num = 0; + depend_events = NULL; + if (!list_empty(&worker->enqueued_events)) { + depend_events = cl_command_queue_record_in_queue_events(queue, &event_num); + } + + CL_OBJECT_UNLOCK(queue); + + e = cl_event_create(queue->ctx, queue, event_num, depend_events, type, &err); + + for (i = 0; i < event_num; i++) { //unref the temp + cl_event_delete(depend_events[i]); + } + if (depend_events) + cl_free(depend_events); + + if (err != CL_SUCCESS) { + *error = err; + return NULL; + } } - return CL_SUCCESS; + e->exec_data.type = eq_type; + *error = CL_SUCCESS; + return e; } diff --git a/src/cl_event.h b/src/cl_event.h index 85cd53e2..f67299c4 100644 --- a/src/cl_event.h +++ b/src/cl_event.h @@ -14,111 +14,75 @@ * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see <http://www.gnu.org/licenses/>. * - * Author: Benjamin Segovia <benjamin.segovia@intel.com> */ -#ifndef __CL_EVENT_H__ -#define __CL_EVENT_H__ +#ifndef __CL_EVENT_H_ +#define __CL_EVENT_H_ #include <semaphore.h> #include "cl_base_object.h" -#include "cl_driver.h" #include "cl_enqueue.h" #include "CL/cl.h" -#define CL_ENQUEUE_EXECUTE_IMM 0 -#define CL_ENQUEUE_EXECUTE_DEFER 1 +typedef void(CL_CALLBACK *cl_event_notify_cb)(cl_event event, cl_int event_command_exec_status, void *user_data); -typedef struct _user_event { - cl_event event; /* The user event */ - struct _user_event* next; /* Next user event in list */ -} user_event; +typedef struct _cl_event_user_callback { + cl_int status; /* The execution status */ + cl_bool executed; /* Indicat the callback function been called or not */ + cl_event_notify_cb pfn_notify; /* Callback function */ + void *user_data; /* Callback user data */ + list_head node; /* Event callback list node */ +} _cl_event_user_callback; -typedef struct _enqueue_callback { - cl_event event; /* The event relative this enqueue callback */ - enqueue_data data; /* Hold all enqueue callback's infomation */ - cl_uint num_events; /* num events in wait list */ - cl_event* wait_list; /* All event wait list this callback wait on */ - user_event* wait_user_events; /* The head of user event list the callback wait on */ - struct _enqueue_callback* next; /* The next enqueue callback in wait list */ -} enqueue_callback; +typedef _cl_event_user_callback *cl_event_user_callback; -typedef void (CL_CALLBACK *EVENT_NOTIFY)(cl_event event, cl_int event_command_exec_status, void *user_data); - -typedef struct _user_callback { - cl_int status; /* The execution status */ - cl_bool executed; /* Indicat the callback function been called or not */ - EVENT_NOTIFY pfn_notify; /* Callback function */ - void* user_data; /* Callback user data */ - struct _user_callback* next; /* Next event callback in list */ -} user_callback; - -struct _cl_event { - _cl_base_object base; - cl_context ctx; /* The context associated with event */ - cl_command_queue queue; /* The command queue associated with event */ - cl_command_type type; /* The command type associated with event */ - cl_int status; /* The execution status */ - cl_gpgpu gpgpu; /* Current gpgpu, owned by this structure. */ - cl_gpgpu_event gpgpu_event; /* The event object communicate with hardware */ - user_callback* user_cb; /* The event callback functions */ - enqueue_callback* enqueue_cb; /* This event's enqueue */ - enqueue_callback* waits_head; /* The head of enqueues list wait on this event */ - cl_bool emplict; /* Identify this event whether created by api emplict*/ - cl_ulong timestamp[4];/* The time stamps for profiling. */ - cl_ulong queued_timestamp; - cl_event last_next, last_prev;/* We need a list to monitor untouchable api event*/ -}; +typedef struct _cl_event { + _cl_base_object base; + cl_context ctx; /* The context associated with event */ + cl_command_queue queue; /* The command queue associated with event */ + cl_command_type event_type; /* Event type. */ + cl_bool is_barrier; /* Is this event a barrier */ + cl_int status; /* The execution status */ + cl_event *depend_events; /* The events must complete before this. */ + cl_uint depend_event_num; /* The depend events number. */ + list_head callbacks; /* The events The event callback functions */ + list_head enqueue_node; /* The node in the enqueue list. */ + cl_ulong timestamp[4]; /* The time stamps for profiling. */ + cl_ulong queued_timestamp; + enqueue_data exec_data; /* Context for execute this event. */ +} _cl_event; #define CL_OBJECT_EVENT_MAGIC 0x8324a9f810ebf90fLL -#define CL_OBJECT_IS_EVENT(obj) (((cl_base_object)obj)->magic == CL_OBJECT_EVENT_MAGIC) +#define CL_OBJECT_IS_EVENT(obj) ((obj && \ + ((cl_base_object)obj)->magic == CL_OBJECT_EVENT_MAGIC && \ + CL_OBJECT_GET_REF(obj) >= 1)) + +#define CL_EVENT_IS_MARKER(E) (E->event_type == CL_COMMAND_MARKER) +#define CL_EVENT_IS_BARRIER(E) (E->event_type == CL_COMMAND_BARRIER) +#define CL_EVENT_IS_USER(E) (E->event_type == CL_COMMAND_USER) /* Create a new event object */ -cl_event cl_event_new(cl_context, cl_command_queue, cl_command_type, cl_bool); -/* Unref the object and delete it if no more reference on it */ -void cl_event_delete(cl_event); -/* Add one more reference to this object */ -void cl_event_add_ref(cl_event); -/* Register a user callback function for specific commond execution status */ -cl_int cl_event_set_callback(cl_event, cl_int, EVENT_NOTIFY, void *); -/* Execute the event's callback if the event's status supersedes the callback's status. Free the callback if specified */ -void cl_event_call_callback(cl_event event, cl_int status, cl_bool free_cb); -/* Check events wait list for enqueue commonds */ -cl_int cl_event_check_waitlist(cl_uint, const cl_event *, cl_event *, cl_context); -/* Wait the all events in wait list complete */ -cl_int cl_event_wait_events(cl_uint, const cl_event *, cl_command_queue); -/* New a enqueue suspend task */ -void cl_event_new_enqueue_callback(cl_event, enqueue_data *, cl_uint, const cl_event *); -/* Set the event status and call all callbacks */ -void cl_event_set_status(cl_event, cl_int); -/* Check and update event status */ -void cl_event_update_status(cl_event, cl_int); -/* Create the marker event */ -cl_int cl_event_marker_with_wait_list(cl_command_queue, cl_uint, const cl_event *, cl_event*); -/* Create the barrier event */ -cl_int cl_event_barrier_with_wait_list(cl_command_queue, cl_uint, const cl_event *, cl_event*); -/* Get the cpu time */ -cl_ulong cl_event_get_cpu_timestamp(cl_ulong *cpu_time); -/*Get the cpu time for queued*/ -cl_int cl_event_get_queued_cpu_timestamp(cl_event event); -/*get timestamp delate between end and start*/ -cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp,cl_ulong end_timestamp); -/*Get start time stamp*/ -cl_ulong cl_event_get_start_timestamp(cl_event event); -/*Get end time stamp*/ -cl_ulong cl_event_get_end_timestamp(cl_event event); -/* Do the event profiling */ -cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name); -/* insert the user event */ -cl_int cl_event_insert_user_event(user_event** p_u_ev, cl_event event); -/* remove the user event */ -cl_int cl_event_remove_user_event(user_event** p_u_ev, cl_event event); -/* flush the event's pending gpgpu batch buffer and notify driver this gpgpu event has been flushed. */ -cl_int cl_event_flush(cl_event event); -/* monitor or block wait all events in the last_event list */ -void cl_event_update_last_events(cl_command_queue queuet, int wait); -/* insert the event into the last_event list in queue */ -void cl_event_insert_last_events(cl_command_queue queue, cl_event event); +extern cl_event cl_event_create(cl_context ctx, cl_command_queue queue, cl_uint num_events, + const cl_event *event_list, cl_command_type type, cl_int *errcode_ret); +extern cl_int cl_event_check_waitlist(cl_uint num_events_in_wait_list, const cl_event *event_wait_list, + cl_event* event, cl_context ctx); +extern void cl_event_exec(cl_event event, cl_int exec_status); +/* 0 means ready, >0 means not ready, <0 means error. */ +extern cl_int cl_event_is_ready(cl_event event); +extern cl_int cl_event_get_status(cl_event event); +extern void cl_event_add_ref(cl_event event); +extern void cl_event_delete(cl_event event); +extern cl_int cl_event_set_status(cl_event event, cl_int status); +extern cl_int cl_event_set_callback(cl_event event, cl_int exec_type, + cl_event_notify_cb pfn_notify, void *user_data); +extern cl_int cl_event_wait_for_events_list(cl_uint num_events, const cl_event *event_list); +extern cl_int cl_event_wait_for_event_ready(cl_event event); +extern cl_ulong cl_event_get_timestamp_delta(cl_ulong start_timestamp, cl_ulong end_timestamp); +extern cl_ulong cl_event_get_start_timestamp(cl_event event); +extern cl_ulong cl_event_get_end_timestamp(cl_event event); +extern cl_int cl_event_get_timestamp(cl_event event, cl_profiling_info param_name); +extern cl_event cl_event_create_marker_or_barrier(cl_command_queue queue, cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, cl_bool is_barrier, + cl_int* error); #endif /* __CL_EVENT_H__ */ - diff --git a/src/cl_mem.c b/src/cl_mem.c index 06a4d5a6..333ffc95 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -28,6 +28,7 @@ #include "cl_kernel.h" #include "cl_command_queue.h" #include "cl_cmrt.h" +#include "cl_enqueue.h" #include "CL/cl.h" #include "CL/cl_intel.h" @@ -1264,7 +1265,7 @@ cl_mem_add_ref(cl_mem mem) #define LOCAL_SZ_2 4 LOCAL cl_int -cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, +cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf, size_t src_offset, size_t dst_offset, size_t cb) { cl_int ret = CL_SUCCESS; @@ -1317,7 +1318,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, cl_kernel_set_arg(ker, 2, sizeof(cl_mem), &dst_buf); cl_kernel_set_arg(ker, 3, sizeof(int), &dw_dst_offset); cl_kernel_set_arg(ker, 4, sizeof(int), &cb); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); cl_kernel_delete(ker); return ret; } @@ -1358,7 +1359,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, cl_kernel_set_arg(ker, 4, sizeof(int), &dw_num); cl_kernel_set_arg(ker, 5, sizeof(int), &first_mask); cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); cl_kernel_delete(ker); return ret; } @@ -1388,7 +1389,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, cl_kernel_set_arg(ker, 6, sizeof(int), &last_mask); cl_kernel_set_arg(ker, 7, sizeof(int), &shift); cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); cl_kernel_delete(ker); return ret; } @@ -1420,7 +1421,7 @@ cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, cl_kernel_set_arg(ker, 7, sizeof(int), &shift); cl_kernel_set_arg(ker, 8, sizeof(int), &dw_mask); cl_kernel_set_arg(ker, 9, sizeof(int), &src_less); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); cl_kernel_delete(ker); return ret; } @@ -1493,13 +1494,13 @@ cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image cl_kernel_set_arg(ker, 6, sizeof(cl_int), &origin[1]); cl_kernel_set_arg(ker, 7, sizeof(cl_int), &origin[2]); - ret = cl_command_queue_ND_range(queue, ker, 3, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, NULL, 3, global_off, global_sz, local_sz); cl_kernel_delete(ker); return ret; } LOCAL cl_int -cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size, +cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern, size_t pattern_size, cl_mem buffer, size_t offset, size_t size) { cl_int ret = CL_SUCCESS; @@ -1596,13 +1597,13 @@ cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size, if (is_128) cl_kernel_set_arg(ker, 4, pattern_size, pattern1); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, e, 1, global_off, global_sz, local_sz); cl_kernel_delete(ker); return ret; } LOCAL cl_int -cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, +cl_mem_copy_buffer_rect(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf, const size_t *src_origin, const size_t *dst_origin, const size_t *region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch) { @@ -1617,7 +1618,7 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0]; cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0]; cl_int size = region[0]*region[1]*region[2]; - ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size); + ret = cl_mem_copy(queue, NULL, src_buf, dst_buf,src_offset, dst_offset, size); return ret; } @@ -1669,14 +1670,15 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_row_pitch); cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_slice_pitch); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); cl_kernel_delete(ker); return ret; } LOCAL cl_int -cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image, struct _cl_mem_image* dst_image, - const size_t *src_origin, const size_t *dst_origin, const size_t *region) { +cl_mem_kernel_copy_image(cl_command_queue queue, cl_event event, struct _cl_mem_image* src_image, + struct _cl_mem_image* dst_image, const size_t *src_origin, + const size_t *dst_origin, const size_t *region) { cl_int ret; cl_kernel ker = NULL; size_t global_off[] = {0,0,0}; @@ -1817,7 +1819,7 @@ cl_mem_kernel_copy_image(cl_command_queue queue, struct _cl_mem_image* src_image cl_kernel_set_arg(ker, 9, sizeof(cl_int), &dst_origin[1]); cl_kernel_set_arg(ker, 10, sizeof(cl_int), &dst_origin[2]); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); fail: @@ -1830,7 +1832,7 @@ fail: } LOCAL cl_int -cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_mem buffer, +cl_mem_copy_image_to_buffer(cl_command_queue queue, cl_event event, struct _cl_mem_image* image, cl_mem buffer, const size_t *src_origin, const size_t dst_offset, const size_t *region) { cl_int ret; cl_kernel ker = NULL; @@ -1919,7 +1921,7 @@ cl_mem_copy_image_to_buffer(cl_command_queue queue, struct _cl_mem_image* image, cl_kernel_set_arg(ker, 7, sizeof(cl_int), &src_origin[2]); cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_dst_offset); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); fail: @@ -1933,7 +1935,7 @@ fail: LOCAL cl_int -cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_mem_image* image, +cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_event event, cl_mem buffer, struct _cl_mem_image* image, const size_t src_offset, const size_t *dst_origin, const size_t *region) { cl_int ret; cl_kernel ker = NULL; @@ -2019,7 +2021,7 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me cl_kernel_set_arg(ker, 7, sizeof(cl_int), &dst_origin[2]); cl_kernel_set_arg(ker, 8, sizeof(cl_int), &kn_src_offset); - ret = cl_command_queue_ND_range(queue, ker, 1, global_off, global_sz, local_sz); + ret = cl_command_queue_ND_range(queue, ker, event, 1, global_off, global_sz, local_sz); cl_kernel_delete(ker); image->intel_fmt = intel_fmt; @@ -2308,3 +2310,83 @@ error: mem = NULL; goto exit; } + +LOCAL cl_int +cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset, + size_t size, const size_t *origin, const size_t *region) +{ + // TODO: Need to add MT safe logic. + + cl_int slot = -1; + int err = CL_SUCCESS; + size_t sub_offset = 0; + + if(mem->type == CL_MEM_SUBBUFFER_TYPE) { + struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; + sub_offset = buffer->sub_offset; + } + + ptr = (char*)ptr + offset + sub_offset; + if(mem->flags & CL_MEM_USE_HOST_PTR) { + assert(mem->host_ptr); + //only calc ptr here, will do memcpy in enqueue + *mem_ptr = (char *)mem->host_ptr + offset + sub_offset; + } else { + *mem_ptr = ptr; + } + /* Record the mapped address. */ + if (!mem->mapped_ptr_sz) { + mem->mapped_ptr_sz = 16; + mem->mapped_ptr = (cl_mapped_ptr *)malloc( + sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz); + if (!mem->mapped_ptr) { + cl_mem_unmap_auto(mem); + err = CL_OUT_OF_HOST_MEMORY; + goto error; + } + memset(mem->mapped_ptr, 0, mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); + slot = 0; + } else { + int i = 0; + for (; i < mem->mapped_ptr_sz; i++) { + if (mem->mapped_ptr[i].ptr == NULL) { + slot = i; + break; + } + } + if (i == mem->mapped_ptr_sz) { + cl_mapped_ptr *new_ptr = (cl_mapped_ptr *)malloc( + sizeof(cl_mapped_ptr) * mem->mapped_ptr_sz * 2); + if (!new_ptr) { + cl_mem_unmap_auto(mem); + err = CL_OUT_OF_HOST_MEMORY; + goto error; + } + memset(new_ptr, 0, 2 * mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); + memcpy(new_ptr, mem->mapped_ptr, + mem->mapped_ptr_sz * sizeof(cl_mapped_ptr)); + slot = mem->mapped_ptr_sz; + mem->mapped_ptr_sz *= 2; + free(mem->mapped_ptr); + mem->mapped_ptr = new_ptr; + } + } + assert(slot != -1); + mem->mapped_ptr[slot].ptr = *mem_ptr; + mem->mapped_ptr[slot].v_ptr = ptr; + mem->mapped_ptr[slot].size = size; + if(origin) { + assert(region); + mem->mapped_ptr[slot].origin[0] = origin[0]; + mem->mapped_ptr[slot].origin[1] = origin[1]; + mem->mapped_ptr[slot].origin[2] = origin[2]; + mem->mapped_ptr[slot].region[0] = region[0]; + mem->mapped_ptr[slot].region[1] = region[1]; + mem->mapped_ptr[slot].region[2] = region[2]; + } + mem->map_ref++; +error: + if (err != CL_SUCCESS) + *mem_ptr = NULL; + return err; +} diff --git a/src/cl_mem.h b/src/cl_mem.h index 9bb5c473..82f30f6a 100644 --- a/src/cl_mem.h +++ b/src/cl_mem.h @@ -101,7 +101,17 @@ typedef struct _cl_mem { } _cl_mem; #define CL_OBJECT_MEM_MAGIC 0x381a27b9ee6504dfLL -#define CL_OBJECT_IS_MEM(obj) (((cl_base_object)obj)->magic == CL_OBJECT_MEM_MAGIC) +#define CL_OBJECT_IS_MEM(obj) ((obj && \ + ((cl_base_object)obj)->magic == CL_OBJECT_MEM_MAGIC && \ + CL_OBJECT_GET_REF(obj) >= 1)) +#define CL_OBJECT_IS_IMAGE(mem) ((mem && \ + ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC && \ + CL_OBJECT_GET_REF(mem) >= 1 && \ + mem->type >= CL_MEM_IMAGE_TYPE)) +#define CL_OBJECT_IS_BUFFER(mem) ((mem && \ + ((cl_base_object)mem)->magic == CL_OBJECT_MEM_MAGIC && \ + CL_OBJECT_GET_REF(mem) >= 1 && \ + mem->type < CL_MEM_IMAGE_TYPE)) struct _cl_mem_image { _cl_mem base; @@ -221,30 +231,30 @@ extern void cl_mem_gl_delete(struct _cl_mem_gl_image *); extern void cl_mem_add_ref(cl_mem); /* api clEnqueueCopyBuffer help function */ -extern cl_int cl_mem_copy(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf, +extern cl_int cl_mem_copy(cl_command_queue queue, cl_event event, cl_mem src_buf, cl_mem dst_buf, size_t src_offset, size_t dst_offset, size_t cb); -extern cl_int cl_mem_fill(cl_command_queue queue, const void * pattern, size_t pattern_size, +extern cl_int cl_mem_fill(cl_command_queue queue, cl_event e, const void * pattern, size_t pattern_size, cl_mem buffer, size_t offset, size_t size); extern cl_int cl_image_fill(cl_command_queue queue, const void * pattern, struct _cl_mem_image*, const size_t *, const size_t *); /* api clEnqueueCopyBufferRect help function */ -extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_mem, cl_mem, +extern cl_int cl_mem_copy_buffer_rect(cl_command_queue, cl_event event, cl_mem, cl_mem, const size_t *, const size_t *, const size_t *, size_t, size_t, size_t, size_t); /* api clEnqueueCopyImage help function */ -extern cl_int cl_mem_kernel_copy_image(cl_command_queue, struct _cl_mem_image*, struct _cl_mem_image*, - const size_t *, const size_t *, const size_t *); +extern cl_int cl_mem_kernel_copy_image(cl_command_queue, cl_event event, struct _cl_mem_image*, + struct _cl_mem_image*, const size_t *, const size_t *, const size_t *); /* api clEnqueueCopyImageToBuffer help function */ -extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, struct _cl_mem_image*, cl_mem, +extern cl_int cl_mem_copy_image_to_buffer(cl_command_queue, cl_event, struct _cl_mem_image*, cl_mem, const size_t *, const size_t, const size_t *); /* api clEnqueueCopyBufferToImage help function */ -extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_mem, struct _cl_mem_image*, +extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_event, cl_mem, struct _cl_mem_image*, const size_t, const size_t *, const size_t *); /* Directly map a memory object */ @@ -318,5 +328,8 @@ extern cl_mem cl_mem_new_image_from_fd(cl_context ctx, size_t row_pitch, cl_int *errcode); +extern cl_int cl_mem_record_map_mem(cl_mem mem, void *ptr, void **mem_ptr, size_t offset, + size_t size, const size_t *origin, const size_t *region); + #endif /* __CL_MEM_H__ */ |