diff options
author | Zhigang Gong <zhigang.gong@intel.com> | 2015-11-12 16:47:04 +0800 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2015-11-25 11:45:13 +0800 |
commit | 06b0298cb481f936f84e9e9af8fae0763574d3fd (patch) | |
tree | 31f2ec7fbca0d3be53d019cd690f4f30ab68b2f2 | |
parent | 5d5aa6085ea0724ecea7471f4b710d8f21e206bc (diff) |
runtime: set CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE to kernel's SIMD_WIDTH.
It makes sense to set CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE to the
corresponding SIMD size. Then it provides a way for intel's OCL application
to get SIMD width at runtime and make some SIMD width dependant optimization
possible.
Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
Reviewed-by: Ruiling Song <ruiling.song@intel.com>
-rw-r--r-- | src/cl_api.c | 3 | ||||
-rw-r--r-- | src/cl_command_queue_gen7.c | 2 | ||||
-rw-r--r-- | src/cl_device_id.c | 11 | ||||
-rw-r--r-- | src/cl_device_id.h | 2 | ||||
-rw-r--r-- | src/cl_gt_device.h | 1 |
5 files changed, 13 insertions, 6 deletions
diff --git a/src/cl_api.c b/src/cl_api.c index ddd39cfc..d8ccd7e4 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -3022,6 +3022,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, err = cl_command_queue_flush(command_queue); } +error: if(b_output_kernel_perf) { if(kernel->program->build_opts != NULL) @@ -3029,7 +3030,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue, else time_end(command_queue->ctx, cl_kernel_get_name(kernel), "", command_queue); } -error: + return err; } diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index e5198cd5..1f827050 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -359,7 +359,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, /* Compute the number of HW threads we need */ if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) { - fprintf(stderr, "Beignet: Work group size exceed Kerne's work group size.\n"); + fprintf(stderr, "Beignet: Work group size exceed Kernel's work group size.\n"); return err; } kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz; diff --git a/src/cl_device_id.c b/src/cl_device_id.c index deb2fad8..5debf06a 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -988,7 +988,16 @@ cl_get_kernel_workgroup_info(cl_kernel kernel, return CL_SUCCESS; } } - DECL_FIELD(PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device->preferred_wg_sz_mul) + case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: + { + if (param_value && param_value_size < sizeof(size_t)) + return CL_INVALID_VALUE; + if (param_value_size_ret != NULL) + *param_value_size_ret = sizeof(size_t); + if (param_value) + *(size_t*)param_value = interp_kernel_get_simd_width(kernel->opaque); + return CL_SUCCESS; + } case CL_KERNEL_LOCAL_MEM_SIZE: { size_t local_mem_sz = interp_kernel_get_slm_size(kernel->opaque) + kernel->local_mem_sz; diff --git a/src/cl_device_id.h b/src/cl_device_id.h index e9717352..ea1030de 100644 --- a/src/cl_device_id.h +++ b/src/cl_device_id.h @@ -111,8 +111,6 @@ struct _cl_device_id { size_t driver_version_sz; size_t spir_versions_sz; size_t built_in_kernels_sz; - /* Kernel specific info that we're assigning statically */ - size_t preferred_wg_sz_mul; /* SubDevice specific info */ cl_device_id parent_device; cl_uint partition_max_sub_device; diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h index d8089c23..97cb94a4 100644 --- a/src/cl_gt_device.h +++ b/src/cl_gt_device.h @@ -39,7 +39,6 @@ .native_vector_width_float = 4, .native_vector_width_double = 2, .native_vector_width_half = 8, -.preferred_wg_sz_mul = 16, .address_bits = 32, .image_support = CL_TRUE, .max_read_image_args = BTI_MAX_READ_IMAGE_ARGS, |