diff options
-rw-r--r-- | CMake/FindGBE.cmake | 36 | ||||
-rw-r--r-- | CMakeLists.txt | 8 | ||||
-rw-r--r-- | include/CL/cl.h | 2 | ||||
-rw-r--r-- | include/CL/cl_intel.h | 12 | ||||
-rw-r--r-- | src/CMakeLists.txt | 6 | ||||
-rw-r--r-- | src/cl_api.c | 14 | ||||
-rw-r--r-- | src/cl_command_queue.c | 179 | ||||
-rw-r--r-- | src/cl_command_queue_gen6.c | 230 | ||||
-rw-r--r-- | src/cl_command_queue_gen7.c | 6 | ||||
-rw-r--r-- | src/cl_device_id.c | 55 | ||||
-rw-r--r-- | src/cl_gen6_device.h | 30 | ||||
-rw-r--r-- | src/cl_kernel.c | 776 | ||||
-rw-r--r-- | src/cl_kernel.h | 289 | ||||
-rw-r--r-- | src/cl_program.c | 174 | ||||
-rw-r--r-- | src/cl_program.h | 8 |
15 files changed, 83 insertions, 1742 deletions
diff --git a/CMake/FindGBE.cmake b/CMake/FindGBE.cmake new file mode 100644 index 00000000..46704838 --- /dev/null +++ b/CMake/FindGBE.cmake @@ -0,0 +1,36 @@ +# +# Try to find X library and include path. +# Once done this will define +# +# GBE_FOUND +# GBE_INCLUDE_PATH +# GBE_LIBRARY +# + +FIND_PATH(GBE_INCLUDE_PATH gen/program.h + ~/include/ + /usr/include/ + /usr/local/include/ + /sw/include/ + /opt/local/include/ + DOC "The directory where gen/program.h resides") +FIND_LIBRARY(GBE_LIBRARY + NAMES GBE gbe + PATHS + ~/lib/ + /usr/lib64 + /usr/lib + /usr/local/lib64 + /usr/local/lib + /sw/lib + /opt/local/lib + DOC "The GBE library") + +IF(GBE_INCLUDE_PATH) + SET(GBE_FOUND 1 CACHE STRING "Set to 1 if GBE is found, 0 otherwise") +ELSE(GBE_INCLUDE_PATH) + SET(GBE_FOUND 0 CACHE STRING "Set to 1 if GBE is found, 0 otherwise") +ENDIF(GBE_INCLUDE_PATH) + +MARK_AS_ADVANCED(GBE_FOUND) + diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e974c67..e6d9fee0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,14 @@ ELSE(XFIXES_FOUND) MESSAGE(STATUS "Looking for Xfixes - not found") ENDIF(XFIXES_FOUND) +# Gen-backend (compiler) +INCLUDE(CMake/FindGBE.cmake) +IF(GBE_FOUND) + MESSAGE(STATUS "Looking for Gen-Backend - found") +ELSE(GBE_FOUND) + MESSAGE(STATUS "Looking for Gen-Backend - not found") +ENDIF(GBE_FOUND) + # the run-time itself ADD_SUBDIRECTORY(src) diff --git a/include/CL/cl.h b/include/CL/cl.h index ddb18ece..8201afc0 100644 --- a/include/CL/cl.h +++ b/include/CL/cl.h @@ -625,7 +625,7 @@ clGetSamplerInfo(cl_sampler /* sampler */, size_t /* param_value_size */, void * /* param_value */, size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; - + /* Program Object APIs */ extern CL_API_ENTRY cl_program CL_API_CALL clCreateProgramWithSource(cl_context /* context */, diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h index 34f37288..9239bb56 100644 --- a/include/CL/cl_intel.h +++ b/include/CL/cl_intel.h @@ -46,16 +46,16 @@ clIntelPinBuffer(cl_mem); extern CL_API_ENTRY cl_int CL_API_CALL clIntelUnpinBuffer(cl_mem); -/* Set the buffer where to report the performance counters. If NULL, nothing - * will be report - */ -extern CL_API_ENTRY cl_int CL_API_CALL -clIntelSetReportBuffer(cl_command_queue, cl_mem); - /* Get the generation of the Gen device (used to load the proper binary) */ extern CL_API_ENTRY cl_int CL_API_CALL clIntelGetGenVersion(cl_device_id device, cl_int *ver); +/* Create a program from a LLVM source file */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithLLVM(cl_context /* context */, + const char * /* file */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + #ifdef __cplusplus } #endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7837dd12..39392434 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,7 @@ INCLUDE_DIRECTORIES( ${CMAKE_CURRENT_SOURCE_DIR} ${DRM_INCLUDE_PATH} + ${GBE_INCLUDE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/../include) SET(OPENCL_SRC @@ -16,7 +17,6 @@ SET(OPENCL_SRC cl_device_id.c cl_context.c cl_command_queue.c - cl_command_queue_gen6.c cl_command_queue_gen7.c intel/intel_gpgpu.c intel/intel_batchbuffer.c @@ -26,6 +26,7 @@ SET(OPENCL_SRC ADD_LIBRARY(cl SHARED ${OPENCL_SRC}) TARGET_LINK_LIBRARIES(cl + ${GBE_LIBRARY} ${DRM_LIBRARY} ${DRM_INTEL_LIBRARY} ${XLIB_LIBRARY} @@ -39,7 +40,6 @@ ADD_LIBRARY(cl_test STATIC tests/cl_file_map.c) TARGET_LINK_LIBRARIES(cl_test cl) -ADD_EXECUTABLE(cl_inject cl_inject.c tests/cl_file_map.c) ADD_EXECUTABLE(test_write_only tests/test_write_only.c) ADD_EXECUTABLE(test_copy_buffer tests/test_copy_buffer.c) ADD_EXECUTABLE(test_copy_image tests/test_copy_image.c) @@ -52,7 +52,6 @@ ADD_EXECUTABLE(test_local_memory tests/test_local_memory.c) ADD_EXECUTABLE(test_private_memory tests/test_private_memory.c) ADD_EXECUTABLE(test_constant_memory tests/test_constant_memory.c) ADD_EXECUTABLE(test_memory_leak tests/test_memory_leak.c) -ADD_EXECUTABLE(test_perf_report tests/test_perf_report.c) ADD_EXECUTABLE(mandelbrot tests/mandelbrot.c) ADD_EXECUTABLE(mersenneTwister tests/mersenneTwister.c) ADD_EXECUTABLE(blackscholes tests/blackscholes.c) @@ -76,7 +75,6 @@ TARGET_LINK_LIBRARIES(test_private_memory cl_test m) TARGET_LINK_LIBRARIES(test_constant_memory cl_test m) TARGET_LINK_LIBRARIES(test_memory_leak cl_test m) TARGET_LINK_LIBRARIES(test_write_only cl_test m) -TARGET_LINK_LIBRARIES(test_perf_report cl_test m) TARGET_LINK_LIBRARIES(mandelbrot cl_test m) TARGET_LINK_LIBRARIES(mersenneTwister cl_test m) TARGET_LINK_LIBRARIES(blackscholes cl_test m) diff --git a/src/cl_api.c b/src/cl_api.c index 248b2ef5..c808b977 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -1166,20 +1166,6 @@ error: } cl_int -clIntelSetReportBuffer(cl_command_queue queue, cl_mem mem) -{ - cl_int err = CL_SUCCESS; - CHECK_QUEUE (queue); - if (mem != NULL && mem->magic != CL_MAGIC_MEM_HEADER) { - err = CL_INVALID_MEM; - goto error; - } - err = cl_command_queue_set_report_buffer(queue, mem); -error: - return err; -} - -cl_int clIntelGetGenVersion(cl_device_id device, cl_int *ver) { return cl_device_get_version(device, ver); diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 9c1dab39..76170a56 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -103,168 +103,6 @@ cl_command_queue_add_ref(cl_command_queue queue) atomic_inc(&queue->ref_n); } -static void -cl_kernel_copy_image_parameters(cl_kernel k, cl_mem mem, int index, char *curbe) -{ - cl_curbe_patch_info_t *info = NULL; - uint64_t key; - assert(curbe && mem && mem->is_image); - - key = cl_curbe_key(DATA_PARAMETER_IMAGE_WIDTH, index, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(curbe+info->offsets[0], &mem->w, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_IMAGE_HEIGHT, index, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(curbe+info->offsets[0], &mem->h, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_IMAGE_DEPTH, index, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(curbe+info->offsets[0], &mem->depth, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE, index, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(curbe+info->offsets[0], &mem->fmt.image_channel_data_type, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_IMAGE_CHANNEL_ORDER, index, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(curbe+info->offsets[0], &mem->fmt.image_channel_order, sizeof(uint32_t)); -} - -LOCAL cl_int -cl_command_queue_bind_surface(cl_command_queue queue, - cl_kernel k, - char *curbe, - drm_intel_bo **local, - drm_intel_bo **priv, - drm_intel_bo **scratch, - uint32_t local_sz) -{ - cl_context ctx = queue->ctx; - intel_gpgpu_t *gpgpu = queue->gpgpu; - drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx); - cl_mem mem = NULL; - drm_intel_bo *bo = NULL, *sync_bo = NULL; - const size_t max_thread = ctx->device->max_compute_unit; - cl_int err = CL_SUCCESS; - uint32_t i, index; - - /* Bind user defined surface */ - for (i = 0; i < k->arg_info_n; ++i) { - assert(k->arg_info[i].offset % SURFACE_SZ == 0); - index = k->arg_info[i].offset / SURFACE_SZ; - mem = (cl_mem) k->args[k->arg_info[i].arg_index]; - assert(index != MAX_SURFACES - 1); - CHECK_MEM(mem); - bo = mem->bo; - assert(bo); - if (mem->is_image) { - const int32_t w = mem->w, h = mem->h, pitch = mem->pitch; - const uint32_t fmt = mem->intel_fmt; - gpgpu_tiling_t tiling = GPGPU_NO_TILE; - if (mem->tiling == CL_TILE_X) - tiling = GPGPU_TILE_X; - else if (mem->tiling == CL_TILE_Y) - tiling = GPGPU_TILE_Y; - gpgpu_bind_image2D(gpgpu, index, bo, fmt, w, h, pitch, tiling); - - /* Copy the image parameters (width, height) in the constant buffer if the - * user requests them - */ - cl_kernel_copy_image_parameters(k, mem, index, curbe); - } else - gpgpu_bind_buf(gpgpu, index, bo, cc_llc_l3); - } - - /* Allocate the constant surface (if any) */ - if (k->const_bo) { - assert(k->const_bo_index != MAX_SURFACES - 1); - gpgpu_bind_buf(gpgpu, k->const_bo_index, - k->const_bo, - cc_llc_l3); - } - - /* Allocate local surface needed for SLM and bind it */ - if (local && local_sz != 0) { - const size_t sz = 16 * local_sz; /* XXX 16 == maximum barrier number */ - assert(k->patch.local_surf.offset % SURFACE_SZ == 0); - index = k->patch.local_surf.offset / SURFACE_SZ; - assert(index != MAX_SURFACES - 1); - *local = drm_intel_bo_alloc(bufmgr, "CL local surface", sz, 64); - gpgpu_bind_buf(gpgpu, index, *local, cc_llc_l3); - } - else if (local) - *local = NULL; - - /* Allocate private surface and bind it */ - if (priv && k->patch.private_surf.size != 0) { - const size_t sz = max_thread * - k->patch.private_surf.size * - k->patch.exec_env.largest_compiled_simd_sz; - // assert(k->patch.exec_env.largest_compiled_simd_sz == 16); - assert(k->patch.private_surf.offset % SURFACE_SZ == 0); - index = k->patch.private_surf.offset / SURFACE_SZ; - assert(index != MAX_SURFACES - 1); - *priv = drm_intel_bo_alloc(bufmgr, "CL private surface", sz, 64); - gpgpu_bind_buf(gpgpu, index, *priv, cc_llc_l3); - } - else if(priv) - *priv = NULL; - - /* Allocate scratch surface and bind it */ - if (scratch && k->patch.scratch.size != 0) { - const size_t sz = max_thread * /* XXX is it given per lane ??? */ - k->patch.scratch.size * - k->patch.exec_env.largest_compiled_simd_sz; - // assert(k->patch.exec_env.largest_compiled_simd_sz == 16); - assert(k->patch.scratch.offset % SURFACE_SZ == 0); - assert(index != MAX_SURFACES - 1); - index = k->patch.scratch.offset / SURFACE_SZ; - *scratch = drm_intel_bo_alloc(bufmgr, "CL scratch surface", sz, 64); - gpgpu_bind_buf(gpgpu, index, *scratch, cc_llc_l3); - } - else if (scratch) - *scratch = NULL; - - /* Now bind a bo used for synchronization */ - sync_bo = drm_intel_bo_alloc(bufmgr, "sync surface", 64, 64); - gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3); - if (queue->last_batch != NULL) - drm_intel_bo_unreference(queue->last_batch); - queue->last_batch = sync_bo; - -error: - assert(err == CL_SUCCESS); /* Cannot fail here */ - return err; -} - -LOCAL cl_int -cl_kernel_check_args(cl_kernel k) -{ - uint32_t i; - for (i = 0; i < k->arg_n; ++i) - if (k->is_provided[i] == CL_FALSE) - return CL_INVALID_KERNEL_ARGS; - return CL_SUCCESS; -} - -LOCAL cl_int -cl_command_queue_set_report_buffer(cl_command_queue queue, cl_mem mem) -{ - cl_int err = CL_SUCCESS; - if (queue->perf != NULL) { - cl_mem_delete(queue->perf); - queue->perf = NULL; - } - if (mem != NULL) { - if (drm_intel_bo_get_size(mem->bo) < 1024) { /* 1K for the performance counters is enough */ - err = CL_INVALID_BUFFER_SIZE; - goto error; - } - cl_mem_add_ref(mem); - queue->perf = mem; - } - -error: - return err; -} - #if USE_FULSIM extern void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr*); extern void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr*, FILE*); @@ -303,8 +141,9 @@ static const size_t chunk_sz = 8192u; static cl_int cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k) { - cl_mem mem = NULL; cl_int err = CL_SUCCESS; +#if 0 + cl_mem mem = NULL; int i; size_t j; @@ -323,6 +162,7 @@ cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k) aub_exec_dump_raw_file(mem->bo, chunk_n * chunk_sz, chunk_remainder); } error: +#endif return err; } @@ -345,6 +185,7 @@ struct bmphdr { /* raw b, g, r data here, dword aligned per scan line */ }; +#if 0 static int* cl_read_bmp(const char *filename, int *width, int *height) { @@ -426,16 +267,17 @@ cl_read_dump(const char *name, size_t *size) *size = sz; return dump; } +#endif static cl_int cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k) { + cl_int err = CL_SUCCESS; +#if 0 cl_mem mem = NULL; char *from = NULL, *to = NULL; size_t size, j, chunk_n, chunk_remainder; - cl_int err = CL_SUCCESS; int i, curr = 0; - /* Bind user defined surface */ for (i = 0; i < k->arg_info_n; ++i) { if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER) @@ -475,11 +317,12 @@ cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k) cl_mem_unmap(mem); } error: +#endif return err; + } #endif /* USE_FULSIM */ -extern cl_int cl_command_queue_ND_range_gen6(cl_command_queue, cl_kernel, const size_t*, const size_t*, const size_t*); extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *); LOCAL cl_int @@ -501,9 +344,7 @@ cl_command_queue_ND_range(cl_command_queue queue, drm_intel_bufmgr_gem_set_aubfile(bufmgr, file); #endif /* USE_FULSIM */ - if (ver == 6) - TRY (cl_command_queue_ND_range_gen6, queue, k, global_wk_off, global_wk_sz, local_wk_sz); - else if (ver == 7 || ver == 75) + if (ver == 7 || ver == 75) TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz); else FATAL ("Unknown Gen Device"); diff --git a/src/cl_command_queue_gen6.c b/src/cl_command_queue_gen6.c deleted file mode 100644 index a08ff410..00000000 --- a/src/cl_command_queue_gen6.c +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see <http://www.gnu.org/licenses/>. - * - * Author: Benjamin Segovia <benjamin.segovia@intel.com> - */ - -#include "cl_command_queue.h" -#include "cl_context.h" -#include "cl_program.h" -#include "cl_kernel.h" -#include "cl_device_id.h" -#include "cl_mem.h" -#include "cl_utils.h" -#include "cl_alloc.h" - -#ifdef _PLASMA -#include "plasma/plasma_export.h" -#else -#include "intel_bufmgr.h" -#include "intel/intel_gpgpu.h" -#endif - -#include <assert.h> -#include <stdio.h> -#include <string.h> - -/* Header used by kernels */ -typedef struct cl_inline_header { - uint32_t grp_n[3]; - uint32_t local_sz[3]; - uint32_t exec_mask; - uint32_t local_mem_sz; -} cl_inline_header_t; - -/* ID inside the work group */ -typedef struct cl_local_id { - uint16_t data[16]; -} cl_local_id_t; - -static INLINE size_t -cl_kernel_compute_batch_sz(cl_kernel k, size_t wk_grp_n, size_t thread_n) -{ - size_t sz = 256; /* upper bound of the complete prelude */ - size_t media_obj_sz = 6 * 4; /* size of one MEDIA OBJECT */ - media_obj_sz += sizeof(cl_inline_header_t); /* header for all threads */ - media_obj_sz += 3 * sizeof(cl_local_id_t);/* for each dimension */ - if (k->patch.exec_env.has_barriers) - media_obj_sz += 4 * 4; /* one barrier update per object */ - sz += media_obj_sz * wk_grp_n * thread_n; - return sz; -} - -static INLINE void -cl_command_queue_enqueue_wk_grp(cl_command_queue queue, - cl_local_id_t **ids, - const cl_inline_header_t *header, - uint32_t thread_n, - uint32_t barrierID) -{ - intel_gpgpu_t *gpgpu = queue->gpgpu; - uint32_t i; - for (i = 0; i < thread_n; ++i) { - const size_t sz = sizeof(cl_inline_header_t) + 3*sizeof(cl_local_id_t); - char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz); - size_t offset = 0; - assert(data); - *((cl_inline_header_t *) (data + offset)) = *header; - offset += sizeof(cl_inline_header_t); - *((cl_local_id_t *) (data + offset)) = ids[0][i]; - offset += sizeof(cl_local_id_t); - *((cl_local_id_t *) (data + offset)) = ids[1][i]; - offset += sizeof(cl_local_id_t); - *((cl_local_id_t *) (data + offset)) = ids[2][i]; - } -} - -LOCAL cl_int -cl_command_queue_ND_range_gen6(cl_command_queue queue, - cl_kernel ker, - const size_t *global_wk_off, - const size_t *global_wk_sz, - const size_t *local_wk_sz) -{ - cl_context ctx = queue->ctx; - intel_gpgpu_t *gpgpu = queue->gpgpu; - drm_intel_bo *slm_bo = NULL, *private_bo = NULL, *scratch_bo = NULL; - char *curbe = NULL; /* constant buffer */ - const size_t cst_sz = ker->patch.curbe.sz; - size_t wk_grp_sz, wk_grp_n, batch_sz; - uint32_t grp_end[3], offset[3], thread_n; /* per work group */ - uint32_t i, j, k, curr; - uint32_t barrierID = 0; - cl_inline_header_t header; - cl_local_id_t *ids[3] = {NULL,NULL,NULL}; - cl_int err = CL_SUCCESS; - - /* Allocate 16 kernels (one per barrier) */ - genx_gpgpu_kernel_t kernels[16]; - for (i = 0; i < 16; ++i) { - kernels[i].name = "OCL kernel"; - kernels[i].grf_blocks = 128; - kernels[i].cst_sz = cst_sz; - kernels[i].bin = NULL, - kernels[i].size = 0, - kernels[i].bo = ker->bo; - kernels[i].barrierID = i; - kernels[i].use_barrier = 0; /* unused in gen6 */ - kernels[i].thread_n = 0; /* unused in gen6 */ - } - - /* All arguments must have been set */ - TRY (cl_kernel_check_args, ker); - - /* Check that the local work sizes are OK */ - TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &wk_grp_sz); - - /* Directly from the user defined values */ - header.local_sz[0] = local_wk_sz[0]; - header.local_sz[1] = local_wk_sz[1]; - header.local_sz[2] = local_wk_sz[2]; - offset[0] = header.grp_n[0] = 0; - offset[1] = header.grp_n[1] = 0; - offset[2] = header.grp_n[2] = 0; - header.exec_mask = ~0; - - /* offsets are evenly divided by the local sizes */ - offset[0] = global_wk_off[0] / local_wk_sz[0]; - offset[1] = global_wk_off[1] / local_wk_sz[1]; - offset[2] = global_wk_off[2] / local_wk_sz[2]; - - /* Compute the local size per wg and the offsets for each local buffer */ - header.local_mem_sz = cl_kernel_local_memory_sz(ker); - - /* Create the constant buffer */ - if (cst_sz > 0) { - assert(ker->cst_buffer); - curbe = cl_kernel_create_cst_buffer(ker, global_wk_off, global_wk_sz, local_wk_sz, 0, 0); - } - - /* Only if we want to monitor performance for this kernel */ - if (queue->perf) - gpgpu_set_perf_counters(gpgpu, queue->perf->bo); - - /* Setup the kernel */ - gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32); - if (queue->last_batch != NULL) - drm_intel_bo_unreference(queue->last_batch); - queue->last_batch = NULL; - cl_command_queue_bind_surface(queue, - ker, - curbe, - &slm_bo, - &private_bo, - &scratch_bo, - header.local_mem_sz); - - /* Upload the __constant samplers if any */ - const void *samplers = ker->dynamic_heap + ker->patch.sampler_state.offset; - const uint32_t sampler_n = ker->patch.sampler_state.count; - gpgpu_upload_samplers(gpgpu, samplers, sampler_n); - - gpgpu_states_setup(gpgpu, kernels, 16); - - /* Fill the constant buffer */ - if (cst_sz > 0) { - gpgpu_upload_constants(gpgpu, curbe, cst_sz); - cl_free(curbe); - } - - wk_grp_n = 1; - for (i = 0; i < 3; ++i) { - TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wk_grp_sz*sizeof(uint16_t))); - grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i]; - wk_grp_n *= grp_end[i]-offset[i]; - } - thread_n = wk_grp_sz / 16; - batch_sz = cl_kernel_compute_batch_sz(ker, wk_grp_n, thread_n); - - /* Start a new batch buffer */ - gpgpu_batch_reset(gpgpu, batch_sz); - gpgpu_batch_start(gpgpu); - - /* Push all media objects. We implement three paths to make it (a bit) faster. - * Local IDs are shared from work group to work group. We allocate once the - * buffers and reuse them - */ - curr = 0; - for (k = 0; k < local_wk_sz[2]; ++k) - for (j = 0; j < local_wk_sz[1]; ++j) - for (i = 0; i < local_wk_sz[0]; ++i, ++curr) { - ((uint16_t*) ids[0])[curr] = i; - ((uint16_t*) ids[1])[curr] = j; - ((uint16_t*) ids[2])[curr] = k; - } - for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) - for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1]) - for (header.grp_n[2] = offset[2]; header.grp_n[2] < grp_end[2]; ++header.grp_n[2]) { - if (ker->patch.exec_env.has_barriers) - gpgpu_update_barrier(gpgpu, barrierID, thread_n); - cl_command_queue_enqueue_wk_grp(queue, ids, &header, thread_n, barrierID); - barrierID = (barrierID + 1) % 16; - } - - gpgpu_batch_end(gpgpu, 0); - gpgpu_flush(gpgpu); - - if (slm_bo) drm_intel_bo_unreference(slm_bo); - if (private_bo) drm_intel_bo_unreference(private_bo); - if (scratch_bo) drm_intel_bo_unreference(scratch_bo); - -error: - cl_free(ids[0]); - cl_free(ids[1]); - cl_free(ids[2]); - return err; -} - diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 35f7f5ef..45970265 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -100,13 +100,15 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue, const size_t *global_wk_sz, const size_t *local_wk_sz) { +#if 0 cl_context ctx = queue->ctx; intel_gpgpu_t *gpgpu = queue->gpgpu; drm_intel_bo *private_bo = NULL, *scratch_bo = NULL; char *curbe = NULL; /* Does not include per-thread local IDs */ char *final_curbe = NULL; /* Includes them */ genx_gpgpu_kernel_t kernel; - const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz; + //const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz; + const size_t simd_sz = 16; size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz; size_t i, thread_n, id_offset; cl_int err = CL_SUCCESS; @@ -176,5 +178,7 @@ error: cl_free(final_curbe); cl_free(curbe); return err; +#endif + return CL_SUCCESS; } diff --git a/src/cl_device_id.c b/src/cl_device_id.c index f7492eb3..e4457bee 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -33,30 +33,6 @@ #include <stdio.h> #include <string.h> -static struct _cl_device_id intel_snb_gt2_device = { - .max_compute_unit = 60, - .max_work_item_sizes = {512, 512, 512}, - .max_work_group_size = 512, - .max_clock_frequency = 1350, - /* Does not really belong here, but for now this seems the most - * natural place to put it */ - .wg_sz = 512, - .compile_wg_sz = {0}, - -#include "cl_gen6_device.h" -}; - -static struct _cl_device_id intel_snb_gt1_device = { - .max_compute_unit = 24, - .max_work_item_sizes = {256, 256, 256}, - .max_work_group_size = 256, - .max_clock_frequency = 1000, - .wg_sz = 256, - .compile_wg_sz = {0}, - -#include "cl_gen6_device.h" -}; - static struct _cl_device_id intel_ivb_gt2_device = { .max_compute_unit = 128, .max_work_item_sizes = {512, 512, 512}, @@ -119,21 +95,6 @@ cl_get_gt_device(void) intel_ivb_gt2_device.platform = intel_platform; ret = &intel_ivb_gt2_device; } - else if (device_id == PCI_CHIP_SANDYBRIDGE_GT1 || - device_id == PCI_CHIP_SANDYBRIDGE_M_GT1 || - device_id == PCI_CHIP_SANDYBRIDGE_S_GT) { - intel_snb_gt1_device.vendor_id = device_id; - intel_snb_gt1_device.platform = intel_platform; - ret = &intel_snb_gt1_device; - } - else if (device_id == PCI_CHIP_SANDYBRIDGE_GT2 || - device_id == PCI_CHIP_SANDYBRIDGE_M_GT2 || - device_id == PCI_CHIP_SANDYBRIDGE_GT2_PLUS || - device_id == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS) { - intel_snb_gt2_device.vendor_id = device_id; - intel_snb_gt2_device.platform = intel_platform; - ret = &intel_snb_gt2_device; - } return ret; } @@ -196,9 +157,7 @@ cl_get_device_info(cl_device_id device, void * param_value, size_t * param_value_size_ret) { - if (UNLIKELY(device != &intel_snb_gt1_device && - device != &intel_snb_gt2_device && - device != &intel_ivb_gt1_device && + if (UNLIKELY(device != &intel_ivb_gt1_device && device != &intel_ivb_gt2_device && device != &intel_hsw_device)) return CL_INVALID_DEVICE; @@ -272,17 +231,13 @@ cl_get_device_info(cl_device_id device, LOCAL cl_int cl_device_get_version(cl_device_id device, cl_int *ver) { - if (UNLIKELY(device != &intel_snb_gt1_device && - device != &intel_snb_gt2_device && - device != &intel_ivb_gt1_device && + if (UNLIKELY(device != &intel_ivb_gt1_device && device != &intel_ivb_gt2_device && device != &intel_hsw_device)) return CL_INVALID_DEVICE; if (ver == NULL) return CL_SUCCESS; - if (device == &intel_snb_gt1_device || device == &intel_snb_gt2_device) - *ver = 6; - else if (device == &intel_ivb_gt1_device || device == &intel_ivb_gt2_device) + if (device == &intel_ivb_gt1_device || device == &intel_ivb_gt2_device) *ver = 7; else *ver = 75; @@ -308,9 +263,7 @@ cl_get_kernel_workgroup_info(cl_device_id device, void* param_value, size_t* param_value_size_ret) { - if (UNLIKELY(device != &intel_snb_gt1_device && - device != &intel_snb_gt2_device && - device != &intel_ivb_gt1_device && + if (UNLIKELY(device != &intel_ivb_gt1_device && device != &intel_ivb_gt2_device)) return CL_INVALID_DEVICE; if (UNLIKELY(param_value == NULL)) diff --git a/src/cl_gen6_device.h b/src/cl_gen6_device.h deleted file mode 100644 index b09121fd..00000000 --- a/src/cl_gen6_device.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library. If not, see <http://www.gnu.org/licenses/>. - * - * Author: Benjamin Segovia <benjamin.segovia@intel.com> - */ - -/* Common fields for both SNB devices (either GT1 or GT2) - */ -.max_parameter_size = 256, -.global_mem_cache_line_size = 128, /* XXX */ -.global_mem_cache_size = 8 << 10, /* XXX */ -.local_mem_type = CL_GLOBAL, -.local_mem_size = 16 << 10, -.gfx_id = IGFX_GEN6_CORE, - -#include "cl_gt_device.h" - diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 6668328d..69302119 100644 --- a/src/cl_kernel.c +++ b/src/cl_kernel.c @@ -40,49 +40,6 @@ #include <stdint.h> #include <assert.h> -static void -cl_arg_list_destroy(cl_arg_info_t *arg_info) -{ - cl_arg_info_t *next_arg_info = NULL; - while (arg_info) { - next_arg_info = arg_info->next; - cl_free(arg_info); - arg_info = next_arg_info; - } -} - -static void -cl_curbe_list_destroy(cl_curbe_patch_info_t *curbe_info) -{ - cl_curbe_patch_info_t *next_curbe_info = NULL; - while (curbe_info) { - next_curbe_info = curbe_info->next; - cl_free(curbe_info); - curbe_info = next_curbe_info; - } -} - -/* Header for all internal objects (cl_mem_object, cl_kernel_object, ...) */ -typedef struct cl_object_header { - uint64_t magic; - volatile int ref_n; -} cl_object_header_t; - -static void -cl_kernel_release_args(cl_kernel k) -{ - uint32_t i; - assert(k->args); - for (i = 0; i < k->arg_n; ++i) { - cl_object_header_t *header = (cl_object_header_t *) k->args[i]; - if (header == NULL) - continue; - FATAL_IF (header->magic != CL_MAGIC_MEM_HEADER, - "A non memory object was set as an argument"); - cl_mem_delete((cl_mem)header); - } -} - LOCAL void cl_kernel_delete(cl_kernel k) { @@ -92,24 +49,6 @@ cl_kernel_delete(cl_kernel k) /* We are not done with the kernel */ if (atomic_dec(&k->ref_n) > 1) return; - /* User may have set some OCL object as arguments. As we referenced them when - * we set them, we release all their references here - */ - if (k->args) cl_kernel_release_args(k); - - /* Free the chain lists (may also be arrays) */ - cl_arg_list_destroy(k->arg_info); - cl_curbe_list_destroy(k->curbe_info); - - /* Free the CURBE data */ - cl_free(k->cst_buffer); - - /* Free the argument array */ - cl_free(k->args); - - /* Free the array to track argument setting */ - cl_free(k->is_provided); - /* Release one reference on all bos we own */ if (k->bo) drm_intel_bo_unreference(k->bo); if (k->const_bo) drm_intel_bo_unreference(k->const_bo); @@ -117,7 +56,6 @@ cl_kernel_delete(cl_kernel k) /* This will be true for kernels created by clCreateKernel */ if (k->ref_its_program) cl_program_delete(k->program); - cl_free(k->name); k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */ cl_free(k); } @@ -144,725 +82,11 @@ cl_kernel_add_ref(cl_kernel k) atomic_inc(&k->ref_n); } -static void -cl_kernel_chain_arg(cl_kernel k, cl_arg_info_t *arg_info) -{ - cl_arg_info_t *next = k->arg_info; - arg_info->next = next; - k->arg_info = arg_info; -} - -static void -cl_kernel_chain_curbe(cl_kernel k, cl_curbe_patch_info_t *curbe_info) -{ - cl_curbe_patch_info_t *next = k->curbe_info; - curbe_info->next = next; - k->curbe_info = curbe_info; -} - -static INLINE cl_curbe_patch_info_t* -cl_kernel_get_curbe_info_list(cl_kernel k, uint64_t key) -{ - cl_curbe_patch_info_t *curbe_info = k->curbe_info; - while (curbe_info) { - if (curbe_info->key == key) break; - curbe_info = curbe_info->next; - } - return curbe_info; -} - -static INLINE cl_curbe_patch_info_t* -cl_kernel_new_curbe_info(cl_kernel k, cl_patch_data_parameter_buffer_t *data) -{ - cl_curbe_patch_info_t *curbe = NULL; - - TRY_ALLOC_NO_ERR (curbe, CALLOC(cl_curbe_patch_info_t)); - curbe->type = data->type; - curbe->arg_index = data->index; - curbe->offsets[0] = data->offset; - curbe->sz = data->data_sz; - curbe->src_offset = data->src_offset; - curbe->is_local = CL_FALSE; - curbe->last = 0; - cl_kernel_chain_curbe(k, curbe); - -exit: - return curbe; -error: - cl_free(curbe); - curbe = NULL; - goto exit; -} - -static int -cl_arg_cmp(const void *a, const void *b) -{ - const cl_arg_info_t *arg0 = (const cl_arg_info_t *) a; - const cl_arg_info_t *arg1 = (const cl_arg_info_t *) b; - return arg0->arg_index > arg1->arg_index; -} - -static int -cl_curbe_cmp(const void *a, const void *b) -{ - const cl_curbe_patch_info_t *curbe0 = (const cl_curbe_patch_info_t *) a; - const cl_curbe_patch_info_t *curbe1 = (const cl_curbe_patch_info_t *) b; - return curbe0->key > curbe1->key; -} - -static cl_int -cl_kernel_sort_arg_list(cl_kernel k) -{ - cl_arg_info_t *arg_info = NULL; - cl_arg_info_t *array = NULL; - cl_int arg_info_n = 0; - cl_int err = CL_SUCCESS; - - /* How many arguments do we have? */ - arg_info = k->arg_info; - while (arg_info) { - arg_info_n++; - arg_info = arg_info->next; - } - - /* Now fill the array with the unsorted arguments */ - TRY_ALLOC (array, CALLOC_ARRAY(cl_arg_info_t, arg_info_n)); - arg_info = k->arg_info; - arg_info_n = 0; - while (arg_info) { - array[arg_info_n++] = *arg_info; - array->next = NULL; - arg_info = arg_info->next; - } - - /* Sort the argument list array */ - qsort(array, arg_info_n, sizeof(cl_arg_info_t), cl_arg_cmp); - - /* Replace the list by the array */ - cl_arg_list_destroy(k->arg_info); - k->arg_info = array; - k->arg_info_n = arg_info_n; - -exit: - return err; -error: - cl_free(array); - goto exit; -} - -static cl_int -cl_kernel_sort_curbe_info_list(cl_kernel k) -{ - cl_curbe_patch_info_t *curbe_info = NULL; - cl_curbe_patch_info_t *array = NULL; - cl_int curbe_info_n = 0; - cl_int err = CL_SUCCESS; - - /* How many curbe info do we have? */ - curbe_info = k->curbe_info; - while (curbe_info) { - curbe_info_n++; - curbe_info = curbe_info->next; - } - - /* Now fill the array with the unsorted curbe info */ - TRY_ALLOC (array, CALLOC_ARRAY(cl_curbe_patch_info_t, curbe_info_n)); - curbe_info = k->curbe_info; - curbe_info_n = 0; - while (curbe_info) { - array[curbe_info_n++] = *curbe_info; - array->next = NULL; - curbe_info = curbe_info->next; - } - - /* Sort the curbe list array */ - qsort(array, curbe_info_n, sizeof(cl_curbe_patch_info_t), cl_curbe_cmp); - - /* Replace the list by the array */ - cl_curbe_list_destroy(k->curbe_info); - k->curbe_info = array; - k->curbe_info_n = curbe_info_n; - k->curbe_info->next = NULL; - -exit: - return err; -error: - cl_free(array); - goto exit; -} - -#define ASSOC_ITEM(ENUM,TYPE,FIELD) \ - case JOIN(PATCH_TOKEN_, ENUM): \ - info->FIELD = *(JOIN(JOIN(cl_patch_,TYPE),_t)*) patch; \ - break; - -static cl_int -cl_kernel_allocate_inline_buffer(cl_kernel k, - cl_patch_alloc_surf_with_init_t *init, - const char **patch, - size_t *read) -{ - drm_intel_bufmgr *bufmgr = NULL; - const size_t sz = init->sz; - cl_int err = CL_SUCCESS; - - FATAL_IF (init->offset % SURFACE_SZ, "Bad alignment for inline buffer offset"); - FATAL_IF (k->const_bo != NULL, "inline buffer already declared"); - assert(k->program && k->program->ctx); - bufmgr = cl_context_get_intel_bufmgr(k->program->ctx); - TRY_ALLOC (k->const_bo, drm_intel_bo_alloc(bufmgr, - "Inline buffer", - sz, - 64)); - drm_intel_bo_subdata(k->const_bo, 0, sz, &init->data); - k->const_bo_index = init->offset / SURFACE_SZ; - *read += sz; - *patch += sz; - -error: - return err; -} - -static cl_int -cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz) -{ - cl_kernel_patch_info_t *info = &k->patch; - cl_curbe_patch_info_t *curbe_info = NULL; - cl_arg_info_t *arg_info = NULL; - uint64_t curbe_key; - size_t read = 0; - cl_int err = CL_SUCCESS; - - while (read < sz) { - const cl_patch_item_header_t *item = (const cl_patch_item_header_t *) patch; - switch (item->token) { - case PATCH_TOKEN_MEDIA_VFE_STATE: break; - ASSOC_ITEM (MEDIA_INTERFACE_DESCRIPTOR_LOAD, interface_desc_load, idrt); - ASSOC_ITEM (STATE_SIP, sip, sip); - ASSOC_ITEM (MEDIA_CURBE_LOAD, curbe_load, curbe); - ASSOC_ITEM (SAMPLER_STATE_ARRAY, sampler_state_array, sampler_state); - ASSOC_ITEM (INTERFACE_DESCRIPTOR_DATA, interface_desc_data, surf_desc); - ASSOC_ITEM (BINDING_TABLE_STATE, binding_table_state, binding_table); - ASSOC_ITEM (ALLOCATE_SCRATCH_SURFACE, alloc_scratch_surf, scratch); - ASSOC_ITEM (ALLOCATE_PRIVATE_MEMORY, alloc_private_memory_surf, private_surf); - ASSOC_ITEM (ALLOCATE_LOCAL_SURFACE, alloc_local_surf, local_surf); - ASSOC_ITEM (EXECUTION_ENVIRONMENT, exec_env, exec_env); - ASSOC_ITEM (THREAD_PAYLOAD, thread_payload, thread_payload); - - case PATCH_TOKEN_DATA_PARAMETER_STREAM: - info->curbe.sz = *(uint32_t *) (patch + sizeof(cl_patch_item_header_t)); - info->curbe.offset = 0; - break; - case PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT: - case PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT: - case PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT: - { - - TRY_ALLOC (arg_info, CALLOC(cl_arg_info_t)); - if (item->token == PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT) { - cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch; - arg_info->arg_index = from->index; - arg_info->offset = from->offset; - arg_info->type = OCLRT_ARG_TYPE_BUFFER; - } - else if (item->token == PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT) { - cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch; - arg_info->arg_index = from->index; - arg_info->offset = from->offset; - arg_info->type = OCLRT_ARG_TYPE_CONST; - } -#if USE_OLD_COMPILER - else if (item->token == PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT) { - cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch; - arg_info->arg_index = from->index; - arg_info->offset = from->offset; - arg_info->type = OCLRT_ARG_TYPE_IMAGE; - } -#else - else if (item->token == PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT) { - cl_image_memory_object_arg_t *from = (cl_image_memory_object_arg_t *) patch; - arg_info->arg_index = from->index; - arg_info->offset = from->offset; - arg_info->type = OCLRT_ARG_TYPE_IMAGE; - } -#endif - else - assert(0); - - arg_info->sz = sizeof(cl_mem); - arg_info->is_patched = CL_FALSE; - - /* Chain the argument to the next arguments */ - cl_kernel_chain_arg(k, arg_info); - k->arg_n = MAX(k->arg_n, arg_info->arg_index); - k->arg_info_n++; - } - break; - - case PATCH_TOKEN_ALLOCATE_SURFACE_WITH_INITIALIZATION: - { - cl_patch_alloc_surf_with_init_t *from = (cl_patch_alloc_surf_with_init_t *) patch; - TRY (cl_kernel_allocate_inline_buffer, k, from, &patch, &read); - } - break; - - case PATCH_TOKEN_DATA_PARAMETER_BUFFER: - { - cl_patch_data_parameter_buffer_t *data = (cl_patch_data_parameter_buffer_t *) patch; - switch (data->type) - { - case DATA_PARAMETER_KERNEL_ARGUMENT: - case DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES: - case DATA_PARAMETER_LOCAL_WORK_SIZE: - case DATA_PARAMETER_GLOBAL_WORK_SIZE: - case DATA_PARAMETER_GLOBAL_WORK_OFFSET: - case DATA_PARAMETER_NUM_WORK_GROUPS: - case DATA_PARAMETER_WORK_DIMENSIONS: - case DATA_PARAMETER_IMAGE_WIDTH: - case DATA_PARAMETER_IMAGE_HEIGHT: - case DATA_PARAMETER_IMAGE_DEPTH: - case DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE: - case DATA_PARAMETER_IMAGE_CHANNEL_ORDER: - case DATA_PARAMETER_NUM_HARDWARE_THREADS: - { -#if USE_OLD_COMPILER == 0 - if (data->type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) - curbe_key = cl_curbe_key(data->type, data->index, 0); - else -#endif - curbe_key = cl_curbe_key(data->type, data->index, data->src_offset); - curbe_info = cl_kernel_get_curbe_info_list(k, curbe_key); - if (curbe_info != NULL) - curbe_info->offsets[++curbe_info->last] = data->offset; - else - TRY_ALLOC (curbe_info, cl_kernel_new_curbe_info(k, data)); - curbe_info->key = curbe_key; - curbe_info->is_patched = CL_FALSE; - curbe_info = NULL; - k->arg_n = MAX(k->arg_n, data->index); - - /* We will need to allocate a local surface */ - if (data->type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) - k->has_local_buffer = CL_TRUE; - break; - } - default: NOT_IMPLEMENTED; - } - } - break; - default: - FATAL("Undefined item in patch list"); - break; - } - patch += item->size; - read += item->size; - } - - if (k->patch.local_surf.sz != 0) - k->has_local_buffer = CL_TRUE; - - /* k->arg_n was the offset of the last argument. Turn it into an argument - * number - */ - k->arg_n++; - - /* Transform the argument and the curbe info lists into sorted arrays */ - if (k->arg_info) - TRY (cl_kernel_sort_arg_list, k); - if (k->curbe_info) - TRY (cl_kernel_sort_curbe_info_list, k); - -error: - return err; -} - -#undef ASSOC_ITEM - -LOCAL int -cl_kernel_setup(cl_kernel k, const char *ker) -{ - drm_intel_bufmgr *bufmgr = NULL; - int err = 0; - - /* Kernel instruction */ - FATAL_IF (k->kernel_heap_sz == 0, "No instruction found for this kernel"); - k->kernel_heap = ker; - ker += k->kernel_heap_sz; - - /* No general heap */ - FATAL_IF (k->general_heap_sz, "General heap unsupported"); - - /* Dynamic heap */ - if (k->dynamic_heap_sz) { - k->dynamic_heap = ker; - ker += k->dynamic_heap_sz; - } - - /* Surface state heap */ - if (k->surface_heap_sz) { - k->surface_heap = ker; - ker += k->surface_heap_sz; - } - - /* Patch list */ - if (k->patch_list_sz) { - k->patch_list = ker; - ker += k->patch_list_sz; - } - - /* Read all the patch elements */ - TRY (cl_kernel_setup_patch_list, k, k->patch_list, k->patch_list_sz); - - /* Create the kernel in GPU memory */ - assert(k->program && k->program->ctx); - bufmgr = cl_context_get_intel_bufmgr(k->program->ctx); - assert(bufmgr); - TRY_ALLOC (k->bo, drm_intel_bo_alloc(bufmgr, - "OCL kernel", - k->kernel_heap_sz, - 64)); - drm_intel_bo_subdata(k->bo, 0, k->kernel_heap_sz, k->kernel_heap); - - /* We have some restrictions on the compiled binary for SNB */ - FATAL_IF (k->program->ctx->ver == 6 && - k->patch.exec_env.largest_compiled_simd_sz != 16, "Unsupported SIMD size"); - FATAL_IF (k->program->ctx->ver == 6 && - k->patch.exec_env.compiled_simd16 == 0, "Unsupported SIMD size"); - FATAL_IF (k->program->ctx->ver > 6 && - k->patch.exec_env.largest_compiled_simd_sz == 32, "Unsupported SIMD size"); - -error: - return err; -} - -LOCAL cl_kernel -cl_kernel_dup(cl_kernel from) -{ - cl_kernel to = NULL; - size_t name_sz = 0; - size_t cst_buffer_sz = 0; - - assert(from); - TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel)); - *to = *from; /* most fields do not belong to the kernel but the program */ - to->ref_n = 1; - name_sz = strlen(from->name) + 1; /* zero terminated */ - TRY_ALLOC_NO_ERR (to->name, CALLOC_ARRAY(char, name_sz)); - memcpy(to->name, from->name, name_sz); - - /* Duplicate the argument info list */ - if (from->arg_info != NULL) { - assert(from->arg_info_n != 0); - assert(from->arg_info->next == NULL); - TRY_ALLOC_NO_ERR (to->arg_info, CALLOC_ARRAY(cl_arg_info_t, from->arg_info_n)); - memcpy(to->arg_info, from->arg_info, sizeof(cl_arg_info_t) * from->arg_info_n); - } - - /* Duplicate the curbe info */ - if (from->curbe_info != NULL) { - assert(from->curbe_info_n != 0); - assert(from->curbe_info->next == NULL); - TRY_ALLOC_NO_ERR (to->curbe_info, - CALLOC_ARRAY(cl_curbe_patch_info_t, from->curbe_info_n)); - memcpy(to->curbe_info, - from->curbe_info, - sizeof(cl_curbe_patch_info_t) * from->curbe_info_n); - } - - /* This kernel (used outside the internal code) will need to see its CURBE - * updated when setting arguments - */ - cst_buffer_sz = ALIGN(to->patch.curbe.sz, 32); - if (cst_buffer_sz) - TRY_ALLOC_NO_ERR (to->cst_buffer, cl_malloc(cst_buffer_sz)); - - /* We store for each argument the buffer currently set */ - TRY_ALLOC_NO_ERR (to->args, CALLOC_ARRAY(void*, to->arg_n)); - - /* We track here that all arguments are provided by the user */ - TRY_ALLOC_NO_ERR (to->is_provided, CALLOC_ARRAY(uint8_t, to->arg_n)); - - /* Retain the bos */ - if (from->bo) drm_intel_bo_reference(from->bo); - if (from->const_bo) drm_intel_bo_reference(from->const_bo); - - /* We retain the program destruction since this kernel (user allocated) - * depends on the program for some of its pointers - */ - assert(from->program); - cl_program_add_ref(from->program); - to->ref_its_program = CL_TRUE; - -exit: - return to; -error: - cl_free(to->arg_info); - cl_free(to->curbe_info); - to->arg_info = NULL; - to->curbe_info = NULL; - cl_kernel_delete(to); - to = NULL; - goto exit; -} - -/* arg_info / curbe_info are sorted. Just use a dichotomic search */ -#define DECL_DICHO_SEARCH(FN, TYPE, KEY_TYPE, FIELD, SUB_FIELD) \ -LOCAL TYPE* \ -JOIN(cl_kernel_get_,FN)(cl_kernel k, KEY_TYPE index) \ -{ \ - uint32_t begin = 0, end = k->JOIN(FN,_n); \ - \ - while (end > begin) { \ - uint32_t mid = (begin + end) / 2; \ - if (k->FIELD[mid].SUB_FIELD == index) \ - return k->FIELD + mid; \ - else if (k->FIELD[mid].SUB_FIELD > index) \ - end = mid; \ - else \ - begin = mid + 1; \ - } \ - \ - return NULL; \ -} - -DECL_DICHO_SEARCH(arg_info, cl_arg_info_t, uint32_t, arg_info, arg_index) -DECL_DICHO_SEARCH(curbe_info, cl_curbe_patch_info_t, uint64_t, curbe_info, key) - -#undef DECL_DICHO_SEARCH - -/* Set the given value (typically a function parameter) - * in the constant buffer - */ -static cl_int -cl_kernel_set_curbe_entry(cl_kernel k, - uint32_t index, - size_t sz, - const void *value) -{ - cl_curbe_patch_info_t *info = NULL; - uint64_t key; - cl_int err = CL_SUCCESS; - uint32_t i; - - /* Case 1: regular kernel argument (int, float ...) */ - key = cl_curbe_key(DATA_PARAMETER_KERNEL_ARGUMENT, index, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) { - - /* User must give a value for these arguments */ - if (value == NULL) { - err = CL_INVALID_ARG_VALUE; - goto error; - } - - /* Sizes must match */ - if (UNLIKELY(sz > info->sz)) { - err = CL_INVALID_ARG_SIZE; - goto error; - } - - /* Patch all locations */ - assert(k->cst_buffer); - for (i = 0; i <= info->last; ++i) { - assert(sz + info->offsets[i] <= k->patch.curbe.sz); - memcpy(k->cst_buffer + info->offsets[i], value, sz); - } - - /* We are done */ - goto exit; - } - - /* Case 2: Local buffer size */ - key = cl_curbe_key(DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES, index, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) { - info->sz = sz; - goto exit; - } - - /* Problem. We were not able to find anything */ - err = CL_INVALID_ARG_INDEX; - -exit: -error: - return err; -} - LOCAL cl_int cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value) { - const cl_arg_info_t *arg_info = NULL; - cl_mem *mem = NULL; cl_int err = CL_SUCCESS; - /* Not a valid argument if exce*/ - assert(k); - if (UNLIKELY(index >= k->arg_n)) { - err = CL_INVALID_ARG_VALUE; - goto error; - } - - /* Is it a buffer / image / sampler to set */ - if ((arg_info = cl_kernel_get_arg_info(k, index)) != NULL) { - switch (arg_info->type) { - case OCLRT_ARG_TYPE_CONST: - case OCLRT_ARG_TYPE_IMAGE: - case OCLRT_ARG_TYPE_BUFFER: - { - /* Check the buffer consistency */ - FATAL_IF(value == NULL, "Unsupported NULL value for buffer (TBD)"); - if (UNLIKELY(sz != sizeof(void*))) { - err = CL_INVALID_ARG_SIZE; - goto error; - } - mem = (cl_mem*) value; - FATAL_IF (mem == NULL, "Buffer cannot be NULL"); - CHECK_MEM((*mem)); - - /* The kernel holds a reference on it now */ - cl_mem_add_ref(*mem); - cl_mem_delete(k->args[index]); - k->args[index] = *mem; - } - k->is_provided[index] = CL_TRUE; - goto exit; - default: NOT_IMPLEMENTED; - } - } - - TRY (cl_kernel_set_curbe_entry, k, index, sz, value); - k->is_provided[index] = CL_TRUE; - -exit: -error: - return err; -} - -static INLINE int32_t -cl_kernel_get_first_local(cl_kernel k) -{ - int32_t i; - for (i = 0; i < (int32_t) k->curbe_info_n; ++i) - if (k->curbe_info[i].type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) - return i; - return k->curbe_info_n; -} - -LOCAL uint32_t -cl_kernel_local_memory_sz(cl_kernel k) -{ - int32_t i; - uint32_t local_mem_sz = 0; - - if (k->has_local_buffer) { - - /* Look for all local surfaces offset to set */ - i = cl_kernel_get_first_local(k); - - /* Now, set the offsets for all local surfaces */ - for (; i < (int32_t) k->curbe_info_n; ++i) { - cl_curbe_patch_info_t *info = k->curbe_info + i; - const size_t offset = local_mem_sz; - if (info->type != DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES) - break; - assert(info->last == 0); - assert(sizeof(int32_t) + info->offsets[0] <= k->patch.curbe.sz); - memcpy(k->cst_buffer + info->offsets[0], &offset, sizeof(int32_t)); - local_mem_sz += info->sz; - } - local_mem_sz += k->patch.local_surf.sz; - } - return local_mem_sz; -} - -LOCAL char* -cl_kernel_create_cst_buffer(cl_kernel k, - const size_t *global_wk_off, - const size_t *global_wk_sz, - const size_t *local_wk_sz, - cl_uint wk_dim, - cl_uint thread_n) -{ - cl_curbe_patch_info_t *info = NULL; - const size_t sz = k->patch.curbe.sz; - uint64_t key = 0; - char *data = NULL; - - TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1)); - memcpy(data, k->cst_buffer, sz); - - /* Global work group offset */ - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_off, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 4); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_off+1, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 8); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_off+2, sizeof(uint32_t)); - - /* Global work group size */ - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_sz, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t)); - - /* Local work group size */ - key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_wk_sz, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t)); - key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t)); - - /* HW thread number (Gen7+) */ - key = cl_curbe_key(DATA_PARAMETER_NUM_HARDWARE_THREADS, 0, 0); - if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], &thread_n, sizeof(uint32_t)); - -exit: - return data; -error: - cl_free(data); - data = NULL; - goto exit; -} - -LOCAL cl_int -cl_kernel_work_group_sz(cl_kernel ker, - const size_t *local_wk_sz, - uint32_t wk_dim, - size_t *wk_grp_sz) -{ - cl_int err = CL_SUCCESS; - size_t sz = 0; - cl_uint i; - - for (i = 0; i < wk_dim; ++i) - if ((&ker->patch.exec_env.required_wgr_sz_x)[i] && - (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_wk_sz[i]) { - err = CL_INVALID_WORK_ITEM_SIZE; - goto error; - } - sz = local_wk_sz[0]; - for (i = 1; i < wk_dim; ++i) - sz *= local_wk_sz[i]; - FATAL_IF (sz % 16, "Work group size must be a multiple of 16"); - if (sz > ker->program->ctx->device->max_work_group_size) { - err = CL_INVALID_WORK_ITEM_SIZE; - goto error; - } - -error: - if (wk_grp_sz) - *wk_grp_sz = sz; return err; } diff --git a/src/cl_kernel.h b/src/cl_kernel.h index c27cff6b..f474879a 100644 --- a/src/cl_kernel.h +++ b/src/cl_kernel.h @@ -23,277 +23,20 @@ #include "cl_defs.h" #include "cl_internals.h" #include "CL/cl.h" +#include "gen/program.h" #include <stdint.h> #include <stdlib.h> -/***************************************************************************/ -/* XXX Structures extracted from the WINDOWS CODE BASE */ -/***************************************************************************/ - -// Some fields went from 1 to 4 bytes with the new compiler -#if USE_OLD_COMPILER -typedef uint8_t cl_compiler_boolean_t; -#else -typedef uint32_t cl_compiler_boolean_t; -#endif /* USE_OLD_COMPILER */ - -typedef struct cl_program_header { - uint32_t magic; - uint32_t version; - uint32_t device; - uint32_t ker_n; -} cl_program_header_t; - -typedef struct cl_arg_info { - uint32_t arg_index; - uint32_t type; - cl_compiler_boolean_t is_null; - uint32_t offset; - uint32_t sz; - void *obj; - cl_compiler_boolean_t is_patched; - struct cl_arg_info *next; -} cl_arg_info_t; - -typedef struct cl_curbe_patch_info { - uint64_t key; - uint32_t last; - uint32_t offsets[OCLRT_CURBE_MAX_OFFSETS]; - uint32_t type; - uint32_t arg_index; - uint32_t sz; - uint32_t src_offset; - cl_compiler_boolean_t is_patched; - cl_compiler_boolean_t is_local; - struct cl_curbe_patch_info *next; -} cl_curbe_patch_info_t; - -typedef struct cl_kernel_header { - uint32_t check_sum; - uint32_t kernel_name_sz; - uint32_t patch_list_sz; -} cl_kernel_header_t; - -typedef struct cl_kernel_header75 { - cl_kernel_header_t header; - uint32_t kernel_heap_sz; - uint32_t general_state_heap_sz; - uint32_t dynamic_state_heap_sz; - uint32_t surface_state_heap_sz; -} cl_kernel_header75_t; - -typedef struct cl_kernel_header7 { - cl_kernel_header_t header; - uint32_t kernel_heap_sz; - uint32_t general_state_heap_sz; - uint32_t dynamic_state_heap_sz; - uint32_t surface_state_heap_sz; -} cl_kernel_header7_t; - -typedef struct cl_kernel_header6 { - cl_kernel_header_t header; - uint32_t kernel_heap_sz; - uint32_t general_state_heap_sz; - uint32_t dynamic_state_heap_sz; - uint32_t surface_state_heap_sz; - uint32_t indirect_object__heap_sz; -} cl_kernel_header6_t; - -typedef struct cl_patch_item_header { - uint32_t token; - uint32_t size; -} cl_patch_item_header_t; - -typedef struct cl_global_memory_object_arg { - cl_patch_item_header_t header; - uint32_t index; - uint32_t offset; -} cl_global_memory_object_arg_t; - -#if USE_OLD_COMPILER == 0 -typedef struct cl_image_memory_object_arg { - cl_patch_item_header_t header; - uint32_t index; - uint32_t image_type; - uint32_t offset; -} cl_image_memory_object_arg_t; -#endif - -typedef struct cl_patch_constant_memory_object_arg { - uint32_t index; - uint32_t offset; -} cl_patch_constant_memory_object_arg_t; - -typedef struct cl_patch_sampler_kernel_arg { - cl_patch_item_header_t header; - uint32_t index; - uint32_t offset; -} cl_patch_sampler_kernel_arg_t; - -typedef struct cl_patch_data_parameter_buffer { - cl_patch_item_header_t header; - uint32_t type; - uint32_t index; - uint32_t offset; - uint32_t data_sz; - uint32_t src_offset; -} cl_patch_data_parameter_buffer_t; - -typedef struct cl_patch_data_parameter_stream { - cl_patch_item_header_t header; - uint32_t data_parameter_stream_sz; -} cl_patch_data_parameter_stream_t; - -typedef struct cl_patch_sip { - cl_patch_item_header_t header; - uint32_t sip_offset; -} cl_patch_sip_t; - -typedef struct cl_patch_sampler_state_array { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t count; - uint32_t border_color_offset; -} cl_patch_sampler_state_array_t; - -typedef struct cl_patch_binding_table_state { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t count; - uint32_t surface_state_offset; -} cl_patch_binding_table_state_t; - -typedef struct cl_patch_alloc_scratch_surf { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t size; -} cl_patch_alloc_scratch_surf_t; - -typedef struct cl_patch_alloc_private_memory_surf { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t size; -} cl_patch_alloc_private_memory_surf_t; - -typedef struct cl_patch_alloc_system_thread_surf { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t sz; -} cl_patch_alloc_system_thread_surf_t; - -typedef struct cl_patch_alloc_surf_with_init { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t sz; - char* data; -} cl_patch_alloc_surf_with_init_t; - -typedef struct cl_patch_alloc_local_surf { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t sz; -} cl_patch_alloc_local_surf_t; - -typedef struct cl_patch_thread_payload { - cl_patch_item_header_t header; - uint8_t header_present; - uint8_t local_idx_present; - uint8_t local_idy_present; - uint8_t local_idz_present; -} cl_patch_thread_payload_t; - -typedef struct cl_patch_exec_env { - cl_patch_item_header_t header; - uint32_t required_wgr_sz_x; - uint32_t required_wgr_sz_y; - uint32_t required_wgr_sz_z; - uint32_t largest_compiled_simd_sz; - uint8_t has_barriers; - uint8_t compiled_simd8; - uint8_t compiled_simd16; - uint8_t compiled_simd32; -} cl_patch_exec_env_t; - -typedef struct cl_patch_vfe_state { - cl_patch_item_header_t header; - uint32_t scratch_offset; -} cl_patch_vfe_state_t; - -typedef struct cl_patch_curbe_load { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t sz; -} cl_patch_curbe_load_t; - -typedef struct cl_patch_interface_desc_load { - cl_patch_item_header_t header; - uint32_t offset; -} cl_patch_interface_desc_load_t; - -typedef struct cl_patch_interface_desc_data { - cl_patch_item_header_t header; - uint32_t offset; - uint32_t sampler_state_offset; - uint32_t kernel_offset; - uint32_t binding_table_offset; -} cl_patch_interface_desc_data_t; - -typedef struct cl_kernel_patch_info { - cl_patch_sip_t sip; - cl_patch_sampler_state_array_t sampler_state; - cl_patch_binding_table_state_t binding_table; - cl_patch_alloc_scratch_surf_t scratch; - cl_patch_alloc_private_memory_surf_t private_surf; - cl_patch_alloc_system_thread_surf_t sys_thread_surf; - cl_patch_alloc_surf_with_init_t surf_with_init; - cl_patch_alloc_local_surf_t local_surf; - cl_patch_thread_payload_t thread_payload; - cl_patch_exec_env_t exec_env; - cl_patch_vfe_state_t vfe; - cl_patch_curbe_load_t curbe; - cl_patch_interface_desc_load_t idrt; - cl_patch_interface_desc_data_t surf_desc; -} cl_kernel_patch_info_t; - struct _cl_kernel { uint64_t magic; /* To identify it as a kernel */ volatile int ref_n; /* We reference count this object */ struct _drm_intel_bo *bo; /* The code itself */ struct _drm_intel_bo *const_bo;/* Buffer for all __constants values in the OCL program */ cl_program program; /* Owns this structure (and pointers) */ - cl_arg_info_t *arg_info; /* List of arguments */ - cl_curbe_patch_info_t *curbe_info; /* List of patch locations for the curbe */ - char *name; /* User defined name */ - char *cst_buffer; /* (user provided) NDrange kernel parameters */ - void **args; /* (user provided) arguments which are cl_mem / cl_image / cl_sampler */ - uint8_t *is_provided; /* Tell us if all arguments have been provided by the user */ - const char *patch_list; /* Defines where the data are in the heaps */ - const char *kernel_heap; /* Contains instructions */ - const char *general_heap; /* Contains scratch space */ - const char *surface_heap; /* Contains surface state and binding table */ - const char *dynamic_heap; /* Contains IDRT and sampler states */ - size_t patch_list_sz; /* Total size of the patch list */ - size_t kernel_heap_sz; /* Size of the kernel heap */ - size_t general_heap_sz; /* Should be 0 */ - size_t surface_heap_sz; /* Size of the surface state heap */ - size_t dynamic_heap_sz; /* Size of the dynamic heap */ - cl_kernel_patch_info_t patch; /* Got from the patch list */ - uint32_t arg_info_n; /* Number of argument info */ - uint32_t curbe_info_n; /* Number of curbe info */ - uint32_t arg_n; /* Number of arguments in the function */ - uint32_t const_bo_index; /* Index in the binding table for const_bo */ - uint8_t has_local_buffer; /* Is there any __local * as function argument? */ uint8_t ref_its_program; /* True only for the user kernel (those created by clCreateKernel) */ }; -/* Size of the surface state as encoded in the binary blob */ -#if USE_OLD_COMPILER -#define SURFACE_SZ 32 -#else -#define SURFACE_SZ 64 -#endif - /* Allocate an empty kernel */ extern cl_kernel cl_kernel_new(void); @@ -308,42 +51,12 @@ extern cl_kernel cl_kernel_dup(cl_kernel); /* Add one more reference on the kernel object */ extern void cl_kernel_add_ref(cl_kernel); -/* Setup a kernel from a binary blob */ -extern int cl_kernel_setup(cl_kernel, const char*); - /* Set the argument before kernel execution */ extern int cl_kernel_set_arg(cl_kernel, uint32_t arg_index, size_t arg_size, const void *arg_value); -/* Check that all arguments are set before running the kernel */ -extern cl_int cl_kernel_check_args(cl_kernel); - -/* Get the size of shared local memory bound to the kernel */ -extern uint32_t cl_kernel_local_memory_sz(cl_kernel); - -/* Return a curbe entry if it exists. NULL otherwise */ -extern cl_curbe_patch_info_t *cl_kernel_get_curbe_info(cl_kernel, uint64_t); - -/* To look up the sorted curbe array */ -static inline uint64_t -cl_curbe_key(uint32_t type, uint32_t index, uint32_t src_offset) -{ - return ((uint64_t) type << 48) | - ((uint64_t) index << 32) | - (uint64_t) src_offset; -} - -/* Allocate, fill and return the CURBE */ -extern char* -cl_kernel_create_cst_buffer(cl_kernel k, - const size_t *global_wk_off, - const size_t *global_wk_sz, - const size_t *local_wk_sz, - cl_uint wk_dim, - cl_uint thread_n); - /* Compute and check the work group size from the user provided local size */ extern cl_int cl_kernel_work_group_sz(cl_kernel ker, diff --git a/src/cl_program.c b/src/cl_program.c index 59161854..cb93d2ca 100644 --- a/src/cl_program.c +++ b/src/cl_program.c @@ -32,97 +32,6 @@ #include <string.h> #include <assert.h> -#if USE_OLD_COMPILER -static const int icbe_ver = 1001; -#else -static const int icbe_ver = 1002; -#endif - -#define DECL_LOAD_HEADER(GEN) \ -static const char* \ -JOIN(cl_kernel_load_header,GEN)(cl_kernel ker, \ - const char *header, \ - size_t *name_sz, \ - size_t *ker_sz) \ -{ \ - const JOIN(JOIN(cl_kernel_header,GEN),_t) *h = \ - (const JOIN(JOIN(cl_kernel_header,GEN),_t) *) header; \ - *ker_sz = *name_sz = h->header.kernel_name_sz; \ - *ker_sz += ker->patch_list_sz = h->header.patch_list_sz; \ - *ker_sz += ker->kernel_heap_sz = h->kernel_heap_sz; \ - *ker_sz += ker->general_heap_sz = h->general_state_heap_sz; \ - *ker_sz += ker->surface_heap_sz = h->surface_state_heap_sz; \ - *ker_sz += ker->dynamic_heap_sz = h->dynamic_state_heap_sz; \ - return header + sizeof(JOIN(JOIN(cl_kernel_header,GEN),_t)); \ -} - -DECL_LOAD_HEADER(6) -DECL_LOAD_HEADER(7) -DECL_LOAD_HEADER(75) - -#undef DECL_LOAD_HEADER - -static int -cl_program_decode(cl_program p) -{ - cl_program_header_t *header = (cl_program_header_t *) p->bin; - const char *ker = NULL, *bin = NULL; - size_t ker_sz = 0, name_sz = 0; - int i, err = 0; - - /* Check binary consistency */ - assert(p->ctx && p->ctx->device); - FATAL_IF (header->magic != 0x494e5443, "Bad file format for the program\n"); - FATAL_IF (header->device != p->ctx->device->gfx_id, "File not compiled for this device\n"); - FATAL_IF (header->version != icbe_ver, "Uncompatible compiler\n"); - FATAL_IF ((p->ker_n = header->ker_n) == 0, "No kernel found in the program\n"); - - /* Allocate the kernel array */ - TRY_ALLOC (p->ker, CALLOC_ARRAY(cl_kernel, p->ker_n)); - - /* Load all kernels */ - ker = bin = p->bin + sizeof(cl_program_header_t); - for (i = 0; i < header->ker_n; ++i) { - - /* Format changes from generation to generation */ - TRY_ALLOC (p->ker[i], cl_kernel_new()); - switch (header->device) { - case IGFX_GEN7_5_CORE: - ker = cl_kernel_load_header75(p->ker[i], ker, &name_sz, &ker_sz); - break; - case IGFX_GEN7_CORE: - ker = cl_kernel_load_header7(p->ker[i], ker, &name_sz, &ker_sz); - break; - case IGFX_GEN6_CORE: - ker = cl_kernel_load_header6(p->ker[i], ker, &name_sz, &ker_sz); - break; - default: - FATAL ("Unsupported platform"); - break; - } - - /* Set the kernel name */ - TRY_ALLOC (p->ker[i]->name, CALLOC_ARRAY(char, name_sz)); - memcpy(p->ker[i]->name, ker, name_sz); - name_sz = ALIGN(name_sz, 4); - - /* Points to the kernel code */ - ker += name_sz; - - /* Initialize the kernel */ - p->ker[i]->program = p; - TRY (cl_kernel_setup, p->ker[i], ker); - - /* Pointer to the next kernel to setup */ - ker += (ker_sz - name_sz); - } - -exit: - return err; -error: - goto exit; -} - LOCAL void cl_program_delete(cl_program p) { @@ -172,23 +81,10 @@ cl_program_new(cl_context ctx, const char *data, size_t sz) p->magic = CL_MAGIC_PROGRAM_HEADER; p->ctx = ctx; - /* Decode the binary blob */ - TRY_NO_ERR (cl_program_decode, p); - - /* Append the command queue in the list */ - pthread_mutex_lock(&ctx->program_lock); - p->next = ctx->programs; - if (ctx->programs != NULL) - ctx->programs->prev = p; - ctx->programs = p; - pthread_mutex_unlock(&ctx->program_lock); - cl_context_add_ref(ctx); - exit: return p; error: cl_program_delete(p); - p = NULL; goto exit; } @@ -208,78 +104,12 @@ cl_program_create_from_binary(cl_context ctx, cl_int * binary_status, cl_int * errcode_ret) { - cl_program program = NULL; - cl_int err = CL_SUCCESS; - - assert(ctx); - INVALID_DEVICE_IF (num_devices != 1); - INVALID_DEVICE_IF (devices == NULL); - INVALID_DEVICE_IF (devices[0] != ctx->device); - INVALID_VALUE_IF (binaries == NULL); - INVALID_VALUE_IF (lengths == NULL); - - if (binaries[0] == NULL) { - err = CL_INVALID_VALUE; - if (binary_status) - binary_status[0] = CL_INVALID_VALUE; - goto error; - } - - if (lengths[0] == 0) { - err = CL_INVALID_VALUE; - if (binary_status) - binary_status[0] = CL_INVALID_VALUE; - goto error; - } - - TRY_ALLOC (program, cl_program_new(ctx, (const char *) binaries[0], lengths[0])); - -exit: - if (errcode_ret) - *errcode_ret = err; - return program; -error: - cl_program_delete(program); - program = NULL; - goto exit; + return NULL; } LOCAL cl_kernel cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret) { - cl_kernel from = NULL, to = NULL; - cl_int err = CL_SUCCESS; - uint32_t i = 0; - - if (UNLIKELY(name == NULL)) { - err = CL_INVALID_KERNEL_NAME; - goto error; - } - - /* Find the program first */ - for (i = 0; i < p->ker_n; ++i) { - assert(p->ker[i] && p->ker[i]->name); - if (strcmp(p->ker[i]->name, name) == 0) { - from = p->ker[i]; - break; - } - } - - /* We were not able to find this named kernel */ - if (UNLIKELY(from == NULL)) { - err = CL_INVALID_KERNEL_NAME; - goto error; - } - - TRY_ALLOC(to, cl_kernel_dup(from)); - -exit: - if (errcode_ret) - *errcode_ret = err; - return to; -error: - cl_kernel_delete(to); - to = NULL; - goto exit; + return NULL; } diff --git a/src/cl_program.h b/src/cl_program.h index bf4b7ba2..53e182de 100644 --- a/src/cl_program.h +++ b/src/cl_program.h @@ -62,5 +62,13 @@ cl_program_create_from_binary(cl_context context, cl_int * binary_status, cl_int * errcode_ret); +/* Directly create a program from a LLVM source file */ +extern cl_program +cl_program_create_from_llvm(cl_context context, + cl_uint num_devices, + const cl_device_id * devices, + const char * fileName, + cl_int * errcode_ret); + #endif /* __CL_PROGRAM_H__ */ |