summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--CMake/FindGBE.cmake36
-rw-r--r--CMakeLists.txt8
-rw-r--r--include/CL/cl.h2
-rw-r--r--include/CL/cl_intel.h12
-rw-r--r--src/CMakeLists.txt6
-rw-r--r--src/cl_api.c14
-rw-r--r--src/cl_command_queue.c179
-rw-r--r--src/cl_command_queue_gen6.c230
-rw-r--r--src/cl_command_queue_gen7.c6
-rw-r--r--src/cl_device_id.c55
-rw-r--r--src/cl_gen6_device.h30
-rw-r--r--src/cl_kernel.c776
-rw-r--r--src/cl_kernel.h289
-rw-r--r--src/cl_program.c174
-rw-r--r--src/cl_program.h8
15 files changed, 83 insertions, 1742 deletions
diff --git a/CMake/FindGBE.cmake b/CMake/FindGBE.cmake
new file mode 100644
index 00000000..46704838
--- /dev/null
+++ b/CMake/FindGBE.cmake
@@ -0,0 +1,36 @@
+#
+# Try to find X library and include path.
+# Once done this will define
+#
+# GBE_FOUND
+# GBE_INCLUDE_PATH
+# GBE_LIBRARY
+#
+
+FIND_PATH(GBE_INCLUDE_PATH gen/program.h
+ ~/include/
+ /usr/include/
+ /usr/local/include/
+ /sw/include/
+ /opt/local/include/
+ DOC "The directory where gen/program.h resides")
+FIND_LIBRARY(GBE_LIBRARY
+ NAMES GBE gbe
+ PATHS
+ ~/lib/
+ /usr/lib64
+ /usr/lib
+ /usr/local/lib64
+ /usr/local/lib
+ /sw/lib
+ /opt/local/lib
+ DOC "The GBE library")
+
+IF(GBE_INCLUDE_PATH)
+ SET(GBE_FOUND 1 CACHE STRING "Set to 1 if GBE is found, 0 otherwise")
+ELSE(GBE_INCLUDE_PATH)
+ SET(GBE_FOUND 0 CACHE STRING "Set to 1 if GBE is found, 0 otherwise")
+ENDIF(GBE_INCLUDE_PATH)
+
+MARK_AS_ADVANCED(GBE_FOUND)
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5e974c67..e6d9fee0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,14 @@ ELSE(XFIXES_FOUND)
MESSAGE(STATUS "Looking for Xfixes - not found")
ENDIF(XFIXES_FOUND)
+# Gen-backend (compiler)
+INCLUDE(CMake/FindGBE.cmake)
+IF(GBE_FOUND)
+ MESSAGE(STATUS "Looking for Gen-Backend - found")
+ELSE(GBE_FOUND)
+ MESSAGE(STATUS "Looking for Gen-Backend - not found")
+ENDIF(GBE_FOUND)
+
# the run-time itself
ADD_SUBDIRECTORY(src)
diff --git a/include/CL/cl.h b/include/CL/cl.h
index ddb18ece..8201afc0 100644
--- a/include/CL/cl.h
+++ b/include/CL/cl.h
@@ -625,7 +625,7 @@ clGetSamplerInfo(cl_sampler /* sampler */,
size_t /* param_value_size */,
void * /* param_value */,
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
+
/* Program Object APIs */
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithSource(cl_context /* context */,
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index 34f37288..9239bb56 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -46,16 +46,16 @@ clIntelPinBuffer(cl_mem);
extern CL_API_ENTRY cl_int CL_API_CALL
clIntelUnpinBuffer(cl_mem);
-/* Set the buffer where to report the performance counters. If NULL, nothing
- * will be report
- */
-extern CL_API_ENTRY cl_int CL_API_CALL
-clIntelSetReportBuffer(cl_command_queue, cl_mem);
-
/* Get the generation of the Gen device (used to load the proper binary) */
extern CL_API_ENTRY cl_int CL_API_CALL
clIntelGetGenVersion(cl_device_id device, cl_int *ver);
+/* Create a program from a LLVM source file */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithLLVM(cl_context /* context */,
+ const char * /* file */,
+ cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
+
#ifdef __cplusplus
}
#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7837dd12..39392434 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,6 +1,7 @@
INCLUDE_DIRECTORIES(
${CMAKE_CURRENT_SOURCE_DIR}
${DRM_INCLUDE_PATH}
+ ${GBE_INCLUDE_PATH}
${CMAKE_CURRENT_SOURCE_DIR}/../include)
SET(OPENCL_SRC
@@ -16,7 +17,6 @@ SET(OPENCL_SRC
cl_device_id.c
cl_context.c
cl_command_queue.c
- cl_command_queue_gen6.c
cl_command_queue_gen7.c
intel/intel_gpgpu.c
intel/intel_batchbuffer.c
@@ -26,6 +26,7 @@ SET(OPENCL_SRC
ADD_LIBRARY(cl SHARED ${OPENCL_SRC})
TARGET_LINK_LIBRARIES(cl
+ ${GBE_LIBRARY}
${DRM_LIBRARY}
${DRM_INTEL_LIBRARY}
${XLIB_LIBRARY}
@@ -39,7 +40,6 @@ ADD_LIBRARY(cl_test STATIC
tests/cl_file_map.c)
TARGET_LINK_LIBRARIES(cl_test cl)
-ADD_EXECUTABLE(cl_inject cl_inject.c tests/cl_file_map.c)
ADD_EXECUTABLE(test_write_only tests/test_write_only.c)
ADD_EXECUTABLE(test_copy_buffer tests/test_copy_buffer.c)
ADD_EXECUTABLE(test_copy_image tests/test_copy_image.c)
@@ -52,7 +52,6 @@ ADD_EXECUTABLE(test_local_memory tests/test_local_memory.c)
ADD_EXECUTABLE(test_private_memory tests/test_private_memory.c)
ADD_EXECUTABLE(test_constant_memory tests/test_constant_memory.c)
ADD_EXECUTABLE(test_memory_leak tests/test_memory_leak.c)
-ADD_EXECUTABLE(test_perf_report tests/test_perf_report.c)
ADD_EXECUTABLE(mandelbrot tests/mandelbrot.c)
ADD_EXECUTABLE(mersenneTwister tests/mersenneTwister.c)
ADD_EXECUTABLE(blackscholes tests/blackscholes.c)
@@ -76,7 +75,6 @@ TARGET_LINK_LIBRARIES(test_private_memory cl_test m)
TARGET_LINK_LIBRARIES(test_constant_memory cl_test m)
TARGET_LINK_LIBRARIES(test_memory_leak cl_test m)
TARGET_LINK_LIBRARIES(test_write_only cl_test m)
-TARGET_LINK_LIBRARIES(test_perf_report cl_test m)
TARGET_LINK_LIBRARIES(mandelbrot cl_test m)
TARGET_LINK_LIBRARIES(mersenneTwister cl_test m)
TARGET_LINK_LIBRARIES(blackscholes cl_test m)
diff --git a/src/cl_api.c b/src/cl_api.c
index 248b2ef5..c808b977 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1166,20 +1166,6 @@ error:
}
cl_int
-clIntelSetReportBuffer(cl_command_queue queue, cl_mem mem)
-{
- cl_int err = CL_SUCCESS;
- CHECK_QUEUE (queue);
- if (mem != NULL && mem->magic != CL_MAGIC_MEM_HEADER) {
- err = CL_INVALID_MEM;
- goto error;
- }
- err = cl_command_queue_set_report_buffer(queue, mem);
-error:
- return err;
-}
-
-cl_int
clIntelGetGenVersion(cl_device_id device, cl_int *ver)
{
return cl_device_get_version(device, ver);
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 9c1dab39..76170a56 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -103,168 +103,6 @@ cl_command_queue_add_ref(cl_command_queue queue)
atomic_inc(&queue->ref_n);
}
-static void
-cl_kernel_copy_image_parameters(cl_kernel k, cl_mem mem, int index, char *curbe)
-{
- cl_curbe_patch_info_t *info = NULL;
- uint64_t key;
- assert(curbe && mem && mem->is_image);
-
- key = cl_curbe_key(DATA_PARAMETER_IMAGE_WIDTH, index, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(curbe+info->offsets[0], &mem->w, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_IMAGE_HEIGHT, index, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(curbe+info->offsets[0], &mem->h, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_IMAGE_DEPTH, index, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(curbe+info->offsets[0], &mem->depth, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE, index, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(curbe+info->offsets[0], &mem->fmt.image_channel_data_type, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_IMAGE_CHANNEL_ORDER, index, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(curbe+info->offsets[0], &mem->fmt.image_channel_order, sizeof(uint32_t));
-}
-
-LOCAL cl_int
-cl_command_queue_bind_surface(cl_command_queue queue,
- cl_kernel k,
- char *curbe,
- drm_intel_bo **local,
- drm_intel_bo **priv,
- drm_intel_bo **scratch,
- uint32_t local_sz)
-{
- cl_context ctx = queue->ctx;
- intel_gpgpu_t *gpgpu = queue->gpgpu;
- drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
- cl_mem mem = NULL;
- drm_intel_bo *bo = NULL, *sync_bo = NULL;
- const size_t max_thread = ctx->device->max_compute_unit;
- cl_int err = CL_SUCCESS;
- uint32_t i, index;
-
- /* Bind user defined surface */
- for (i = 0; i < k->arg_info_n; ++i) {
- assert(k->arg_info[i].offset % SURFACE_SZ == 0);
- index = k->arg_info[i].offset / SURFACE_SZ;
- mem = (cl_mem) k->args[k->arg_info[i].arg_index];
- assert(index != MAX_SURFACES - 1);
- CHECK_MEM(mem);
- bo = mem->bo;
- assert(bo);
- if (mem->is_image) {
- const int32_t w = mem->w, h = mem->h, pitch = mem->pitch;
- const uint32_t fmt = mem->intel_fmt;
- gpgpu_tiling_t tiling = GPGPU_NO_TILE;
- if (mem->tiling == CL_TILE_X)
- tiling = GPGPU_TILE_X;
- else if (mem->tiling == CL_TILE_Y)
- tiling = GPGPU_TILE_Y;
- gpgpu_bind_image2D(gpgpu, index, bo, fmt, w, h, pitch, tiling);
-
- /* Copy the image parameters (width, height) in the constant buffer if the
- * user requests them
- */
- cl_kernel_copy_image_parameters(k, mem, index, curbe);
- } else
- gpgpu_bind_buf(gpgpu, index, bo, cc_llc_l3);
- }
-
- /* Allocate the constant surface (if any) */
- if (k->const_bo) {
- assert(k->const_bo_index != MAX_SURFACES - 1);
- gpgpu_bind_buf(gpgpu, k->const_bo_index,
- k->const_bo,
- cc_llc_l3);
- }
-
- /* Allocate local surface needed for SLM and bind it */
- if (local && local_sz != 0) {
- const size_t sz = 16 * local_sz; /* XXX 16 == maximum barrier number */
- assert(k->patch.local_surf.offset % SURFACE_SZ == 0);
- index = k->patch.local_surf.offset / SURFACE_SZ;
- assert(index != MAX_SURFACES - 1);
- *local = drm_intel_bo_alloc(bufmgr, "CL local surface", sz, 64);
- gpgpu_bind_buf(gpgpu, index, *local, cc_llc_l3);
- }
- else if (local)
- *local = NULL;
-
- /* Allocate private surface and bind it */
- if (priv && k->patch.private_surf.size != 0) {
- const size_t sz = max_thread *
- k->patch.private_surf.size *
- k->patch.exec_env.largest_compiled_simd_sz;
- // assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
- assert(k->patch.private_surf.offset % SURFACE_SZ == 0);
- index = k->patch.private_surf.offset / SURFACE_SZ;
- assert(index != MAX_SURFACES - 1);
- *priv = drm_intel_bo_alloc(bufmgr, "CL private surface", sz, 64);
- gpgpu_bind_buf(gpgpu, index, *priv, cc_llc_l3);
- }
- else if(priv)
- *priv = NULL;
-
- /* Allocate scratch surface and bind it */
- if (scratch && k->patch.scratch.size != 0) {
- const size_t sz = max_thread * /* XXX is it given per lane ??? */
- k->patch.scratch.size *
- k->patch.exec_env.largest_compiled_simd_sz;
- // assert(k->patch.exec_env.largest_compiled_simd_sz == 16);
- assert(k->patch.scratch.offset % SURFACE_SZ == 0);
- assert(index != MAX_SURFACES - 1);
- index = k->patch.scratch.offset / SURFACE_SZ;
- *scratch = drm_intel_bo_alloc(bufmgr, "CL scratch surface", sz, 64);
- gpgpu_bind_buf(gpgpu, index, *scratch, cc_llc_l3);
- }
- else if (scratch)
- *scratch = NULL;
-
- /* Now bind a bo used for synchronization */
- sync_bo = drm_intel_bo_alloc(bufmgr, "sync surface", 64, 64);
- gpgpu_bind_buf(gpgpu, MAX_SURFACES-1, sync_bo, cc_llc_l3);
- if (queue->last_batch != NULL)
- drm_intel_bo_unreference(queue->last_batch);
- queue->last_batch = sync_bo;
-
-error:
- assert(err == CL_SUCCESS); /* Cannot fail here */
- return err;
-}
-
-LOCAL cl_int
-cl_kernel_check_args(cl_kernel k)
-{
- uint32_t i;
- for (i = 0; i < k->arg_n; ++i)
- if (k->is_provided[i] == CL_FALSE)
- return CL_INVALID_KERNEL_ARGS;
- return CL_SUCCESS;
-}
-
-LOCAL cl_int
-cl_command_queue_set_report_buffer(cl_command_queue queue, cl_mem mem)
-{
- cl_int err = CL_SUCCESS;
- if (queue->perf != NULL) {
- cl_mem_delete(queue->perf);
- queue->perf = NULL;
- }
- if (mem != NULL) {
- if (drm_intel_bo_get_size(mem->bo) < 1024) { /* 1K for the performance counters is enough */
- err = CL_INVALID_BUFFER_SIZE;
- goto error;
- }
- cl_mem_add_ref(mem);
- queue->perf = mem;
- }
-
-error:
- return err;
-}
-
#if USE_FULSIM
extern void drm_intel_bufmgr_gem_stop_aubfile(drm_intel_bufmgr*);
extern void drm_intel_bufmgr_gem_set_aubfile(drm_intel_bufmgr*, FILE*);
@@ -303,8 +141,9 @@ static const size_t chunk_sz = 8192u;
static cl_int
cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k)
{
- cl_mem mem = NULL;
cl_int err = CL_SUCCESS;
+#if 0
+ cl_mem mem = NULL;
int i;
size_t j;
@@ -323,6 +162,7 @@ cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k)
aub_exec_dump_raw_file(mem->bo, chunk_n * chunk_sz, chunk_remainder);
}
error:
+#endif
return err;
}
@@ -345,6 +185,7 @@ struct bmphdr {
/* raw b, g, r data here, dword aligned per scan line */
};
+#if 0
static int*
cl_read_bmp(const char *filename, int *width, int *height)
{
@@ -426,16 +267,17 @@ cl_read_dump(const char *name, size_t *size)
*size = sz;
return dump;
}
+#endif
static cl_int
cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
{
+ cl_int err = CL_SUCCESS;
+#if 0
cl_mem mem = NULL;
char *from = NULL, *to = NULL;
size_t size, j, chunk_n, chunk_remainder;
- cl_int err = CL_SUCCESS;
int i, curr = 0;
-
/* Bind user defined surface */
for (i = 0; i < k->arg_info_n; ++i) {
if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER)
@@ -475,11 +317,12 @@ cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
cl_mem_unmap(mem);
}
error:
+#endif
return err;
+
}
#endif /* USE_FULSIM */
-extern cl_int cl_command_queue_ND_range_gen6(cl_command_queue, cl_kernel, const size_t*, const size_t*, const size_t*);
extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
LOCAL cl_int
@@ -501,9 +344,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
drm_intel_bufmgr_gem_set_aubfile(bufmgr, file);
#endif /* USE_FULSIM */
- if (ver == 6)
- TRY (cl_command_queue_ND_range_gen6, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
- else if (ver == 7 || ver == 75)
+ if (ver == 7 || ver == 75)
TRY (cl_command_queue_ND_range_gen7, queue, k, global_wk_off, global_wk_sz, local_wk_sz);
else
FATAL ("Unknown Gen Device");
diff --git a/src/cl_command_queue_gen6.c b/src/cl_command_queue_gen6.c
deleted file mode 100644
index a08ff410..00000000
--- a/src/cl_command_queue_gen6.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia@intel.com>
- */
-
-#include "cl_command_queue.h"
-#include "cl_context.h"
-#include "cl_program.h"
-#include "cl_kernel.h"
-#include "cl_device_id.h"
-#include "cl_mem.h"
-#include "cl_utils.h"
-#include "cl_alloc.h"
-
-#ifdef _PLASMA
-#include "plasma/plasma_export.h"
-#else
-#include "intel_bufmgr.h"
-#include "intel/intel_gpgpu.h"
-#endif
-
-#include <assert.h>
-#include <stdio.h>
-#include <string.h>
-
-/* Header used by kernels */
-typedef struct cl_inline_header {
- uint32_t grp_n[3];
- uint32_t local_sz[3];
- uint32_t exec_mask;
- uint32_t local_mem_sz;
-} cl_inline_header_t;
-
-/* ID inside the work group */
-typedef struct cl_local_id {
- uint16_t data[16];
-} cl_local_id_t;
-
-static INLINE size_t
-cl_kernel_compute_batch_sz(cl_kernel k, size_t wk_grp_n, size_t thread_n)
-{
- size_t sz = 256; /* upper bound of the complete prelude */
- size_t media_obj_sz = 6 * 4; /* size of one MEDIA OBJECT */
- media_obj_sz += sizeof(cl_inline_header_t); /* header for all threads */
- media_obj_sz += 3 * sizeof(cl_local_id_t);/* for each dimension */
- if (k->patch.exec_env.has_barriers)
- media_obj_sz += 4 * 4; /* one barrier update per object */
- sz += media_obj_sz * wk_grp_n * thread_n;
- return sz;
-}
-
-static INLINE void
-cl_command_queue_enqueue_wk_grp(cl_command_queue queue,
- cl_local_id_t **ids,
- const cl_inline_header_t *header,
- uint32_t thread_n,
- uint32_t barrierID)
-{
- intel_gpgpu_t *gpgpu = queue->gpgpu;
- uint32_t i;
- for (i = 0; i < thread_n; ++i) {
- const size_t sz = sizeof(cl_inline_header_t) + 3*sizeof(cl_local_id_t);
- char *data = gpgpu_run_with_inline(gpgpu, barrierID, sz);
- size_t offset = 0;
- assert(data);
- *((cl_inline_header_t *) (data + offset)) = *header;
- offset += sizeof(cl_inline_header_t);
- *((cl_local_id_t *) (data + offset)) = ids[0][i];
- offset += sizeof(cl_local_id_t);
- *((cl_local_id_t *) (data + offset)) = ids[1][i];
- offset += sizeof(cl_local_id_t);
- *((cl_local_id_t *) (data + offset)) = ids[2][i];
- }
-}
-
-LOCAL cl_int
-cl_command_queue_ND_range_gen6(cl_command_queue queue,
- cl_kernel ker,
- const size_t *global_wk_off,
- const size_t *global_wk_sz,
- const size_t *local_wk_sz)
-{
- cl_context ctx = queue->ctx;
- intel_gpgpu_t *gpgpu = queue->gpgpu;
- drm_intel_bo *slm_bo = NULL, *private_bo = NULL, *scratch_bo = NULL;
- char *curbe = NULL; /* constant buffer */
- const size_t cst_sz = ker->patch.curbe.sz;
- size_t wk_grp_sz, wk_grp_n, batch_sz;
- uint32_t grp_end[3], offset[3], thread_n; /* per work group */
- uint32_t i, j, k, curr;
- uint32_t barrierID = 0;
- cl_inline_header_t header;
- cl_local_id_t *ids[3] = {NULL,NULL,NULL};
- cl_int err = CL_SUCCESS;
-
- /* Allocate 16 kernels (one per barrier) */
- genx_gpgpu_kernel_t kernels[16];
- for (i = 0; i < 16; ++i) {
- kernels[i].name = "OCL kernel";
- kernels[i].grf_blocks = 128;
- kernels[i].cst_sz = cst_sz;
- kernels[i].bin = NULL,
- kernels[i].size = 0,
- kernels[i].bo = ker->bo;
- kernels[i].barrierID = i;
- kernels[i].use_barrier = 0; /* unused in gen6 */
- kernels[i].thread_n = 0; /* unused in gen6 */
- }
-
- /* All arguments must have been set */
- TRY (cl_kernel_check_args, ker);
-
- /* Check that the local work sizes are OK */
- TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &wk_grp_sz);
-
- /* Directly from the user defined values */
- header.local_sz[0] = local_wk_sz[0];
- header.local_sz[1] = local_wk_sz[1];
- header.local_sz[2] = local_wk_sz[2];
- offset[0] = header.grp_n[0] = 0;
- offset[1] = header.grp_n[1] = 0;
- offset[2] = header.grp_n[2] = 0;
- header.exec_mask = ~0;
-
- /* offsets are evenly divided by the local sizes */
- offset[0] = global_wk_off[0] / local_wk_sz[0];
- offset[1] = global_wk_off[1] / local_wk_sz[1];
- offset[2] = global_wk_off[2] / local_wk_sz[2];
-
- /* Compute the local size per wg and the offsets for each local buffer */
- header.local_mem_sz = cl_kernel_local_memory_sz(ker);
-
- /* Create the constant buffer */
- if (cst_sz > 0) {
- assert(ker->cst_buffer);
- curbe = cl_kernel_create_cst_buffer(ker, global_wk_off, global_wk_sz, local_wk_sz, 0, 0);
- }
-
- /* Only if we want to monitor performance for this kernel */
- if (queue->perf)
- gpgpu_set_perf_counters(gpgpu, queue->perf->bo);
-
- /* Setup the kernel */
- gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32);
- if (queue->last_batch != NULL)
- drm_intel_bo_unreference(queue->last_batch);
- queue->last_batch = NULL;
- cl_command_queue_bind_surface(queue,
- ker,
- curbe,
- &slm_bo,
- &private_bo,
- &scratch_bo,
- header.local_mem_sz);
-
- /* Upload the __constant samplers if any */
- const void *samplers = ker->dynamic_heap + ker->patch.sampler_state.offset;
- const uint32_t sampler_n = ker->patch.sampler_state.count;
- gpgpu_upload_samplers(gpgpu, samplers, sampler_n);
-
- gpgpu_states_setup(gpgpu, kernels, 16);
-
- /* Fill the constant buffer */
- if (cst_sz > 0) {
- gpgpu_upload_constants(gpgpu, curbe, cst_sz);
- cl_free(curbe);
- }
-
- wk_grp_n = 1;
- for (i = 0; i < 3; ++i) {
- TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wk_grp_sz*sizeof(uint16_t)));
- grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i];
- wk_grp_n *= grp_end[i]-offset[i];
- }
- thread_n = wk_grp_sz / 16;
- batch_sz = cl_kernel_compute_batch_sz(ker, wk_grp_n, thread_n);
-
- /* Start a new batch buffer */
- gpgpu_batch_reset(gpgpu, batch_sz);
- gpgpu_batch_start(gpgpu);
-
- /* Push all media objects. We implement three paths to make it (a bit) faster.
- * Local IDs are shared from work group to work group. We allocate once the
- * buffers and reuse them
- */
- curr = 0;
- for (k = 0; k < local_wk_sz[2]; ++k)
- for (j = 0; j < local_wk_sz[1]; ++j)
- for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
- ((uint16_t*) ids[0])[curr] = i;
- ((uint16_t*) ids[1])[curr] = j;
- ((uint16_t*) ids[2])[curr] = k;
- }
- for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0])
- for (header.grp_n[1] = offset[1]; header.grp_n[1] < grp_end[1]; ++header.grp_n[1])
- for (header.grp_n[2] = offset[2]; header.grp_n[2] < grp_end[2]; ++header.grp_n[2]) {
- if (ker->patch.exec_env.has_barriers)
- gpgpu_update_barrier(gpgpu, barrierID, thread_n);
- cl_command_queue_enqueue_wk_grp(queue, ids, &header, thread_n, barrierID);
- barrierID = (barrierID + 1) % 16;
- }
-
- gpgpu_batch_end(gpgpu, 0);
- gpgpu_flush(gpgpu);
-
- if (slm_bo) drm_intel_bo_unreference(slm_bo);
- if (private_bo) drm_intel_bo_unreference(private_bo);
- if (scratch_bo) drm_intel_bo_unreference(scratch_bo);
-
-error:
- cl_free(ids[0]);
- cl_free(ids[1]);
- cl_free(ids[2]);
- return err;
-}
-
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 35f7f5ef..45970265 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -100,13 +100,15 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
const size_t *global_wk_sz,
const size_t *local_wk_sz)
{
+#if 0
cl_context ctx = queue->ctx;
intel_gpgpu_t *gpgpu = queue->gpgpu;
drm_intel_bo *private_bo = NULL, *scratch_bo = NULL;
char *curbe = NULL; /* Does not include per-thread local IDs */
char *final_curbe = NULL; /* Includes them */
genx_gpgpu_kernel_t kernel;
- const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz;
+ //const size_t simd_sz = ker->patch.exec_env.largest_compiled_simd_sz;
+ const size_t simd_sz = 16;
size_t local_sz, batch_sz, cst_sz = ker->patch.curbe.sz;
size_t i, thread_n, id_offset;
cl_int err = CL_SUCCESS;
@@ -176,5 +178,7 @@ error:
cl_free(final_curbe);
cl_free(curbe);
return err;
+#endif
+ return CL_SUCCESS;
}
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index f7492eb3..e4457bee 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -33,30 +33,6 @@
#include <stdio.h>
#include <string.h>
-static struct _cl_device_id intel_snb_gt2_device = {
- .max_compute_unit = 60,
- .max_work_item_sizes = {512, 512, 512},
- .max_work_group_size = 512,
- .max_clock_frequency = 1350,
- /* Does not really belong here, but for now this seems the most
- * natural place to put it */
- .wg_sz = 512,
- .compile_wg_sz = {0},
-
-#include "cl_gen6_device.h"
-};
-
-static struct _cl_device_id intel_snb_gt1_device = {
- .max_compute_unit = 24,
- .max_work_item_sizes = {256, 256, 256},
- .max_work_group_size = 256,
- .max_clock_frequency = 1000,
- .wg_sz = 256,
- .compile_wg_sz = {0},
-
-#include "cl_gen6_device.h"
-};
-
static struct _cl_device_id intel_ivb_gt2_device = {
.max_compute_unit = 128,
.max_work_item_sizes = {512, 512, 512},
@@ -119,21 +95,6 @@ cl_get_gt_device(void)
intel_ivb_gt2_device.platform = intel_platform;
ret = &intel_ivb_gt2_device;
}
- else if (device_id == PCI_CHIP_SANDYBRIDGE_GT1 ||
- device_id == PCI_CHIP_SANDYBRIDGE_M_GT1 ||
- device_id == PCI_CHIP_SANDYBRIDGE_S_GT) {
- intel_snb_gt1_device.vendor_id = device_id;
- intel_snb_gt1_device.platform = intel_platform;
- ret = &intel_snb_gt1_device;
- }
- else if (device_id == PCI_CHIP_SANDYBRIDGE_GT2 ||
- device_id == PCI_CHIP_SANDYBRIDGE_M_GT2 ||
- device_id == PCI_CHIP_SANDYBRIDGE_GT2_PLUS ||
- device_id == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS) {
- intel_snb_gt2_device.vendor_id = device_id;
- intel_snb_gt2_device.platform = intel_platform;
- ret = &intel_snb_gt2_device;
- }
return ret;
}
@@ -196,9 +157,7 @@ cl_get_device_info(cl_device_id device,
void * param_value,
size_t * param_value_size_ret)
{
- if (UNLIKELY(device != &intel_snb_gt1_device &&
- device != &intel_snb_gt2_device &&
- device != &intel_ivb_gt1_device &&
+ if (UNLIKELY(device != &intel_ivb_gt1_device &&
device != &intel_ivb_gt2_device &&
device != &intel_hsw_device))
return CL_INVALID_DEVICE;
@@ -272,17 +231,13 @@ cl_get_device_info(cl_device_id device,
LOCAL cl_int
cl_device_get_version(cl_device_id device, cl_int *ver)
{
- if (UNLIKELY(device != &intel_snb_gt1_device &&
- device != &intel_snb_gt2_device &&
- device != &intel_ivb_gt1_device &&
+ if (UNLIKELY(device != &intel_ivb_gt1_device &&
device != &intel_ivb_gt2_device &&
device != &intel_hsw_device))
return CL_INVALID_DEVICE;
if (ver == NULL)
return CL_SUCCESS;
- if (device == &intel_snb_gt1_device || device == &intel_snb_gt2_device)
- *ver = 6;
- else if (device == &intel_ivb_gt1_device || device == &intel_ivb_gt2_device)
+ if (device == &intel_ivb_gt1_device || device == &intel_ivb_gt2_device)
*ver = 7;
else
*ver = 75;
@@ -308,9 +263,7 @@ cl_get_kernel_workgroup_info(cl_device_id device,
void* param_value,
size_t* param_value_size_ret)
{
- if (UNLIKELY(device != &intel_snb_gt1_device &&
- device != &intel_snb_gt2_device &&
- device != &intel_ivb_gt1_device &&
+ if (UNLIKELY(device != &intel_ivb_gt1_device &&
device != &intel_ivb_gt2_device))
return CL_INVALID_DEVICE;
if (UNLIKELY(param_value == NULL))
diff --git a/src/cl_gen6_device.h b/src/cl_gen6_device.h
deleted file mode 100644
index b09121fd..00000000
--- a/src/cl_gen6_device.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia@intel.com>
- */
-
-/* Common fields for both SNB devices (either GT1 or GT2)
- */
-.max_parameter_size = 256,
-.global_mem_cache_line_size = 128, /* XXX */
-.global_mem_cache_size = 8 << 10, /* XXX */
-.local_mem_type = CL_GLOBAL,
-.local_mem_size = 16 << 10,
-.gfx_id = IGFX_GEN6_CORE,
-
-#include "cl_gt_device.h"
-
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 6668328d..69302119 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -40,49 +40,6 @@
#include <stdint.h>
#include <assert.h>
-static void
-cl_arg_list_destroy(cl_arg_info_t *arg_info)
-{
- cl_arg_info_t *next_arg_info = NULL;
- while (arg_info) {
- next_arg_info = arg_info->next;
- cl_free(arg_info);
- arg_info = next_arg_info;
- }
-}
-
-static void
-cl_curbe_list_destroy(cl_curbe_patch_info_t *curbe_info)
-{
- cl_curbe_patch_info_t *next_curbe_info = NULL;
- while (curbe_info) {
- next_curbe_info = curbe_info->next;
- cl_free(curbe_info);
- curbe_info = next_curbe_info;
- }
-}
-
-/* Header for all internal objects (cl_mem_object, cl_kernel_object, ...) */
-typedef struct cl_object_header {
- uint64_t magic;
- volatile int ref_n;
-} cl_object_header_t;
-
-static void
-cl_kernel_release_args(cl_kernel k)
-{
- uint32_t i;
- assert(k->args);
- for (i = 0; i < k->arg_n; ++i) {
- cl_object_header_t *header = (cl_object_header_t *) k->args[i];
- if (header == NULL)
- continue;
- FATAL_IF (header->magic != CL_MAGIC_MEM_HEADER,
- "A non memory object was set as an argument");
- cl_mem_delete((cl_mem)header);
- }
-}
-
LOCAL void
cl_kernel_delete(cl_kernel k)
{
@@ -92,24 +49,6 @@ cl_kernel_delete(cl_kernel k)
/* We are not done with the kernel */
if (atomic_dec(&k->ref_n) > 1) return;
- /* User may have set some OCL object as arguments. As we referenced them when
- * we set them, we release all their references here
- */
- if (k->args) cl_kernel_release_args(k);
-
- /* Free the chain lists (may also be arrays) */
- cl_arg_list_destroy(k->arg_info);
- cl_curbe_list_destroy(k->curbe_info);
-
- /* Free the CURBE data */
- cl_free(k->cst_buffer);
-
- /* Free the argument array */
- cl_free(k->args);
-
- /* Free the array to track argument setting */
- cl_free(k->is_provided);
-
/* Release one reference on all bos we own */
if (k->bo) drm_intel_bo_unreference(k->bo);
if (k->const_bo) drm_intel_bo_unreference(k->const_bo);
@@ -117,7 +56,6 @@ cl_kernel_delete(cl_kernel k)
/* This will be true for kernels created by clCreateKernel */
if (k->ref_its_program) cl_program_delete(k->program);
- cl_free(k->name);
k->magic = CL_MAGIC_DEAD_HEADER; /* For safety */
cl_free(k);
}
@@ -144,725 +82,11 @@ cl_kernel_add_ref(cl_kernel k)
atomic_inc(&k->ref_n);
}
-static void
-cl_kernel_chain_arg(cl_kernel k, cl_arg_info_t *arg_info)
-{
- cl_arg_info_t *next = k->arg_info;
- arg_info->next = next;
- k->arg_info = arg_info;
-}
-
-static void
-cl_kernel_chain_curbe(cl_kernel k, cl_curbe_patch_info_t *curbe_info)
-{
- cl_curbe_patch_info_t *next = k->curbe_info;
- curbe_info->next = next;
- k->curbe_info = curbe_info;
-}
-
-static INLINE cl_curbe_patch_info_t*
-cl_kernel_get_curbe_info_list(cl_kernel k, uint64_t key)
-{
- cl_curbe_patch_info_t *curbe_info = k->curbe_info;
- while (curbe_info) {
- if (curbe_info->key == key) break;
- curbe_info = curbe_info->next;
- }
- return curbe_info;
-}
-
-static INLINE cl_curbe_patch_info_t*
-cl_kernel_new_curbe_info(cl_kernel k, cl_patch_data_parameter_buffer_t *data)
-{
- cl_curbe_patch_info_t *curbe = NULL;
-
- TRY_ALLOC_NO_ERR (curbe, CALLOC(cl_curbe_patch_info_t));
- curbe->type = data->type;
- curbe->arg_index = data->index;
- curbe->offsets[0] = data->offset;
- curbe->sz = data->data_sz;
- curbe->src_offset = data->src_offset;
- curbe->is_local = CL_FALSE;
- curbe->last = 0;
- cl_kernel_chain_curbe(k, curbe);
-
-exit:
- return curbe;
-error:
- cl_free(curbe);
- curbe = NULL;
- goto exit;
-}
-
-static int
-cl_arg_cmp(const void *a, const void *b)
-{
- const cl_arg_info_t *arg0 = (const cl_arg_info_t *) a;
- const cl_arg_info_t *arg1 = (const cl_arg_info_t *) b;
- return arg0->arg_index > arg1->arg_index;
-}
-
-static int
-cl_curbe_cmp(const void *a, const void *b)
-{
- const cl_curbe_patch_info_t *curbe0 = (const cl_curbe_patch_info_t *) a;
- const cl_curbe_patch_info_t *curbe1 = (const cl_curbe_patch_info_t *) b;
- return curbe0->key > curbe1->key;
-}
-
-static cl_int
-cl_kernel_sort_arg_list(cl_kernel k)
-{
- cl_arg_info_t *arg_info = NULL;
- cl_arg_info_t *array = NULL;
- cl_int arg_info_n = 0;
- cl_int err = CL_SUCCESS;
-
- /* How many arguments do we have? */
- arg_info = k->arg_info;
- while (arg_info) {
- arg_info_n++;
- arg_info = arg_info->next;
- }
-
- /* Now fill the array with the unsorted arguments */
- TRY_ALLOC (array, CALLOC_ARRAY(cl_arg_info_t, arg_info_n));
- arg_info = k->arg_info;
- arg_info_n = 0;
- while (arg_info) {
- array[arg_info_n++] = *arg_info;
- array->next = NULL;
- arg_info = arg_info->next;
- }
-
- /* Sort the argument list array */
- qsort(array, arg_info_n, sizeof(cl_arg_info_t), cl_arg_cmp);
-
- /* Replace the list by the array */
- cl_arg_list_destroy(k->arg_info);
- k->arg_info = array;
- k->arg_info_n = arg_info_n;
-
-exit:
- return err;
-error:
- cl_free(array);
- goto exit;
-}
-
-static cl_int
-cl_kernel_sort_curbe_info_list(cl_kernel k)
-{
- cl_curbe_patch_info_t *curbe_info = NULL;
- cl_curbe_patch_info_t *array = NULL;
- cl_int curbe_info_n = 0;
- cl_int err = CL_SUCCESS;
-
- /* How many curbe info do we have? */
- curbe_info = k->curbe_info;
- while (curbe_info) {
- curbe_info_n++;
- curbe_info = curbe_info->next;
- }
-
- /* Now fill the array with the unsorted curbe info */
- TRY_ALLOC (array, CALLOC_ARRAY(cl_curbe_patch_info_t, curbe_info_n));
- curbe_info = k->curbe_info;
- curbe_info_n = 0;
- while (curbe_info) {
- array[curbe_info_n++] = *curbe_info;
- array->next = NULL;
- curbe_info = curbe_info->next;
- }
-
- /* Sort the curbe list array */
- qsort(array, curbe_info_n, sizeof(cl_curbe_patch_info_t), cl_curbe_cmp);
-
- /* Replace the list by the array */
- cl_curbe_list_destroy(k->curbe_info);
- k->curbe_info = array;
- k->curbe_info_n = curbe_info_n;
- k->curbe_info->next = NULL;
-
-exit:
- return err;
-error:
- cl_free(array);
- goto exit;
-}
-
-#define ASSOC_ITEM(ENUM,TYPE,FIELD) \
- case JOIN(PATCH_TOKEN_, ENUM): \
- info->FIELD = *(JOIN(JOIN(cl_patch_,TYPE),_t)*) patch; \
- break;
-
-static cl_int
-cl_kernel_allocate_inline_buffer(cl_kernel k,
- cl_patch_alloc_surf_with_init_t *init,
- const char **patch,
- size_t *read)
-{
- drm_intel_bufmgr *bufmgr = NULL;
- const size_t sz = init->sz;
- cl_int err = CL_SUCCESS;
-
- FATAL_IF (init->offset % SURFACE_SZ, "Bad alignment for inline buffer offset");
- FATAL_IF (k->const_bo != NULL, "inline buffer already declared");
- assert(k->program && k->program->ctx);
- bufmgr = cl_context_get_intel_bufmgr(k->program->ctx);
- TRY_ALLOC (k->const_bo, drm_intel_bo_alloc(bufmgr,
- "Inline buffer",
- sz,
- 64));
- drm_intel_bo_subdata(k->const_bo, 0, sz, &init->data);
- k->const_bo_index = init->offset / SURFACE_SZ;
- *read += sz;
- *patch += sz;
-
-error:
- return err;
-}
-
-static cl_int
-cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
-{
- cl_kernel_patch_info_t *info = &k->patch;
- cl_curbe_patch_info_t *curbe_info = NULL;
- cl_arg_info_t *arg_info = NULL;
- uint64_t curbe_key;
- size_t read = 0;
- cl_int err = CL_SUCCESS;
-
- while (read < sz) {
- const cl_patch_item_header_t *item = (const cl_patch_item_header_t *) patch;
- switch (item->token) {
- case PATCH_TOKEN_MEDIA_VFE_STATE: break;
- ASSOC_ITEM (MEDIA_INTERFACE_DESCRIPTOR_LOAD, interface_desc_load, idrt);
- ASSOC_ITEM (STATE_SIP, sip, sip);
- ASSOC_ITEM (MEDIA_CURBE_LOAD, curbe_load, curbe);
- ASSOC_ITEM (SAMPLER_STATE_ARRAY, sampler_state_array, sampler_state);
- ASSOC_ITEM (INTERFACE_DESCRIPTOR_DATA, interface_desc_data, surf_desc);
- ASSOC_ITEM (BINDING_TABLE_STATE, binding_table_state, binding_table);
- ASSOC_ITEM (ALLOCATE_SCRATCH_SURFACE, alloc_scratch_surf, scratch);
- ASSOC_ITEM (ALLOCATE_PRIVATE_MEMORY, alloc_private_memory_surf, private_surf);
- ASSOC_ITEM (ALLOCATE_LOCAL_SURFACE, alloc_local_surf, local_surf);
- ASSOC_ITEM (EXECUTION_ENVIRONMENT, exec_env, exec_env);
- ASSOC_ITEM (THREAD_PAYLOAD, thread_payload, thread_payload);
-
- case PATCH_TOKEN_DATA_PARAMETER_STREAM:
- info->curbe.sz = *(uint32_t *) (patch + sizeof(cl_patch_item_header_t));
- info->curbe.offset = 0;
- break;
- case PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT:
- case PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT:
- case PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT:
- {
-
- TRY_ALLOC (arg_info, CALLOC(cl_arg_info_t));
- if (item->token == PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT) {
- cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch;
- arg_info->arg_index = from->index;
- arg_info->offset = from->offset;
- arg_info->type = OCLRT_ARG_TYPE_BUFFER;
- }
- else if (item->token == PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT) {
- cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch;
- arg_info->arg_index = from->index;
- arg_info->offset = from->offset;
- arg_info->type = OCLRT_ARG_TYPE_CONST;
- }
-#if USE_OLD_COMPILER
- else if (item->token == PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT) {
- cl_global_memory_object_arg_t *from = (cl_global_memory_object_arg_t *) patch;
- arg_info->arg_index = from->index;
- arg_info->offset = from->offset;
- arg_info->type = OCLRT_ARG_TYPE_IMAGE;
- }
-#else
- else if (item->token == PATCH_TOKEN_IMAGE_MEMORY_KERNEL_ARGUMENT) {
- cl_image_memory_object_arg_t *from = (cl_image_memory_object_arg_t *) patch;
- arg_info->arg_index = from->index;
- arg_info->offset = from->offset;
- arg_info->type = OCLRT_ARG_TYPE_IMAGE;
- }
-#endif
- else
- assert(0);
-
- arg_info->sz = sizeof(cl_mem);
- arg_info->is_patched = CL_FALSE;
-
- /* Chain the argument to the next arguments */
- cl_kernel_chain_arg(k, arg_info);
- k->arg_n = MAX(k->arg_n, arg_info->arg_index);
- k->arg_info_n++;
- }
- break;
-
- case PATCH_TOKEN_ALLOCATE_SURFACE_WITH_INITIALIZATION:
- {
- cl_patch_alloc_surf_with_init_t *from = (cl_patch_alloc_surf_with_init_t *) patch;
- TRY (cl_kernel_allocate_inline_buffer, k, from, &patch, &read);
- }
- break;
-
- case PATCH_TOKEN_DATA_PARAMETER_BUFFER:
- {
- cl_patch_data_parameter_buffer_t *data = (cl_patch_data_parameter_buffer_t *) patch;
- switch (data->type)
- {
- case DATA_PARAMETER_KERNEL_ARGUMENT:
- case DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES:
- case DATA_PARAMETER_LOCAL_WORK_SIZE:
- case DATA_PARAMETER_GLOBAL_WORK_SIZE:
- case DATA_PARAMETER_GLOBAL_WORK_OFFSET:
- case DATA_PARAMETER_NUM_WORK_GROUPS:
- case DATA_PARAMETER_WORK_DIMENSIONS:
- case DATA_PARAMETER_IMAGE_WIDTH:
- case DATA_PARAMETER_IMAGE_HEIGHT:
- case DATA_PARAMETER_IMAGE_DEPTH:
- case DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE:
- case DATA_PARAMETER_IMAGE_CHANNEL_ORDER:
- case DATA_PARAMETER_NUM_HARDWARE_THREADS:
- {
-#if USE_OLD_COMPILER == 0
- if (data->type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
- curbe_key = cl_curbe_key(data->type, data->index, 0);
- else
-#endif
- curbe_key = cl_curbe_key(data->type, data->index, data->src_offset);
- curbe_info = cl_kernel_get_curbe_info_list(k, curbe_key);
- if (curbe_info != NULL)
- curbe_info->offsets[++curbe_info->last] = data->offset;
- else
- TRY_ALLOC (curbe_info, cl_kernel_new_curbe_info(k, data));
- curbe_info->key = curbe_key;
- curbe_info->is_patched = CL_FALSE;
- curbe_info = NULL;
- k->arg_n = MAX(k->arg_n, data->index);
-
- /* We will need to allocate a local surface */
- if (data->type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
- k->has_local_buffer = CL_TRUE;
- break;
- }
- default: NOT_IMPLEMENTED;
- }
- }
- break;
- default:
- FATAL("Undefined item in patch list");
- break;
- }
- patch += item->size;
- read += item->size;
- }
-
- if (k->patch.local_surf.sz != 0)
- k->has_local_buffer = CL_TRUE;
-
- /* k->arg_n was the offset of the last argument. Turn it into an argument
- * number
- */
- k->arg_n++;
-
- /* Transform the argument and the curbe info lists into sorted arrays */
- if (k->arg_info)
- TRY (cl_kernel_sort_arg_list, k);
- if (k->curbe_info)
- TRY (cl_kernel_sort_curbe_info_list, k);
-
-error:
- return err;
-}
-
-#undef ASSOC_ITEM
-
-LOCAL int
-cl_kernel_setup(cl_kernel k, const char *ker)
-{
- drm_intel_bufmgr *bufmgr = NULL;
- int err = 0;
-
- /* Kernel instruction */
- FATAL_IF (k->kernel_heap_sz == 0, "No instruction found for this kernel");
- k->kernel_heap = ker;
- ker += k->kernel_heap_sz;
-
- /* No general heap */
- FATAL_IF (k->general_heap_sz, "General heap unsupported");
-
- /* Dynamic heap */
- if (k->dynamic_heap_sz) {
- k->dynamic_heap = ker;
- ker += k->dynamic_heap_sz;
- }
-
- /* Surface state heap */
- if (k->surface_heap_sz) {
- k->surface_heap = ker;
- ker += k->surface_heap_sz;
- }
-
- /* Patch list */
- if (k->patch_list_sz) {
- k->patch_list = ker;
- ker += k->patch_list_sz;
- }
-
- /* Read all the patch elements */
- TRY (cl_kernel_setup_patch_list, k, k->patch_list, k->patch_list_sz);
-
- /* Create the kernel in GPU memory */
- assert(k->program && k->program->ctx);
- bufmgr = cl_context_get_intel_bufmgr(k->program->ctx);
- assert(bufmgr);
- TRY_ALLOC (k->bo, drm_intel_bo_alloc(bufmgr,
- "OCL kernel",
- k->kernel_heap_sz,
- 64));
- drm_intel_bo_subdata(k->bo, 0, k->kernel_heap_sz, k->kernel_heap);
-
- /* We have some restrictions on the compiled binary for SNB */
- FATAL_IF (k->program->ctx->ver == 6 &&
- k->patch.exec_env.largest_compiled_simd_sz != 16, "Unsupported SIMD size");
- FATAL_IF (k->program->ctx->ver == 6 &&
- k->patch.exec_env.compiled_simd16 == 0, "Unsupported SIMD size");
- FATAL_IF (k->program->ctx->ver > 6 &&
- k->patch.exec_env.largest_compiled_simd_sz == 32, "Unsupported SIMD size");
-
-error:
- return err;
-}
-
-LOCAL cl_kernel
-cl_kernel_dup(cl_kernel from)
-{
- cl_kernel to = NULL;
- size_t name_sz = 0;
- size_t cst_buffer_sz = 0;
-
- assert(from);
- TRY_ALLOC_NO_ERR (to, CALLOC(struct _cl_kernel));
- *to = *from; /* most fields do not belong to the kernel but the program */
- to->ref_n = 1;
- name_sz = strlen(from->name) + 1; /* zero terminated */
- TRY_ALLOC_NO_ERR (to->name, CALLOC_ARRAY(char, name_sz));
- memcpy(to->name, from->name, name_sz);
-
- /* Duplicate the argument info list */
- if (from->arg_info != NULL) {
- assert(from->arg_info_n != 0);
- assert(from->arg_info->next == NULL);
- TRY_ALLOC_NO_ERR (to->arg_info, CALLOC_ARRAY(cl_arg_info_t, from->arg_info_n));
- memcpy(to->arg_info, from->arg_info, sizeof(cl_arg_info_t) * from->arg_info_n);
- }
-
- /* Duplicate the curbe info */
- if (from->curbe_info != NULL) {
- assert(from->curbe_info_n != 0);
- assert(from->curbe_info->next == NULL);
- TRY_ALLOC_NO_ERR (to->curbe_info,
- CALLOC_ARRAY(cl_curbe_patch_info_t, from->curbe_info_n));
- memcpy(to->curbe_info,
- from->curbe_info,
- sizeof(cl_curbe_patch_info_t) * from->curbe_info_n);
- }
-
- /* This kernel (used outside the internal code) will need to see its CURBE
- * updated when setting arguments
- */
- cst_buffer_sz = ALIGN(to->patch.curbe.sz, 32);
- if (cst_buffer_sz)
- TRY_ALLOC_NO_ERR (to->cst_buffer, cl_malloc(cst_buffer_sz));
-
- /* We store for each argument the buffer currently set */
- TRY_ALLOC_NO_ERR (to->args, CALLOC_ARRAY(void*, to->arg_n));
-
- /* We track here that all arguments are provided by the user */
- TRY_ALLOC_NO_ERR (to->is_provided, CALLOC_ARRAY(uint8_t, to->arg_n));
-
- /* Retain the bos */
- if (from->bo) drm_intel_bo_reference(from->bo);
- if (from->const_bo) drm_intel_bo_reference(from->const_bo);
-
- /* We retain the program destruction since this kernel (user allocated)
- * depends on the program for some of its pointers
- */
- assert(from->program);
- cl_program_add_ref(from->program);
- to->ref_its_program = CL_TRUE;
-
-exit:
- return to;
-error:
- cl_free(to->arg_info);
- cl_free(to->curbe_info);
- to->arg_info = NULL;
- to->curbe_info = NULL;
- cl_kernel_delete(to);
- to = NULL;
- goto exit;
-}
-
-/* arg_info / curbe_info are sorted. Just use a dichotomic search */
-#define DECL_DICHO_SEARCH(FN, TYPE, KEY_TYPE, FIELD, SUB_FIELD) \
-LOCAL TYPE* \
-JOIN(cl_kernel_get_,FN)(cl_kernel k, KEY_TYPE index) \
-{ \
- uint32_t begin = 0, end = k->JOIN(FN,_n); \
- \
- while (end > begin) { \
- uint32_t mid = (begin + end) / 2; \
- if (k->FIELD[mid].SUB_FIELD == index) \
- return k->FIELD + mid; \
- else if (k->FIELD[mid].SUB_FIELD > index) \
- end = mid; \
- else \
- begin = mid + 1; \
- } \
- \
- return NULL; \
-}
-
-DECL_DICHO_SEARCH(arg_info, cl_arg_info_t, uint32_t, arg_info, arg_index)
-DECL_DICHO_SEARCH(curbe_info, cl_curbe_patch_info_t, uint64_t, curbe_info, key)
-
-#undef DECL_DICHO_SEARCH
-
-/* Set the given value (typically a function parameter)
- * in the constant buffer
- */
-static cl_int
-cl_kernel_set_curbe_entry(cl_kernel k,
- uint32_t index,
- size_t sz,
- const void *value)
-{
- cl_curbe_patch_info_t *info = NULL;
- uint64_t key;
- cl_int err = CL_SUCCESS;
- uint32_t i;
-
- /* Case 1: regular kernel argument (int, float ...) */
- key = cl_curbe_key(DATA_PARAMETER_KERNEL_ARGUMENT, index, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) {
-
- /* User must give a value for these arguments */
- if (value == NULL) {
- err = CL_INVALID_ARG_VALUE;
- goto error;
- }
-
- /* Sizes must match */
- if (UNLIKELY(sz > info->sz)) {
- err = CL_INVALID_ARG_SIZE;
- goto error;
- }
-
- /* Patch all locations */
- assert(k->cst_buffer);
- for (i = 0; i <= info->last; ++i) {
- assert(sz + info->offsets[i] <= k->patch.curbe.sz);
- memcpy(k->cst_buffer + info->offsets[i], value, sz);
- }
-
- /* We are done */
- goto exit;
- }
-
- /* Case 2: Local buffer size */
- key = cl_curbe_key(DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES, index, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) {
- info->sz = sz;
- goto exit;
- }
-
- /* Problem. We were not able to find anything */
- err = CL_INVALID_ARG_INDEX;
-
-exit:
-error:
- return err;
-}
-
LOCAL cl_int
cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
{
- const cl_arg_info_t *arg_info = NULL;
- cl_mem *mem = NULL;
cl_int err = CL_SUCCESS;
- /* Not a valid argument if exce*/
- assert(k);
- if (UNLIKELY(index >= k->arg_n)) {
- err = CL_INVALID_ARG_VALUE;
- goto error;
- }
-
- /* Is it a buffer / image / sampler to set */
- if ((arg_info = cl_kernel_get_arg_info(k, index)) != NULL) {
- switch (arg_info->type) {
- case OCLRT_ARG_TYPE_CONST:
- case OCLRT_ARG_TYPE_IMAGE:
- case OCLRT_ARG_TYPE_BUFFER:
- {
- /* Check the buffer consistency */
- FATAL_IF(value == NULL, "Unsupported NULL value for buffer (TBD)");
- if (UNLIKELY(sz != sizeof(void*))) {
- err = CL_INVALID_ARG_SIZE;
- goto error;
- }
- mem = (cl_mem*) value;
- FATAL_IF (mem == NULL, "Buffer cannot be NULL");
- CHECK_MEM((*mem));
-
- /* The kernel holds a reference on it now */
- cl_mem_add_ref(*mem);
- cl_mem_delete(k->args[index]);
- k->args[index] = *mem;
- }
- k->is_provided[index] = CL_TRUE;
- goto exit;
- default: NOT_IMPLEMENTED;
- }
- }
-
- TRY (cl_kernel_set_curbe_entry, k, index, sz, value);
- k->is_provided[index] = CL_TRUE;
-
-exit:
-error:
- return err;
-}
-
-static INLINE int32_t
-cl_kernel_get_first_local(cl_kernel k)
-{
- int32_t i;
- for (i = 0; i < (int32_t) k->curbe_info_n; ++i)
- if (k->curbe_info[i].type == DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
- return i;
- return k->curbe_info_n;
-}
-
-LOCAL uint32_t
-cl_kernel_local_memory_sz(cl_kernel k)
-{
- int32_t i;
- uint32_t local_mem_sz = 0;
-
- if (k->has_local_buffer) {
-
- /* Look for all local surfaces offset to set */
- i = cl_kernel_get_first_local(k);
-
- /* Now, set the offsets for all local surfaces */
- for (; i < (int32_t) k->curbe_info_n; ++i) {
- cl_curbe_patch_info_t *info = k->curbe_info + i;
- const size_t offset = local_mem_sz;
- if (info->type != DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES)
- break;
- assert(info->last == 0);
- assert(sizeof(int32_t) + info->offsets[0] <= k->patch.curbe.sz);
- memcpy(k->cst_buffer + info->offsets[0], &offset, sizeof(int32_t));
- local_mem_sz += info->sz;
- }
- local_mem_sz += k->patch.local_surf.sz;
- }
- return local_mem_sz;
-}
-
-LOCAL char*
-cl_kernel_create_cst_buffer(cl_kernel k,
- const size_t *global_wk_off,
- const size_t *global_wk_sz,
- const size_t *local_wk_sz,
- cl_uint wk_dim,
- cl_uint thread_n)
-{
- cl_curbe_patch_info_t *info = NULL;
- const size_t sz = k->patch.curbe.sz;
- uint64_t key = 0;
- char *data = NULL;
-
- TRY_ALLOC_NO_ERR (data, (char *) cl_calloc(sz, 1));
- memcpy(data, k->cst_buffer, sz);
-
- /* Global work group offset */
- key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_wk_off, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 4);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_wk_off+1, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_OFFSET, 0, 8);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_wk_off+2, sizeof(uint32_t));
-
- /* Global work group size */
- key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_wk_sz, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t));
-
- /* Local work group size */
- key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], local_wk_sz, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t));
- key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
-
- /* HW thread number (Gen7+) */
- key = cl_curbe_key(DATA_PARAMETER_NUM_HARDWARE_THREADS, 0, 0);
- if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], &thread_n, sizeof(uint32_t));
-
-exit:
- return data;
-error:
- cl_free(data);
- data = NULL;
- goto exit;
-}
-
-LOCAL cl_int
-cl_kernel_work_group_sz(cl_kernel ker,
- const size_t *local_wk_sz,
- uint32_t wk_dim,
- size_t *wk_grp_sz)
-{
- cl_int err = CL_SUCCESS;
- size_t sz = 0;
- cl_uint i;
-
- for (i = 0; i < wk_dim; ++i)
- if ((&ker->patch.exec_env.required_wgr_sz_x)[i] &&
- (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_wk_sz[i]) {
- err = CL_INVALID_WORK_ITEM_SIZE;
- goto error;
- }
- sz = local_wk_sz[0];
- for (i = 1; i < wk_dim; ++i)
- sz *= local_wk_sz[i];
- FATAL_IF (sz % 16, "Work group size must be a multiple of 16");
- if (sz > ker->program->ctx->device->max_work_group_size) {
- err = CL_INVALID_WORK_ITEM_SIZE;
- goto error;
- }
-
-error:
- if (wk_grp_sz)
- *wk_grp_sz = sz;
return err;
}
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index c27cff6b..f474879a 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -23,277 +23,20 @@
#include "cl_defs.h"
#include "cl_internals.h"
#include "CL/cl.h"
+#include "gen/program.h"
#include <stdint.h>
#include <stdlib.h>
-/***************************************************************************/
-/* XXX Structures extracted from the WINDOWS CODE BASE */
-/***************************************************************************/
-
-// Some fields went from 1 to 4 bytes with the new compiler
-#if USE_OLD_COMPILER
-typedef uint8_t cl_compiler_boolean_t;
-#else
-typedef uint32_t cl_compiler_boolean_t;
-#endif /* USE_OLD_COMPILER */
-
-typedef struct cl_program_header {
- uint32_t magic;
- uint32_t version;
- uint32_t device;
- uint32_t ker_n;
-} cl_program_header_t;
-
-typedef struct cl_arg_info {
- uint32_t arg_index;
- uint32_t type;
- cl_compiler_boolean_t is_null;
- uint32_t offset;
- uint32_t sz;
- void *obj;
- cl_compiler_boolean_t is_patched;
- struct cl_arg_info *next;
-} cl_arg_info_t;
-
-typedef struct cl_curbe_patch_info {
- uint64_t key;
- uint32_t last;
- uint32_t offsets[OCLRT_CURBE_MAX_OFFSETS];
- uint32_t type;
- uint32_t arg_index;
- uint32_t sz;
- uint32_t src_offset;
- cl_compiler_boolean_t is_patched;
- cl_compiler_boolean_t is_local;
- struct cl_curbe_patch_info *next;
-} cl_curbe_patch_info_t;
-
-typedef struct cl_kernel_header {
- uint32_t check_sum;
- uint32_t kernel_name_sz;
- uint32_t patch_list_sz;
-} cl_kernel_header_t;
-
-typedef struct cl_kernel_header75 {
- cl_kernel_header_t header;
- uint32_t kernel_heap_sz;
- uint32_t general_state_heap_sz;
- uint32_t dynamic_state_heap_sz;
- uint32_t surface_state_heap_sz;
-} cl_kernel_header75_t;
-
-typedef struct cl_kernel_header7 {
- cl_kernel_header_t header;
- uint32_t kernel_heap_sz;
- uint32_t general_state_heap_sz;
- uint32_t dynamic_state_heap_sz;
- uint32_t surface_state_heap_sz;
-} cl_kernel_header7_t;
-
-typedef struct cl_kernel_header6 {
- cl_kernel_header_t header;
- uint32_t kernel_heap_sz;
- uint32_t general_state_heap_sz;
- uint32_t dynamic_state_heap_sz;
- uint32_t surface_state_heap_sz;
- uint32_t indirect_object__heap_sz;
-} cl_kernel_header6_t;
-
-typedef struct cl_patch_item_header {
- uint32_t token;
- uint32_t size;
-} cl_patch_item_header_t;
-
-typedef struct cl_global_memory_object_arg {
- cl_patch_item_header_t header;
- uint32_t index;
- uint32_t offset;
-} cl_global_memory_object_arg_t;
-
-#if USE_OLD_COMPILER == 0
-typedef struct cl_image_memory_object_arg {
- cl_patch_item_header_t header;
- uint32_t index;
- uint32_t image_type;
- uint32_t offset;
-} cl_image_memory_object_arg_t;
-#endif
-
-typedef struct cl_patch_constant_memory_object_arg {
- uint32_t index;
- uint32_t offset;
-} cl_patch_constant_memory_object_arg_t;
-
-typedef struct cl_patch_sampler_kernel_arg {
- cl_patch_item_header_t header;
- uint32_t index;
- uint32_t offset;
-} cl_patch_sampler_kernel_arg_t;
-
-typedef struct cl_patch_data_parameter_buffer {
- cl_patch_item_header_t header;
- uint32_t type;
- uint32_t index;
- uint32_t offset;
- uint32_t data_sz;
- uint32_t src_offset;
-} cl_patch_data_parameter_buffer_t;
-
-typedef struct cl_patch_data_parameter_stream {
- cl_patch_item_header_t header;
- uint32_t data_parameter_stream_sz;
-} cl_patch_data_parameter_stream_t;
-
-typedef struct cl_patch_sip {
- cl_patch_item_header_t header;
- uint32_t sip_offset;
-} cl_patch_sip_t;
-
-typedef struct cl_patch_sampler_state_array {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t count;
- uint32_t border_color_offset;
-} cl_patch_sampler_state_array_t;
-
-typedef struct cl_patch_binding_table_state {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t count;
- uint32_t surface_state_offset;
-} cl_patch_binding_table_state_t;
-
-typedef struct cl_patch_alloc_scratch_surf {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t size;
-} cl_patch_alloc_scratch_surf_t;
-
-typedef struct cl_patch_alloc_private_memory_surf {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t size;
-} cl_patch_alloc_private_memory_surf_t;
-
-typedef struct cl_patch_alloc_system_thread_surf {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t sz;
-} cl_patch_alloc_system_thread_surf_t;
-
-typedef struct cl_patch_alloc_surf_with_init {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t sz;
- char* data;
-} cl_patch_alloc_surf_with_init_t;
-
-typedef struct cl_patch_alloc_local_surf {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t sz;
-} cl_patch_alloc_local_surf_t;
-
-typedef struct cl_patch_thread_payload {
- cl_patch_item_header_t header;
- uint8_t header_present;
- uint8_t local_idx_present;
- uint8_t local_idy_present;
- uint8_t local_idz_present;
-} cl_patch_thread_payload_t;
-
-typedef struct cl_patch_exec_env {
- cl_patch_item_header_t header;
- uint32_t required_wgr_sz_x;
- uint32_t required_wgr_sz_y;
- uint32_t required_wgr_sz_z;
- uint32_t largest_compiled_simd_sz;
- uint8_t has_barriers;
- uint8_t compiled_simd8;
- uint8_t compiled_simd16;
- uint8_t compiled_simd32;
-} cl_patch_exec_env_t;
-
-typedef struct cl_patch_vfe_state {
- cl_patch_item_header_t header;
- uint32_t scratch_offset;
-} cl_patch_vfe_state_t;
-
-typedef struct cl_patch_curbe_load {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t sz;
-} cl_patch_curbe_load_t;
-
-typedef struct cl_patch_interface_desc_load {
- cl_patch_item_header_t header;
- uint32_t offset;
-} cl_patch_interface_desc_load_t;
-
-typedef struct cl_patch_interface_desc_data {
- cl_patch_item_header_t header;
- uint32_t offset;
- uint32_t sampler_state_offset;
- uint32_t kernel_offset;
- uint32_t binding_table_offset;
-} cl_patch_interface_desc_data_t;
-
-typedef struct cl_kernel_patch_info {
- cl_patch_sip_t sip;
- cl_patch_sampler_state_array_t sampler_state;
- cl_patch_binding_table_state_t binding_table;
- cl_patch_alloc_scratch_surf_t scratch;
- cl_patch_alloc_private_memory_surf_t private_surf;
- cl_patch_alloc_system_thread_surf_t sys_thread_surf;
- cl_patch_alloc_surf_with_init_t surf_with_init;
- cl_patch_alloc_local_surf_t local_surf;
- cl_patch_thread_payload_t thread_payload;
- cl_patch_exec_env_t exec_env;
- cl_patch_vfe_state_t vfe;
- cl_patch_curbe_load_t curbe;
- cl_patch_interface_desc_load_t idrt;
- cl_patch_interface_desc_data_t surf_desc;
-} cl_kernel_patch_info_t;
-
struct _cl_kernel {
uint64_t magic; /* To identify it as a kernel */
volatile int ref_n; /* We reference count this object */
struct _drm_intel_bo *bo; /* The code itself */
struct _drm_intel_bo *const_bo;/* Buffer for all __constants values in the OCL program */
cl_program program; /* Owns this structure (and pointers) */
- cl_arg_info_t *arg_info; /* List of arguments */
- cl_curbe_patch_info_t *curbe_info; /* List of patch locations for the curbe */
- char *name; /* User defined name */
- char *cst_buffer; /* (user provided) NDrange kernel parameters */
- void **args; /* (user provided) arguments which are cl_mem / cl_image / cl_sampler */
- uint8_t *is_provided; /* Tell us if all arguments have been provided by the user */
- const char *patch_list; /* Defines where the data are in the heaps */
- const char *kernel_heap; /* Contains instructions */
- const char *general_heap; /* Contains scratch space */
- const char *surface_heap; /* Contains surface state and binding table */
- const char *dynamic_heap; /* Contains IDRT and sampler states */
- size_t patch_list_sz; /* Total size of the patch list */
- size_t kernel_heap_sz; /* Size of the kernel heap */
- size_t general_heap_sz; /* Should be 0 */
- size_t surface_heap_sz; /* Size of the surface state heap */
- size_t dynamic_heap_sz; /* Size of the dynamic heap */
- cl_kernel_patch_info_t patch; /* Got from the patch list */
- uint32_t arg_info_n; /* Number of argument info */
- uint32_t curbe_info_n; /* Number of curbe info */
- uint32_t arg_n; /* Number of arguments in the function */
- uint32_t const_bo_index; /* Index in the binding table for const_bo */
- uint8_t has_local_buffer; /* Is there any __local * as function argument? */
uint8_t ref_its_program; /* True only for the user kernel (those created by clCreateKernel) */
};
-/* Size of the surface state as encoded in the binary blob */
-#if USE_OLD_COMPILER
-#define SURFACE_SZ 32
-#else
-#define SURFACE_SZ 64
-#endif
-
/* Allocate an empty kernel */
extern cl_kernel cl_kernel_new(void);
@@ -308,42 +51,12 @@ extern cl_kernel cl_kernel_dup(cl_kernel);
/* Add one more reference on the kernel object */
extern void cl_kernel_add_ref(cl_kernel);
-/* Setup a kernel from a binary blob */
-extern int cl_kernel_setup(cl_kernel, const char*);
-
/* Set the argument before kernel execution */
extern int cl_kernel_set_arg(cl_kernel,
uint32_t arg_index,
size_t arg_size,
const void *arg_value);
-/* Check that all arguments are set before running the kernel */
-extern cl_int cl_kernel_check_args(cl_kernel);
-
-/* Get the size of shared local memory bound to the kernel */
-extern uint32_t cl_kernel_local_memory_sz(cl_kernel);
-
-/* Return a curbe entry if it exists. NULL otherwise */
-extern cl_curbe_patch_info_t *cl_kernel_get_curbe_info(cl_kernel, uint64_t);
-
-/* To look up the sorted curbe array */
-static inline uint64_t
-cl_curbe_key(uint32_t type, uint32_t index, uint32_t src_offset)
-{
- return ((uint64_t) type << 48) |
- ((uint64_t) index << 32) |
- (uint64_t) src_offset;
-}
-
-/* Allocate, fill and return the CURBE */
-extern char*
-cl_kernel_create_cst_buffer(cl_kernel k,
- const size_t *global_wk_off,
- const size_t *global_wk_sz,
- const size_t *local_wk_sz,
- cl_uint wk_dim,
- cl_uint thread_n);
-
/* Compute and check the work group size from the user provided local size */
extern cl_int
cl_kernel_work_group_sz(cl_kernel ker,
diff --git a/src/cl_program.c b/src/cl_program.c
index 59161854..cb93d2ca 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -32,97 +32,6 @@
#include <string.h>
#include <assert.h>
-#if USE_OLD_COMPILER
-static const int icbe_ver = 1001;
-#else
-static const int icbe_ver = 1002;
-#endif
-
-#define DECL_LOAD_HEADER(GEN) \
-static const char* \
-JOIN(cl_kernel_load_header,GEN)(cl_kernel ker, \
- const char *header, \
- size_t *name_sz, \
- size_t *ker_sz) \
-{ \
- const JOIN(JOIN(cl_kernel_header,GEN),_t) *h = \
- (const JOIN(JOIN(cl_kernel_header,GEN),_t) *) header; \
- *ker_sz = *name_sz = h->header.kernel_name_sz; \
- *ker_sz += ker->patch_list_sz = h->header.patch_list_sz; \
- *ker_sz += ker->kernel_heap_sz = h->kernel_heap_sz; \
- *ker_sz += ker->general_heap_sz = h->general_state_heap_sz; \
- *ker_sz += ker->surface_heap_sz = h->surface_state_heap_sz; \
- *ker_sz += ker->dynamic_heap_sz = h->dynamic_state_heap_sz; \
- return header + sizeof(JOIN(JOIN(cl_kernel_header,GEN),_t)); \
-}
-
-DECL_LOAD_HEADER(6)
-DECL_LOAD_HEADER(7)
-DECL_LOAD_HEADER(75)
-
-#undef DECL_LOAD_HEADER
-
-static int
-cl_program_decode(cl_program p)
-{
- cl_program_header_t *header = (cl_program_header_t *) p->bin;
- const char *ker = NULL, *bin = NULL;
- size_t ker_sz = 0, name_sz = 0;
- int i, err = 0;
-
- /* Check binary consistency */
- assert(p->ctx && p->ctx->device);
- FATAL_IF (header->magic != 0x494e5443, "Bad file format for the program\n");
- FATAL_IF (header->device != p->ctx->device->gfx_id, "File not compiled for this device\n");
- FATAL_IF (header->version != icbe_ver, "Uncompatible compiler\n");
- FATAL_IF ((p->ker_n = header->ker_n) == 0, "No kernel found in the program\n");
-
- /* Allocate the kernel array */
- TRY_ALLOC (p->ker, CALLOC_ARRAY(cl_kernel, p->ker_n));
-
- /* Load all kernels */
- ker = bin = p->bin + sizeof(cl_program_header_t);
- for (i = 0; i < header->ker_n; ++i) {
-
- /* Format changes from generation to generation */
- TRY_ALLOC (p->ker[i], cl_kernel_new());
- switch (header->device) {
- case IGFX_GEN7_5_CORE:
- ker = cl_kernel_load_header75(p->ker[i], ker, &name_sz, &ker_sz);
- break;
- case IGFX_GEN7_CORE:
- ker = cl_kernel_load_header7(p->ker[i], ker, &name_sz, &ker_sz);
- break;
- case IGFX_GEN6_CORE:
- ker = cl_kernel_load_header6(p->ker[i], ker, &name_sz, &ker_sz);
- break;
- default:
- FATAL ("Unsupported platform");
- break;
- }
-
- /* Set the kernel name */
- TRY_ALLOC (p->ker[i]->name, CALLOC_ARRAY(char, name_sz));
- memcpy(p->ker[i]->name, ker, name_sz);
- name_sz = ALIGN(name_sz, 4);
-
- /* Points to the kernel code */
- ker += name_sz;
-
- /* Initialize the kernel */
- p->ker[i]->program = p;
- TRY (cl_kernel_setup, p->ker[i], ker);
-
- /* Pointer to the next kernel to setup */
- ker += (ker_sz - name_sz);
- }
-
-exit:
- return err;
-error:
- goto exit;
-}
-
LOCAL void
cl_program_delete(cl_program p)
{
@@ -172,23 +81,10 @@ cl_program_new(cl_context ctx, const char *data, size_t sz)
p->magic = CL_MAGIC_PROGRAM_HEADER;
p->ctx = ctx;
- /* Decode the binary blob */
- TRY_NO_ERR (cl_program_decode, p);
-
- /* Append the command queue in the list */
- pthread_mutex_lock(&ctx->program_lock);
- p->next = ctx->programs;
- if (ctx->programs != NULL)
- ctx->programs->prev = p;
- ctx->programs = p;
- pthread_mutex_unlock(&ctx->program_lock);
- cl_context_add_ref(ctx);
-
exit:
return p;
error:
cl_program_delete(p);
- p = NULL;
goto exit;
}
@@ -208,78 +104,12 @@ cl_program_create_from_binary(cl_context ctx,
cl_int * binary_status,
cl_int * errcode_ret)
{
- cl_program program = NULL;
- cl_int err = CL_SUCCESS;
-
- assert(ctx);
- INVALID_DEVICE_IF (num_devices != 1);
- INVALID_DEVICE_IF (devices == NULL);
- INVALID_DEVICE_IF (devices[0] != ctx->device);
- INVALID_VALUE_IF (binaries == NULL);
- INVALID_VALUE_IF (lengths == NULL);
-
- if (binaries[0] == NULL) {
- err = CL_INVALID_VALUE;
- if (binary_status)
- binary_status[0] = CL_INVALID_VALUE;
- goto error;
- }
-
- if (lengths[0] == 0) {
- err = CL_INVALID_VALUE;
- if (binary_status)
- binary_status[0] = CL_INVALID_VALUE;
- goto error;
- }
-
- TRY_ALLOC (program, cl_program_new(ctx, (const char *) binaries[0], lengths[0]));
-
-exit:
- if (errcode_ret)
- *errcode_ret = err;
- return program;
-error:
- cl_program_delete(program);
- program = NULL;
- goto exit;
+ return NULL;
}
LOCAL cl_kernel
cl_program_create_kernel(cl_program p, const char *name, cl_int *errcode_ret)
{
- cl_kernel from = NULL, to = NULL;
- cl_int err = CL_SUCCESS;
- uint32_t i = 0;
-
- if (UNLIKELY(name == NULL)) {
- err = CL_INVALID_KERNEL_NAME;
- goto error;
- }
-
- /* Find the program first */
- for (i = 0; i < p->ker_n; ++i) {
- assert(p->ker[i] && p->ker[i]->name);
- if (strcmp(p->ker[i]->name, name) == 0) {
- from = p->ker[i];
- break;
- }
- }
-
- /* We were not able to find this named kernel */
- if (UNLIKELY(from == NULL)) {
- err = CL_INVALID_KERNEL_NAME;
- goto error;
- }
-
- TRY_ALLOC(to, cl_kernel_dup(from));
-
-exit:
- if (errcode_ret)
- *errcode_ret = err;
- return to;
-error:
- cl_kernel_delete(to);
- to = NULL;
- goto exit;
+ return NULL;
}
diff --git a/src/cl_program.h b/src/cl_program.h
index bf4b7ba2..53e182de 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -62,5 +62,13 @@ cl_program_create_from_binary(cl_context context,
cl_int * binary_status,
cl_int * errcode_ret);
+/* Directly create a program from a LLVM source file */
+extern cl_program
+cl_program_create_from_llvm(cl_context context,
+ cl_uint num_devices,
+ const cl_device_id * devices,
+ const char * fileName,
+ cl_int * errcode_ret);
+
#endif /* __CL_PROGRAM_H__ */