/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Benjamin Segovia
*/
#include "cl_command_queue.h"
#include "cl_context.h"
#include "cl_program.h"
#include "cl_kernel.h"
#include "cl_device_id.h"
#include "cl_mem.h"
#include "cl_event.h"
#include "cl_utils.h"
#include "cl_alloc.h"
#include "cl_device_enqueue.h"
#include
#include
#include
#include
#define MAX_GROUP_SIZE_IN_HALFSLICE 512
static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+256; }
/* "Varing" payload is the part of the curbe that changes accross threads in the
* same work group. Right now, it consists in local IDs and block IPs
*/
static cl_int
cl_set_varying_payload(const cl_kernel ker,
char *data,
const size_t *local_wk_sz,
size_t simd_sz,
size_t cst_sz,
size_t thread_n)
{
uint32_t *ids[3] = {NULL,NULL,NULL};
uint16_t *block_ips = NULL;
uint32_t *thread_ids = NULL;
size_t i, j, k, curr = 0;
int32_t id_offset[3], ip_offset, tid_offset;
cl_int err = CL_SUCCESS;
int32_t dw_ip_offset = -1;
id_offset[0] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0);
id_offset[1] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0);
id_offset[2] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0);
ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0);
tid_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_THREAD_ID, 0);
if (ip_offset < 0)
dw_ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_DW_BLOCK_IP, 0);
assert(ip_offset < 0 || dw_ip_offset < 0);
assert(ip_offset >= 0 || dw_ip_offset >= 0);
if (id_offset[0] >= 0)
TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
if (id_offset[1] >= 0)
TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
if (id_offset[2] >= 0)
TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz));
TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz));
if (tid_offset >= 0) {
TRY_ALLOC(thread_ids, (uint32_t*) alloca(sizeof(uint32_t)*thread_n));
memset(thread_ids, 0, sizeof(uint32_t)*thread_n);
}
/* 0xffff means that the lane is inactivated */
memset(block_ips, 0xff, sizeof(int16_t)*thread_n*simd_sz);
/* Compute the IDs and the block IPs */
for (k = 0; k < local_wk_sz[2]; ++k)
for (j = 0; j < local_wk_sz[1]; ++j)
for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
if (id_offset[0] >= 0)
ids[0][curr] = i;
if (id_offset[1] >= 0)
ids[1][curr] = j;
if (id_offset[2] >= 0)
ids[2][curr] = k;
block_ips[curr] = 0;
if (thread_ids)
thread_ids[curr/simd_sz] = curr/simd_sz;
}
/* Copy them to the curbe buffer */
curr = 0;
for (i = 0; i < thread_n; ++i, data += cst_sz) {
uint32_t *ids0 = (uint32_t *) (data + id_offset[0]);
uint32_t *ids1 = (uint32_t *) (data + id_offset[1]);
uint32_t *ids2 = (uint32_t *) (data + id_offset[2]);
uint16_t *ips = (uint16_t *) (data + ip_offset);
uint32_t *dw_ips = (uint32_t *) (data + dw_ip_offset);
if (thread_ids)
*(uint32_t *)(data + tid_offset) = thread_ids[i];
for (j = 0; j < simd_sz; ++j, ++curr) {
if (id_offset[0] >= 0)
ids0[j] = ids[0][curr];
if (id_offset[1] >= 0)
ids1[j] = ids[1][curr];
if (id_offset[2] >= 0)
ids2[j] = ids[2][curr];
if (ip_offset >= 0)
ips[j] = block_ips[curr];
if (dw_ip_offset >= 0)
dw_ips[j] = block_ips[curr];
}
}
error:
return err;
}
static int
cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker, cl_gpgpu gpgpu)
{
if (interp_kernel_get_ocl_version(ker->opaque) >= 200) {
// pass the starting of constant address space
int32_t constant_addrspace = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_CONSTANT_ADDRSPACE, 0);
if (constant_addrspace >= 0) {
size_t global_const_size = interp_program_get_global_constant_size(ker->program->opaque);
if (global_const_size > 0) {
*(char **)(ker->curbe + constant_addrspace) = ker->program->global_data_ptr;
cl_gpgpu_bind_buf(gpgpu, ker->program->global_data, constant_addrspace, 0, ALIGN(global_const_size, getpagesize()), BTI_CONSTANT);
}
}
return 0;
}
// TODO this is only valid for OpenCL 1.2,
// under ocl1.2 we gather all constant into one dedicated surface.
// but in 2.0 we put program global into one surface, but constants
// pass through kernel argument in each separate buffer
int32_t arg;
size_t offset = 0;
uint32_t raw_size = 0, aligned_size =0;
gbe_program prog = ker->program->opaque;
const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
size_t global_const_size = interp_program_get_global_constant_size(prog);
raw_size = global_const_size;
// Surface state need 4 byte alignment, and Constant argument's buffer size
// have align to 4 byte when alloc, so align global constant size to 4 can
// ensure the finally aligned_size align to 4.
aligned_size = ALIGN(raw_size, 4);
/* Reserve 8 bytes to get rid of 0 address */
if(global_const_size == 0) aligned_size = 8;
for (arg = 0; arg < arg_n; ++arg) {
const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
assert(alignment != 0);
cl_mem mem = ker->args[arg].mem;
raw_size += mem->size;
aligned_size = ALIGN(aligned_size, alignment);
aligned_size += mem->size;
}
}
if(raw_size == 0)
return 0;
cl_buffer bo = cl_gpgpu_alloc_constant_buffer(gpgpu, aligned_size, BTI_CONSTANT);
if (bo == NULL)
return -1;
cl_buffer_map(bo, 1);
char * cst_addr = cl_buffer_get_virtual(bo);
if (cst_addr == NULL)
return -1;
/* upload the global constant data */
if (global_const_size > 0) {
interp_program_get_global_constant_data(prog, (char*)(cst_addr+offset));
offset += global_const_size;
}
/* reserve 8 bytes to get rid of 0 address */
if(global_const_size == 0) {
offset = 8;
}
/* upload constant buffer argument */
int32_t curbe_offset = 0;
for (arg = 0; arg < arg_n; ++arg) {
const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) {
cl_mem mem = ker->args[arg].mem;
uint32_t alignment = interp_kernel_get_arg_align(ker->opaque, arg);
offset = ALIGN(offset, alignment);
curbe_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
if (curbe_offset < 0)
continue;
*(uint32_t *) (ker->curbe + curbe_offset) = offset;
cl_buffer_map(mem->bo, 1);
void * addr = cl_buffer_get_virtual(mem->bo);
memcpy(cst_addr + offset, addr, mem->size);
cl_buffer_unmap(mem->bo);
offset += mem->size;
}
}
cl_buffer_unmap(bo);
return 0;
}
/* Will return the total amount of slm used */
static int32_t
cl_curbe_fill(cl_kernel ker,
const uint32_t work_dim,
const size_t *global_wk_off,
const size_t *global_wk_sz,
const size_t *local_wk_sz,
const size_t *enqueued_local_wk_sz,
size_t thread_n)
{
int32_t offset;
#define UPLOAD(ENUM, VALUE) \
if ((offset = interp_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \
*((uint32_t *) (ker->curbe + offset)) = VALUE;
UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]);
UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]);
UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]);
UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_X, enqueued_local_wk_sz[0]);
UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Y, enqueued_local_wk_sz[1]);
UPLOAD(GBE_CURBE_ENQUEUED_LOCAL_SIZE_Z, enqueued_local_wk_sz[2]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]);
UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]);
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]);
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]);
UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]);
UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0] / enqueued_local_wk_sz[0] + (global_wk_sz[0]%enqueued_local_wk_sz[0]?1:0));
UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1] / enqueued_local_wk_sz[1] + (global_wk_sz[1]%enqueued_local_wk_sz[1]?1:0));
UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2] / enqueued_local_wk_sz[2] + (global_wk_sz[2]%enqueued_local_wk_sz[2]?1:0));
UPLOAD(GBE_CURBE_THREAD_NUM, thread_n);
UPLOAD(GBE_CURBE_WORK_DIM, work_dim);
#undef UPLOAD
/* Handle the various offsets to SLM */
const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
int32_t arg, slm_offset = interp_kernel_get_slm_size(ker->opaque);
ker->local_mem_sz = 0;
for (arg = 0; arg < arg_n; ++arg) {
const enum gbe_arg_type type = interp_kernel_get_arg_type(ker->opaque, arg);
if (type != GBE_ARG_LOCAL_PTR)
continue;
uint32_t align = interp_kernel_get_arg_align(ker->opaque, arg);
assert(align != 0);
slm_offset = ALIGN(slm_offset, align);
offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg);
if (offset < 0)
continue;
uint32_t *slmptr = (uint32_t *) (ker->curbe + offset);
*slmptr = slm_offset;
slm_offset += ker->args[arg].local_sz;
ker->local_mem_sz += ker->args[arg].local_sz;
}
return slm_offset;
}
static void
cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
{
cl_context ctx = ker->program->ctx;
cl_device_id device = ctx->devices[0];
const int32_t per_lane_stack_sz = ker->stack_size;
const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
const int32_t sub_value = GBE_STACK_BUFFER;
const int32_t offset_stack_buffer = interp_kernel_get_curbe_offset(ker->opaque, value, sub_value);
int32_t stack_sz = per_lane_stack_sz;
/* No stack required for this kernel */
if (per_lane_stack_sz == 0)
return;
/* The stack size is given for *each* SIMD lane. So, we accordingly compute
* the size we need for the complete machine
*/
assert(offset_stack_buffer >= 0);
stack_sz *= interp_kernel_get_simd_width(ker->opaque);
stack_sz *= device->max_compute_unit * ctx->devices[0]->max_thread_per_unit;
/* for some hardware, part of EUs are disabled with EU id reserved,
* it makes the active EU id larger than count of EUs within a subslice,
* need to enlarge stack size for such case to avoid out of range.
*/
cl_driver_enlarge_stack_size(ctx->drv, &stack_sz);
const int32_t offset_stack_size = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_SIZE, 0);
if (offset_stack_size >= 0) {
*(uint64_t *)(ker->curbe + offset_stack_size) = stack_sz;
}
cl_gpgpu_set_stack(gpgpu, offset_stack_buffer, stack_sz, BTI_PRIVATE);
}
static int
cl_bind_profiling(cl_gpgpu gpgpu, uint32_t simd_sz, cl_kernel ker, size_t global_sz, size_t local_sz, uint32_t bti) {
int32_t offset;
int i = 0;
int thread_num;
if (simd_sz == 16) {
for(i = 0; i < 3; i++) {
offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_TIMESTAMP0 + i, 0);
assert(offset >= 0);
memset(ker->curbe + offset, 0x0, sizeof(uint32_t)*8*2);
thread_num = (local_sz + 15)/16;
}
} else {
assert(simd_sz == 8);
for(i = 0; i < 5; i++) {
offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_TIMESTAMP0 + i, 0);
assert(offset >= 0);
memset(ker->curbe + offset, 0x0, sizeof(uint32_t)*8);
thread_num = (local_sz + 7)/8;
}
}
offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_PROFILING_BUF_POINTER, 0);
thread_num = thread_num*(global_sz/local_sz);
if (cl_gpgpu_set_profiling_buffer(gpgpu, thread_num*128 + 4, offset, bti))
return -1;
return 0;
}
static int
cl_alloc_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num, size_t global_sz) {
/* An guess size. */
size_t buf_size = global_sz * sizeof(int) * 16 * printf_num;
if (buf_size > 16*1024*1024) //at most.
buf_size = 16*1024*1024;
if (buf_size < 1*1024*1024) // at least.
buf_size = 1*1024*1024;
if (cl_gpgpu_set_printf_buffer(gpgpu, buf_size, interp_get_printf_buf_bti(printf_info)) != 0)
return -1;
return 0;
}
LOCAL cl_int
cl_command_queue_ND_range_gen7(cl_command_queue queue,
cl_kernel ker,
cl_event event,
const uint32_t work_dim,
const size_t *global_wk_off,
const size_t *global_dim_off,
const size_t *global_wk_sz,
const size_t *global_wk_sz_use,
const size_t *local_wk_sz,
const size_t *local_wk_sz_use)
{
cl_gpgpu gpgpu = cl_gpgpu_new(queue->ctx->drv);
cl_context ctx = queue->ctx;
char *final_curbe = NULL; /* Includes them and one sub-buffer per group */
cl_gpgpu_kernel kernel;
const uint32_t simd_sz = cl_kernel_get_simd_width(ker);
size_t i, batch_sz = 0u, local_sz = 0u;
size_t cst_sz = interp_kernel_get_curbe_size(ker->opaque);
int32_t scratch_sz = interp_kernel_get_scratch_size(ker->opaque);
size_t thread_n = 0u;
int printf_num = 0;
cl_int err = CL_SUCCESS;
size_t global_size = global_wk_sz[0] * global_wk_sz[1] * global_wk_sz[2];
void* printf_info = NULL;
uint32_t max_bti = 0;
if (ker->exec_info_n > 0) {
cst_sz += ker->exec_info_n * sizeof(void *);
cst_sz = (cst_sz + 31) / 32 * 32; //align to register size, hard code here.
ker->curbe = cl_realloc(ker->curbe, cst_sz);
}
ker->curbe_sz = cst_sz;
/* Setup kernel */
kernel.name = interp_kernel_get_name(ker->opaque);
kernel.grf_blocks = 128;
kernel.bo = ker->bo;
kernel.barrierID = 0;
kernel.slm_sz = 0;
kernel.use_slm = interp_kernel_use_slm(ker->opaque);
/* Compute the number of HW threads we need */
if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz_use, 3, &local_sz) != CL_SUCCESS)) {
DEBUGP(DL_ERROR, "Work group size exceed Kernel's work group size.");
return err;
}
kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
kernel.curbe_sz = cst_sz;
if (scratch_sz > ker->program->ctx->devices[0]->scratch_mem_size) {
DEBUGP(DL_ERROR, "Out of scratch memory %d.", scratch_sz);
return CL_OUT_OF_RESOURCES;
}
/* Curbe step 1: fill the constant urb buffer data shared by all threads */
if (ker->curbe) {
kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz,local_wk_sz_use ,local_wk_sz, thread_n);
if (kernel.slm_sz > ker->program->ctx->devices[0]->local_mem_size) {
DEBUGP(DL_ERROR, "Out of shared local memory %d.", kernel.slm_sz);
return CL_OUT_OF_RESOURCES;
}
}
printf_info = interp_dup_printfset(ker->opaque);
cl_gpgpu_set_printf_info(gpgpu, printf_info);
/* Setup the kernel */
if (queue->props & CL_QUEUE_PROFILING_ENABLE)
err = cl_gpgpu_state_init(gpgpu, ctx->devices[0]->max_compute_unit * ctx->devices[0]->max_thread_per_unit, cst_sz / 32, 1);
else
err = cl_gpgpu_state_init(gpgpu, ctx->devices[0]->max_compute_unit * ctx->devices[0]->max_thread_per_unit, cst_sz / 32, 0);
if (err != 0)
goto error;
printf_num = interp_get_printf_num(printf_info);
if (printf_num) {
if (cl_alloc_printf(gpgpu, ker, printf_info, printf_num, global_size) != 0)
goto error;
}
if (interp_get_profiling_bti(ker->opaque) != 0) {
if (cl_bind_profiling(gpgpu, simd_sz, ker, global_size, local_sz, interp_get_profiling_bti(ker->opaque)))
goto error;
cl_gpgpu_set_profiling_info(gpgpu, interp_dup_profiling(ker->opaque));
} else {
cl_gpgpu_set_profiling_info(gpgpu, NULL);
}
/* Bind user buffers */
cl_command_queue_bind_surface(queue, ker, gpgpu, &max_bti);
/* Bind user images */
if(UNLIKELY(err = cl_command_queue_bind_image(queue, ker, gpgpu, &max_bti) != CL_SUCCESS))
return err;
/* Bind all exec infos */
cl_command_queue_bind_exec_info(queue, ker, gpgpu, &max_bti);
/* Bind device enqueue buffer */
cl_device_enqueue_bind_buffer(gpgpu, ker, &max_bti, &kernel);
/* Bind all samplers */
if (ker->vme)
cl_gpgpu_bind_vme_state(gpgpu, ker->accel);
else
cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0)
goto error;
/* Bind a stack if needed */
cl_bind_stack(gpgpu, ker);
if (cl_upload_constant_buffer(queue, ker, gpgpu) != 0)
goto error;
cl_gpgpu_states_setup(gpgpu, &kernel);
/* Curbe step 2. Give the localID and upload it to video memory */
if (ker->curbe) {
assert(cst_sz > 0);
TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz));
for (i = 0; i < thread_n; ++i) {
memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz);
}
TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz_use, simd_sz, cst_sz, thread_n);
if (cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz) != 0)
goto error;
}
/* Start a new batch buffer */
batch_sz = cl_kernel_compute_batch_sz(ker);
if (cl_gpgpu_batch_reset(gpgpu, batch_sz) != 0)
goto error;
//cl_set_thread_batch_buf(queue, cl_gpgpu_ref_batch_buf(gpgpu));
cl_gpgpu_batch_start(gpgpu);
/* Issue the GPGPU_WALKER command */
cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off,global_dim_off, global_wk_sz_use, local_wk_sz_use);
/* Close the batch buffer and submit it */
cl_gpgpu_batch_end(gpgpu, 0);
event->exec_data.queue = queue;
event->exec_data.gpgpu = gpgpu;
event->exec_data.type = EnqueueNDRangeKernel;
return CL_SUCCESS;
error:
/* only some command/buffer internal error reach here, so return error code OOR */
return CL_OUT_OF_RESOURCES;
}