/*
* Copyright © 2012 Intel Corporation
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library. If not, see .
*
* Author: Rong Yang
*/
#include "cl_device_enqueue.h"
#include "cl_mem.h"
#include "cl_utils.h"
#include "cl_context.h"
#include "cl_program.h"
#include "cl_alloc.h"
#include "cl_kernel.h"
#include "cl_command_queue.h"
#include "cl_event.h"
LOCAL cl_int
cl_device_enqueue_fix_offset(cl_kernel ker) {
uint32_t i;
void *ptr;
cl_mem mem;
enum gbe_arg_type arg_type; /* kind of argument */
for (i = 0; i < ker->arg_n; ++i) {
arg_type = interp_kernel_get_arg_type(ker->opaque, i);
//HOW about image
if (!(arg_type == GBE_ARG_GLOBAL_PTR || arg_type == GBE_ARG_CONSTANT_PTR) || !ker->args[i].mem)
continue;
if(!ker->args[i].is_svm) {
mem = ker->args[i].mem;
ptr = cl_mem_map(mem, 0);
cl_buffer_set_softpin_offset(mem->bo, (size_t)ptr);
cl_buffer_set_bo_use_full_range(mem->bo, 1);
cl_buffer_disable_reuse(mem->bo);
mem->host_ptr = ptr;
cl_mem_unmap(mem);
ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ptr;
} else {
ker->device_enqueue_infos[ker->device_enqueue_info_n++] = ker->args[i].mem->host_ptr;
}
}
return 0;
}
LOCAL cl_int
cl_device_enqueue_bind_buffer(cl_gpgpu gpgpu, cl_kernel ker, uint32_t *max_bti, cl_gpgpu_kernel *kernel)
{
int32_t value = GBE_CURBE_ENQUEUE_BUF_POINTER;
int32_t offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
size_t buf_size = 32 * 1024 * 1024; //fix 32M
cl_mem mem;
if(offset > 0) {
if(ker->useDeviceEnqueue == false) {
if(ker->device_enqueue_ptr == NULL)
ker->device_enqueue_ptr = cl_mem_svm_allocate(ker->program->ctx, 0, buf_size, 0);
if(ker->device_enqueue_infos == NULL)
ker->device_enqueue_infos = cl_calloc(ker->arg_n, sizeof(void *));
ker->device_enqueue_info_n = 0;
ker->useDeviceEnqueue = CL_TRUE;
cl_device_enqueue_fix_offset(ker);
cl_kernel_add_ref(ker);
}
mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
assert(mem);
cl_gpgpu_bind_buf(gpgpu, mem->bo, offset, 0, buf_size, *max_bti);
cl_gpgpu_set_kernel(gpgpu, ker);
}
return 0;
}
typedef struct ndrange_info_t {
int type;
int global_work_size[3];
int local_work_size[3];
int global_work_offset[3];
} ndrange_info_t;
typedef struct Block_literal {
void *isa; // initialized to &_NSConcreteStackBlock or &_NSConcreteGlobalBlock
int flags;
int reserved;
int index;
struct Block_descriptor_1 {
unsigned long int slm_size; // NULL
unsigned long int size; // sizeof(struct Block_literal_1)
// optional helper functions
void *copy_helper; // IFF (1<<25)
void *dispose_helper; // IFF (1<<25)
// required ABI.2010.3.16
const char *signature; // IFF (1<<30)
} *descriptor;
// imported variables
} Block_literal;
LOCAL cl_int
cl_device_enqueue_parse_result(cl_command_queue queue, cl_gpgpu gpgpu)
{
cl_mem mem;
int size, type, dim, i;
const char * kernel_name;
cl_kernel child_ker;
cl_event evt = NULL;
cl_kernel ker = cl_gpgpu_get_kernel(gpgpu);
if(ker == NULL || ker->useDeviceEnqueue == CL_FALSE)
return 0;
void *buf = cl_gpgpu_ref_batch_buf(gpgpu);
//wait the gpgpu's batch buf finish, the gpgpu in queue may be not
//same as the param gpgpu, for example when flush event.
cl_gpgpu_sync(buf);
cl_gpgpu_unref_batch_buf(buf);
mem = cl_context_get_svm_from_ptr(ker->program->ctx, ker->device_enqueue_ptr);
if(mem == NULL) return -1;
char *ptr = (char *)cl_mem_map(mem, 0);
size = *(int *)ptr;
ptr += 4;
while(size > 0) {
size_t fixed_global_off[] = {0,0,0};
size_t fixed_global_sz[] = {1,1,1};
size_t fixed_local_sz[] = {1,1,1};
ndrange_info_t* ndrange_info = (ndrange_info_t *)ptr;
size -= sizeof(ndrange_info_t);
ptr += sizeof(ndrange_info_t);
Block_literal *block = (Block_literal *)ptr;
size -= block->descriptor->size;
ptr += block->descriptor->size;
type = ndrange_info->type;
dim = (type & 0xf0) >> 4;
type = type & 0xf;
assert(dim <= 2);
for(i = 0; i <= dim; i++) {
fixed_global_sz[i] = ndrange_info->global_work_size[i];
if(type > 1)
fixed_local_sz[i] = ndrange_info->local_work_size[i];
if(type > 2)
fixed_global_off[i] = ndrange_info->global_work_offset[i];
}
int *slm_sizes = (int *)ptr;
int slm_size = block->descriptor->slm_size;
size -= slm_size;
ptr += slm_size;
kernel_name = interp_program_get_device_enqueue_kernel_name(ker->program->opaque, block->index);
child_ker = cl_program_create_kernel(ker->program, kernel_name, NULL);
assert(child_ker);
cl_kernel_set_arg_svm_pointer(child_ker, 0, block);
int index = 1;
for(i=0; idevice_enqueue_info_n * sizeof(void *),
ker->device_enqueue_infos);
if (evt != NULL) {
clReleaseEvent(evt);
evt = NULL;
}
clEnqueueNDRangeKernel(queue, child_ker, dim + 1, fixed_global_off,
fixed_global_sz, fixed_local_sz, 0, NULL, &evt);
cl_command_queue_flush_gpgpu(gpgpu);
cl_kernel_delete(child_ker);
}
if (evt != NULL) {
//Can't call clWaitForEvents here, it may cause dead lock.
//If evt->exec_data.gpgpu is NULL, evt has finished.
if (evt->exec_data.gpgpu) {
buf = cl_gpgpu_ref_batch_buf(evt->exec_data.gpgpu);
//wait the gpgpu's batch buf finish, the gpgpu in queue may be not
//same as the param gpgpu, for example when flush event.
cl_gpgpu_sync(buf);
cl_gpgpu_unref_batch_buf(buf);
}
clReleaseEvent(evt);
evt = NULL;
}
cl_mem_unmap_auto(mem);
cl_kernel_delete(ker);
return 0;
}