/* * Copyright © 2012 Intel Corporation * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library. If not, see . * * Author: Benjamin Segovia */ #include "cl_command_queue.h" #include "cl_context.h" #include "cl_program.h" #include "cl_kernel.h" #include "cl_device_id.h" #include "cl_mem.h" #include "cl_utils.h" #include "cl_alloc.h" #include #include #include static INLINE size_t cl_kernel_compute_batch_sz(cl_kernel k) { return 256+32; } /* "Varing" payload is the part of the curbe that changes accross threads in the * same work group. Right now, it consists in local IDs and block IPs */ static cl_int cl_set_varying_payload(const cl_kernel ker, char *data, const size_t *local_wk_sz, size_t simd_sz, size_t cst_sz, size_t thread_n) { uint32_t *ids[3] = {NULL,NULL,NULL}; uint16_t *block_ips = NULL; size_t i, j, k, curr = 0; int32_t id_offset[3], ip_offset; cl_int err = CL_SUCCESS; id_offset[0] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0); id_offset[1] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0); id_offset[2] = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0); ip_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0); assert(id_offset[0] >= 0 && id_offset[1] >= 0 && id_offset[2] >= 0 && ip_offset >= 0); TRY_ALLOC(ids[0], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz)); TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz)); TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz)); TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz)); /* 0xffff means that the lane is inactivated */ memset(block_ips, 0xff, sizeof(uint16_t)*thread_n*simd_sz); /* Compute the IDs and the block IPs */ for (k = 0; k < local_wk_sz[2]; ++k) for (j = 0; j < local_wk_sz[1]; ++j) for (i = 0; i < local_wk_sz[0]; ++i, ++curr) { ids[0][curr] = i; ids[1][curr] = j; ids[2][curr] = k; block_ips[curr] = 0; } /* Copy them to the curbe buffer */ curr = 0; for (i = 0; i < thread_n; ++i, data += cst_sz) { uint32_t *ids0 = (uint32_t *) (data + id_offset[0]); uint32_t *ids1 = (uint32_t *) (data + id_offset[1]); uint32_t *ids2 = (uint32_t *) (data + id_offset[2]); uint16_t *ips = (uint16_t *) (data + ip_offset); for (j = 0; j < simd_sz; ++j, ++curr) { ids0[j] = ids[0][curr]; ids1[j] = ids[1][curr]; ids2[j] = ids[2][curr]; ips[j] = block_ips[curr]; } } error: return err; } static void cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker) { /* calculate constant buffer size */ int32_t arg; size_t offset; gbe_program prog = ker->program->opaque; const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque); size_t global_const_size = gbe_program_get_global_constant_size(prog); uint32_t constant_buf_size = 0; for (arg = 0; arg < arg_n; ++arg) { const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg); if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) { cl_mem mem = ker->args[arg].mem; constant_buf_size += ALIGN(mem->size, 4); } } if(global_const_size == 0 && constant_buf_size == 0) return; cl_buffer bo = cl_gpgpu_alloc_constant_buffer(queue->gpgpu, constant_buf_size + global_const_size + 4); cl_buffer_map(bo, 1); char * cst_addr = cl_buffer_get_virtual(bo); offset = 0; if (global_const_size > 0) { /* Write the global constant arrays */ gbe_program_get_global_constant_data(prog, (char*)(cst_addr+offset)); } offset += ALIGN(global_const_size, 4); if(global_const_size == 0) { /* reserve 4 bytes to get rid of 0 address */ offset += 4; } /* upload constant buffer argument */ int32_t curbe_offset = 0; for (arg = 0; arg < arg_n; ++arg) { const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg); if (type == GBE_ARG_CONSTANT_PTR && ker->args[arg].mem) { cl_mem mem = ker->args[arg].mem; curbe_offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg); assert(curbe_offset >= 0); *(uint32_t *) (ker->curbe + curbe_offset) = offset; cl_buffer_map(mem->bo, 1); void * addr = cl_buffer_get_virtual(mem->bo); memcpy(cst_addr + offset, addr, mem->size); cl_buffer_unmap(mem->bo); offset += ALIGN(mem->size, 4); } } cl_buffer_unmap(bo); } /* Will return the total amount of slm used */ static int32_t cl_curbe_fill(cl_kernel ker, const uint32_t work_dim, const size_t *global_wk_off, const size_t *global_wk_sz, const size_t *local_wk_sz, size_t thread_n) { int32_t offset; #define UPLOAD(ENUM, VALUE) \ if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, ENUM, 0)) >= 0) \ *((uint32_t *) (ker->curbe + offset)) = VALUE; UPLOAD(GBE_CURBE_LOCAL_SIZE_X, local_wk_sz[0]); UPLOAD(GBE_CURBE_LOCAL_SIZE_Y, local_wk_sz[1]); UPLOAD(GBE_CURBE_LOCAL_SIZE_Z, local_wk_sz[2]); UPLOAD(GBE_CURBE_GLOBAL_SIZE_X, global_wk_sz[0]); UPLOAD(GBE_CURBE_GLOBAL_SIZE_Y, global_wk_sz[1]); UPLOAD(GBE_CURBE_GLOBAL_SIZE_Z, global_wk_sz[2]); UPLOAD(GBE_CURBE_GLOBAL_OFFSET_X, global_wk_off[0]); UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Y, global_wk_off[1]); UPLOAD(GBE_CURBE_GLOBAL_OFFSET_Z, global_wk_off[2]); UPLOAD(GBE_CURBE_GROUP_NUM_X, global_wk_sz[0]/local_wk_sz[0]); UPLOAD(GBE_CURBE_GROUP_NUM_Y, global_wk_sz[1]/local_wk_sz[1]); UPLOAD(GBE_CURBE_GROUP_NUM_Z, global_wk_sz[2]/local_wk_sz[2]); UPLOAD(GBE_CURBE_THREAD_NUM, thread_n); UPLOAD(GBE_CURBE_WORK_DIM, work_dim); #undef UPLOAD /* Upload sampler information. */ offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_SAMPLER_INFO, 0); if (offset >= 0) { uint32_t i; for(i = 0; i < ker->sampler_sz; i++, offset += 2) { *((uint16_t *) (ker->curbe + offset)) = ker->samplers[i] & 0xFF; } } /* Write identity for the stack pointer. This is required by the stack pointer * computation in the kernel */ if ((offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_STACK_POINTER, 0)) >= 0) { const uint32_t simd_sz = gbe_kernel_get_simd_width(ker->opaque); uint32_t *stackptr = (uint32_t *) (ker->curbe + offset); int32_t i; for (i = 0; i < (int32_t) simd_sz; ++i) stackptr[i] = i; } /* Handle the various offsets to SLM */ const int32_t arg_n = gbe_kernel_get_arg_num(ker->opaque); /* align so that we kernel argument get good alignment */ int32_t arg, slm_offset = ALIGN(gbe_kernel_get_slm_size(ker->opaque), 32); for (arg = 0; arg < arg_n; ++arg) { const enum gbe_arg_type type = gbe_kernel_get_arg_type(ker->opaque, arg); if (type != GBE_ARG_LOCAL_PTR) continue; offset = gbe_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_KERNEL_ARGUMENT, arg); assert(offset >= 0); uint32_t *slmptr = (uint32_t *) (ker->curbe + offset); *slmptr = slm_offset; slm_offset += ker->args[arg].local_sz; } return slm_offset; } static void cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) { cl_context ctx = ker->program->ctx; cl_device_id device = ctx->device; const int32_t per_lane_stack_sz = gbe_kernel_get_stack_size(ker->opaque); const int32_t value = GBE_CURBE_EXTRA_ARGUMENT; const int32_t sub_value = GBE_STACK_BUFFER; const int32_t offset = gbe_kernel_get_curbe_offset(ker->opaque, value, sub_value); int32_t stack_sz = per_lane_stack_sz; /* No stack required for this kernel */ if (per_lane_stack_sz == 0) return; /* The stack size is given for *each* SIMD lane. So, we accordingly compute * the size we need for the complete machine */ assert(offset >= 0); stack_sz *= gbe_kernel_get_simd_width(ker->opaque); stack_sz *= device->max_compute_unit; cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3); } static void cl_setup_scratch(cl_gpgpu gpgpu, cl_kernel ker) { int32_t scratch_sz = gbe_kernel_get_scratch_size(ker->opaque); cl_gpgpu_set_scratch(gpgpu, scratch_sz); } LOCAL cl_int cl_command_queue_ND_range_gen7(cl_command_queue queue, cl_kernel ker, const uint32_t work_dim, const size_t *global_wk_off, const size_t *global_wk_sz, const size_t *local_wk_sz) { cl_context ctx = queue->ctx; cl_gpgpu gpgpu = queue->gpgpu; char *final_curbe = NULL; /* Includes them and one sub-buffer per group */ cl_gpgpu_kernel kernel; const uint32_t simd_sz = cl_kernel_get_simd_width(ker); size_t i, batch_sz = 0u, local_sz = 0u; size_t cst_sz = ker->curbe_sz= gbe_kernel_get_curbe_size(ker->opaque); size_t thread_n = 0u; cl_int err = CL_SUCCESS; /* Setup kernel */ kernel.name = "KERNEL"; kernel.grf_blocks = 128; kernel.bo = ker->bo; kernel.barrierID = 0; kernel.slm_sz = 0; kernel.use_slm = gbe_kernel_use_slm(ker->opaque); /* Compute the number of HW threads we need */ TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz); kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz; kernel.curbe_sz = cst_sz; /* Curbe step 1: fill the constant urb buffer data shared by all threads */ if (ker->curbe) { kernel.slm_sz = cl_curbe_fill(ker, work_dim, global_wk_off, global_wk_sz, local_wk_sz, thread_n); if (kernel.slm_sz > ker->program->ctx->device->local_mem_size) return CL_OUT_OF_RESOURCES; } /* Setup the kernel */ if (queue->props & CL_QUEUE_PROFILING_ENABLE) cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 1); else cl_gpgpu_state_init(gpgpu, ctx->device->max_compute_unit, cst_sz / 32, 0); /* Bind user buffers */ cl_command_queue_bind_surface(queue, ker); /* Bind user images */ cl_command_queue_bind_image(queue, ker); /* Bind all samplers */ cl_gpgpu_bind_sampler(queue->gpgpu, ker->samplers, ker->sampler_sz); cl_setup_scratch(gpgpu, ker); /* Bind a stack if needed */ cl_bind_stack(gpgpu, ker); cl_upload_constant_buffer(queue, ker); cl_gpgpu_states_setup(gpgpu, &kernel); /* Curbe step 2. Give the localID and upload it to video memory */ if (ker->curbe) { assert(cst_sz > 0); TRY_ALLOC (final_curbe, (char*) alloca(thread_n * cst_sz)); for (i = 0; i < thread_n; ++i) { memcpy(final_curbe + cst_sz * i, ker->curbe, cst_sz); } TRY (cl_set_varying_payload, ker, final_curbe, local_wk_sz, simd_sz, cst_sz, thread_n); cl_gpgpu_upload_curbes(gpgpu, final_curbe, thread_n*cst_sz); } /* Start a new batch buffer */ batch_sz = cl_kernel_compute_batch_sz(ker); cl_gpgpu_batch_reset(gpgpu, batch_sz); cl_gpgpu_batch_start(gpgpu); /* Issue the GPGPU_WALKER command */ cl_gpgpu_walker(gpgpu, simd_sz, thread_n, global_wk_off, global_wk_sz, local_wk_sz); /* Close the batch buffer and submit it */ cl_gpgpu_batch_end(gpgpu, 0); error: return err; }