From bdf98d61c66c98c65f8e17001fd08d11eef7159b Mon Sep 17 00:00:00 2001 From: Junyan He Date: Mon, 4 Apr 2016 00:22:53 +0800 Subject: ndrange --- backend/src/driver/cl_gen_kernel.cpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'backend') diff --git a/backend/src/driver/cl_gen_kernel.cpp b/backend/src/driver/cl_gen_kernel.cpp index 6bd6748f..5b70b197 100644 --- a/backend/src/driver/cl_gen_kernel.cpp +++ b/backend/src/driver/cl_gen_kernel.cpp @@ -443,6 +443,34 @@ static cl_int genGPUBindSamplers(GenGPUState& gpuState, cl_kernel kernel, Kernel return CL_SUCCESS; } +static void genGPUSetStack(GenGPUState& gpuState, Kernel* ker, + cl_device_id device ,GenGPUDevice* gpuDev) +{ + const int32_t per_lane_stack_sz = ker->getStackSize(); + const int32_t offset = ker->getCurbeOffset(GBE_CURBE_EXTRA_ARGUMENT, GBE_STACK_BUFFER); + int32_t stack_sz = per_lane_stack_sz; + + /* No stack required for this kernel */ + if (per_lane_stack_sz == 0) + return; + + /* The stack size is given for *each* SIMD lane. So, we accordingly compute + * the size we need for the complete machine + */ + assert(offset >= 0); + stack_sz *= ker->getSIMDWidth(); + stack_sz *= device->max_compute_unit * gpuDev->max_thread_per_unit; + + /* Because HSW calc stack offset per thread is relative with half slice, when + thread schedule in half slice is not balance, would out of bound. Because + the max half slice is 4 in GT4, multiply stack size with 4 for safe. + */ + if(gpuDev->gen_ver == 75) + stack_sz *= 4; + + gpuState.setStack(offset, stack_sz, BTI_PRIVATE); +} + extern "C" cl_int genEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel kernel, const uint32_t work_dim, const size_t *global_wk_off, const size_t *global_wk_sz, @@ -528,5 +556,9 @@ cl_int genEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel kernel, const u if (err != CL_SUCCESS) return err; + /* Bind a stack if needed */ + genGPUSetStack(*ndRange->gpuState, ker, queue->device, gpuDev); + + } -- cgit v1.2.3