diff options
author | Junyan He <junyan.he@intel.com> | 2016-04-03 23:36:10 +0800 |
---|---|---|
committer | Junyan He <junyan.he@intel.com> | 2016-04-03 23:36:10 +0800 |
commit | 7100f990bf94eb5628051a6f2d858c1b5f6cd1bd (patch) | |
tree | 44150a24cfe5c9cc341aa068f937018e633d842c /backend/src/driver | |
parent | 9058ca4c36bb170e7c5b1ff6e0612101a3de9646 (diff) |
ndrange
Diffstat (limited to 'backend/src/driver')
-rw-r--r-- | backend/src/driver/cl_gen_gpu_state.cpp | 18 | ||||
-rw-r--r-- | backend/src/driver/cl_gen_gpu_state.hpp | 1 | ||||
-rw-r--r-- | backend/src/driver/cl_gen_kernel.cpp | 65 |
3 files changed, 63 insertions, 21 deletions
diff --git a/backend/src/driver/cl_gen_gpu_state.cpp b/backend/src/driver/cl_gen_gpu_state.cpp index addaa930..9b8f1124 100644 --- a/backend/src/driver/cl_gen_gpu_state.cpp +++ b/backend/src/driver/cl_gen_gpu_state.cpp @@ -171,6 +171,23 @@ void GenGPUState::setStack(uint32_t offset, uint32_t size, uint8_t bti) this->bindBuf(this->stack_b.bo, offset, 0, size, bti); } +int GenGPUState::setScratch(uint32_t per_thread_size) +{ + uint32_t total = per_thread_size * this->max_threads; + /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */ + if (IS_HASWELL(device_id)) + total *= 2; + + this->per_thread_scratch = per_thread_size; + + this->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096); + if (this->scratch_b.bo == NULL) + return -1; + + return 0; +} + + bool GenGPUState::stateInit(uint32_t max_threads, uint32_t size_cs_entry) { drm_intel_bo *bo = NULL; @@ -180,6 +197,7 @@ bool GenGPUState::stateInit(uint32_t max_threads, uint32_t size_cs_entry) this->img_bitmap = 0; this->img_index_base = 3; this->sampler_bitmap = ~((1 << max_sampler_n) - 1); + this->per_thread_scratch = 0; /* URB */ this->curb.num_cs_entries = 64; diff --git a/backend/src/driver/cl_gen_gpu_state.hpp b/backend/src/driver/cl_gen_gpu_state.hpp index eafac148..54849f94 100644 --- a/backend/src/driver/cl_gen_gpu_state.hpp +++ b/backend/src/driver/cl_gen_gpu_state.hpp @@ -169,6 +169,7 @@ struct GenGPUState { void bindBuf(drm_intel_bo *buf, uint32_t offset, uint32_t internal_offset, size_t size, uint8_t bti); void setStack(uint32_t offset, uint32_t size, uint8_t bti); + int setScratch(uint32_t per_thread_size); bool stateInit(uint32_t max_threads, uint32_t size_cs_entry); bool allocConstantBuffer(uint32_t size, uint8_t bti); void batchStart(uint32_t use_slm); diff --git a/backend/src/driver/cl_gen_kernel.cpp b/backend/src/driver/cl_gen_kernel.cpp index 72a933fa..6bd6748f 100644 --- a/backend/src/driver/cl_gen_kernel.cpp +++ b/backend/src/driver/cl_gen_kernel.cpp @@ -391,7 +391,7 @@ static cl_int genGPUBindSurfaces(GenGPUState& gpuState, cl_command_queue queue, return CL_SUCCESS; } -static cl_int genQueueBindImage(GenGPUState& gpuState, cl_kernel kernel, Kernel* ker) +static cl_int genGPUBindImage(char* curbe, GenGPUState& gpuState, cl_kernel kernel, Kernel* ker) { uint32_t i; size_t image_sz = ker->getImageSize(); @@ -406,32 +406,40 @@ static cl_int genQueueBindImage(GenGPUState& gpuState, cl_kernel kernel, Kernel* image = cl_mem_to_image(kernel->args[id]->mem); - - -#if 0 - set_image_info(k->curbe, &k->images[i], image); - if(k->vme) { - if( (image->fmt.image_channel_order != CL_R) || (image->fmt.image_channel_data_type != CL_UNORM_INT8) ) - return CL_IMAGE_FORMAT_NOT_SUPPORTED; - cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset, - image->intel_fmt, image->image_type, image->bpp, - image->w, image->h, image->depth, - image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); - } else - cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset, - image->intel_fmt, image->image_type, image->bpp, - image->w, image->h, image->depth, - image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); + if (images[i].wSlot >= 0) + *(uint32_t*)(curbe + images[i].wSlot) = image->w; + if (images[i].hSlot >= 0) + *(uint32_t*)(curbe + images[i].hSlot) = image->h; + if (images[i].depthSlot >= 0) + *(uint32_t*)(curbe + images[i].depthSlot) = image->depth; + if (images[i].channelOrderSlot >= 0) + *(uint32_t*)(curbe + images[i].channelOrderSlot) = image->fmt.image_channel_order; + if (images[i].dataTypeSlot >= 0) + *(uint32_t*)(curbe + images[i].dataTypeSlot) = image->fmt.image_channel_data_type; + /* + gpuState.bindImage(images[i].idx, image->base.bo, + image->offset + k->args[id].mem->offset, image->intel_fmt, + image->image_type, image->bpp, image->w, image->h, image->depth, + image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer // on demand. if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset + k->args[id].mem->offset, - image->intel_fmt, image->image_type, image->bpp, - image->w, image->h, image->depth, - image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); + image->intel_fmt, image->image_type, image->bpp, + image->w, image->h, image->depth, + image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling); -#endif + */ + return CL_SUCCESS; } +} + +static cl_int genGPUBindSamplers(GenGPUState& gpuState, cl_kernel kernel, Kernel* ker) +{ + size_t sz = ker->getSamplerSize(); + uint32_t *samplers = (uint32_t *)alloca(GEN_MAX_SAMPLERS * sizeof(uint32_t)); + ker->getSamplerData(samplers); + gpuState.bindSamplers(samplers, sz); return CL_SUCCESS; } @@ -505,5 +513,20 @@ cl_int genEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel kernel, const u if (err != CL_SUCCESS) return err; + if (ker->getImageSize()) { + err = genGPUBindImage(curbe, *ndRange->gpuState, kernel, ker); + if (err != CL_SUCCESS) + return err; + } + + if (ker->getSamplerSize()) { + err = genGPUBindSamplers(*ndRange->gpuState, kernel, ker); + if (err != CL_SUCCESS) + return err; + } + err = ndRange->gpuState->setScratch(scratch_sz); + if (err != CL_SUCCESS) + return err; + } |