summaryrefslogtreecommitdiff
path: root/backend/src/driver
diff options
context:
space:
mode:
authorJunyan He <junyan.he@intel.com>2016-04-03 23:36:10 +0800
committerJunyan He <junyan.he@intel.com>2016-04-03 23:36:10 +0800
commit7100f990bf94eb5628051a6f2d858c1b5f6cd1bd (patch)
tree44150a24cfe5c9cc341aa068f937018e633d842c /backend/src/driver
parent9058ca4c36bb170e7c5b1ff6e0612101a3de9646 (diff)
ndrange
Diffstat (limited to 'backend/src/driver')
-rw-r--r--backend/src/driver/cl_gen_gpu_state.cpp18
-rw-r--r--backend/src/driver/cl_gen_gpu_state.hpp1
-rw-r--r--backend/src/driver/cl_gen_kernel.cpp65
3 files changed, 63 insertions, 21 deletions
diff --git a/backend/src/driver/cl_gen_gpu_state.cpp b/backend/src/driver/cl_gen_gpu_state.cpp
index addaa930..9b8f1124 100644
--- a/backend/src/driver/cl_gen_gpu_state.cpp
+++ b/backend/src/driver/cl_gen_gpu_state.cpp
@@ -171,6 +171,23 @@ void GenGPUState::setStack(uint32_t offset, uint32_t size, uint8_t bti)
this->bindBuf(this->stack_b.bo, offset, 0, size, bti);
}
+int GenGPUState::setScratch(uint32_t per_thread_size)
+{
+ uint32_t total = per_thread_size * this->max_threads;
+ /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */
+ if (IS_HASWELL(device_id))
+ total *= 2;
+
+ this->per_thread_scratch = per_thread_size;
+
+ this->scratch_b.bo = drm_intel_bo_alloc(bufmgr, "SCRATCH_BO", total, 4096);
+ if (this->scratch_b.bo == NULL)
+ return -1;
+
+ return 0;
+}
+
+
bool GenGPUState::stateInit(uint32_t max_threads, uint32_t size_cs_entry)
{
drm_intel_bo *bo = NULL;
@@ -180,6 +197,7 @@ bool GenGPUState::stateInit(uint32_t max_threads, uint32_t size_cs_entry)
this->img_bitmap = 0;
this->img_index_base = 3;
this->sampler_bitmap = ~((1 << max_sampler_n) - 1);
+ this->per_thread_scratch = 0;
/* URB */
this->curb.num_cs_entries = 64;
diff --git a/backend/src/driver/cl_gen_gpu_state.hpp b/backend/src/driver/cl_gen_gpu_state.hpp
index eafac148..54849f94 100644
--- a/backend/src/driver/cl_gen_gpu_state.hpp
+++ b/backend/src/driver/cl_gen_gpu_state.hpp
@@ -169,6 +169,7 @@ struct GenGPUState {
void bindBuf(drm_intel_bo *buf, uint32_t offset, uint32_t internal_offset,
size_t size, uint8_t bti);
void setStack(uint32_t offset, uint32_t size, uint8_t bti);
+ int setScratch(uint32_t per_thread_size);
bool stateInit(uint32_t max_threads, uint32_t size_cs_entry);
bool allocConstantBuffer(uint32_t size, uint8_t bti);
void batchStart(uint32_t use_slm);
diff --git a/backend/src/driver/cl_gen_kernel.cpp b/backend/src/driver/cl_gen_kernel.cpp
index 72a933fa..6bd6748f 100644
--- a/backend/src/driver/cl_gen_kernel.cpp
+++ b/backend/src/driver/cl_gen_kernel.cpp
@@ -391,7 +391,7 @@ static cl_int genGPUBindSurfaces(GenGPUState& gpuState, cl_command_queue queue,
return CL_SUCCESS;
}
-static cl_int genQueueBindImage(GenGPUState& gpuState, cl_kernel kernel, Kernel* ker)
+static cl_int genGPUBindImage(char* curbe, GenGPUState& gpuState, cl_kernel kernel, Kernel* ker)
{
uint32_t i;
size_t image_sz = ker->getImageSize();
@@ -406,32 +406,40 @@ static cl_int genQueueBindImage(GenGPUState& gpuState, cl_kernel kernel, Kernel*
image = cl_mem_to_image(kernel->args[id]->mem);
-
-
-#if 0
- set_image_info(k->curbe, &k->images[i], image);
- if(k->vme) {
- if( (image->fmt.image_channel_order != CL_R) || (image->fmt.image_channel_data_type != CL_UNORM_INT8) )
- return CL_IMAGE_FORMAT_NOT_SUPPORTED;
- cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
- image->intel_fmt, image->image_type, image->bpp,
- image->w, image->h, image->depth,
- image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
- } else
- cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset + k->args[id].mem->offset,
- image->intel_fmt, image->image_type, image->bpp,
- image->w, image->h, image->depth,
- image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+ if (images[i].wSlot >= 0)
+ *(uint32_t*)(curbe + images[i].wSlot) = image->w;
+ if (images[i].hSlot >= 0)
+ *(uint32_t*)(curbe + images[i].hSlot) = image->h;
+ if (images[i].depthSlot >= 0)
+ *(uint32_t*)(curbe + images[i].depthSlot) = image->depth;
+ if (images[i].channelOrderSlot >= 0)
+ *(uint32_t*)(curbe + images[i].channelOrderSlot) = image->fmt.image_channel_order;
+ if (images[i].dataTypeSlot >= 0)
+ *(uint32_t*)(curbe + images[i].dataTypeSlot) = image->fmt.image_channel_data_type;
+ /*
+ gpuState.bindImage(images[i].idx, image->base.bo,
+ image->offset + k->args[id].mem->offset, image->intel_fmt,
+ image->image_type, image->bpp, image->w, image->h, image->depth,
+ image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
// TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
// on demand.
if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_WORKAROUND_IMAGE_OFFSET, image->base.bo, image->offset + k->args[id].mem->offset,
- image->intel_fmt, image->image_type, image->bpp,
- image->w, image->h, image->depth,
- image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+ image->intel_fmt, image->image_type, image->bpp,
+ image->w, image->h, image->depth,
+ image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
-#endif
+ */
+ return CL_SUCCESS;
}
+}
+
+static cl_int genGPUBindSamplers(GenGPUState& gpuState, cl_kernel kernel, Kernel* ker)
+{
+ size_t sz = ker->getSamplerSize();
+ uint32_t *samplers = (uint32_t *)alloca(GEN_MAX_SAMPLERS * sizeof(uint32_t));
+ ker->getSamplerData(samplers);
+ gpuState.bindSamplers(samplers, sz);
return CL_SUCCESS;
}
@@ -505,5 +513,20 @@ cl_int genEnqueueNDRangeKernel(cl_command_queue queue, cl_kernel kernel, const u
if (err != CL_SUCCESS)
return err;
+ if (ker->getImageSize()) {
+ err = genGPUBindImage(curbe, *ndRange->gpuState, kernel, ker);
+ if (err != CL_SUCCESS)
+ return err;
+ }
+
+ if (ker->getSamplerSize()) {
+ err = genGPUBindSamplers(*ndRange->gpuState, kernel, ker);
+ if (err != CL_SUCCESS)
+ return err;
+ }
+ err = ndRange->gpuState->setScratch(scratch_sz);
+ if (err != CL_SUCCESS)
+ return err;
+
}