diff options
author | Yang Rong <rong.r.yang@intel.com> | 2014-06-12 19:42:12 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@intel.com> | 2014-06-12 13:56:31 +0800 |
commit | 94d8536f043ea93a1c0750877a4cb346ed2b1607 (patch) | |
tree | be96fabf5a4367da8e3dd4d4fb997cfc3ae5a185 | |
parent | 9e88dec0a78a26432bcf6545f107131219c18699 (diff) |
HSW: Fix potential issue of GT3 when calc stack address.
GT3 have 4 half slice, so should shift left 2 bits, and also should enlarge the stack buffer size,
otherwize, if thread generate is non-balance, may out of bound.
Per bspec, scratch size need set 2X of desired.
Signed-off-by: Yang Rong <rong.r.yang@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
-rw-r--r-- | backend/src/backend/gen75_context.cpp | 4 | ||||
-rw-r--r-- | src/cl_command_queue_gen7.c | 7 | ||||
-rw-r--r-- | src/intel/intel_gpgpu.c | 3 |
3 files changed, 12 insertions, 2 deletions
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp index aedd4d3c..da0db854 100644 --- a/backend/src/backend/gen75_context.cpp +++ b/backend/src/backend/gen75_context.cpp @@ -92,12 +92,12 @@ namespace gbe p->curr.predicate = GEN_PREDICATE_NONE; //p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x1ff)); p->AND(GenRegister::ud1grf(126,0), GenRegister::ud1grf(0,5), GenRegister::immud(0x7f)); - p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x80)); + p->AND(GenRegister::ud1grf(126,4), GenRegister::ud1grf(0,5), GenRegister::immud(0x180)); p->SHR(GenRegister::ud1grf(126,4), GenRegister::ud1grf(126, 4), GenRegister::immud(7)); p->curr.execWidth = this->simdWidth; p->SHL(stackptr, stackptr, GenRegister::immud(perLaneShift)); p->curr.execWidth = 1; - p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(1)); + p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(2)); p->ADD(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::ud1grf(126, 4)); p->SHL(GenRegister::ud1grf(126,0), GenRegister::ud1grf(126,0), GenRegister::immud(perThreadShift)); p->curr.execWidth = this->simdWidth; diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 2223f4f4..978650a6 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -244,6 +244,13 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker) assert(offset >= 0); stack_sz *= interp_kernel_get_simd_width(ker->opaque); stack_sz *= device->max_compute_unit; + /* Because HSW calc stack offset per thread is relative with half slice, when + thread schedule in half slice is not balance, would out of bound. Because + the max half slice is 4 in GT4, multiply stack size with 4 for safe. + */ + if(cl_driver_get_ver(ctx->drv) == 75) + stack_sz *= 4; + cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cl_gpgpu_get_cache_ctrl()); } diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 50935837..cae843bf 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -833,6 +833,9 @@ intel_gpgpu_set_scratch(intel_gpgpu_t * gpgpu, uint32_t per_thread_size) drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr; drm_intel_bo* old = gpgpu->scratch_b.bo; uint32_t total = per_thread_size * gpgpu->max_threads; + /* Per Bspec, scratch should 2X the desired size, otherwise luxmark may hang */ + if (IS_HASWELL(gpgpu->drv->device_id)) + total *= 2; gpgpu->per_thread_scratch = per_thread_size; |