diff options
author | Zhigang Gong <zhigang.gong@intel.com> | 2014-09-04 08:01:25 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@intel.com> | 2014-09-04 11:53:04 +0800 |
commit | 9ce75d4de41e8359f411ce5ab3dbd2cd363364b2 (patch) | |
tree | 96e63bf32d8826d6de3402373fccdb7674399ea8 | |
parent | 56d2cf2e02b6933478f080548f6960751ff18741 (diff) |
GBE: fixup/refine a bug for image1D array's extra binding index handling.
Due to hardware limitation on Gen7/Gen75 when sampling a
surface with clamp address mode and nearest filter mode
on a integer image1Darray type surface, we have to bind
one buffer to to bti. The previous implementation hard
coded it to 128 + original index and when check whether
it is such type bti in driver layer, assume the bti reserved
is 3 which is wrong now.
This patch fixed those hard coded functions and use the
macros defined in the program.h.
Signed-off-by: Zhigang Gong <zhigang.gong@intel.com>
Reviewed-by: "Song, Ruiling" <ruiling.song@intel.com>
-rw-r--r-- | backend/src/backend/gen_insn_selection.cpp | 2 | ||||
-rw-r--r-- | backend/src/backend/program.h | 2 | ||||
-rw-r--r-- | backend/src/llvm/llvm_gen_backend.cpp | 13 | ||||
-rw-r--r-- | src/cl_command_queue.c | 5 | ||||
-rw-r--r-- | src/intel/intel_gpgpu.c | 28 |
5 files changed, 35 insertions, 15 deletions
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index abbff77..d631579 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -3862,7 +3862,7 @@ namespace gbe msgLen = srcNum; } // We switch to a fixup bti for linear filter on a image1d array sampling. - uint32_t bti = insn.getImageIndex() + (insn.getSamplerOffset() == 2 ? 128 : 0); + uint32_t bti = insn.getImageIndex() + (insn.getSamplerOffset() == 2 ? BTI_MAX_IMAGE_NUM : 0); if (bti > 253) { std::cerr << "Too large bti " << bti; return false; diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index 254df92..c63ae6a 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -66,6 +66,8 @@ enum gbe_get_arg_info_value { #define BTI_CONSTANT 0 #define BTI_PRIVATE 1 #define BTI_RESERVED_NUM 2 +#define BTI_MAX_IMAGE_NUM 128 +#define BTI_MAX_ID (BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM - 1) /*! Constant buffer values (ie values to setup in the constant buffer) */ enum gbe_curbe_type { diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 1604ede..738f7d3 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -500,6 +500,11 @@ namespace gbe ir::ImmediateIndex processConstantImmIndex(Constant *CPV, int32_t index = 0u); const ir::Immediate &processConstantImm(Constant *CPV, int32_t index = 0u); + uint32_t incBtiBase() { + GBE_ASSERT(btiBase <= BTI_MAX_ID); + return btiBase++; + } + bool runOnFunction(Function &F) { // Do not codegen any 'available_externally' functions at all, they have // definitions outside the translation unit. @@ -1363,7 +1368,7 @@ namespace gbe globalPointer.insert(std::make_pair(I, btiBase)); ctx.appendSurface(btiBase, reg); ctx.input(argName, ir::FunctionArgument::GLOBAL_POINTER, reg, llvmInfo, ptrSize, align, btiBase); - btiBase++; + incBtiBase(); break; case ir::MEM_LOCAL: ctx.input(argName, ir::FunctionArgument::LOCAL_POINTER, reg, llvmInfo, ptrSize, align, 0xfe); @@ -1374,7 +1379,7 @@ namespace gbe break; case ir::IMAGE: ctx.input(argName, ir::FunctionArgument::IMAGE, reg, llvmInfo, ptrSize, align, 0x0); - ctx.getFunction().getImageSet()->append(reg, &ctx, btiBase++); + ctx.getFunction().getImageSet()->append(reg, &ctx, incBtiBase()); break; default: GBE_ASSERT(addrSpace != ir::MEM_PRIVATE); } @@ -1720,12 +1725,12 @@ namespace gbe if(v.getName().equals(StringRef("__gen_ocl_printf_buf"))) { ctx.appendSurface(btiBase, ir::ocl::printfbptr); ctx.getFunction().getPrintfSet()->setBufBTI(btiBase); - globalPointer.insert(std::make_pair(&v, btiBase++)); + globalPointer.insert(std::make_pair(&v, incBtiBase())); regTranslator.newScalarProxy(ir::ocl::printfbptr, const_cast<GlobalVariable*>(&v)); } else if(v.getName().equals(StringRef("__gen_ocl_printf_index_buf"))) { ctx.appendSurface(btiBase, ir::ocl::printfiptr); ctx.getFunction().getPrintfSet()->setIndexBufBTI(btiBase); - globalPointer.insert(std::make_pair(&v, btiBase++)); + globalPointer.insert(std::make_pair(&v, incBtiBase())); regTranslator.newScalarProxy(ir::ocl::printfiptr, const_cast<GlobalVariable*>(&v)); } else if(v.getName().str().substr(0, 4) == ".str") { /* When there are multi printf statements in multi kernel fucntions within the same diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 52e91ae..4cbb4eb 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -17,6 +17,7 @@ * Author: Benjamin Segovia <benjamin.segovia@intel.com> */ +#include "program.h" // for BTI_MAX_IMAGE_NUM #include "cl_command_queue.h" #include "cl_context.h" #include "cl_program.h" @@ -142,8 +143,10 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k) image->intel_fmt, image->image_type, image->w, image->h, image->depth, image->row_pitch, (cl_gpgpu_tiling)image->tiling); + // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer + // on demand. if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY) - cl_gpgpu_bind_image(gpgpu, k->images[i].idx + 128, image->base.bo, image->offset, + cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_MAX_IMAGE_NUM, image->base.bo, image->offset, image->intel_fmt, image->image_type, image->w, image->h, image->depth, image->row_pitch, (cl_gpgpu_tiling)image->tiling); diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 867ab4c..c4b9156 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -35,6 +35,7 @@ #include "intel/intel_structs.h" #include "intel/intel_batchbuffer.h" #include "intel/intel_driver.h" +#include "program.h" // for BTI_RESERVED_NUM #include "cl_alloc.h" #include "cl_utils.h" @@ -819,6 +820,22 @@ intel_get_surface_type(cl_mem_object_type type) return 0; } +/* Get fixed surface type. If it is a 1D array image with a large index, + we need to fixup it to 2D type due to a Gen7/Gen75's sampler issue + on a integer type surface with clamp address mode and nearest filter mode. +*/ +static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type) +{ + uint32_t surface_type; + if (((IS_IVYBRIDGE(gpgpu->drv->device_id) || IS_HASWELL(gpgpu->drv->device_id))) && + index >= 128 + BTI_RESERVED_NUM && + type == CL_MEM_OBJECT_IMAGE1D_ARRAY) + surface_type = I965_SURFACE_2D; + else + surface_type = intel_get_surface_type(type); + return surface_type; +} + static void intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu, uint32_t index, @@ -836,12 +853,8 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu, gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index]; memset(ss, 0, sizeof(*ss)); - ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2 - if (index > 128 + 2 && type == CL_MEM_OBJECT_IMAGE1D_ARRAY) - ss->ss0.surface_type = I965_SURFACE_2D; - else - ss->ss0.surface_type = intel_get_surface_type(type); + ss->ss0.surface_type = get_surface_type(gpgpu, index, type); if (intel_is_surface_array(type)) { ss->ss0.surface_array = 1; ss->ss0.surface_array_spacing = 1; @@ -886,10 +899,7 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu, gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index]; memset(ss, 0, sizeof(*ss)); ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2 - if (index > 128 + 2 && type == CL_MEM_OBJECT_IMAGE1D_ARRAY) - ss->ss0.surface_type = I965_SURFACE_2D; - else - ss->ss0.surface_type = intel_get_surface_type(type); + ss->ss0.surface_type = get_surface_type(gpgpu, index, type); if (intel_is_surface_array(type)) { ss->ss0.surface_array = 1; ss->ss0.surface_array_spacing = 1; |