summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernels/compiler_array.cl1
-rw-r--r--kernels/compiler_array0.cl16
-rw-r--r--src/cl_command_queue_gen7.c5
-rw-r--r--src/cl_driver.c1
-rw-r--r--src/cl_driver.h6
-rw-r--r--src/intel/intel_gpgpu.c339
-rw-r--r--src/sim/sim_driver.c19
-rw-r--r--utests/CMakeLists.txt3
-rw-r--r--utests/compiler_array0.cpp73
9 files changed, 286 insertions, 177 deletions
diff --git a/kernels/compiler_array.cl b/kernels/compiler_array.cl
index 7bd6060e..5dce4d9e 100644
--- a/kernels/compiler_array.cl
+++ b/kernels/compiler_array.cl
@@ -12,4 +12,3 @@ compiler_array(__global int *src, __global int *dst)
dst[get_global_id(0)] = array[get_local_id(0)];
}
-
diff --git a/kernels/compiler_array0.cl b/kernels/compiler_array0.cl
new file mode 100644
index 00000000..3ab0fb8b
--- /dev/null
+++ b/kernels/compiler_array0.cl
@@ -0,0 +1,16 @@
+__kernel void
+compiler_array0(__global int *src, __global int *dst)
+{
+ int i;
+ int final[16];
+ for (i = 0; i < 16; ++i) {
+ int array[16], j;
+ for (j = 0; j < 16; ++j)
+ array[j] = get_global_id(0);
+ for (j = 0; j < src[0]; ++j)
+ array[j] = 1+src[j];
+ final[i] = array[i];
+ }
+ dst[get_global_id(0)] = final[get_global_id(0)];
+}
+
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 0d4ffb58..5da418cf 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -141,8 +141,6 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
{
cl_context ctx = ker->program->ctx;
cl_device_id device = ctx->device;
- cl_buffer_mgr bufmgr = cl_context_get_bufmgr(ctx);
- cl_buffer buffer = NULL;
const int32_t per_lane_stack_sz = gbe_kernel_get_stack_size(ker->opaque);
const int32_t value = GBE_CURBE_EXTRA_ARGUMENT;
const int32_t sub_value = GBE_STACK_BUFFER;
@@ -160,8 +158,7 @@ cl_bind_stack(cl_gpgpu gpgpu, cl_kernel ker)
stack_sz *= gbe_kernel_get_simd_width(ker->opaque);
stack_sz *= device->max_compute_unit;
stack_sz *= device->max_thread_per_unit;
- buffer = cl_buffer_alloc(bufmgr, NULL, stack_sz, 64);
- cl_gpgpu_bind_buf(gpgpu, buffer, offset, cc_llc_l3);
+ cl_gpgpu_set_stack(gpgpu, offset, stack_sz, cc_llc_l3);
}
LOCAL cl_int
diff --git a/src/cl_driver.c b/src/cl_driver.c
index 9c8bfa26..66d805d8 100644
--- a/src/cl_driver.c
+++ b/src/cl_driver.c
@@ -45,6 +45,7 @@ LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
LOCAL cl_gpgpu_new_cb *cl_gpgpu_new = NULL;
LOCAL cl_gpgpu_delete_cb *cl_gpgpu_delete = NULL;
LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
+LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
LOCAL cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D = NULL;
LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
LOCAL cl_gpgpu_set_perf_counters_cb *cl_gpgpu_set_perf_counters = NULL;
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 75df8dd9..1caf0555 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -23,7 +23,7 @@
#include <stdint.h>
#include <stdlib.h>
-/* XXX needed for previous driver */
+/* Various limitations we should remove actually */
#define GEN_MAX_SURFACES 128
#define GEN_MAX_SAMPLERS 16
@@ -118,6 +118,10 @@ typedef void (cl_gpgpu_bind_image2D_cb)(cl_gpgpu state,
cl_gpgpu_tiling tiling);
extern cl_gpgpu_bind_image2D_cb *cl_gpgpu_bind_image2D;
+/* Setup a stack */
+typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
+extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
+
/* Configure internal state */
typedef void (cl_gpgpu_state_init_cb)(cl_gpgpu, uint32_t max_threads, uint32_t size_cs_entry);
extern cl_gpgpu_state_init_cb *cl_gpgpu_state_init;
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index dd284e08..4e42afbd 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -63,6 +63,7 @@ struct intel_gpgpu
uint32_t binded_offset[max_buf_n]; /* their offsets in the constant buffer */
uint32_t binded_n; /* number of buffers binded */
+ struct { drm_intel_bo *bo; } stack_b;
struct { drm_intel_bo *bo; } idrt_b;
struct { drm_intel_bo *bo; } surface_heap_b;
struct { drm_intel_bo *bo; } vfe_state_b;
@@ -81,24 +82,26 @@ struct intel_gpgpu
typedef struct intel_gpgpu intel_gpgpu_t;
static void
-intel_gpgpu_delete(intel_gpgpu_t *state)
+intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
{
- if (state == NULL)
+ if (gpgpu == NULL)
return;
- if (state->surface_heap_b.bo)
- drm_intel_bo_unreference(state->surface_heap_b.bo);
- if (state->idrt_b.bo)
- drm_intel_bo_unreference(state->idrt_b.bo);
- if (state->vfe_state_b.bo)
- drm_intel_bo_unreference(state->vfe_state_b.bo);
- if (state->curbe_b.bo)
- drm_intel_bo_unreference(state->curbe_b.bo);
- if (state->sampler_state_b.bo)
- drm_intel_bo_unreference(state->sampler_state_b.bo);
- if (state->perf_b.bo)
- drm_intel_bo_unreference(state->perf_b.bo);
- intel_batchbuffer_delete(state->batch);
- cl_free(state);
+ if (gpgpu->surface_heap_b.bo)
+ drm_intel_bo_unreference(gpgpu->surface_heap_b.bo);
+ if (gpgpu->idrt_b.bo)
+ drm_intel_bo_unreference(gpgpu->idrt_b.bo);
+ if (gpgpu->vfe_state_b.bo)
+ drm_intel_bo_unreference(gpgpu->vfe_state_b.bo);
+ if (gpgpu->curbe_b.bo)
+ drm_intel_bo_unreference(gpgpu->curbe_b.bo);
+ if (gpgpu->sampler_state_b.bo)
+ drm_intel_bo_unreference(gpgpu->sampler_state_b.bo);
+ if (gpgpu->perf_b.bo)
+ drm_intel_bo_unreference(gpgpu->perf_b.bo);
+ if (gpgpu->stack_b.bo)
+ drm_intel_bo_unreference(gpgpu->stack_b.bo);
+ intel_batchbuffer_delete(gpgpu->batch);
+ cl_free(gpgpu);
}
static intel_gpgpu_t*
@@ -121,98 +124,96 @@ error:
}
static void
-intel_gpgpu_select_pipeline(intel_gpgpu_t *state)
+intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
{
- BEGIN_BATCH(state->batch, 1);
- OUT_BATCH(state->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
- ADVANCE_BATCH(state->batch);
+ BEGIN_BATCH(gpgpu->batch, 1);
+ OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
+ ADVANCE_BATCH(gpgpu->batch);
}
static void
-intel_gpgpu_set_base_address(intel_gpgpu_t *state)
+intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
{
const uint32_t def_cc = cc_llc_l3; /* default Cache Control value */
- BEGIN_BATCH(state->batch, 10);
- OUT_BATCH(state->batch, CMD_STATE_BASE_ADDRESS | 8);
+ BEGIN_BATCH(gpgpu->batch, 10);
+ OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 8);
/* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
- OUT_BATCH(state->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY); /* General State Base Addr */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY); /* General State Base Addr */
/* 0, State Mem Obj CC */
/* We use a state base address for the surface heap since IVB clamp the
* binding table pointer at 11 bits. So, we cannot use pointers directly while
* using the surface heap
*/
- OUT_RELOC(state->batch, state->surface_heap_b.bo,
+ OUT_RELOC(gpgpu->batch, gpgpu->surface_heap_b.bo,
I915_GEM_DOMAIN_INSTRUCTION,
I915_GEM_DOMAIN_INSTRUCTION,
0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY);
- OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
- OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
- OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
/* If we output an AUB file, we limit the total size to 64MB */
#if USE_FULSIM
- OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
- OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
- OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
- OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
+ OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
+ OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
+ OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
+ OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
#else
- OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
- OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
- OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
- OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(gpgpu->batch, 0 | BASE_ADDRESS_MODIFY);
#endif /* USE_FULSIM */
- ADVANCE_BATCH(state->batch);
+ ADVANCE_BATCH(gpgpu->batch);
}
static void
-intel_gpgpu_load_vfe_state(intel_gpgpu_t *state)
+intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
{
- BEGIN_BATCH(state->batch, 8);
- OUT_BATCH(state->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
+ BEGIN_BATCH(gpgpu->batch, 8);
+ OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (8-2));
gen6_vfe_state_inline_t* vfe = (gen6_vfe_state_inline_t*)
- intel_batchbuffer_alloc_space(state->batch,0);
+ intel_batchbuffer_alloc_space(gpgpu->batch,0);
memset(vfe, 0, sizeof(struct gen6_vfe_state_inline));
vfe->vfe1.gpgpu_mode = 1;
vfe->vfe1.bypass_gateway_ctl = 1;
vfe->vfe1.reset_gateway_timer = 1;
- vfe->vfe1.max_threads = state->max_threads - 1;
+ vfe->vfe1.max_threads = gpgpu->max_threads - 1;
vfe->vfe1.urb_entries = 64;
vfe->vfe3.curbe_size = 480;
vfe->vfe4.scoreboard_mask = 0;
- //vfe->vfe3.urb_size = 13;
- //vfe->vfe4.scoreboard_mask = (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) ? 0 : 0x80000000;
- intel_batchbuffer_alloc_space(state->batch, sizeof(gen6_vfe_state_inline_t));
- ADVANCE_BATCH(state->batch);
+ intel_batchbuffer_alloc_space(gpgpu->batch, sizeof(gen6_vfe_state_inline_t));
+ ADVANCE_BATCH(gpgpu->batch);
}
static void
-intel_gpgpu_load_constant_buffer(intel_gpgpu_t *state)
+intel_gpgpu_load_constant_buffer(intel_gpgpu_t *gpgpu)
{
- BEGIN_BATCH(state->batch, 4);
- OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
- OUT_BATCH(state->batch, 0); /* mbz */
+ BEGIN_BATCH(gpgpu->batch, 4);
+ OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
+ OUT_BATCH(gpgpu->batch, 0); /* mbz */
// XXX
#if 1
- OUT_BATCH(state->batch,
- state->urb.size_cs_entry*
- state->urb.num_cs_entries*32);
+ OUT_BATCH(gpgpu->batch,
+ gpgpu->urb.size_cs_entry*
+ gpgpu->urb.num_cs_entries*32);
#else
- OUT_BATCH(state->batch, 5120);
+ OUT_BATCH(gpgpu->batch, 5120);
#endif
- OUT_RELOC(state->batch, state->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
- ADVANCE_BATCH(state->batch);
+ OUT_RELOC(gpgpu->batch, gpgpu->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+ ADVANCE_BATCH(gpgpu->batch);
}
static void
-intel_gpgpu_load_idrt(intel_gpgpu_t *state)
-{
- BEGIN_BATCH(state->batch, 4);
- OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
- OUT_BATCH(state->batch, 0); /* mbz */
- OUT_BATCH(state->batch, 1 << 5);
- OUT_RELOC(state->batch, state->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
- ADVANCE_BATCH(state->batch);
+intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
+{
+ BEGIN_BATCH(gpgpu->batch, 4);
+ OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
+ OUT_BATCH(gpgpu->batch, 0); /* mbz */
+ OUT_BATCH(gpgpu->batch, 1 << 5);
+ OUT_RELOC(gpgpu->batch, gpgpu->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
+ ADVANCE_BATCH(gpgpu->batch);
}
static const uint32_t gpgpu_l3_config_reg1[] =
@@ -303,11 +304,11 @@ enum GFX3DSTATE_PIPELINED_SUBOPCODE
};
static void
-intel_gpgpu_pipe_control(intel_gpgpu_t *state)
+intel_gpgpu_pipe_control(intel_gpgpu_t *gpgpu)
{
- BEGIN_BATCH(state->batch, SIZEOF32(gen6_pipe_control_t));
+ BEGIN_BATCH(gpgpu->batch, SIZEOF32(gen6_pipe_control_t));
gen6_pipe_control_t* pc = (gen6_pipe_control_t*)
- intel_batchbuffer_alloc_space(state->batch, 0);
+ intel_batchbuffer_alloc_space(gpgpu->batch, 0);
memset(pc, 0, sizeof(*pc));
pc->dw0.length = SIZEOF32(gen6_pipe_control_t) - 2;
pc->dw0.instruction_subopcode = GFX3DSUBOP_3DCONTROL;
@@ -317,167 +318,169 @@ intel_gpgpu_pipe_control(intel_gpgpu_t *state)
pc->dw1.render_target_cache_flush_enable = 1;
pc->dw1.cs_stall = 1;
pc->dw1.dc_flush_enable = 1;
- ADVANCE_BATCH(state->batch);
+ ADVANCE_BATCH(gpgpu->batch);
}
static void
-intel_gpgpu_set_L3(intel_gpgpu_t *state, uint32_t use_barrier)
+intel_gpgpu_set_L3(intel_gpgpu_t *gpgpu, uint32_t use_barrier)
{
- BEGIN_BATCH(state->batch, 6);
- OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
- OUT_BATCH(state->batch, L3_CNTL_REG2_ADDRESS_OFFSET);
+ BEGIN_BATCH(gpgpu->batch, 6);
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, L3_CNTL_REG2_ADDRESS_OFFSET);
if (use_barrier)
- OUT_BATCH(state->batch, gpgpu_l3_config_reg1[8]);
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[8]);
else
- OUT_BATCH(state->batch, gpgpu_l3_config_reg1[4]);
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg1[4]);
- OUT_BATCH(state->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
- OUT_BATCH(state->batch, L3_CNTL_REG3_ADDRESS_OFFSET);
+ OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+ OUT_BATCH(gpgpu->batch, L3_CNTL_REG3_ADDRESS_OFFSET);
if (use_barrier)
- OUT_BATCH(state->batch, gpgpu_l3_config_reg2[8]);
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[8]);
else
- OUT_BATCH(state->batch, gpgpu_l3_config_reg2[4]);
- ADVANCE_BATCH(state->batch);
+ OUT_BATCH(gpgpu->batch, gpgpu_l3_config_reg2[4]);
+ ADVANCE_BATCH(gpgpu->batch);
- intel_gpgpu_pipe_control(state);
+ intel_gpgpu_pipe_control(gpgpu);
}
static void
-intel_gpgpu_batch_start(intel_gpgpu_t *state)
-{
- intel_batchbuffer_start_atomic(state->batch, 256);
- intel_gpgpu_pipe_control(state);
- if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
- intel_gpgpu_set_L3(state, state->ker->use_barrier);
- intel_gpgpu_select_pipeline(state);
- intel_gpgpu_set_base_address(state);
- intel_gpgpu_load_vfe_state(state);
- intel_gpgpu_load_constant_buffer(state);
- intel_gpgpu_load_idrt(state);
-
- if (state->perf_b.bo) {
- BEGIN_BATCH(state->batch, 3);
- OUT_BATCH(state->batch,
+intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
+{
+ intel_batchbuffer_start_atomic(gpgpu->batch, 256);
+ intel_gpgpu_pipe_control(gpgpu);
+ intel_gpgpu_set_L3(gpgpu, gpgpu->ker->use_barrier);
+ intel_gpgpu_select_pipeline(gpgpu);
+ intel_gpgpu_set_base_address(gpgpu);
+ intel_gpgpu_load_vfe_state(gpgpu);
+ intel_gpgpu_load_constant_buffer(gpgpu);
+ intel_gpgpu_load_idrt(gpgpu);
+
+ if (gpgpu->perf_b.bo) {
+ BEGIN_BATCH(gpgpu->batch, 3);
+ OUT_BATCH(gpgpu->batch,
(0x28 << 23) | /* MI_REPORT_PERF_COUNT */
(3 - 2)); /* length-2 */
- OUT_RELOC(state->batch, state->perf_b.bo,
+ OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
0 | /* Offset for the start "counters" */
1); /* Use GTT and not PGTT */
- OUT_BATCH(state->batch, 0);
- ADVANCE_BATCH(state->batch);
+ OUT_BATCH(gpgpu->batch, 0);
+ ADVANCE_BATCH(gpgpu->batch);
}
}
static void
-intel_gpgpu_batch_end(intel_gpgpu_t *state, int32_t flush_mode)
+intel_gpgpu_batch_end(intel_gpgpu_t *gpgpu, int32_t flush_mode)
{
/* Insert the performance counter command */
- if (state->perf_b.bo) {
- BEGIN_BATCH(state->batch, 3);
- OUT_BATCH(state->batch,
+ if (gpgpu->perf_b.bo) {
+ BEGIN_BATCH(gpgpu->batch, 3);
+ OUT_BATCH(gpgpu->batch,
(0x28 << 23) | /* MI_REPORT_PERF_COUNT */
(3 - 2)); /* length-2 */
- OUT_RELOC(state->batch, state->perf_b.bo,
+ OUT_RELOC(gpgpu->batch, gpgpu->perf_b.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
512 | /* Offset for the end "counters" */
1); /* Use GTT and not PGTT */
- OUT_BATCH(state->batch, 0);
- ADVANCE_BATCH(state->batch);
+ OUT_BATCH(gpgpu->batch, 0);
+ ADVANCE_BATCH(gpgpu->batch);
}
- if(flush_mode) intel_gpgpu_pipe_control(state);
- intel_batchbuffer_end_atomic(state->batch);
+ if(flush_mode) intel_gpgpu_pipe_control(gpgpu);
+ intel_batchbuffer_end_atomic(gpgpu->batch);
}
static void
-intel_gpgpu_batch_reset(intel_gpgpu_t *state, size_t sz)
+intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz)
{
- intel_batchbuffer_reset(state->batch, sz);
+ intel_batchbuffer_reset(gpgpu->batch, sz);
}
static void
-intel_gpgpu_flush(intel_gpgpu_t *state)
+intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
{
- intel_batchbuffer_flush(state->batch);
+ intel_batchbuffer_flush(gpgpu->batch);
}
static void
-intel_gpgpu_state_init(intel_gpgpu_t *state,
+intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
uint32_t max_threads,
uint32_t size_cs_entry)
{
- dri_bo *bo;
+ drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+ drm_intel_bo *bo;
/* Binded buffers */
- state->binded_n = 0;
+ gpgpu->binded_n = 0;
/* URB */
- state->urb.num_cs_entries = 64;
- state->urb.size_cs_entry = size_cs_entry;
- state->max_threads = max_threads;
-
- /* constant buffer */
- if(state->curbe_b.bo)
- dri_bo_unreference(state->curbe_b.bo);
- uint32_t size_cb = state->urb.num_cs_entries * state->urb.size_cs_entry * 64;
+ gpgpu->urb.num_cs_entries = 64;
+ gpgpu->urb.size_cs_entry = size_cs_entry;
+ gpgpu->max_threads = max_threads;
+
+ /* Constant buffer */
+ if(gpgpu->curbe_b.bo)
+ dri_bo_unreference(gpgpu->curbe_b.bo);
+ uint32_t size_cb = gpgpu->urb.num_cs_entries * gpgpu->urb.size_cs_entry * 64;
size_cb = ALIGN(size_cb, 4096);
- bo = dri_bo_alloc(state->drv->bufmgr,
- "CONSTANT_BUFFER",
- size_cb,
- 64);
+ bo = dri_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size_cb, 64);
assert(bo);
- state->curbe_b.bo = bo;
+ gpgpu->curbe_b.bo = bo;
/* surface state */
- if(state->surface_heap_b.bo)
- dri_bo_unreference(state->surface_heap_b.bo);
- bo = dri_bo_alloc(state->drv->bufmgr,
+ if(gpgpu->surface_heap_b.bo)
+ dri_bo_unreference(gpgpu->surface_heap_b.bo);
+ bo = dri_bo_alloc(bufmgr,
"SURFACE_HEAP",
sizeof(surface_heap_t),
32);
assert(bo);
dri_bo_map(bo, 1);
memset(bo->virtual, 0, sizeof(surface_heap_t));
- state->surface_heap_b.bo = bo;
+ gpgpu->surface_heap_b.bo = bo;
/* Interface descriptor remap table */
- if(state->idrt_b.bo)
- dri_bo_unreference(state->idrt_b.bo);
- bo = dri_bo_alloc(state->drv->bufmgr,
+ if(gpgpu->idrt_b.bo)
+ dri_bo_unreference(gpgpu->idrt_b.bo);
+ bo = dri_bo_alloc(bufmgr,
"IDRT",
MAX_IF_DESC * sizeof(struct gen6_interface_descriptor),
32);
assert(bo);
- state->idrt_b.bo = bo;
+ gpgpu->idrt_b.bo = bo;
/* vfe state */
- if(state->vfe_state_b.bo)
- dri_bo_unreference(state->vfe_state_b.bo);
- state->vfe_state_b.bo = NULL;
+ if(gpgpu->vfe_state_b.bo)
+ dri_bo_unreference(gpgpu->vfe_state_b.bo);
+ gpgpu->vfe_state_b.bo = NULL;
/* sampler state */
- if (state->sampler_state_b.bo)
- dri_bo_unreference(state->sampler_state_b.bo);
- bo = dri_bo_alloc(state->drv->bufmgr,
- "sample states",
+ if (gpgpu->sampler_state_b.bo)
+ dri_bo_unreference(gpgpu->sampler_state_b.bo);
+ bo = dri_bo_alloc(gpgpu->drv->bufmgr,
+ "SAMPLER_STATE",
GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
32);
assert(bo);
dri_bo_map(bo, 1);
memset(bo->virtual, 0, sizeof(gen6_sampler_state_t) * GEN_MAX_SAMPLERS);
- state->sampler_state_b.bo = bo;
+ gpgpu->sampler_state_b.bo = bo;
+
+ /* stack */
+ if (gpgpu->stack_b.bo)
+ dri_bo_unreference(gpgpu->stack_b.bo);
+ gpgpu->stack_b.bo = NULL;
}
static void
-intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_bo)
+intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo)
{
- surface_heap_t *heap = state->surface_heap_b.bo->virtual;
+ surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
heap->binding_table[index] = offsetof(surface_heap_t, surface) +
index * sizeof(gen7_surface_state_t);
- dri_bo_emit_reloc(state->surface_heap_b.bo,
+ dri_bo_emit_reloc(gpgpu->surface_heap_b.bo,
I915_GEM_DOMAIN_RENDER,
I915_GEM_DOMAIN_RENDER,
0,
@@ -491,9 +494,9 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_
* surface but Fulsim complains
*/
static void
-intel_gpgpu_map_address_space(intel_gpgpu_t *state)
+intel_gpgpu_map_address_space(intel_gpgpu_t *gpgpu)
{
- surface_heap_t *heap = state->surface_heap_b.bo->virtual;
+ surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[0];
gen7_surface_state_t *ss1 = (gen7_surface_state_t *) heap->surface[1];
memset(ss0, 0, sizeof(gen7_surface_state_t));
@@ -510,7 +513,7 @@ intel_gpgpu_map_address_space(intel_gpgpu_t *state)
}
static void
-intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
+intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *gpgpu,
int32_t index,
dri_bo* obj_bo,
uint32_t format,
@@ -519,7 +522,7 @@ intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
int32_t pitch,
int32_t tiling)
{
- surface_heap_t *heap = state->surface_heap_b.bo->virtual;
+ surface_heap_t *heap = gpgpu->surface_heap_b.bo->virtual;
gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
memset(ss, 0, sizeof(*ss));
ss->ss0.surface_type = I965_SURFACE_2D;
@@ -536,7 +539,7 @@ intel_gpgpu_bind_image2D_gen7(intel_gpgpu_t *state,
ss->ss0.tiled_surface = 1;
ss->ss0.tile_walk = I965_TILEWALK_YMAJOR;
}
- intel_gpgpu_set_buf_reloc_gen7(state, index, obj_bo);
+ intel_gpgpu_set_buf_reloc_gen7(gpgpu, index, obj_bo);
}
static void
@@ -546,18 +549,18 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset, u
gpgpu->binded_buf[gpgpu->binded_n] = buf;
gpgpu->binded_offset[gpgpu->binded_n] = offset;
gpgpu->binded_n++;
-#if 0
- const uint32_t size = obj_bo->size;
- assert(index < GEN_MAX_SURFACES);
- if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
- intel_gpgpu_bind_buf_gen7(state, index, obj_bo, size, cchint);
- else
- NOT_IMPLEMENTED;
-#endif
}
static void
-intel_gpgpu_bind_image2D(intel_gpgpu_t *state,
+intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
+{
+ drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
+ gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
+ intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, cchint);
+}
+
+static void
+intel_gpgpu_bind_image2D(intel_gpgpu_t *gpgpu,
int32_t index,
cl_buffer *obj_bo,
uint32_t format,
@@ -567,19 +570,16 @@ intel_gpgpu_bind_image2D(intel_gpgpu_t *state,
cl_gpgpu_tiling tiling)
{
assert(index < GEN_MAX_SURFACES);
- if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75)
- intel_gpgpu_bind_image2D_gen7(state, index, (drm_intel_bo*) obj_bo, format, w, h, pitch, tiling);
- else
- NOT_IMPLEMENTED;
+ intel_gpgpu_bind_image2D_gen7(gpgpu, index, (drm_intel_bo*) obj_bo, format, w, h, pitch, tiling);
}
static void
-intel_gpgpu_build_idrt(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel)
+intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
{
gen6_interface_descriptor_t *desc;
drm_intel_bo *bo = NULL, *ker_bo = NULL;
- bo = state->idrt_b.bo;
+ bo = gpgpu->idrt_b.bo;
dri_bo_map(bo, 1);
assert(bo->virtual);
desc = (gen6_interface_descriptor_t*) bo->virtual;
@@ -588,14 +588,14 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel)
ker_bo = (drm_intel_bo *) kernel->bo;
desc->desc0.kernel_start_pointer = ker_bo->offset >> 6; /* reloc */
desc->desc1.single_program_flow = 1;
- desc->desc2.sampler_state_pointer = state->sampler_state_b.bo->offset >> 5;
+ desc->desc2.sampler_state_pointer = gpgpu->sampler_state_b.bo->offset >> 5;
desc->desc3.binding_table_entry_count = 0; /* no prefetch */
desc->desc3.binding_table_pointer = 0;
desc->desc4.curbe_read_len = kernel->cst_sz / 32;
desc->desc4.curbe_read_offset = 0;
/* Barriers / SLM are automatically handled on Gen7+ */
- if (state->drv->gen_ver == 7 || state->drv->gen_ver == 75) {
+ if (gpgpu->drv->gen_ver == 7 || gpgpu->drv->gen_ver == 75) {
size_t slm_sz = kernel->slm_sz;
desc->desc5.group_threads_num = kernel->use_barrier ? kernel->thread_n : 0;
desc->desc5.barrier_enable = kernel->use_barrier;
@@ -627,7 +627,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel)
I915_GEM_DOMAIN_INSTRUCTION, 0,
0,
offsetof(gen6_interface_descriptor_t, desc2),
- state->sampler_state_b.bo);
+ gpgpu->sampler_state_b.bo);
dri_bo_unmap(bo);
}
@@ -730,6 +730,7 @@ intel_set_gpgpu_callbacks(void)
cl_gpgpu_delete = (cl_gpgpu_delete_cb *) intel_gpgpu_delete;
cl_gpgpu_bind_image2D = (cl_gpgpu_bind_image2D_cb *) intel_gpgpu_bind_image2D;
cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) intel_gpgpu_bind_buf;
+ cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) intel_gpgpu_set_stack;
cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) intel_gpgpu_upload_constants;
diff --git a/src/sim/sim_driver.c b/src/sim/sim_driver.c
index 0d18ccd1..49a9e893 100644
--- a/src/sim/sim_driver.c
+++ b/src/sim/sim_driver.c
@@ -196,6 +196,7 @@ struct _sim_gpgpu
sim_driver driver; /* the driver the gpgpu states belongs to */
sim_kernel_cb *kernel; /* call it for each HW thread */
sim_buffer binded_buf[max_buf_n]; /* all buffers binded for the call */
+ sim_buffer stack; /* used only when stack is required */
char *fake_memory; /* fake memory to emulate flat address space in any mode (32 / 64 bits) */
char *curbe; /* constant buffer */
uint32_t binded_offset[max_buf_n]; /* their offsets in the constant buffer */
@@ -210,6 +211,7 @@ typedef struct _sim_gpgpu *sim_gpgpu;
static void sim_gpgpu_delete(sim_gpgpu gpgpu) {
if (gpgpu->fake_memory) cl_free(gpgpu->fake_memory);
if (gpgpu->curbe) cl_free(gpgpu->curbe);
+ if (gpgpu->stack) sim_buffer_delete(gpgpu->stack);
cl_free(gpgpu);
}
@@ -217,6 +219,7 @@ static sim_gpgpu sim_gpgpu_new(sim_driver driver)
{
sim_gpgpu gpgpu = NULL;
TRY_ALLOC_NO_ERR(gpgpu, cl_calloc(1, sizeof(struct _sim_gpgpu)));
+ gpgpu->driver = driver;
exit:
return gpgpu;
error:
@@ -264,7 +267,12 @@ static void
sim_gpgpu_state_init(sim_gpgpu gpgpu, uint32_t max_threads, uint32_t size_cs_entry)
{
assert(gpgpu);
- memset(gpgpu, 0, sizeof(*gpgpu));
+ if (gpgpu->stack)
+ sim_buffer_delete(gpgpu->stack);
+ gpgpu->fake_memory = NULL;
+ gpgpu->curbe = NULL;
+ gpgpu->binded_n = 0;
+ gpgpu->thread_n = 0;
gpgpu->curbe_sz = size_cs_entry * 32;
gpgpu->max_threads = max_threads;
}
@@ -306,6 +314,14 @@ sim_gpgpu_bind_buf(sim_gpgpu gpgpu, sim_buffer buf, uint32_t offset, uint32_t cc
}
static void
+sim_gpgpu_set_stack(sim_gpgpu gpgpu, uint32_t offset, uint32_t size, uint32_t cchint)
+{
+ sim_bufmgr bufmgr = gpgpu->driver->bufmgr;
+ gpgpu->stack = sim_buffer_alloc(bufmgr, "STACK", size, 64);
+ sim_gpgpu_bind_buf(gpgpu, gpgpu->stack, offset, cchint);
+}
+
+static void
sim_gpgpu_walker(sim_gpgpu gpgpu,
uint32_t simd_sz,
uint32_t thread_n,
@@ -364,6 +380,7 @@ sim_setup_callbacks(void)
cl_gpgpu_delete = (cl_gpgpu_delete_cb *) sim_gpgpu_delete;
cl_gpgpu_bind_image2D = (cl_gpgpu_bind_image2D_cb *) sim_gpgpu_bind_image2D;
cl_gpgpu_bind_buf = (cl_gpgpu_bind_buf_cb *) sim_gpgpu_bind_buf;
+ cl_gpgpu_set_stack = (cl_gpgpu_set_stack_cb *) sim_gpgpu_set_stack;
cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) sim_gpgpu_state_init;
cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) sim_gpgpu_set_perf_counters;
cl_gpgpu_upload_constants = (cl_gpgpu_upload_constants_cb *) sim_gpgpu_upload_constants;
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 507ef0da..0fd5e823 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -28,7 +28,8 @@ ADD_LIBRARY(utests SHARED
compiler_lower_return0.cpp
compiler_lower_return1.cpp
compiler_lower_return2.cpp
- compiler_array.cpp)
+ compiler_array.cpp
+ compiler_array0.cpp)
TARGET_LINK_LIBRARIES(utests cl m)
ADD_EXECUTABLE(run utest_run.cpp)
diff --git a/utests/compiler_array0.cpp b/utests/compiler_array0.cpp
new file mode 100644
index 00000000..9e3535dc
--- /dev/null
+++ b/utests/compiler_array0.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+ int i;
+ int final[16];
+ for (i = 0; i < 16; ++i) {
+ int array[16], j;
+ for (j = 0; j < 16; ++j)
+ array[j] = global_id;
+ for (j = 0; j < src[0]; ++j)
+ array[j] = 1+src[j];
+ final[i] = array[i];
+ }
+ dst[global_id] = final[global_id];
+}
+
+void compiler_array0(void)
+{
+ const size_t n = 16;
+ int cpu_dst[16], cpu_src[16];
+
+ // Setup kernel and buffers
+ OCL_CREATE_KERNEL("compiler_array0");
+ OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+ OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+ OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+ globals[0] = 16;
+ locals[0] = 16;
+
+ // Run random tests
+ for (uint32_t pass = 0; pass < 8; ++pass) {
+ OCL_MAP_BUFFER(0);
+ for (int32_t i = 0; i < (int32_t) n; ++i)
+ cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+ OCL_UNMAP_BUFFER(0);
+
+ // Run the kernel on GPU
+ OCL_NDRANGE(1);
+
+ // Run on CPU
+ for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+ // Compare
+ OCL_MAP_BUFFER(1);
+ for (int32_t i = 0; i < 11; ++i)
+ OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+ OCL_UNMAP_BUFFER(1);
+ }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_array0);
+
+