summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--kernels/dct_kernels.cl5
-rw-r--r--kernels/matmul_kernels.cl4
-rw-r--r--src/cl_api.c1
-rw-r--r--src/cl_command_queue.c15
-rw-r--r--src/cl_command_queue_gen6.c4
-rw-r--r--src/cl_command_queue_gen7.c12
-rw-r--r--src/intel/intel_gpgpu.c12
7 files changed, 29 insertions, 24 deletions
diff --git a/kernels/dct_kernels.cl b/kernels/dct_kernels.cl
index ed1e0f12..76ebe208 100644
--- a/kernels/dct_kernels.cl
+++ b/kernels/dct_kernels.cl
@@ -59,11 +59,10 @@ void DCT(__global float * output,
{
uint index1 = (inverse)? i*blockWidth + k : k * blockWidth + i;
uint index2 = getIdx(groupIdx, groupIdy, j, k, blockWidth, width);
-
acc += dct8x8[index1] * input[index2];
}
- inter[j*blockWidth + i] = acc;
+ inter[j*blockWidth + i] = acc;
/*
* Make sure all the values of inter that belong to a block
* are calculated before proceeding further
@@ -81,6 +80,6 @@ void DCT(__global float * output,
acc += inter[index1] * dct8x8[index2];
}
- output[idx] = acc;
+ output[idx] = acc;
}
diff --git a/kernels/matmul_kernels.cl b/kernels/matmul_kernels.cl
index b414e299..d2710304 100644
--- a/kernels/matmul_kernels.cl
+++ b/kernels/matmul_kernels.cl
@@ -9,7 +9,7 @@
__kernel void mmmKernel(__global float4 *matrixA,
__global float4 *matrixB,
__global float4* matrixC,
- uint widthA, uint widthB)
+ uint widthA, uint widthB)
{
int2 pos = (int2)(get_global_id(0), get_global_id(1));
@@ -224,4 +224,4 @@ __kernel void mmmKernel_local2(__global float4 *matrixA,
matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + get_global_size(0)] = sum1;
matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 2 * get_global_size(0)] = sum2;
matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 3 * get_global_size(0)] = sum3;
-} \ No newline at end of file
+}
diff --git a/src/cl_api.c b/src/cl_api.c
index dee67855..0b37ba82 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -1124,6 +1124,5 @@ clFulsimSetOutputBuffer(cl_command_queue queue, cl_mem mem)
err = cl_command_queue_set_fulsim_buffer(queue, mem);
error:
return err;
-
}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index b4e27357..76d8c72c 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -82,6 +82,10 @@ cl_command_queue_delete(cl_command_queue queue)
if (queue->next == NULL && queue->prev == NULL)
queue->ctx->queues = NULL;
pthread_mutex_unlock(&queue->ctx->queue_lock);
+ if (queue->fulsim_out != NULL) {
+ cl_mem_delete(queue->fulsim_out);
+ queue->fulsim_out = NULL;
+ }
cl_mem_delete(queue->perf);
cl_context_delete(queue->ctx);
intel_gpgpu_delete(queue->gpgpu);
@@ -292,21 +296,18 @@ LOCAL cl_int
cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem)
{
#if USE_FULSIM
- cl_context ctx = queue->ctx;
- drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
- drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo);
-#endif /* USE_FULSIM */
-
- queue->fulsim_out = mem;
if (queue->fulsim_out != NULL) {
cl_mem_delete(queue->fulsim_out);
queue->fulsim_out = NULL;
}
if (mem != NULL) {
+ cl_context ctx = queue->ctx;
+ drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
+ drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo);
cl_mem_add_ref(mem);
queue->fulsim_out = mem;
}
-
+#endif /* USE_FULSIM */
return CL_SUCCESS;
}
diff --git a/src/cl_command_queue_gen6.c b/src/cl_command_queue_gen6.c
index 763b7579..4ee8b024 100644
--- a/src/cl_command_queue_gen6.c
+++ b/src/cl_command_queue_gen6.c
@@ -186,9 +186,9 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue,
* buffers and reuse them
*/
curr = 0;
- for (i = 0; i < local_wk_sz[0]; ++i)
+ for (k = 0; k < local_wk_sz[2]; ++k)
for (j = 0; j < local_wk_sz[1]; ++j)
- for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
+ for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
((uint16_t*) ids[0])[curr] = i;
((uint16_t*) ids[1])[curr] = j;
((uint16_t*) ids[2])[curr] = k;
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 9a65d987..f378290c 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -55,12 +55,12 @@ cl_set_local_ids(char *data,
TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*16));
/* Compute the IDs */
- for (i = 0; i < local_wk_sz[0]; ++i)
+ for (k = 0; k < local_wk_sz[2]; ++k)
for (j = 0; j < local_wk_sz[1]; ++j)
- for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
- ((uint16_t*) ids[0])[curr] = i;
- ((uint16_t*) ids[1])[curr] = j;
- ((uint16_t*) ids[2])[curr] = k;
+ for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
+ ids[0][curr] = i;
+ ids[1][curr] = j;
+ ids[2][curr] = k;
}
/* Copy them to the constant buffer */
@@ -70,7 +70,7 @@ cl_set_local_ids(char *data,
uint16_t *ids0 = (uint16_t *) (data + 0);
uint16_t *ids1 = (uint16_t *) (data + 32);
uint16_t *ids2 = (uint16_t *) (data + 64);
- for (j = 0; j < 16; ++j, ++curr) {/* SIMD16 */
+ for (j = 0; j < 16; ++j, ++curr) {
ids0[j] = ids[0][curr];
ids1[j] = ids[1][curr];
ids2[j] = ids[2][curr];
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 002992e5..6929d10f 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -1099,16 +1099,22 @@ gpgpu_walker(intel_gpgpu_t *state,
const size_t global_wk_sz[3],
const size_t local_wk_sz[3])
{
+ const uint32_t global_wk_dim[3] = {
+ global_wk_sz[0] / local_wk_sz[0],
+ global_wk_sz[1] / local_wk_sz[1],
+ global_wk_sz[2] / local_wk_sz[2]
+ };
+
BEGIN_BATCH(state->batch, 11);
OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
OUT_BATCH(state->batch, 0); /* kernel index == 0 */
OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
OUT_BATCH(state->batch, global_wk_off[0]);
- OUT_BATCH(state->batch, global_wk_sz[0]-1);
+ OUT_BATCH(state->batch, global_wk_dim[0]);
OUT_BATCH(state->batch, global_wk_off[1]);
- OUT_BATCH(state->batch, global_wk_sz[1]-1);
+ OUT_BATCH(state->batch, global_wk_dim[1]);
OUT_BATCH(state->batch, global_wk_off[2]);
- OUT_BATCH(state->batch, global_wk_sz[2]-1);
+ OUT_BATCH(state->batch, global_wk_dim[2]);
OUT_BATCH(state->batch, ~0x0);
OUT_BATCH(state->batch, ~0x0);
ADVANCE_BATCH(state->batch);