diff options
-rw-r--r-- | src/cl_command_queue.c | 47 | ||||
-rw-r--r-- | src/cl_driver.h | 2 | ||||
-rw-r--r-- | src/intel/intel_driver.c | 2 | ||||
-rw-r--r-- | src/intel/intel_gpgpu.c | 120 | ||||
-rw-r--r-- | src/sim/sim_driver.c | 4 |
5 files changed, 85 insertions, 90 deletions
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index 82d483af..56098ee8 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -153,27 +153,25 @@ static cl_int cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k) { cl_int err = CL_SUCCESS; -#if 0 cl_mem mem = NULL; int i; size_t j; /* Bind user defined surface */ - for (i = 0; i < k->arg_info_n; ++i) { + for (i = 0; i < k->arg_n; ++i) { size_t chunk_n, chunk_remainder; - if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER) + if (gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR) continue; - mem = (cl_mem) k->args[k->arg_info[i].arg_index]; + mem = (cl_mem) k->args[i].mem; CHECK_MEM(mem); - chunk_n = mem->bo->size / chunk_sz; - chunk_remainder = mem->bo->size % chunk_sz; + chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz; + chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz; for (j = 0; j < chunk_n; ++j) aub_exec_dump_raw_file(mem->bo, j * chunk_sz, chunk_sz); if (chunk_remainder) aub_exec_dump_raw_file(mem->bo, chunk_n * chunk_sz, chunk_remainder); } error: -#endif return err; } @@ -196,7 +194,6 @@ struct bmphdr { /* raw b, g, r data here, dword aligned per scan line */ }; -#if 0 static int* cl_read_bmp(const char *filename, int *width, int *height) { @@ -213,24 +210,6 @@ cl_read_bmp(const char *filename, int *width, int *height) n = fread(&hdr, 1, sizeof(hdr), fp); assert(n == sizeof(hdr)); -#if 0 - /* Dump stuff out */ - printf(" filesize = %d\n", hdr.filesize); /* total file size incl header */ - printf(" as0 = %d\n", hdr.as0); - printf(" as1 = %d\n", hdr.as1); - printf(" bmpoffset = %d\n", hdr.bmpoffset); /* ofset of bmp data */ - printf("headerbytes = %d\n", hdr.headerbytes); /* bytes in header from this point (40 actually) */ - printf(" width = %d\n", hdr.width); - printf(" height = %d\n", hdr.height); - printf(" nplanes = %d\n", hdr.nplanes); /* no of color planes */ - printf(" bpp = %d\n", hdr.bpp); /* bits/pixel */ - printf("compression = %d\n", hdr.compression); /* BI_RGB = 0 = no compression */ - printf(" sizeraw = %d\n", hdr.sizeraw); /* size of raw bmp file, excluding header, incl padding */ - printf(" hres = %d\n", hdr.hres); /* horz resolutions pixels/meter */ - printf(" vres = %d\n", hdr.vres); - printf(" npalcolors = %d\n", hdr.npalcolors); /* No of colors in palette */ - printf(" nimportant = %d\n", hdr.nimportant); /* No of important colors */ -#endif assert(hdr.width > 0 && hdr.height > 0 && hdr.nplanes == 1 @@ -278,26 +257,24 @@ cl_read_dump(const char *name, size_t *size) *size = sz; return dump; } -#endif static cl_int cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k) { cl_int err = CL_SUCCESS; -#if 0 cl_mem mem = NULL; char *from = NULL, *to = NULL; size_t size, j, chunk_n, chunk_remainder; int i, curr = 0; /* Bind user defined surface */ - for (i = 0; i < k->arg_info_n; ++i) { - if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER) + for (i = 0; i < k->arg_n; ++i) { + if (gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR) continue; - mem = (cl_mem) k->args[k->arg_info[i].arg_index]; + mem = (cl_mem) k->args[i].mem; CHECK_MEM(mem); assert(mem->bo); - chunk_n = mem->bo->size / chunk_sz; - chunk_remainder = mem->bo->size % chunk_sz; + chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz; + chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz; to = cl_mem_map(mem); for (j = 0; j < chunk_n; ++j) { char name[256]; @@ -328,11 +305,9 @@ cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k) cl_mem_unmap(mem); } error: -#endif return err; - } -#endif /* USE_FULSIM */ +#endif extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *); diff --git a/src/cl_driver.h b/src/cl_driver.h index 076f3983..75df8dd9 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -195,7 +195,7 @@ typedef void* (cl_buffer_get_virtual_cb)(cl_buffer); extern cl_buffer_get_virtual_cb *cl_buffer_get_virtual; /* Get the size of the buffer */ -typedef void* (cl_buffer_get_size_cb)(cl_buffer); +typedef size_t (cl_buffer_get_size_cb)(cl_buffer); extern cl_buffer_get_size_cb *cl_buffer_get_size; /* Pin a buffer */ diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index 78d6cb7f..d487fc06 100644 --- a/src/intel/intel_driver.c +++ b/src/intel/intel_driver.c @@ -340,7 +340,7 @@ intel_driver_get_ver(struct intel_driver *drv) return drv->gen_ver; } -static uint32_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; } +static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; } static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; } LOCAL void diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 4de28506..c2a3745a 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -486,23 +486,29 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_ obj_bo); } +/* Use two one GB surface to map the 2GB address space */ static void -intel_gpgpu_map_address_space(intel_gpgpu_t *state, - int32_t index, - uint32_t size, - uint32_t cchint) +intel_gpgpu_map_address_space(intel_gpgpu_t *state) { surface_heap_t *heap = state->surface_heap_b.bo->virtual; - gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index]; - const uint32_t size_ss = size - 1; - memset(ss, 0, sizeof(*ss)); - ss->ss0.surface_type = I965_SURFACE_BUFFER; - ss->ss0.surface_format = I965_SURFACEFORMAT_RAW; - ss->ss1.base_addr = 0; - ss->ss2.width = size_ss & 0x7f; /* bits 6:0 of size_ss */ - ss->ss2.height = (size_ss & 0x1fff80) >> 7; /* bits 20:7 of size_ss */ - ss->ss3.depth = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */ - ss->ss5.cache_control = cc_llc_l3; + gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[0]; + gen7_surface_state_t *ss1 = (gen7_surface_state_t *) heap->surface[1]; + const uint32_t sz = (1<<30) - 1; + //const uint32_t sz = 1024*1024-1; + memset(ss0, 0, sizeof(gen7_surface_state_t)); + memset(ss1, 0, sizeof(gen7_surface_state_t)); + ss1->ss0.surface_type = ss0->ss0.surface_type = I965_SURFACE_BUFFER; + ss1->ss0.surface_format = ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW; + ss0->ss1.base_addr = 0; + ss1->ss1.base_addr = 1<<30; + ss1->ss2.width = ss0->ss2.width = sz & 127; /* bits 6:0 of sz */ + ss1->ss2.height = ss0->ss2.height = (sz >> 7) & 16383; /* bits 20:7 of sz */ + ss1->ss3.depth = ss0->ss3.depth = (sz >> 21) & 1023; /* bits 30:21 of sz */ + ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3; + heap->binding_table[0] = offsetof(surface_heap_t, surface) + + 0 * sizeof(gen7_surface_state_t); + heap->binding_table[1] = offsetof(surface_heap_t, surface) + + 1 * sizeof(gen7_surface_state_t); } static void @@ -628,46 +634,60 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel) } static void -intel_gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size) +intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t size) { unsigned char *constant_buffer = NULL; + cl_gpgpu_kernel *k = gpgpu->ker; + uint32_t i, j; - dri_bo_map(state->curbe_b.bo, 1); - assert(state->curbe_b.bo->virtual); - constant_buffer = (unsigned char *) state->curbe_b.bo->virtual; + /* Upload the data first */ + dri_bo_map(gpgpu->curbe_b.bo, 1); + assert(gpgpu->curbe_b.bo->virtual); + constant_buffer = (unsigned char *) gpgpu->curbe_b.bo->virtual; memcpy(constant_buffer, data, size); - dri_bo_unmap(state->curbe_b.bo); + dri_bo_unmap(gpgpu->curbe_b.bo); + + /* Now put all the relocations for our flat address space */ + for (i = 0; i < k->thread_n; ++i) + for (j = 0; j < gpgpu->binded_n; ++j) + drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo, + gpgpu->binded_offset[j]+i*k->cst_sz, + gpgpu->binded_buf[j], + 0, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER); } static void -intel_gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n) +intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n) { if (n) { const size_t sz = n * sizeof(gen6_sampler_state_t); - memcpy(state->sampler_state_b.bo->virtual, data, sz); + memcpy(gpgpu->sampler_state_b.bo->virtual, data, sz); } } static void -intel_gpgpu_states_setup(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel) +intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) { - state->ker = kernel; - intel_gpgpu_build_idrt(state, kernel); - dri_bo_unmap(state->surface_heap_b.bo); - dri_bo_unmap(state->sampler_state_b.bo); + gpgpu->ker = kernel; + intel_gpgpu_build_idrt(gpgpu, kernel); + intel_gpgpu_map_address_space(gpgpu); + dri_bo_unmap(gpgpu->surface_heap_b.bo); + dri_bo_unmap(gpgpu->sampler_state_b.bo); } static void -intel_gpgpu_set_perf_counters(intel_gpgpu_t *state, cl_buffer *perf) +intel_gpgpu_set_perf_counters(intel_gpgpu_t *gpgpu, cl_buffer *perf) { - if (state->perf_b.bo) - drm_intel_bo_unreference(state->perf_b.bo); + if (gpgpu->perf_b.bo) + drm_intel_bo_unreference(gpgpu->perf_b.bo); drm_intel_bo_reference((drm_intel_bo*) perf); - state->perf_b.bo = (drm_intel_bo*) perf; + gpgpu->perf_b.bo = (drm_intel_bo*) perf; } static void -intel_gpgpu_walker(intel_gpgpu_t *state, +intel_gpgpu_walker(intel_gpgpu_t *gpgpu, uint32_t simd_sz, uint32_t thread_n, const size_t global_wk_off[3], @@ -680,27 +700,27 @@ intel_gpgpu_walker(intel_gpgpu_t *state, global_wk_sz[2] / local_wk_sz[2] }; assert(simd_sz == 8 || simd_sz == 16); - BEGIN_BATCH(state->batch, 11); - OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9); - OUT_BATCH(state->batch, 0); /* kernel index == 0 */ + BEGIN_BATCH(gpgpu->batch, 11); + OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9); + OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */ if (simd_sz == 16) - OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */ + OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */ else - OUT_BATCH(state->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */ - OUT_BATCH(state->batch, global_wk_off[0]); - OUT_BATCH(state->batch, global_wk_dim[0]); - OUT_BATCH(state->batch, global_wk_off[1]); - OUT_BATCH(state->batch, global_wk_dim[1]); - OUT_BATCH(state->batch, global_wk_off[2]); - OUT_BATCH(state->batch, global_wk_dim[2]); - OUT_BATCH(state->batch, ~0x0); - OUT_BATCH(state->batch, ~0x0); - ADVANCE_BATCH(state->batch); - - BEGIN_BATCH(state->batch, 2); - OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0); - OUT_BATCH(state->batch, 0); /* kernel index == 0 */ - ADVANCE_BATCH(state->batch); + OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */ + OUT_BATCH(gpgpu->batch, global_wk_off[0]); + OUT_BATCH(gpgpu->batch, global_wk_dim[0]); + OUT_BATCH(gpgpu->batch, global_wk_off[1]); + OUT_BATCH(gpgpu->batch, global_wk_dim[1]); + OUT_BATCH(gpgpu->batch, global_wk_off[2]); + OUT_BATCH(gpgpu->batch, global_wk_dim[2]); + OUT_BATCH(gpgpu->batch, ~0x0); + OUT_BATCH(gpgpu->batch, ~0x0); + ADVANCE_BATCH(gpgpu->batch); + + BEGIN_BATCH(gpgpu->batch, 2); + OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0); + OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */ + ADVANCE_BATCH(gpgpu->batch); } LOCAL void diff --git a/src/sim/sim_driver.c b/src/sim/sim_driver.c index aa634f4b..c9839832 100644 --- a/src/sim/sim_driver.c +++ b/src/sim/sim_driver.c @@ -163,11 +163,11 @@ sim_buffer_get_virtual(sim_buffer buf) return buf->data; } -static void* +static size_t sim_buffer_get_size(sim_buffer buf) { if (UNLIKELY(buf == NULL)) return 0; - return buf->data; + return buf->sz; } static int |