5 files changed, 85 insertions, 90 deletions
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 82d483af..56098ee8 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -153,27 +153,25 @@ static cl_int
 cl_fulsim_dump_all_surfaces(cl_command_queue queue, cl_kernel k)
 {
   cl_int err = CL_SUCCESS;
-#if 0
   cl_mem mem = NULL;
   int i;
   size_t j;
 
   /* Bind user defined surface */
-  for (i = 0; i < k->arg_info_n; ++i) {
+  for (i = 0; i < k->arg_n; ++i) {
     size_t chunk_n, chunk_remainder;
-    if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER)
+    if (gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
       continue;
-    mem = (cl_mem) k->args[k->arg_info[i].arg_index];
+    mem = (cl_mem) k->args[i].mem;
     CHECK_MEM(mem);
-    chunk_n = mem->bo->size / chunk_sz;
-    chunk_remainder = mem->bo->size % chunk_sz;
+    chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
+    chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
     for (j = 0; j < chunk_n; ++j)
       aub_exec_dump_raw_file(mem->bo, j * chunk_sz, chunk_sz);
     if (chunk_remainder)
       aub_exec_dump_raw_file(mem->bo, chunk_n * chunk_sz, chunk_remainder);
   }
 error:
-#endif
   return err;
 }
 
@@ -196,7 +194,6 @@ struct bmphdr {
   /* raw b, g, r data here, dword aligned per scan line */
 };
 
-#if 0
 static int*
 cl_read_bmp(const char *filename, int *width, int *height)
 {
@@ -213,24 +210,6 @@ cl_read_bmp(const char *filename, int *width, int *height)
   n = fread(&hdr, 1, sizeof(hdr), fp);
   assert(n == sizeof(hdr));
 
-#if 0
-  /* Dump stuff out */
-  printf("   filesize = %d\n", hdr.filesize);	/* total file size incl header */
-  printf("        as0 = %d\n", hdr.as0);
-  printf("        as1 = %d\n", hdr.as1);
-  printf("  bmpoffset = %d\n", hdr.bmpoffset);	/* ofset of bmp data  */
-  printf("headerbytes = %d\n", hdr.headerbytes);	/* bytes in header from this point (40 actually) */
-  printf("      width = %d\n", hdr.width);
-  printf("     height = %d\n", hdr.height);
-  printf("    nplanes = %d\n", hdr.nplanes);	/* no of color planes */
-  printf("        bpp = %d\n", hdr.bpp);	/* bits/pixel */
-  printf("compression = %d\n", hdr.compression);	/* BI_RGB = 0 = no compression */
-  printf("    sizeraw = %d\n", hdr.sizeraw);	/* size of raw bmp file, excluding header, incl padding */
-  printf("       hres = %d\n", hdr.hres);	/* horz resolutions pixels/meter */
-  printf("       vres = %d\n", hdr.vres);
-  printf(" npalcolors = %d\n", hdr.npalcolors);	/* No of colors in palette */
-  printf(" nimportant = %d\n", hdr.nimportant);	/* No of important colors */
-#endif
   assert(hdr.width > 0 &&
          hdr.height > 0 &&
          hdr.nplanes == 1
@@ -278,26 +257,24 @@ cl_read_dump(const char *name, size_t *size)
     *size = sz;
   return dump;
 }
-#endif
 
 static cl_int
 cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
 {
   cl_int err = CL_SUCCESS;
-#if 0
   cl_mem mem = NULL;
   char *from = NULL, *to = NULL;
   size_t size, j, chunk_n, chunk_remainder;
   int i, curr = 0;
   /* Bind user defined surface */
-  for (i = 0; i < k->arg_info_n; ++i) {
-    if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER)
+  for (i = 0; i < k->arg_n; ++i) {
+    if (gbe_kernel_get_arg_type(k->opaque, i) != GBE_ARG_GLOBAL_PTR)
       continue;
-    mem = (cl_mem) k->args[k->arg_info[i].arg_index];
+    mem = (cl_mem) k->args[i].mem;
     CHECK_MEM(mem);
     assert(mem->bo);
-    chunk_n = mem->bo->size / chunk_sz;
-    chunk_remainder = mem->bo->size % chunk_sz;
+    chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
+    chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
     to = cl_mem_map(mem);
     for (j = 0; j < chunk_n; ++j) {
       char name[256];
@@ -328,11 +305,9 @@ cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
     cl_mem_unmap(mem);
   }
 error:
-#endif
   return err;
-
 }
-#endif /* USE_FULSIM */
+#endif
 
 extern cl_int cl_command_queue_ND_range_gen7(cl_command_queue, cl_kernel, const size_t *, const size_t *, const size_t *);
 
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 076f3983..75df8dd9 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -195,7 +195,7 @@ typedef void* (cl_buffer_get_virtual_cb)(cl_buffer);
 extern cl_buffer_get_virtual_cb *cl_buffer_get_virtual;
 
 /* Get the size of the buffer */
-typedef void* (cl_buffer_get_size_cb)(cl_buffer);
+typedef size_t (cl_buffer_get_size_cb)(cl_buffer);
 extern cl_buffer_get_size_cb *cl_buffer_get_size;
 
 /* Pin a buffer */
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index 78d6cb7f..d487fc06 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -340,7 +340,7 @@ intel_driver_get_ver(struct intel_driver *drv)
   return drv->gen_ver;
 }
 
-static uint32_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
+static size_t drm_intel_bo_get_size(drm_intel_bo *bo) { return bo->size; }
 static void* drm_intel_bo_get_virtual(drm_intel_bo *bo) { return bo->virtual; }
 
 LOCAL void
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 4de28506..c2a3745a 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -486,23 +486,29 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *state, int32_t index, dri_bo* obj_
                     obj_bo);
 }
 
+/* Use two one GB surface to map the 2GB address space */
 static void
-intel_gpgpu_map_address_space(intel_gpgpu_t *state,
-                              int32_t index,
-                              uint32_t size,
-                              uint32_t cchint)
+intel_gpgpu_map_address_space(intel_gpgpu_t *state)
 {
   surface_heap_t *heap = state->surface_heap_b.bo->virtual;
-  gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
-  const uint32_t size_ss = size - 1;
-  memset(ss, 0, sizeof(*ss));
-  ss->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss->ss0.surface_format = I965_SURFACEFORMAT_RAW;
-  ss->ss1.base_addr = 0;
-  ss->ss2.width  = size_ss & 0x7f;               /* bits 6:0 of size_ss */
-  ss->ss2.height = (size_ss & 0x1fff80) >> 7;    /* bits 20:7 of size_ss */
-  ss->ss3.depth  = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */
-  ss->ss5.cache_control = cc_llc_l3;
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[0];
+  gen7_surface_state_t *ss1 = (gen7_surface_state_t *) heap->surface[1];
+  const uint32_t sz = (1<<30) - 1;
+  //const uint32_t sz = 1024*1024-1;
+  memset(ss0, 0, sizeof(gen7_surface_state_t));
+  memset(ss1, 0, sizeof(gen7_surface_state_t));
+  ss1->ss0.surface_type = ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss1->ss0.surface_format = ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss0->ss1.base_addr = 0;
+  ss1->ss1.base_addr = 1<<30;
+  ss1->ss2.width  = ss0->ss2.width  = sz & 127;          /* bits 6:0 of sz */
+  ss1->ss2.height = ss0->ss2.height = (sz >> 7) & 16383; /* bits 20:7 of sz */
+  ss1->ss3.depth  = ss0->ss3.depth  = (sz >> 21) & 1023; /* bits 30:21 of sz */
+  ss1->ss5.cache_control = ss0->ss5.cache_control = cc_llc_l3;
+  heap->binding_table[0] = offsetof(surface_heap_t, surface)
+                         + 0 * sizeof(gen7_surface_state_t);
+  heap->binding_table[1] = offsetof(surface_heap_t, surface)
+                         + 1 * sizeof(gen7_surface_state_t);
 }
 
 static void
@@ -628,46 +634,60 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel)
 }
 
 static void
-intel_gpgpu_upload_constants(intel_gpgpu_t *state, const void* data, uint32_t size)
+intel_gpgpu_upload_constants(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
 {
   unsigned char *constant_buffer = NULL;
+  cl_gpgpu_kernel *k = gpgpu->ker;
+  uint32_t i, j;
 
-  dri_bo_map(state->curbe_b.bo, 1);
-  assert(state->curbe_b.bo->virtual);
-  constant_buffer = (unsigned char *) state->curbe_b.bo->virtual;
+  /* Upload the data first */
+  dri_bo_map(gpgpu->curbe_b.bo, 1);
+  assert(gpgpu->curbe_b.bo->virtual);
+  constant_buffer = (unsigned char *) gpgpu->curbe_b.bo->virtual;
   memcpy(constant_buffer, data, size);
-  dri_bo_unmap(state->curbe_b.bo);
+  dri_bo_unmap(gpgpu->curbe_b.bo);
+
+  /* Now put all the relocations for our flat address space */
+  for (i = 0; i < k->thread_n; ++i)
+    for (j = 0; j < gpgpu->binded_n; ++j)
+      drm_intel_bo_emit_reloc(gpgpu->curbe_b.bo,
+                              gpgpu->binded_offset[j]+i*k->cst_sz,
+                              gpgpu->binded_buf[j],
+                              0,
+                              I915_GEM_DOMAIN_RENDER,
+                              I915_GEM_DOMAIN_RENDER);
 }
 
 static void
-intel_gpgpu_upload_samplers(intel_gpgpu_t *state, const void *data, uint32_t n)
+intel_gpgpu_upload_samplers(intel_gpgpu_t *gpgpu, const void *data, uint32_t n)
 {
   if (n) {
     const size_t sz = n * sizeof(gen6_sampler_state_t);
-    memcpy(state->sampler_state_b.bo->virtual, data, sz);
+    memcpy(gpgpu->sampler_state_b.bo->virtual, data, sz);
   }
 }
 
 static void
-intel_gpgpu_states_setup(intel_gpgpu_t *state, cl_gpgpu_kernel *kernel)
+intel_gpgpu_states_setup(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 {
-  state->ker = kernel;
-  intel_gpgpu_build_idrt(state, kernel);
-  dri_bo_unmap(state->surface_heap_b.bo);
-  dri_bo_unmap(state->sampler_state_b.bo);
+  gpgpu->ker = kernel;
+  intel_gpgpu_build_idrt(gpgpu, kernel);
+  intel_gpgpu_map_address_space(gpgpu);
+  dri_bo_unmap(gpgpu->surface_heap_b.bo);
+  dri_bo_unmap(gpgpu->sampler_state_b.bo);
 }
 
 static void
-intel_gpgpu_set_perf_counters(intel_gpgpu_t *state, cl_buffer *perf)
+intel_gpgpu_set_perf_counters(intel_gpgpu_t *gpgpu, cl_buffer *perf)
 {
-  if (state->perf_b.bo)
-    drm_intel_bo_unreference(state->perf_b.bo);
+  if (gpgpu->perf_b.bo)
+    drm_intel_bo_unreference(gpgpu->perf_b.bo);
   drm_intel_bo_reference((drm_intel_bo*) perf);
-  state->perf_b.bo = (drm_intel_bo*) perf;
+  gpgpu->perf_b.bo = (drm_intel_bo*) perf;
 }
 
 static void
-intel_gpgpu_walker(intel_gpgpu_t *state,
+intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
                    uint32_t simd_sz,
                    uint32_t thread_n,
                    const size_t global_wk_off[3],
@@ -680,27 +700,27 @@ intel_gpgpu_walker(intel_gpgpu_t *state,
     global_wk_sz[2] / local_wk_sz[2]
   };
   assert(simd_sz == 8 || simd_sz == 16);
-  BEGIN_BATCH(state->batch, 11);
-  OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
-  OUT_BATCH(state->batch, 0);                        /* kernel index == 0 */
+  BEGIN_BATCH(gpgpu->batch, 11);
+  OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 9);
+  OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
   if (simd_sz == 16)
-    OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+    OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
   else
-    OUT_BATCH(state->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
-  OUT_BATCH(state->batch, global_wk_off[0]);
-  OUT_BATCH(state->batch, global_wk_dim[0]);
-  OUT_BATCH(state->batch, global_wk_off[1]);
-  OUT_BATCH(state->batch, global_wk_dim[1]);
-  OUT_BATCH(state->batch, global_wk_off[2]);
-  OUT_BATCH(state->batch, global_wk_dim[2]);
-  OUT_BATCH(state->batch, ~0x0);
-  OUT_BATCH(state->batch, ~0x0);
-  ADVANCE_BATCH(state->batch);
-
-  BEGIN_BATCH(state->batch, 2);
-  OUT_BATCH(state->batch, CMD_MEDIA_STATE_FLUSH | 0);
-  OUT_BATCH(state->batch, 0);                        /* kernel index == 0 */
-  ADVANCE_BATCH(state->batch);
+    OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
+  OUT_BATCH(gpgpu->batch, global_wk_off[0]);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
+  OUT_BATCH(gpgpu->batch, global_wk_off[1]);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
+  OUT_BATCH(gpgpu->batch, global_wk_off[2]);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+  OUT_BATCH(gpgpu->batch, ~0x0);
+  OUT_BATCH(gpgpu->batch, ~0x0);
+  ADVANCE_BATCH(gpgpu->batch);
+
+  BEGIN_BATCH(gpgpu->batch, 2);
+  OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
+  OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
+  ADVANCE_BATCH(gpgpu->batch);
 }
 
 LOCAL void
diff --git a/src/sim/sim_driver.c b/src/sim/sim_driver.c
index aa634f4b..c9839832 100644
--- a/src/sim/sim_driver.c
+++ b/src/sim/sim_driver.c
@@ -163,11 +163,11 @@ sim_buffer_get_virtual(sim_buffer buf)
   return buf->data;
 }
 
-static void*
+static size_t
 sim_buffer_get_size(sim_buffer buf)
 {
   if (UNLIKELY(buf == NULL)) return 0;
-  return buf->data;
+  return buf->sz;
 }
 
 static int