11 files changed, 360 insertions, 164 deletions
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index a03526cf..f7a27405 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -131,8 +131,6 @@ cl_command_queue_bind_surface(cl_command_queue queue,
   for (i = 0; i < k->arg_info_n; ++i) {
     if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER)
       continue;
-
-    /* XXX 64 comes from the patch list format. May change */
     assert(k->arg_info[i].offset % SURFACE_SZ == 0);
     index = k->arg_info[i].offset / SURFACE_SZ;
     mem = (cl_mem) k->args[k->arg_info[i].arg_index];
@@ -362,8 +360,8 @@ error:
 static char*
 cl_kernel_create_cst_buffer(cl_kernel k, 
                             cl_uint work_dim,
-                            const size_t *global_work_size,
-                            const size_t *local_work_size)
+                            const size_t *global_wk_sz,
+                            const size_t *local_wk_sz)
 {
   cl_curbe_patch_info_t *info = NULL;
   const size_t sz = k->patch.curbe.sz;
@@ -376,24 +374,24 @@ cl_kernel_create_cst_buffer(cl_kernel k,
   /* Global work group size */
   key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
   if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_work_size,   sizeof(uint32_t));
+    memcpy(data+info->offsets[0], global_wk_sz,   sizeof(uint32_t));
   key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4);
   if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_work_size+1, sizeof(uint32_t));
+    memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t));
   key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8);
   if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], global_work_size+2, sizeof(uint32_t));
+    memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t));
 
   /* Local work group size */
   key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0);
   if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_work_size,   sizeof(uint32_t));
+    memcpy(data+info->offsets[0], local_wk_sz,   sizeof(uint32_t));
   key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4);
   if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_work_size+1, sizeof(uint32_t));
+    memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t));
   key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8);
   if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
-    memcpy(data+info->offsets[0], local_work_size+2, sizeof(uint32_t));
+    memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
 
 exit:
   return data;
@@ -411,10 +409,17 @@ cl_run_fulsim(void)
   const char *debug_mode = getenv("OCL_FULSIM_DEBUG_MODE");
   if (run_it == NULL || strcmp(run_it, "1"))
     return;
+#if EMULATE_GEN == 6 /* SNB */
   if (debug_mode == NULL || strcmp(debug_mode, "1"))
     system("wine AubLoad.exe dump.aub -device sbrB0");
   else
     system("wine AubLoad.exe dump.aub -device sbrB0 -debug");
+#elif EMULATE_GEN == 7
+  if (debug_mode == NULL || strcmp(debug_mode, "1"))
+    system("wine AubLoad.exe dump.aub -device ivb2");
+  else
+    system("wine AubLoad.exe dump.aub -device ivb2 -debug");
+#endif
 }
 #endif /* USE_FULSIM */
 
@@ -423,8 +428,8 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
                            cl_kernel ker,
                            cl_uint work_dim,
                            const size_t *global_work_offset,
-                           const size_t *global_work_size,
-                           const size_t *local_work_size)
+                           const size_t *global_wk_sz,
+                           const size_t *local_wk_sz)
 {
   cl_context ctx = queue->ctx;
   genx_gpgpu_state_t *gpgpu = queue->gpgpu;
@@ -458,13 +463,13 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
   /* Total number of elements in the work group */
   for (i = 0; i < work_dim; ++i)
     if ((&ker->patch.exec_env.required_wgr_sz_x)[i] &&
-        (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_work_size[i]) {
+        (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_wk_sz[i]) {
       err = CL_INVALID_WORK_ITEM_SIZE;
       goto error;
     }
-  wrk_grp_sz = local_work_size[0];
+  wrk_grp_sz = local_wk_sz[0];
   for (i = 1; i < work_dim; ++i)
-    wrk_grp_sz *= local_work_size[i];
+    wrk_grp_sz *= local_wk_sz[i];
   FATAL_IF (wrk_grp_sz % 16, "Work group size must be a multiple of 16");
   if (wrk_grp_sz > ctx->device->max_work_group_size) {
     err = CL_INVALID_WORK_ITEM_SIZE;
@@ -472,9 +477,9 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
   }
 
   /* Directly from the user defined values */
-  header.local_sz[0] = local_work_size[0];
-  header.local_sz[1] = local_work_size[1];
-  header.local_sz[2] = local_work_size[2];
+  header.local_sz[0] = local_wk_sz[0];
+  header.local_sz[1] = local_wk_sz[1];
+  header.local_sz[2] = local_wk_sz[2];
   offset[0] = header.grp_n[0] = 0;
   offset[1] = header.grp_n[1] = 0;
   offset[2] = header.grp_n[2] = 0;
@@ -483,7 +488,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
   /* offsets are evenly divided by the local sizes */
   if (global_work_offset)
     for (i = 0; i < work_dim; ++i)
-      offset[i] = global_work_offset[i]/local_work_size[i];
+      offset[i] = global_work_offset[i]/local_wk_sz[i];
 
   /* Compute the local size per wg and the offsets for each local buffer */
   cl_kernel_handle_local_memory(ker, &header);
@@ -506,20 +511,17 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
 
   /* Fill the constant buffer */
   if (cst_sz > 0) {
-    char *completed_cst = NULL;
+    char *data = NULL;
     assert(ker->cst_buffer);
-    completed_cst = cl_kernel_create_cst_buffer(ker,
-                                                work_dim,
-                                                global_work_size,
-                                                local_work_size);
-    gpgpu_upload_constants(gpgpu, completed_cst, cst_sz);
-    cl_free(completed_cst);
+    data = cl_kernel_create_cst_buffer(ker,work_dim,global_wk_sz,local_wk_sz);
+    gpgpu_upload_constants(gpgpu, data, cst_sz);
+    cl_free(data);
   }
 
   wrk_grp_n = 1;
   for (i = 0; i < work_dim; ++i) {
     TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wrk_grp_sz*sizeof(uint16_t)));
-    grp_end[i] = offset[i] + global_work_size[i] / local_work_size[i];
+    grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i];
     wrk_grp_n *= grp_end[i]-offset[i];
   }
   thread_n = wrk_grp_sz / 16;
@@ -528,16 +530,16 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
   /* Start a new batch buffer */
   gpgpu_batch_reset(gpgpu, batch_sz);
   gpgpu_batch_start(gpgpu);
-
+#if 1
   /* Push all media objects. We implement three paths to make it (a bit) faster.
    * Local IDs are shared from work group to work group. We allocate once the
    * buffers and reuse them
    */
   if (work_dim == 3) {
     curr = 0;
-    for (i = 0; i < local_work_size[0]; ++i)
-    for (j = 0; j < local_work_size[1]; ++j)
-    for (k = 0; k < local_work_size[2]; ++k, ++curr) {
+    for (i = 0; i < local_wk_sz[0]; ++i)
+    for (j = 0; j < local_wk_sz[1]; ++j)
+    for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
       ((uint16_t*) ids[0])[curr] = i;
       ((uint16_t*) ids[1])[curr] = j;
       ((uint16_t*) ids[2])[curr] = k;
@@ -553,8 +555,8 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
   }
   else if (work_dim == 2) {
     curr = 0;
-    for (i = 0; i < local_work_size[0]; ++i)
-    for (j = 0; j < local_work_size[1]; ++j, ++curr) {
+    for (i = 0; i < local_wk_sz[0]; ++i)
+    for (j = 0; j < local_wk_sz[1]; ++j, ++curr) {
       ((uint16_t*) ids[0])[curr] = i;
       ((uint16_t*) ids[1])[curr] = j;
     }
@@ -567,7 +569,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
     }
   }
   else {
-    for (i = 0; i < local_work_size[0]; ++i)
+    for (i = 0; i < local_wk_sz[0]; ++i)
       ((uint16_t*) ids[0])[i] = i;
     for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) {
       if (ker->patch.exec_env.has_barriers)
@@ -576,7 +578,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
       barrierID = (barrierID + 1) % 16;
     }
   }
-
+#endif
   gpgpu_batch_end(gpgpu, 0);
   gpgpu_flush(gpgpu);
 
@@ -616,9 +618,9 @@ cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem)
 #if USE_FULSIM
   cl_context ctx = queue->ctx;
   drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
-
   drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo);
 #endif /* USE_FULSIM */
+
   queue->fulsim_out = mem;
   if (queue->fulsim_out != NULL) {
     cl_mem_delete(queue->fulsim_out);
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index a846adfb..25106ae0 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -29,46 +29,75 @@
 #include <stdio.h>
 #include <string.h>
 
-static struct _cl_device_id intel_gt2_device = {
+static struct _cl_device_id intel_snb_gt2_device = {
   .max_compute_unit = 60,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1350,
 
-  /* Common fields between GT1 and GT2 */
   #include "cl_gen6_device.h"
 };
 
-static struct _cl_device_id intel_gt1_device = {
+static struct _cl_device_id intel_snb_gt1_device = {
   .max_compute_unit = 24,
   .max_work_item_sizes = {256, 256, 256},
   .max_work_group_size = 256,
   .max_clock_frequency = 1000,
 
-  /* Common fields between GT1 and GT2 */
   #include "cl_gen6_device.h"
 };
 
+static struct _cl_device_id intel_ivb_gt2_device = {
+  .max_compute_unit = 128,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+
+  #include "cl_gen7_device.h"
+};
+
+static struct _cl_device_id intel_ivb_gt1_device = {
+  .max_compute_unit = 64,
+  .max_work_item_sizes = {512, 512, 512},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+
+  #include "cl_gen7_device.h"
+};
+
 LOCAL cl_device_id
 cl_get_gt_device(void)
 {
   cl_device_id ret = NULL;
   int device_id = cl_intel_get_device_id();
 
-  if (device_id == PCI_CHIP_SANDYBRIDGE_GT1   ||
+  if (device_id == PCI_CHIP_IVYBRIDGE_GT1   ||
+      device_id == PCI_CHIP_IVYBRIDGE_M_GT1 ||
+      device_id == PCI_CHIP_IVYBRIDGE_S_GT1) {
+    intel_ivb_gt1_device.vendor_id = device_id;
+    intel_ivb_gt1_device.platform = intel_platform;
+    ret = &intel_ivb_gt1_device;
+  }
+  else if (device_id == PCI_CHIP_IVYBRIDGE_GT2   ||
+      device_id == PCI_CHIP_IVYBRIDGE_M_GT2) {
+    intel_ivb_gt2_device.vendor_id = device_id;
+    intel_ivb_gt2_device.platform = intel_platform;
+    ret = &intel_ivb_gt2_device;
+  }
+  else if (device_id == PCI_CHIP_SANDYBRIDGE_GT1   ||
       device_id == PCI_CHIP_SANDYBRIDGE_M_GT1 ||
       device_id == PCI_CHIP_SANDYBRIDGE_S_GT) {
-    intel_gt1_device.vendor_id = device_id;
-    intel_gt1_device.platform = intel_platform;
-    ret = &intel_gt1_device;
+    intel_snb_gt1_device.vendor_id = device_id;
+    intel_snb_gt1_device.platform = intel_platform;
+    ret = &intel_snb_gt1_device;
   }
   else if (device_id == PCI_CHIP_SANDYBRIDGE_GT2      ||
            device_id == PCI_CHIP_SANDYBRIDGE_M_GT2    ||
            device_id == PCI_CHIP_SANDYBRIDGE_GT2_PLUS ||
            device_id == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS) {
-    intel_gt2_device.vendor_id = device_id;
-    intel_gt2_device.platform = intel_platform;
-    ret = &intel_gt2_device;
+    intel_snb_gt2_device.vendor_id = device_id;
+    intel_snb_gt2_device.platform = intel_platform;
+    ret = &intel_snb_gt2_device;
   }
   return ret;
 }
@@ -130,7 +159,10 @@ cl_get_device_info(cl_device_id     device,
                    void *           param_value,
                    size_t *         param_value_size_ret)
 {
-  if (UNLIKELY(device != &intel_gt1_device && device != &intel_gt2_device))
+  if (UNLIKELY(device != &intel_snb_gt1_device &&
+               device != &intel_snb_gt2_device &&
+               device != &intel_ivb_gt1_device &&
+               device != &intel_ivb_gt2_device))
     return CL_INVALID_DEVICE;
   if (UNLIKELY(param_value == NULL))
     return CL_INVALID_VALUE;
diff --git a/src/cl_gen6_device.h b/src/cl_gen6_device.h
index 32c01c2f..b09121fd 100644
--- a/src/cl_gen6_device.h
+++ b/src/cl_gen6_device.h
@@ -17,68 +17,14 @@
  * Author: Benjamin Segovia <benjamin.segovia@intel.com>
  */
 
-/* Common fields for both GT1 and GT2 devices. Fields which are not shared are
- * set in cl_device_id_object.c which basically deals with OpenCL devices
+/* Common fields for both SNB devices (either GT1 or GT2)
  */
-.device_type = CL_DEVICE_TYPE_GPU,
-.vendor_id = 0, /* == device_id (set when requested) */
-.max_work_item_dimensions = 3,
-.preferred_vector_width_char = 16,
-.preferred_vector_width_short = 16,
-.preferred_vector_width_int = 16,
-.preferred_vector_width_long = 16,
-.preferred_vector_width_float = 16,
-.preferred_vector_width_double = 0,
-.preferred_vector_width_half = 0,
-.native_vector_width_char = 16,
-.native_vector_width_short = 16,
-.native_vector_width_int = 16,
-.native_vector_width_long = 16,
-.native_vector_width_float = 16,
-.native_vector_width_double = 16,
-.native_vector_width_half = 16,
-.address_bits = 32,
-.max_mem_alloc_size = 128 * 1024 * 1024,
-.image_support = CL_FALSE,
-.max_read_image_args = 0,
-.max_write_image_args = 0,
-.image2d_max_width = 0,
-.image2d_max_height = 0,
-.image3d_max_width = 0,
-.image3d_max_height = 0,
-.image3d_max_depth = 0,
-.max_samplers = 0,
-.max_parameter_size = 256, /* Gen6 */
-.mem_base_addr_align = sizeof(cl_uint) * 8,
-.min_data_type_align_size = sizeof(cl_uint),
-.single_fp_config = 0, /* XXX */
-.global_mem_cache_type = CL_READ_WRITE_CACHE,
+.max_parameter_size = 256,
 .global_mem_cache_line_size = 128, /* XXX */
 .global_mem_cache_size = 8 << 10, /* XXX */
-.global_mem_size = 4,
-.max_constant_buffer_size = 64 << 10,
-.max_constant_args = 8,
-.local_mem_type = CL_GLOBAL, /* Gen6 */
-.local_mem_size = 16 << 10,  /* Gen6 */
-.error_correction_support = CL_FALSE,
-.host_unified_memory = CL_FALSE,
-.profiling_timer_resolution = 80, /* ns */
-.endian_little = CL_TRUE,
-.available = CL_TRUE,
-.compiler_available = CL_FALSE, /* XXX */
-.execution_capabilities = CL_EXEC_KERNEL,
-.queue_properties = CL_QUEUE_PROFILING_ENABLE,
-.platform = NULL, /* == intel_platform (set when requested) */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 16 << 10,
 .gfx_id = IGFX_GEN6_CORE,
 
-#define DECL_INFO_STRING(FIELD, STRING) \
-    .FIELD = STRING,                    \
-    .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
-DECL_INFO_STRING(name, "Intel HD Graphics Family")
-DECL_INFO_STRING(vendor, "Intel")
-DECL_INFO_STRING(version, "OpenCL 1.10")
-DECL_INFO_STRING(profile, "FULL_PROFILE")
-DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10")
-DECL_INFO_STRING(extensions, "")
-#undef DECL_INFO_STRING
+#include "cl_gt_device.h"
 
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
new file mode 100644
index 00000000..75c4e3f0
--- /dev/null
+++ b/src/cl_gen7_device.h
@@ -0,0 +1,30 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+/* Common fields for both SNB devices (either GT1 or GT2)
+ */
+.max_parameter_size = 256, 
+.global_mem_cache_line_size = 128, /* XXX */
+.global_mem_cache_size = 8 << 10, /* XXX */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 64 << 10,
+.gfx_id = IGFX_GEN7_CORE,
+
+#include "cl_gt_device.h"
+
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
new file mode 100644
index 00000000..d66d6ead
--- /dev/null
+++ b/src/cl_gt_device.h
@@ -0,0 +1,77 @@
+/* 
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+/* Common fields for both all GT devices (IVB / SNB) */
+.device_type = CL_DEVICE_TYPE_GPU,
+.vendor_id = 0, /* == device_id (set when requested) */
+.max_work_item_dimensions = 3,
+.preferred_vector_width_char = 16,
+.preferred_vector_width_short = 16,
+.preferred_vector_width_int = 16,
+.preferred_vector_width_long = 16,
+.preferred_vector_width_float = 16,
+.preferred_vector_width_double = 0,
+.preferred_vector_width_half = 0,
+.native_vector_width_char = 16,
+.native_vector_width_short = 16,
+.native_vector_width_int = 16,
+.native_vector_width_long = 16,
+.native_vector_width_float = 16,
+.native_vector_width_double = 16,
+.native_vector_width_half = 16,
+.address_bits = 32,
+.max_mem_alloc_size = 128 * 1024 * 1024,
+.image_support = CL_FALSE,
+.max_read_image_args = 0,
+.max_write_image_args = 0,
+.image2d_max_width = 0,
+.image2d_max_height = 0,
+.image3d_max_width = 0,
+.image3d_max_height = 0,
+.image3d_max_depth = 0,
+.max_samplers = 0,
+.mem_base_addr_align = sizeof(cl_uint) * 8,
+.min_data_type_align_size = sizeof(cl_uint),
+.single_fp_config = 0, /* XXX */
+.global_mem_cache_type = CL_READ_WRITE_CACHE,
+.global_mem_size = 4,
+.max_constant_buffer_size = 64 << 10,
+.max_constant_args = 8,
+.error_correction_support = CL_FALSE,
+.host_unified_memory = CL_FALSE,
+.profiling_timer_resolution = 80, /* ns */
+.endian_little = CL_TRUE,
+.available = CL_TRUE,
+.compiler_available = CL_FALSE, /* XXX */
+.execution_capabilities = CL_EXEC_KERNEL,
+.queue_properties = CL_QUEUE_PROFILING_ENABLE,
+.platform = NULL, /* == intel_platform (set when requested) */
+
+#define DECL_INFO_STRING(FIELD, STRING) \
+    .FIELD = STRING,                    \
+    .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
+DECL_INFO_STRING(name, "Intel HD Graphics Family")
+DECL_INFO_STRING(vendor, "Intel")
+DECL_INFO_STRING(version, "OpenCL 1.10")
+DECL_INFO_STRING(profile, "FULL_PROFILE")
+DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10")
+DECL_INFO_STRING(extensions, "")
+#undef DECL_INFO_STRING
+
+
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 5c07b9bd..20c0f427 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -347,6 +347,10 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
       ASSOC_ITEM (EXECUTION_ENVIRONMENT, exec_env, exec_env);
       ASSOC_ITEM (THREAD_PAYLOAD, thread_payload, thread_payload);
 
+      case PATCH_TOKEN_DATA_PARAMETER_STREAM:
+        info->curbe.sz = *(uint32_t *) patch;
+        info->curbe.offset = 0;
+      break;
       case PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT:
       case PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT:
       {
@@ -382,6 +386,7 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
           case DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES:
           case DATA_PARAMETER_LOCAL_WORK_SIZE:
           case DATA_PARAMETER_GLOBAL_WORK_SIZE:
+          case DATA_PARAMETER_GLOBAL_WORK_OFFSET:
           case DATA_PARAMETER_NUM_WORK_GROUPS:
           case DATA_PARAMETER_WORK_DIMENSIONS:
           case DATA_PARAMETER_IMAGE_WIDTH:
@@ -389,6 +394,7 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
           case DATA_PARAMETER_IMAGE_DEPTH:
           case DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE:
           case DATA_PARAMETER_IMAGE_CHANNEL_ORDER:
+          case DATA_PARAMETER_NUM_HARDWARE_THREADS:
           {
             curbe_key = cl_curbe_key(data->type, data->index, data->src_offset);
             curbe_info = cl_kernel_get_curbe_info_list(k, curbe_key);
diff --git a/src/intel/cl_device_data.h b/src/intel/cl_device_data.h
index b2acee95..b7faef16 100644
--- a/src/intel/cl_device_data.h
+++ b/src/intel/cl_device_data.h
@@ -62,6 +62,24 @@
     devid == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS || \
     devid == PCI_CHIP_SANDYBRIDGE_S_GT)
 
+#define PCI_CHIP_IVYBRIDGE_GT1          0x0152  /* Desktop */
+#define PCI_CHIP_IVYBRIDGE_GT2          0x0162
+#define PCI_CHIP_IVYBRIDGE_M_GT1        0x0156  /* Mobile */
+#define PCI_CHIP_IVYBRIDGE_M_GT2        0x0166
+#define PCI_CHIP_IVYBRIDGE_S_GT1        0x015a  /* Server */
+
+#define IS_IVB_GT1(devid)               \
+  (devid == PCI_CHIP_IVYBRIDGE_GT1 ||   \
+   devid == PCI_CHIP_IVYBRIDGE_M_GT1 || \
+   devid == PCI_CHIP_IVYBRIDGE_S_GT1)
+
+#define IS_IVB_GT2(devid)               \
+  (devid == PCI_CHIP_IVYBRIDGE_GT2 ||   \
+   devid == PCI_CHIP_IVYBRIDGE_M_GT2)
+
+#define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid))
+#define IS_GEN7(devid)      IS_IVYBRIDGE(devid)
+
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
diff --git a/src/intel/genx_defines.h b/src/intel/genx_defines.h
index 819fcbd7..af0e3db1 100644
--- a/src/intel/genx_defines.h
+++ b/src/intel/genx_defines.h
@@ -239,6 +239,7 @@
 #define I965_SURFACEFORMAT_R16G16B16_SNORM                0x19D 
 #define I965_SURFACEFORMAT_R16G16B16_SSCALED              0x19E 
 #define I965_SURFACEFORMAT_R16G16B16_USCALED              0x19F
+#define I965_SURFACEFORMAT_RAW                            0x1FF
 
 #define I965_CULLMODE_BOTH      0
 #define I965_CULLMODE_NONE      1
diff --git a/src/intel/genx_gpgpu.c b/src/intel/genx_gpgpu.c
index 1685b0fd..adf73f9a 100644
--- a/src/intel/genx_gpgpu.c
+++ b/src/intel/genx_gpgpu.c
@@ -107,8 +107,77 @@ typedef struct gen6_surface_state
     uint32_t vertical_alignment:1;
     uint32_t x_offset:7;
   } ss5;
+
+  uint32_t ss6; /* unused */
+  uint32_t ss7; /* unused */
 } gen6_surface_state_t;
 
+typedef struct gen7_surface_state
+{
+  struct {
+    uint32_t cube_pos_z:1;
+    uint32_t cube_neg_z:1;
+    uint32_t cube_pos_y:1;
+    uint32_t cube_neg_y:1;
+    uint32_t cube_pos_x:1;
+    uint32_t cube_neg_x:1;
+    uint32_t media_boundary_pixel_mode:2;
+    uint32_t render_cache_rw_mode:1;
+    uint32_t pad1:1;
+    uint32_t surface_array_spacing:1;
+    uint32_t vertical_line_stride_offset:1;
+    uint32_t vertical_line_stride:1;
+    uint32_t tile_walk:1;
+    uint32_t tiled_surface:1;
+    uint32_t horizontal_alignment:1;
+    uint32_t vertical_alignment:2;
+    uint32_t surface_format:9;
+    uint32_t pad0:1;
+    uint32_t surface_array:1;
+    uint32_t surface_type:3;
+  } ss0;
+
+  struct {
+    uint32_t base_addr;
+  } ss1;
+
+  struct {
+    uint32_t width:14;
+    uint32_t pad1:2;
+    uint32_t height:14;
+    uint32_t pad0:2;
+  } ss2;
+
+  struct {
+    uint32_t pitch:18;
+    uint32_t pad0:3;
+    uint32_t depth:11;
+  } ss3;
+
+  uint32_t ss4;
+
+  struct {
+    uint32_t mip_count:4;
+    uint32_t surface_min_load:4;
+    uint32_t pad2:6;
+    uint32_t coherence_type:1;
+    uint32_t stateless_force_write_thru:1;
+    uint32_t surface_object_control_state:4;
+    uint32_t y_offset:4;
+    uint32_t pad0:1;
+    uint32_t x_offset:7;
+  } ss5;
+
+  uint32_t ss6; /* unused */
+  uint32_t ss7; /* unused */
+
+} gen7_surface_state_t;
+
+#define GEN7_CACHED_IN_LLC 3
+
+STATIC_ASSERT(sizeof(gen6_surface_state_t) == sizeof(gen7_surface_state_t));
+static const size_t surface_state_sz = sizeof(gen6_surface_state_t);
+
 typedef struct gen6_vfe_state_inline
 {
   struct {
@@ -343,13 +412,16 @@ gpgpu_set_base_address(genx_gpgpu_state_t *state)
   OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
   /* If we output an AUB file, we limit the total size to 64MB */
 #if USE_FULSIM
-  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound - Ignore Check */
+  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
+  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
+  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
+  OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
 #else
   OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
 #endif
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound - Ignore Check */
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound - Ignore Check */
-  OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound - Ignore Check */
   ADVANCE_BATCH(state->batch);
 }
 
@@ -383,11 +455,10 @@ gpgpu_load_constant_buffer(genx_gpgpu_state_t *state)
   BEGIN_BATCH(state->batch, 4);
   OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
   OUT_BATCH(state->batch, 0);                     /* mbz */
-  OUT_BATCH(state->batch, state->urb.size_cs_entry*
-                          state->urb.num_cs_entries*32);
-  OUT_RELOC(state->batch, state->curbe_b.bo,
-            I915_GEM_DOMAIN_INSTRUCTION, 0,
-            0);
+  OUT_BATCH(state->batch,
+            state->urb.size_cs_entry*
+            state->urb.num_cs_entries*32);
+  OUT_RELOC(state->batch, state->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
   ADVANCE_BATCH(state->batch);
 }
 
@@ -398,9 +469,7 @@ gpgpu_load_idrt(genx_gpgpu_state_t *state)
   OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
   OUT_BATCH(state->batch, 0);                    /* mbz */
   OUT_BATCH(state->batch, state->idrt_b.num*32);
-  OUT_RELOC(state->batch, state->idrt_b.bo,
-            I915_GEM_DOMAIN_INSTRUCTION, 0,
-            0);
+  OUT_RELOC(state->batch, state->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
   ADVANCE_BATCH(state->batch);
 }
 
@@ -469,7 +538,7 @@ gpgpu_flush(genx_gpgpu_state_t *state)
 
 LOCAL void
 gpgpu_state_init(genx_gpgpu_state_t *state,
-                 uint32_t max_thr, 
+                 uint32_t max_threads,
                  uint32_t size_vfe_entry,
                  uint32_t num_vfe_entries,
                  uint32_t size_cs_entry,
@@ -478,8 +547,6 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
   dri_bo *bo;
   int32_t i;
 
-  assert(max_thr > 0 && max_thr < MAX_THREADS);
-
   /* URB */
   state->urb.vfe_start = 0;
   state->urb.num_vfe_entries = num_vfe_entries;
@@ -487,6 +554,7 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
   state->urb.num_cs_entries = num_cs_entries;
   state->urb.size_cs_entry = size_cs_entry;
   state->urb.cs_start = state->urb.vfe_start + state->urb.num_vfe_entries * state->urb.size_vfe_entry;
+  state->max_threads = max_threads;
 
   /* constant buffer */
   if(state->curbe_b.bo)
@@ -494,7 +562,7 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
   uint32_t size_cb = state->urb.num_cs_entries * state->urb.size_cs_entry * (512/8);
   size_cb = (size_cb + (4096 - 1)) & (~(4096-1)); /* roundup to 4K */
   bo = dri_bo_alloc(state->drv->bufmgr,
-                    "constant buffer",
+                    "CONSTANT_BUFFER",
                     size_cb,
                     64);
   assert(bo);
@@ -511,13 +579,13 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
   if(state->binding_table_b.bo)
     dri_bo_unreference(state->binding_table_b.bo);
   bo = dri_bo_alloc(state->drv->bufmgr, 
-                    "binding table",
+                    "SS_SURF_BIND",
                     MAX_SURFACES * sizeof(uint32_t),
                     32);
   assert(bo);
   state->binding_table_b.bo = bo;
 
-  /* interface descriptor remapping table */
+  /* IDRT */
   if(state->idrt_b.bo)
     dri_bo_unreference(state->idrt_b.bo);
   bo = dri_bo_alloc(state->drv->bufmgr, 
@@ -565,14 +633,11 @@ gpgpu_bind_surf_2d(genx_gpgpu_state_t *state,
     state->surface_state_b[index].bo = NULL;
   }
 
-  bo = dri_bo_alloc(state->drv->bufmgr,
-                    "surface state", 
-                    sizeof(gen6_surface_state_t),
-                    32);
+  bo = dri_bo_alloc(state->drv->bufmgr, "surface state", surface_state_sz, 32);
   assert(bo);
   dri_bo_map(bo, 1);
   assert(bo->virtual);
-  ss = (gen6_surface_state_t *)bo->virtual;
+  ss = (gen6_surface_state_t*) bo->virtual;
   memset(ss, 0, sizeof(*ss));
   ss->ss0.surface_type = I965_SURFACE_2D;
   ss->ss0.surface_format = format;
@@ -584,9 +649,8 @@ gpgpu_bind_surf_2d(genx_gpgpu_state_t *state,
   ss->ss3.pitch = (w*4) - 1; /* TEMP patch */
 
   /* TODO: parse GFDT bit as well */
-  if(state->drv->gen_ver == 6) {
+  if(state->drv->gen_ver == 6)
     ss->ss5.cache_control = cchint;
-  }
 
   if (is_dst) {
     write_domain = I915_GEM_DOMAIN_RENDER;
@@ -704,9 +768,6 @@ gpgpu_bind_buf(genx_gpgpu_state_t *state,
                uint32_t size,
                uint32_t cchint)
 {
-  uint32_t size_ss = ((size + 0xf) >> 4)-1; /* ceil(size/16) - 1 */
-
-  gen6_surface_state_t *ss;
   dri_bo *bo;
   uint32_t write_domain, read_domain;
 
@@ -718,40 +779,51 @@ gpgpu_bind_buf(genx_gpgpu_state_t *state,
     state->surface_state_b[index].bo = NULL;
   }
 
-  bo = dri_bo_alloc(state->drv->bufmgr, 
-      "surface state", 
-      sizeof(gen6_surface_state_t), 32);
+  bo = dri_bo_alloc(state->drv->bufmgr, "SS_SURFACE", surface_state_sz, 32);
   assert(bo);
   dri_bo_map(bo, 1);
   assert(bo->virtual);
-  ss = (gen6_surface_state_t *)bo->virtual;
-  memset(ss, 0, sizeof(*ss));
-
-  ss->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
-  ss->ss0.vert_line_stride = 0;
-  ss->ss0.vert_line_stride_ofs = 0;
-  ss->ss1.base_addr = obj_bo->offset + offset;
-  ss->ss2.width = (size_ss & 0x7f); /* bits 6:0 of size_ss */
-  ss->ss2.height = (size_ss >> 7) & 0x1fff; /* bits 19:7 of size_ss */
-  ss->ss3.pitch = 16-1;
-  ss->ss3.depth = (size_ss >> 20); /* bits 26:20 of size_ss */
-
-  /* TODO: parse GFDT bit as well */
-  if(state->drv->gen_ver==6)
-    ss->ss5.cache_control = cchint;
-
   write_domain = I915_GEM_DOMAIN_RENDER;
   read_domain = I915_GEM_DOMAIN_RENDER;
 
-  dri_bo_emit_reloc(bo,
-                    read_domain,
-                    write_domain,
-                    offset,
-                    offsetof(gen6_surface_state_t, ss1),
-                    obj_bo);
-  dri_bo_unmap(bo);
+  if(state->drv->gen_ver == 6) {
+    gen6_surface_state_t *ss = (gen6_surface_state_t *) bo->virtual;
+    const uint32_t size_ss = ((size+0xf) >> 4) - 1; /* ceil(size/16) - 1 */
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_BUFFER;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
+    ss->ss1.base_addr = obj_bo->offset + offset;
+    ss->ss2.width = size_ss & 0x7f; /* bits 6:0 of size_ss */
+    ss->ss2.height = (size_ss >> 7) & 0x1fff; /* bits 19:7 of size_ss */
+    ss->ss3.pitch = 0xf;
+    ss->ss3.depth = size_ss >> 20; /* bits 26:20 of size_ss */
+    ss->ss5.cache_control = cchint;
+    dri_bo_emit_reloc(bo,
+                      read_domain,
+                      write_domain,
+                      offset,
+                      offsetof(gen6_surface_state_t, ss1),
+                      obj_bo);
+  } else if (state->drv->gen_ver == 7) {
+    gen7_surface_state_t *ss = (gen7_surface_state_t *) bo->virtual;
+    const uint32_t size_ss = size - 1;
+    memset(ss, 0, sizeof(*ss));
+    ss->ss0.surface_type = I965_SURFACE_BUFFER;
+    ss->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+    ss->ss1.base_addr = obj_bo->offset + offset;
+    ss->ss2.width  = size_ss & 0x7f; /* bits 6:0 of size_ss */
+    ss->ss2.height = (size_ss & 0x1fff80) >> 7; /* bits 20:7 of size_ss */
+    ss->ss3.depth  = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */
+    ss->ss5.surface_object_control_state = GEN7_CACHED_IN_LLC;
+    dri_bo_emit_reloc(bo,
+                      read_domain,
+                      write_domain,
+                      offset,
+                      offsetof(gen7_surface_state_t, ss1),
+                      obj_bo);
+  }
 
+  dri_bo_unmap(bo);
   assert(index < (int) MAX_SURFACES);
   state->surface_state_b[index].bo = bo;
 }
diff --git a/src/intel/genx_gpgpu.h b/src/intel/genx_gpgpu.h
index 21868858..d2636049 100644
--- a/src/intel/genx_gpgpu.h
+++ b/src/intel/genx_gpgpu.h
@@ -101,7 +101,7 @@ extern void gpgpu_bind_buf(genx_gpgpu_state_t*,
 
 /* Configure state, size in 512-bit units */
 extern void gpgpu_state_init(genx_gpgpu_state_t*,
-                             uint32_t max_thr,
+                             uint32_t max_threads,
                              uint32_t size_vfe_entry,
                              uint32_t num_vfe_entries,
                              uint32_t size_cs_entry,
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index e44e675f..b4e87351 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -114,12 +114,24 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
   assert(res);
   intel_driver_memman_init(driver);
 
-  if (IS_GEN6(driver->device_id))
+#if EMULATE_GEN
+  driver->gen_ver = EMULATE_GEN;
+  if (EMULATE_GEN == 7)
+    driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
+  else if (EMULATE_GEN == 6)
+    driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
+  else
+    FATAL ("Unsupported Gen for emulation");
+#else
+  if (IS_GEN&(driver->device_id))
+    driver->gen_ver = 7;
+  else if (IS_GEN6(driver->device_id))
     driver->gen_ver = 6;
   else if(IS_IGDNG(driver->device_id))
     driver->gen_ver = 5;
   else
     driver->gen_ver = 4;
+#endif /* EMULATE_GEN */
 }
 
 LOCAL int