summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/cl_command_queue.c74
-rw-r--r--src/cl_device_id.c56
-rw-r--r--src/cl_gen6_device.h64
-rw-r--r--src/cl_gen7_device.h30
-rw-r--r--src/cl_gt_device.h77
-rw-r--r--src/cl_kernel.c6
-rw-r--r--src/intel/cl_device_data.h18
-rw-r--r--src/intel/genx_defines.h1
-rw-r--r--src/intel/genx_gpgpu.c182
-rw-r--r--src/intel/genx_gpgpu.h2
-rw-r--r--src/intel/intel_driver.c14
11 files changed, 360 insertions, 164 deletions
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index a03526cf..f7a27405 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -131,8 +131,6 @@ cl_command_queue_bind_surface(cl_command_queue queue,
for (i = 0; i < k->arg_info_n; ++i) {
if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER)
continue;
-
- /* XXX 64 comes from the patch list format. May change */
assert(k->arg_info[i].offset % SURFACE_SZ == 0);
index = k->arg_info[i].offset / SURFACE_SZ;
mem = (cl_mem) k->args[k->arg_info[i].arg_index];
@@ -362,8 +360,8 @@ error:
static char*
cl_kernel_create_cst_buffer(cl_kernel k,
cl_uint work_dim,
- const size_t *global_work_size,
- const size_t *local_work_size)
+ const size_t *global_wk_sz,
+ const size_t *local_wk_sz)
{
cl_curbe_patch_info_t *info = NULL;
const size_t sz = k->patch.curbe.sz;
@@ -376,24 +374,24 @@ cl_kernel_create_cst_buffer(cl_kernel k,
/* Global work group size */
key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0);
if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_work_size, sizeof(uint32_t));
+ memcpy(data+info->offsets[0], global_wk_sz, sizeof(uint32_t));
key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4);
if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_work_size+1, sizeof(uint32_t));
+ memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t));
key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8);
if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], global_work_size+2, sizeof(uint32_t));
+ memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t));
/* Local work group size */
key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0);
if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], local_work_size, sizeof(uint32_t));
+ memcpy(data+info->offsets[0], local_wk_sz, sizeof(uint32_t));
key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4);
if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], local_work_size+1, sizeof(uint32_t));
+ memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t));
key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8);
if ((info = cl_kernel_get_curbe_info(k, key)) != NULL)
- memcpy(data+info->offsets[0], local_work_size+2, sizeof(uint32_t));
+ memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t));
exit:
return data;
@@ -411,10 +409,17 @@ cl_run_fulsim(void)
const char *debug_mode = getenv("OCL_FULSIM_DEBUG_MODE");
if (run_it == NULL || strcmp(run_it, "1"))
return;
+#if EMULATE_GEN == 6 /* SNB */
if (debug_mode == NULL || strcmp(debug_mode, "1"))
system("wine AubLoad.exe dump.aub -device sbrB0");
else
system("wine AubLoad.exe dump.aub -device sbrB0 -debug");
+#elif EMULATE_GEN == 7
+ if (debug_mode == NULL || strcmp(debug_mode, "1"))
+ system("wine AubLoad.exe dump.aub -device ivb2");
+ else
+ system("wine AubLoad.exe dump.aub -device ivb2 -debug");
+#endif
}
#endif /* USE_FULSIM */
@@ -423,8 +428,8 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
cl_kernel ker,
cl_uint work_dim,
const size_t *global_work_offset,
- const size_t *global_work_size,
- const size_t *local_work_size)
+ const size_t *global_wk_sz,
+ const size_t *local_wk_sz)
{
cl_context ctx = queue->ctx;
genx_gpgpu_state_t *gpgpu = queue->gpgpu;
@@ -458,13 +463,13 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
/* Total number of elements in the work group */
for (i = 0; i < work_dim; ++i)
if ((&ker->patch.exec_env.required_wgr_sz_x)[i] &&
- (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_work_size[i]) {
+ (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_wk_sz[i]) {
err = CL_INVALID_WORK_ITEM_SIZE;
goto error;
}
- wrk_grp_sz = local_work_size[0];
+ wrk_grp_sz = local_wk_sz[0];
for (i = 1; i < work_dim; ++i)
- wrk_grp_sz *= local_work_size[i];
+ wrk_grp_sz *= local_wk_sz[i];
FATAL_IF (wrk_grp_sz % 16, "Work group size must be a multiple of 16");
if (wrk_grp_sz > ctx->device->max_work_group_size) {
err = CL_INVALID_WORK_ITEM_SIZE;
@@ -472,9 +477,9 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
}
/* Directly from the user defined values */
- header.local_sz[0] = local_work_size[0];
- header.local_sz[1] = local_work_size[1];
- header.local_sz[2] = local_work_size[2];
+ header.local_sz[0] = local_wk_sz[0];
+ header.local_sz[1] = local_wk_sz[1];
+ header.local_sz[2] = local_wk_sz[2];
offset[0] = header.grp_n[0] = 0;
offset[1] = header.grp_n[1] = 0;
offset[2] = header.grp_n[2] = 0;
@@ -483,7 +488,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
/* offsets are evenly divided by the local sizes */
if (global_work_offset)
for (i = 0; i < work_dim; ++i)
- offset[i] = global_work_offset[i]/local_work_size[i];
+ offset[i] = global_work_offset[i]/local_wk_sz[i];
/* Compute the local size per wg and the offsets for each local buffer */
cl_kernel_handle_local_memory(ker, &header);
@@ -506,20 +511,17 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
/* Fill the constant buffer */
if (cst_sz > 0) {
- char *completed_cst = NULL;
+ char *data = NULL;
assert(ker->cst_buffer);
- completed_cst = cl_kernel_create_cst_buffer(ker,
- work_dim,
- global_work_size,
- local_work_size);
- gpgpu_upload_constants(gpgpu, completed_cst, cst_sz);
- cl_free(completed_cst);
+ data = cl_kernel_create_cst_buffer(ker,work_dim,global_wk_sz,local_wk_sz);
+ gpgpu_upload_constants(gpgpu, data, cst_sz);
+ cl_free(data);
}
wrk_grp_n = 1;
for (i = 0; i < work_dim; ++i) {
TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wrk_grp_sz*sizeof(uint16_t)));
- grp_end[i] = offset[i] + global_work_size[i] / local_work_size[i];
+ grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i];
wrk_grp_n *= grp_end[i]-offset[i];
}
thread_n = wrk_grp_sz / 16;
@@ -528,16 +530,16 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
/* Start a new batch buffer */
gpgpu_batch_reset(gpgpu, batch_sz);
gpgpu_batch_start(gpgpu);
-
+#if 1
/* Push all media objects. We implement three paths to make it (a bit) faster.
* Local IDs are shared from work group to work group. We allocate once the
* buffers and reuse them
*/
if (work_dim == 3) {
curr = 0;
- for (i = 0; i < local_work_size[0]; ++i)
- for (j = 0; j < local_work_size[1]; ++j)
- for (k = 0; k < local_work_size[2]; ++k, ++curr) {
+ for (i = 0; i < local_wk_sz[0]; ++i)
+ for (j = 0; j < local_wk_sz[1]; ++j)
+ for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
((uint16_t*) ids[0])[curr] = i;
((uint16_t*) ids[1])[curr] = j;
((uint16_t*) ids[2])[curr] = k;
@@ -553,8 +555,8 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
}
else if (work_dim == 2) {
curr = 0;
- for (i = 0; i < local_work_size[0]; ++i)
- for (j = 0; j < local_work_size[1]; ++j, ++curr) {
+ for (i = 0; i < local_wk_sz[0]; ++i)
+ for (j = 0; j < local_wk_sz[1]; ++j, ++curr) {
((uint16_t*) ids[0])[curr] = i;
((uint16_t*) ids[1])[curr] = j;
}
@@ -567,7 +569,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
}
}
else {
- for (i = 0; i < local_work_size[0]; ++i)
+ for (i = 0; i < local_wk_sz[0]; ++i)
((uint16_t*) ids[0])[i] = i;
for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) {
if (ker->patch.exec_env.has_barriers)
@@ -576,7 +578,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue,
barrierID = (barrierID + 1) % 16;
}
}
-
+#endif
gpgpu_batch_end(gpgpu, 0);
gpgpu_flush(gpgpu);
@@ -616,9 +618,9 @@ cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem)
#if USE_FULSIM
cl_context ctx = queue->ctx;
drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
-
drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo);
#endif /* USE_FULSIM */
+
queue->fulsim_out = mem;
if (queue->fulsim_out != NULL) {
cl_mem_delete(queue->fulsim_out);
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index a846adfb..25106ae0 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -29,46 +29,75 @@
#include <stdio.h>
#include <string.h>
-static struct _cl_device_id intel_gt2_device = {
+static struct _cl_device_id intel_snb_gt2_device = {
.max_compute_unit = 60,
.max_work_item_sizes = {512, 512, 512},
.max_work_group_size = 512,
.max_clock_frequency = 1350,
- /* Common fields between GT1 and GT2 */
#include "cl_gen6_device.h"
};
-static struct _cl_device_id intel_gt1_device = {
+static struct _cl_device_id intel_snb_gt1_device = {
.max_compute_unit = 24,
.max_work_item_sizes = {256, 256, 256},
.max_work_group_size = 256,
.max_clock_frequency = 1000,
- /* Common fields between GT1 and GT2 */
#include "cl_gen6_device.h"
};
+static struct _cl_device_id intel_ivb_gt2_device = {
+ .max_compute_unit = 128,
+ .max_work_item_sizes = {512, 512, 512},
+ .max_work_group_size = 512,
+ .max_clock_frequency = 1000,
+
+ #include "cl_gen7_device.h"
+};
+
+static struct _cl_device_id intel_ivb_gt1_device = {
+ .max_compute_unit = 64,
+ .max_work_item_sizes = {512, 512, 512},
+ .max_work_group_size = 512,
+ .max_clock_frequency = 1000,
+
+ #include "cl_gen7_device.h"
+};
+
LOCAL cl_device_id
cl_get_gt_device(void)
{
cl_device_id ret = NULL;
int device_id = cl_intel_get_device_id();
- if (device_id == PCI_CHIP_SANDYBRIDGE_GT1 ||
+ if (device_id == PCI_CHIP_IVYBRIDGE_GT1 ||
+ device_id == PCI_CHIP_IVYBRIDGE_M_GT1 ||
+ device_id == PCI_CHIP_IVYBRIDGE_S_GT1) {
+ intel_ivb_gt1_device.vendor_id = device_id;
+ intel_ivb_gt1_device.platform = intel_platform;
+ ret = &intel_ivb_gt1_device;
+ }
+ else if (device_id == PCI_CHIP_IVYBRIDGE_GT2 ||
+ device_id == PCI_CHIP_IVYBRIDGE_M_GT2) {
+ intel_ivb_gt2_device.vendor_id = device_id;
+ intel_ivb_gt2_device.platform = intel_platform;
+ ret = &intel_ivb_gt2_device;
+ }
+ else if (device_id == PCI_CHIP_SANDYBRIDGE_GT1 ||
device_id == PCI_CHIP_SANDYBRIDGE_M_GT1 ||
device_id == PCI_CHIP_SANDYBRIDGE_S_GT) {
- intel_gt1_device.vendor_id = device_id;
- intel_gt1_device.platform = intel_platform;
- ret = &intel_gt1_device;
+ intel_snb_gt1_device.vendor_id = device_id;
+ intel_snb_gt1_device.platform = intel_platform;
+ ret = &intel_snb_gt1_device;
}
else if (device_id == PCI_CHIP_SANDYBRIDGE_GT2 ||
device_id == PCI_CHIP_SANDYBRIDGE_M_GT2 ||
device_id == PCI_CHIP_SANDYBRIDGE_GT2_PLUS ||
device_id == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS) {
- intel_gt2_device.vendor_id = device_id;
- intel_gt2_device.platform = intel_platform;
- ret = &intel_gt2_device;
+ intel_snb_gt2_device.vendor_id = device_id;
+ intel_snb_gt2_device.platform = intel_platform;
+ ret = &intel_snb_gt2_device;
}
return ret;
}
@@ -130,7 +159,10 @@ cl_get_device_info(cl_device_id device,
void * param_value,
size_t * param_value_size_ret)
{
- if (UNLIKELY(device != &intel_gt1_device && device != &intel_gt2_device))
+ if (UNLIKELY(device != &intel_snb_gt1_device &&
+ device != &intel_snb_gt2_device &&
+ device != &intel_ivb_gt1_device &&
+ device != &intel_ivb_gt2_device))
return CL_INVALID_DEVICE;
if (UNLIKELY(param_value == NULL))
return CL_INVALID_VALUE;
diff --git a/src/cl_gen6_device.h b/src/cl_gen6_device.h
index 32c01c2f..b09121fd 100644
--- a/src/cl_gen6_device.h
+++ b/src/cl_gen6_device.h
@@ -17,68 +17,14 @@
* Author: Benjamin Segovia <benjamin.segovia@intel.com>
*/
-/* Common fields for both GT1 and GT2 devices. Fields which are not shared are
- * set in cl_device_id_object.c which basically deals with OpenCL devices
+/* Common fields for both SNB devices (either GT1 or GT2)
*/
-.device_type = CL_DEVICE_TYPE_GPU,
-.vendor_id = 0, /* == device_id (set when requested) */
-.max_work_item_dimensions = 3,
-.preferred_vector_width_char = 16,
-.preferred_vector_width_short = 16,
-.preferred_vector_width_int = 16,
-.preferred_vector_width_long = 16,
-.preferred_vector_width_float = 16,
-.preferred_vector_width_double = 0,
-.preferred_vector_width_half = 0,
-.native_vector_width_char = 16,
-.native_vector_width_short = 16,
-.native_vector_width_int = 16,
-.native_vector_width_long = 16,
-.native_vector_width_float = 16,
-.native_vector_width_double = 16,
-.native_vector_width_half = 16,
-.address_bits = 32,
-.max_mem_alloc_size = 128 * 1024 * 1024,
-.image_support = CL_FALSE,
-.max_read_image_args = 0,
-.max_write_image_args = 0,
-.image2d_max_width = 0,
-.image2d_max_height = 0,
-.image3d_max_width = 0,
-.image3d_max_height = 0,
-.image3d_max_depth = 0,
-.max_samplers = 0,
-.max_parameter_size = 256, /* Gen6 */
-.mem_base_addr_align = sizeof(cl_uint) * 8,
-.min_data_type_align_size = sizeof(cl_uint),
-.single_fp_config = 0, /* XXX */
-.global_mem_cache_type = CL_READ_WRITE_CACHE,
+.max_parameter_size = 256,
.global_mem_cache_line_size = 128, /* XXX */
.global_mem_cache_size = 8 << 10, /* XXX */
-.global_mem_size = 4,
-.max_constant_buffer_size = 64 << 10,
-.max_constant_args = 8,
-.local_mem_type = CL_GLOBAL, /* Gen6 */
-.local_mem_size = 16 << 10, /* Gen6 */
-.error_correction_support = CL_FALSE,
-.host_unified_memory = CL_FALSE,
-.profiling_timer_resolution = 80, /* ns */
-.endian_little = CL_TRUE,
-.available = CL_TRUE,
-.compiler_available = CL_FALSE, /* XXX */
-.execution_capabilities = CL_EXEC_KERNEL,
-.queue_properties = CL_QUEUE_PROFILING_ENABLE,
-.platform = NULL, /* == intel_platform (set when requested) */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 16 << 10,
.gfx_id = IGFX_GEN6_CORE,
-#define DECL_INFO_STRING(FIELD, STRING) \
- .FIELD = STRING, \
- .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
-DECL_INFO_STRING(name, "Intel HD Graphics Family")
-DECL_INFO_STRING(vendor, "Intel")
-DECL_INFO_STRING(version, "OpenCL 1.10")
-DECL_INFO_STRING(profile, "FULL_PROFILE")
-DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10")
-DECL_INFO_STRING(extensions, "")
-#undef DECL_INFO_STRING
+#include "cl_gt_device.h"
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
new file mode 100644
index 00000000..75c4e3f0
--- /dev/null
+++ b/src/cl_gen7_device.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+/* Common fields for both SNB devices (either GT1 or GT2)
+ */
+.max_parameter_size = 256,
+.global_mem_cache_line_size = 128, /* XXX */
+.global_mem_cache_size = 8 << 10, /* XXX */
+.local_mem_type = CL_GLOBAL,
+.local_mem_size = 64 << 10,
+.gfx_id = IGFX_GEN7_CORE,
+
+#include "cl_gt_device.h"
+
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
new file mode 100644
index 00000000..d66d6ead
--- /dev/null
+++ b/src/cl_gt_device.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+/* Common fields for both all GT devices (IVB / SNB) */
+.device_type = CL_DEVICE_TYPE_GPU,
+.vendor_id = 0, /* == device_id (set when requested) */
+.max_work_item_dimensions = 3,
+.preferred_vector_width_char = 16,
+.preferred_vector_width_short = 16,
+.preferred_vector_width_int = 16,
+.preferred_vector_width_long = 16,
+.preferred_vector_width_float = 16,
+.preferred_vector_width_double = 0,
+.preferred_vector_width_half = 0,
+.native_vector_width_char = 16,
+.native_vector_width_short = 16,
+.native_vector_width_int = 16,
+.native_vector_width_long = 16,
+.native_vector_width_float = 16,
+.native_vector_width_double = 16,
+.native_vector_width_half = 16,
+.address_bits = 32,
+.max_mem_alloc_size = 128 * 1024 * 1024,
+.image_support = CL_FALSE,
+.max_read_image_args = 0,
+.max_write_image_args = 0,
+.image2d_max_width = 0,
+.image2d_max_height = 0,
+.image3d_max_width = 0,
+.image3d_max_height = 0,
+.image3d_max_depth = 0,
+.max_samplers = 0,
+.mem_base_addr_align = sizeof(cl_uint) * 8,
+.min_data_type_align_size = sizeof(cl_uint),
+.single_fp_config = 0, /* XXX */
+.global_mem_cache_type = CL_READ_WRITE_CACHE,
+.global_mem_size = 4,
+.max_constant_buffer_size = 64 << 10,
+.max_constant_args = 8,
+.error_correction_support = CL_FALSE,
+.host_unified_memory = CL_FALSE,
+.profiling_timer_resolution = 80, /* ns */
+.endian_little = CL_TRUE,
+.available = CL_TRUE,
+.compiler_available = CL_FALSE, /* XXX */
+.execution_capabilities = CL_EXEC_KERNEL,
+.queue_properties = CL_QUEUE_PROFILING_ENABLE,
+.platform = NULL, /* == intel_platform (set when requested) */
+
+#define DECL_INFO_STRING(FIELD, STRING) \
+ .FIELD = STRING, \
+ .JOIN(FIELD,_sz) = sizeof(STRING) + 1,
+DECL_INFO_STRING(name, "Intel HD Graphics Family")
+DECL_INFO_STRING(vendor, "Intel")
+DECL_INFO_STRING(version, "OpenCL 1.10")
+DECL_INFO_STRING(profile, "FULL_PROFILE")
+DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10")
+DECL_INFO_STRING(extensions, "")
+#undef DECL_INFO_STRING
+
+
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 5c07b9bd..20c0f427 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -347,6 +347,10 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
ASSOC_ITEM (EXECUTION_ENVIRONMENT, exec_env, exec_env);
ASSOC_ITEM (THREAD_PAYLOAD, thread_payload, thread_payload);
+ case PATCH_TOKEN_DATA_PARAMETER_STREAM:
+ info->curbe.sz = *(uint32_t *) patch;
+ info->curbe.offset = 0;
+ break;
case PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT:
case PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT:
{
@@ -382,6 +386,7 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
case DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES:
case DATA_PARAMETER_LOCAL_WORK_SIZE:
case DATA_PARAMETER_GLOBAL_WORK_SIZE:
+ case DATA_PARAMETER_GLOBAL_WORK_OFFSET:
case DATA_PARAMETER_NUM_WORK_GROUPS:
case DATA_PARAMETER_WORK_DIMENSIONS:
case DATA_PARAMETER_IMAGE_WIDTH:
@@ -389,6 +394,7 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz)
case DATA_PARAMETER_IMAGE_DEPTH:
case DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE:
case DATA_PARAMETER_IMAGE_CHANNEL_ORDER:
+ case DATA_PARAMETER_NUM_HARDWARE_THREADS:
{
curbe_key = cl_curbe_key(data->type, data->index, data->src_offset);
curbe_info = cl_kernel_get_curbe_info_list(k, curbe_key);
diff --git a/src/intel/cl_device_data.h b/src/intel/cl_device_data.h
index b2acee95..b7faef16 100644
--- a/src/intel/cl_device_data.h
+++ b/src/intel/cl_device_data.h
@@ -62,6 +62,24 @@
devid == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS || \
devid == PCI_CHIP_SANDYBRIDGE_S_GT)
+#define PCI_CHIP_IVYBRIDGE_GT1 0x0152 /* Desktop */
+#define PCI_CHIP_IVYBRIDGE_GT2 0x0162
+#define PCI_CHIP_IVYBRIDGE_M_GT1 0x0156 /* Mobile */
+#define PCI_CHIP_IVYBRIDGE_M_GT2 0x0166
+#define PCI_CHIP_IVYBRIDGE_S_GT1 0x015a /* Server */
+
+#define IS_IVB_GT1(devid) \
+ (devid == PCI_CHIP_IVYBRIDGE_GT1 || \
+ devid == PCI_CHIP_IVYBRIDGE_M_GT1 || \
+ devid == PCI_CHIP_IVYBRIDGE_S_GT1)
+
+#define IS_IVB_GT2(devid) \
+ (devid == PCI_CHIP_IVYBRIDGE_GT2 || \
+ devid == PCI_CHIP_IVYBRIDGE_M_GT2)
+
+#define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid))
+#define IS_GEN7(devid) IS_IVYBRIDGE(devid)
+
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
diff --git a/src/intel/genx_defines.h b/src/intel/genx_defines.h
index 819fcbd7..af0e3db1 100644
--- a/src/intel/genx_defines.h
+++ b/src/intel/genx_defines.h
@@ -239,6 +239,7 @@
#define I965_SURFACEFORMAT_R16G16B16_SNORM 0x19D
#define I965_SURFACEFORMAT_R16G16B16_SSCALED 0x19E
#define I965_SURFACEFORMAT_R16G16B16_USCALED 0x19F
+#define I965_SURFACEFORMAT_RAW 0x1FF
#define I965_CULLMODE_BOTH 0
#define I965_CULLMODE_NONE 1
diff --git a/src/intel/genx_gpgpu.c b/src/intel/genx_gpgpu.c
index 1685b0fd..adf73f9a 100644
--- a/src/intel/genx_gpgpu.c
+++ b/src/intel/genx_gpgpu.c
@@ -107,8 +107,77 @@ typedef struct gen6_surface_state
uint32_t vertical_alignment:1;
uint32_t x_offset:7;
} ss5;
+
+ uint32_t ss6; /* unused */
+ uint32_t ss7; /* unused */
} gen6_surface_state_t;
+typedef struct gen7_surface_state
+{
+ struct {
+ uint32_t cube_pos_z:1;
+ uint32_t cube_neg_z:1;
+ uint32_t cube_pos_y:1;
+ uint32_t cube_neg_y:1;
+ uint32_t cube_pos_x:1;
+ uint32_t cube_neg_x:1;
+ uint32_t media_boundary_pixel_mode:2;
+ uint32_t render_cache_rw_mode:1;
+ uint32_t pad1:1;
+ uint32_t surface_array_spacing:1;
+ uint32_t vertical_line_stride_offset:1;
+ uint32_t vertical_line_stride:1;
+ uint32_t tile_walk:1;
+ uint32_t tiled_surface:1;
+ uint32_t horizontal_alignment:1;
+ uint32_t vertical_alignment:2;
+ uint32_t surface_format:9;
+ uint32_t pad0:1;
+ uint32_t surface_array:1;
+ uint32_t surface_type:3;
+ } ss0;
+
+ struct {
+ uint32_t base_addr;
+ } ss1;
+
+ struct {
+ uint32_t width:14;
+ uint32_t pad1:2;
+ uint32_t height:14;
+ uint32_t pad0:2;
+ } ss2;
+
+ struct {
+ uint32_t pitch:18;
+ uint32_t pad0:3;
+ uint32_t depth:11;
+ } ss3;
+
+ uint32_t ss4;
+
+ struct {
+ uint32_t mip_count:4;
+ uint32_t surface_min_load:4;
+ uint32_t pad2:6;
+ uint32_t coherence_type:1;
+ uint32_t stateless_force_write_thru:1;
+ uint32_t surface_object_control_state:4;
+ uint32_t y_offset:4;
+ uint32_t pad0:1;
+ uint32_t x_offset:7;
+ } ss5;
+
+ uint32_t ss6; /* unused */
+ uint32_t ss7; /* unused */
+
+} gen7_surface_state_t;
+
+#define GEN7_CACHED_IN_LLC 3
+
+STATIC_ASSERT(sizeof(gen6_surface_state_t) == sizeof(gen7_surface_state_t));
+static const size_t surface_state_sz = sizeof(gen6_surface_state_t);
+
typedef struct gen6_vfe_state_inline
{
struct {
@@ -343,13 +412,16 @@ gpgpu_set_base_address(genx_gpgpu_state_t *state)
OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */
/* If we output an AUB file, we limit the total size to 64MB */
#if USE_FULSIM
- OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound - Ignore Check */
+ OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
+ OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
+ OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
+ OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
#else
OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
+ OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY);
#endif
- OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound - Ignore Check */
- OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound - Ignore Check */
- OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound - Ignore Check */
ADVANCE_BATCH(state->batch);
}
@@ -383,11 +455,10 @@ gpgpu_load_constant_buffer(genx_gpgpu_state_t *state)
BEGIN_BATCH(state->batch, 4);
OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */
OUT_BATCH(state->batch, 0); /* mbz */
- OUT_BATCH(state->batch, state->urb.size_cs_entry*
- state->urb.num_cs_entries*32);
- OUT_RELOC(state->batch, state->curbe_b.bo,
- I915_GEM_DOMAIN_INSTRUCTION, 0,
- 0);
+ OUT_BATCH(state->batch,
+ state->urb.size_cs_entry*
+ state->urb.num_cs_entries*32);
+ OUT_RELOC(state->batch, state->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
ADVANCE_BATCH(state->batch);
}
@@ -398,9 +469,7 @@ gpgpu_load_idrt(genx_gpgpu_state_t *state)
OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
OUT_BATCH(state->batch, 0); /* mbz */
OUT_BATCH(state->batch, state->idrt_b.num*32);
- OUT_RELOC(state->batch, state->idrt_b.bo,
- I915_GEM_DOMAIN_INSTRUCTION, 0,
- 0);
+ OUT_RELOC(state->batch, state->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0);
ADVANCE_BATCH(state->batch);
}
@@ -469,7 +538,7 @@ gpgpu_flush(genx_gpgpu_state_t *state)
LOCAL void
gpgpu_state_init(genx_gpgpu_state_t *state,
- uint32_t max_thr,
+ uint32_t max_threads,
uint32_t size_vfe_entry,
uint32_t num_vfe_entries,
uint32_t size_cs_entry,
@@ -478,8 +547,6 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
dri_bo *bo;
int32_t i;
- assert(max_thr > 0 && max_thr < MAX_THREADS);
-
/* URB */
state->urb.vfe_start = 0;
state->urb.num_vfe_entries = num_vfe_entries;
@@ -487,6 +554,7 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
state->urb.num_cs_entries = num_cs_entries;
state->urb.size_cs_entry = size_cs_entry;
state->urb.cs_start = state->urb.vfe_start + state->urb.num_vfe_entries * state->urb.size_vfe_entry;
+ state->max_threads = max_threads;
/* constant buffer */
if(state->curbe_b.bo)
@@ -494,7 +562,7 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
uint32_t size_cb = state->urb.num_cs_entries * state->urb.size_cs_entry * (512/8);
size_cb = (size_cb + (4096 - 1)) & (~(4096-1)); /* roundup to 4K */
bo = dri_bo_alloc(state->drv->bufmgr,
- "constant buffer",
+ "CONSTANT_BUFFER",
size_cb,
64);
assert(bo);
@@ -511,13 +579,13 @@ gpgpu_state_init(genx_gpgpu_state_t *state,
if(state->binding_table_b.bo)
dri_bo_unreference(state->binding_table_b.bo);
bo = dri_bo_alloc(state->drv->bufmgr,
- "binding table",
+ "SS_SURF_BIND",
MAX_SURFACES * sizeof(uint32_t),
32);
assert(bo);
state->binding_table_b.bo = bo;
- /* interface descriptor remapping table */
+ /* IDRT */
if(state->idrt_b.bo)
dri_bo_unreference(state->idrt_b.bo);
bo = dri_bo_alloc(state->drv->bufmgr,
@@ -565,14 +633,11 @@ gpgpu_bind_surf_2d(genx_gpgpu_state_t *state,
state->surface_state_b[index].bo = NULL;
}
- bo = dri_bo_alloc(state->drv->bufmgr,
- "surface state",
- sizeof(gen6_surface_state_t),
- 32);
+ bo = dri_bo_alloc(state->drv->bufmgr, "surface state", surface_state_sz, 32);
assert(bo);
dri_bo_map(bo, 1);
assert(bo->virtual);
- ss = (gen6_surface_state_t *)bo->virtual;
+ ss = (gen6_surface_state_t*) bo->virtual;
memset(ss, 0, sizeof(*ss));
ss->ss0.surface_type = I965_SURFACE_2D;
ss->ss0.surface_format = format;
@@ -584,9 +649,8 @@ gpgpu_bind_surf_2d(genx_gpgpu_state_t *state,
ss->ss3.pitch = (w*4) - 1; /* TEMP patch */
/* TODO: parse GFDT bit as well */
- if(state->drv->gen_ver == 6) {
+ if(state->drv->gen_ver == 6)
ss->ss5.cache_control = cchint;
- }
if (is_dst) {
write_domain = I915_GEM_DOMAIN_RENDER;
@@ -704,9 +768,6 @@ gpgpu_bind_buf(genx_gpgpu_state_t *state,
uint32_t size,
uint32_t cchint)
{
- uint32_t size_ss = ((size + 0xf) >> 4)-1; /* ceil(size/16) - 1 */
-
- gen6_surface_state_t *ss;
dri_bo *bo;
uint32_t write_domain, read_domain;
@@ -718,40 +779,51 @@ gpgpu_bind_buf(genx_gpgpu_state_t *state,
state->surface_state_b[index].bo = NULL;
}
- bo = dri_bo_alloc(state->drv->bufmgr,
- "surface state",
- sizeof(gen6_surface_state_t), 32);
+ bo = dri_bo_alloc(state->drv->bufmgr, "SS_SURFACE", surface_state_sz, 32);
assert(bo);
dri_bo_map(bo, 1);
assert(bo->virtual);
- ss = (gen6_surface_state_t *)bo->virtual;
- memset(ss, 0, sizeof(*ss));
-
- ss->ss0.surface_type = I965_SURFACE_BUFFER;
- ss->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
- ss->ss0.vert_line_stride = 0;
- ss->ss0.vert_line_stride_ofs = 0;
- ss->ss1.base_addr = obj_bo->offset + offset;
- ss->ss2.width = (size_ss & 0x7f); /* bits 6:0 of size_ss */
- ss->ss2.height = (size_ss >> 7) & 0x1fff; /* bits 19:7 of size_ss */
- ss->ss3.pitch = 16-1;
- ss->ss3.depth = (size_ss >> 20); /* bits 26:20 of size_ss */
-
- /* TODO: parse GFDT bit as well */
- if(state->drv->gen_ver==6)
- ss->ss5.cache_control = cchint;
-
write_domain = I915_GEM_DOMAIN_RENDER;
read_domain = I915_GEM_DOMAIN_RENDER;
- dri_bo_emit_reloc(bo,
- read_domain,
- write_domain,
- offset,
- offsetof(gen6_surface_state_t, ss1),
- obj_bo);
- dri_bo_unmap(bo);
+ if(state->drv->gen_ver == 6) {
+ gen6_surface_state_t *ss = (gen6_surface_state_t *) bo->virtual;
+ const uint32_t size_ss = ((size+0xf) >> 4) - 1; /* ceil(size/16) - 1 */
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_FLOAT;
+ ss->ss1.base_addr = obj_bo->offset + offset;
+ ss->ss2.width = size_ss & 0x7f; /* bits 6:0 of size_ss */
+ ss->ss2.height = (size_ss >> 7) & 0x1fff; /* bits 19:7 of size_ss */
+ ss->ss3.pitch = 0xf;
+ ss->ss3.depth = size_ss >> 20; /* bits 26:20 of size_ss */
+ ss->ss5.cache_control = cchint;
+ dri_bo_emit_reloc(bo,
+ read_domain,
+ write_domain,
+ offset,
+ offsetof(gen6_surface_state_t, ss1),
+ obj_bo);
+ } else if (state->drv->gen_ver == 7) {
+ gen7_surface_state_t *ss = (gen7_surface_state_t *) bo->virtual;
+ const uint32_t size_ss = size - 1;
+ memset(ss, 0, sizeof(*ss));
+ ss->ss0.surface_type = I965_SURFACE_BUFFER;
+ ss->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+ ss->ss1.base_addr = obj_bo->offset + offset;
+ ss->ss2.width = size_ss & 0x7f; /* bits 6:0 of size_ss */
+ ss->ss2.height = (size_ss & 0x1fff80) >> 7; /* bits 20:7 of size_ss */
+ ss->ss3.depth = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */
+ ss->ss5.surface_object_control_state = GEN7_CACHED_IN_LLC;
+ dri_bo_emit_reloc(bo,
+ read_domain,
+ write_domain,
+ offset,
+ offsetof(gen7_surface_state_t, ss1),
+ obj_bo);
+ }
+ dri_bo_unmap(bo);
assert(index < (int) MAX_SURFACES);
state->surface_state_b[index].bo = bo;
}
diff --git a/src/intel/genx_gpgpu.h b/src/intel/genx_gpgpu.h
index 21868858..d2636049 100644
--- a/src/intel/genx_gpgpu.h
+++ b/src/intel/genx_gpgpu.h
@@ -101,7 +101,7 @@ extern void gpgpu_bind_buf(genx_gpgpu_state_t*,
/* Configure state, size in 512-bit units */
extern void gpgpu_state_init(genx_gpgpu_state_t*,
- uint32_t max_thr,
+ uint32_t max_threads,
uint32_t size_vfe_entry,
uint32_t num_vfe_entries,
uint32_t size_cs_entry,
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index e44e675f..b4e87351 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -114,12 +114,24 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
assert(res);
intel_driver_memman_init(driver);
- if (IS_GEN6(driver->device_id))
+#if EMULATE_GEN
+ driver->gen_ver = EMULATE_GEN;
+ if (EMULATE_GEN == 7)
+ driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */
+ else if (EMULATE_GEN == 6)
+ driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */
+ else
+ FATAL ("Unsupported Gen for emulation");
+#else
+ if (IS_GEN&(driver->device_id))
+ driver->gen_ver = 7;
+ else if (IS_GEN6(driver->device_id))
driver->gen_ver = 6;
else if(IS_IGDNG(driver->device_id))
driver->gen_ver = 5;
else
driver->gen_ver = 4;
+#endif /* EMULATE_GEN */
}
LOCAL int