diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/cl_command_queue.c | 74 | ||||
-rw-r--r-- | src/cl_device_id.c | 56 | ||||
-rw-r--r-- | src/cl_gen6_device.h | 64 | ||||
-rw-r--r-- | src/cl_gen7_device.h | 30 | ||||
-rw-r--r-- | src/cl_gt_device.h | 77 | ||||
-rw-r--r-- | src/cl_kernel.c | 6 | ||||
-rw-r--r-- | src/intel/cl_device_data.h | 18 | ||||
-rw-r--r-- | src/intel/genx_defines.h | 1 | ||||
-rw-r--r-- | src/intel/genx_gpgpu.c | 182 | ||||
-rw-r--r-- | src/intel/genx_gpgpu.h | 2 | ||||
-rw-r--r-- | src/intel/intel_driver.c | 14 |
11 files changed, 360 insertions, 164 deletions
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index a03526cf..f7a27405 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -131,8 +131,6 @@ cl_command_queue_bind_surface(cl_command_queue queue, for (i = 0; i < k->arg_info_n; ++i) { if (k->arg_info[i].type != OCLRT_ARG_TYPE_BUFFER) continue; - - /* XXX 64 comes from the patch list format. May change */ assert(k->arg_info[i].offset % SURFACE_SZ == 0); index = k->arg_info[i].offset / SURFACE_SZ; mem = (cl_mem) k->args[k->arg_info[i].arg_index]; @@ -362,8 +360,8 @@ error: static char* cl_kernel_create_cst_buffer(cl_kernel k, cl_uint work_dim, - const size_t *global_work_size, - const size_t *local_work_size) + const size_t *global_wk_sz, + const size_t *local_wk_sz) { cl_curbe_patch_info_t *info = NULL; const size_t sz = k->patch.curbe.sz; @@ -376,24 +374,24 @@ cl_kernel_create_cst_buffer(cl_kernel k, /* Global work group size */ key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 0); if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_work_size, sizeof(uint32_t)); + memcpy(data+info->offsets[0], global_wk_sz, sizeof(uint32_t)); key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 4); if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_work_size+1, sizeof(uint32_t)); + memcpy(data+info->offsets[0], global_wk_sz+1, sizeof(uint32_t)); key = cl_curbe_key(DATA_PARAMETER_GLOBAL_WORK_SIZE, 0, 8); if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], global_work_size+2, sizeof(uint32_t)); + memcpy(data+info->offsets[0], global_wk_sz+2, sizeof(uint32_t)); /* Local work group size */ key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 0); if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_work_size, sizeof(uint32_t)); + memcpy(data+info->offsets[0], local_wk_sz, sizeof(uint32_t)); key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 4); if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_work_size+1, sizeof(uint32_t)); + memcpy(data+info->offsets[0], local_wk_sz+1, sizeof(uint32_t)); key = cl_curbe_key(DATA_PARAMETER_LOCAL_WORK_SIZE, 0, 8); if ((info = cl_kernel_get_curbe_info(k, key)) != NULL) - memcpy(data+info->offsets[0], local_work_size+2, sizeof(uint32_t)); + memcpy(data+info->offsets[0], local_wk_sz+2, sizeof(uint32_t)); exit: return data; @@ -411,10 +409,17 @@ cl_run_fulsim(void) const char *debug_mode = getenv("OCL_FULSIM_DEBUG_MODE"); if (run_it == NULL || strcmp(run_it, "1")) return; +#if EMULATE_GEN == 6 /* SNB */ if (debug_mode == NULL || strcmp(debug_mode, "1")) system("wine AubLoad.exe dump.aub -device sbrB0"); else system("wine AubLoad.exe dump.aub -device sbrB0 -debug"); +#elif EMULATE_GEN == 7 + if (debug_mode == NULL || strcmp(debug_mode, "1")) + system("wine AubLoad.exe dump.aub -device ivb2"); + else + system("wine AubLoad.exe dump.aub -device ivb2 -debug"); +#endif } #endif /* USE_FULSIM */ @@ -423,8 +428,8 @@ cl_command_queue_ND_kernel(cl_command_queue queue, cl_kernel ker, cl_uint work_dim, const size_t *global_work_offset, - const size_t *global_work_size, - const size_t *local_work_size) + const size_t *global_wk_sz, + const size_t *local_wk_sz) { cl_context ctx = queue->ctx; genx_gpgpu_state_t *gpgpu = queue->gpgpu; @@ -458,13 +463,13 @@ cl_command_queue_ND_kernel(cl_command_queue queue, /* Total number of elements in the work group */ for (i = 0; i < work_dim; ++i) if ((&ker->patch.exec_env.required_wgr_sz_x)[i] && - (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_work_size[i]) { + (&ker->patch.exec_env.required_wgr_sz_x)[i] != local_wk_sz[i]) { err = CL_INVALID_WORK_ITEM_SIZE; goto error; } - wrk_grp_sz = local_work_size[0]; + wrk_grp_sz = local_wk_sz[0]; for (i = 1; i < work_dim; ++i) - wrk_grp_sz *= local_work_size[i]; + wrk_grp_sz *= local_wk_sz[i]; FATAL_IF (wrk_grp_sz % 16, "Work group size must be a multiple of 16"); if (wrk_grp_sz > ctx->device->max_work_group_size) { err = CL_INVALID_WORK_ITEM_SIZE; @@ -472,9 +477,9 @@ cl_command_queue_ND_kernel(cl_command_queue queue, } /* Directly from the user defined values */ - header.local_sz[0] = local_work_size[0]; - header.local_sz[1] = local_work_size[1]; - header.local_sz[2] = local_work_size[2]; + header.local_sz[0] = local_wk_sz[0]; + header.local_sz[1] = local_wk_sz[1]; + header.local_sz[2] = local_wk_sz[2]; offset[0] = header.grp_n[0] = 0; offset[1] = header.grp_n[1] = 0; offset[2] = header.grp_n[2] = 0; @@ -483,7 +488,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue, /* offsets are evenly divided by the local sizes */ if (global_work_offset) for (i = 0; i < work_dim; ++i) - offset[i] = global_work_offset[i]/local_work_size[i]; + offset[i] = global_work_offset[i]/local_wk_sz[i]; /* Compute the local size per wg and the offsets for each local buffer */ cl_kernel_handle_local_memory(ker, &header); @@ -506,20 +511,17 @@ cl_command_queue_ND_kernel(cl_command_queue queue, /* Fill the constant buffer */ if (cst_sz > 0) { - char *completed_cst = NULL; + char *data = NULL; assert(ker->cst_buffer); - completed_cst = cl_kernel_create_cst_buffer(ker, - work_dim, - global_work_size, - local_work_size); - gpgpu_upload_constants(gpgpu, completed_cst, cst_sz); - cl_free(completed_cst); + data = cl_kernel_create_cst_buffer(ker,work_dim,global_wk_sz,local_wk_sz); + gpgpu_upload_constants(gpgpu, data, cst_sz); + cl_free(data); } wrk_grp_n = 1; for (i = 0; i < work_dim; ++i) { TRY_ALLOC (ids[i], (cl_local_id_t*) cl_malloc(wrk_grp_sz*sizeof(uint16_t))); - grp_end[i] = offset[i] + global_work_size[i] / local_work_size[i]; + grp_end[i] = offset[i] + global_wk_sz[i] / local_wk_sz[i]; wrk_grp_n *= grp_end[i]-offset[i]; } thread_n = wrk_grp_sz / 16; @@ -528,16 +530,16 @@ cl_command_queue_ND_kernel(cl_command_queue queue, /* Start a new batch buffer */ gpgpu_batch_reset(gpgpu, batch_sz); gpgpu_batch_start(gpgpu); - +#if 1 /* Push all media objects. We implement three paths to make it (a bit) faster. * Local IDs are shared from work group to work group. We allocate once the * buffers and reuse them */ if (work_dim == 3) { curr = 0; - for (i = 0; i < local_work_size[0]; ++i) - for (j = 0; j < local_work_size[1]; ++j) - for (k = 0; k < local_work_size[2]; ++k, ++curr) { + for (i = 0; i < local_wk_sz[0]; ++i) + for (j = 0; j < local_wk_sz[1]; ++j) + for (k = 0; k < local_wk_sz[2]; ++k, ++curr) { ((uint16_t*) ids[0])[curr] = i; ((uint16_t*) ids[1])[curr] = j; ((uint16_t*) ids[2])[curr] = k; @@ -553,8 +555,8 @@ cl_command_queue_ND_kernel(cl_command_queue queue, } else if (work_dim == 2) { curr = 0; - for (i = 0; i < local_work_size[0]; ++i) - for (j = 0; j < local_work_size[1]; ++j, ++curr) { + for (i = 0; i < local_wk_sz[0]; ++i) + for (j = 0; j < local_wk_sz[1]; ++j, ++curr) { ((uint16_t*) ids[0])[curr] = i; ((uint16_t*) ids[1])[curr] = j; } @@ -567,7 +569,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue, } } else { - for (i = 0; i < local_work_size[0]; ++i) + for (i = 0; i < local_wk_sz[0]; ++i) ((uint16_t*) ids[0])[i] = i; for (header.grp_n[0] = offset[0]; header.grp_n[0] < grp_end[0]; ++header.grp_n[0]) { if (ker->patch.exec_env.has_barriers) @@ -576,7 +578,7 @@ cl_command_queue_ND_kernel(cl_command_queue queue, barrierID = (barrierID + 1) % 16; } } - +#endif gpgpu_batch_end(gpgpu, 0); gpgpu_flush(gpgpu); @@ -616,9 +618,9 @@ cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem) #if USE_FULSIM cl_context ctx = queue->ctx; drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx); - drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo); #endif /* USE_FULSIM */ + queue->fulsim_out = mem; if (queue->fulsim_out != NULL) { cl_mem_delete(queue->fulsim_out); diff --git a/src/cl_device_id.c b/src/cl_device_id.c index a846adfb..25106ae0 100644 --- a/src/cl_device_id.c +++ b/src/cl_device_id.c @@ -29,46 +29,75 @@ #include <stdio.h> #include <string.h> -static struct _cl_device_id intel_gt2_device = { +static struct _cl_device_id intel_snb_gt2_device = { .max_compute_unit = 60, .max_work_item_sizes = {512, 512, 512}, .max_work_group_size = 512, .max_clock_frequency = 1350, - /* Common fields between GT1 and GT2 */ #include "cl_gen6_device.h" }; -static struct _cl_device_id intel_gt1_device = { +static struct _cl_device_id intel_snb_gt1_device = { .max_compute_unit = 24, .max_work_item_sizes = {256, 256, 256}, .max_work_group_size = 256, .max_clock_frequency = 1000, - /* Common fields between GT1 and GT2 */ #include "cl_gen6_device.h" }; +static struct _cl_device_id intel_ivb_gt2_device = { + .max_compute_unit = 128, + .max_work_item_sizes = {512, 512, 512}, + .max_work_group_size = 512, + .max_clock_frequency = 1000, + + #include "cl_gen7_device.h" +}; + +static struct _cl_device_id intel_ivb_gt1_device = { + .max_compute_unit = 64, + .max_work_item_sizes = {512, 512, 512}, + .max_work_group_size = 512, + .max_clock_frequency = 1000, + + #include "cl_gen7_device.h" +}; + LOCAL cl_device_id cl_get_gt_device(void) { cl_device_id ret = NULL; int device_id = cl_intel_get_device_id(); - if (device_id == PCI_CHIP_SANDYBRIDGE_GT1 || + if (device_id == PCI_CHIP_IVYBRIDGE_GT1 || + device_id == PCI_CHIP_IVYBRIDGE_M_GT1 || + device_id == PCI_CHIP_IVYBRIDGE_S_GT1) { + intel_ivb_gt1_device.vendor_id = device_id; + intel_ivb_gt1_device.platform = intel_platform; + ret = &intel_ivb_gt1_device; + } + else if (device_id == PCI_CHIP_IVYBRIDGE_GT2 || + device_id == PCI_CHIP_IVYBRIDGE_M_GT2) { + intel_ivb_gt2_device.vendor_id = device_id; + intel_ivb_gt2_device.platform = intel_platform; + ret = &intel_ivb_gt2_device; + } + else if (device_id == PCI_CHIP_SANDYBRIDGE_GT1 || device_id == PCI_CHIP_SANDYBRIDGE_M_GT1 || device_id == PCI_CHIP_SANDYBRIDGE_S_GT) { - intel_gt1_device.vendor_id = device_id; - intel_gt1_device.platform = intel_platform; - ret = &intel_gt1_device; + intel_snb_gt1_device.vendor_id = device_id; + intel_snb_gt1_device.platform = intel_platform; + ret = &intel_snb_gt1_device; } else if (device_id == PCI_CHIP_SANDYBRIDGE_GT2 || device_id == PCI_CHIP_SANDYBRIDGE_M_GT2 || device_id == PCI_CHIP_SANDYBRIDGE_GT2_PLUS || device_id == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS) { - intel_gt2_device.vendor_id = device_id; - intel_gt2_device.platform = intel_platform; - ret = &intel_gt2_device; + intel_snb_gt2_device.vendor_id = device_id; + intel_snb_gt2_device.platform = intel_platform; + ret = &intel_snb_gt2_device; } return ret; } @@ -130,7 +159,10 @@ cl_get_device_info(cl_device_id device, void * param_value, size_t * param_value_size_ret) { - if (UNLIKELY(device != &intel_gt1_device && device != &intel_gt2_device)) + if (UNLIKELY(device != &intel_snb_gt1_device && + device != &intel_snb_gt2_device && + device != &intel_ivb_gt1_device && + device != &intel_ivb_gt2_device)) return CL_INVALID_DEVICE; if (UNLIKELY(param_value == NULL)) return CL_INVALID_VALUE; diff --git a/src/cl_gen6_device.h b/src/cl_gen6_device.h index 32c01c2f..b09121fd 100644 --- a/src/cl_gen6_device.h +++ b/src/cl_gen6_device.h @@ -17,68 +17,14 @@ * Author: Benjamin Segovia <benjamin.segovia@intel.com> */ -/* Common fields for both GT1 and GT2 devices. Fields which are not shared are - * set in cl_device_id_object.c which basically deals with OpenCL devices +/* Common fields for both SNB devices (either GT1 or GT2) */ -.device_type = CL_DEVICE_TYPE_GPU, -.vendor_id = 0, /* == device_id (set when requested) */ -.max_work_item_dimensions = 3, -.preferred_vector_width_char = 16, -.preferred_vector_width_short = 16, -.preferred_vector_width_int = 16, -.preferred_vector_width_long = 16, -.preferred_vector_width_float = 16, -.preferred_vector_width_double = 0, -.preferred_vector_width_half = 0, -.native_vector_width_char = 16, -.native_vector_width_short = 16, -.native_vector_width_int = 16, -.native_vector_width_long = 16, -.native_vector_width_float = 16, -.native_vector_width_double = 16, -.native_vector_width_half = 16, -.address_bits = 32, -.max_mem_alloc_size = 128 * 1024 * 1024, -.image_support = CL_FALSE, -.max_read_image_args = 0, -.max_write_image_args = 0, -.image2d_max_width = 0, -.image2d_max_height = 0, -.image3d_max_width = 0, -.image3d_max_height = 0, -.image3d_max_depth = 0, -.max_samplers = 0, -.max_parameter_size = 256, /* Gen6 */ -.mem_base_addr_align = sizeof(cl_uint) * 8, -.min_data_type_align_size = sizeof(cl_uint), -.single_fp_config = 0, /* XXX */ -.global_mem_cache_type = CL_READ_WRITE_CACHE, +.max_parameter_size = 256, .global_mem_cache_line_size = 128, /* XXX */ .global_mem_cache_size = 8 << 10, /* XXX */ -.global_mem_size = 4, -.max_constant_buffer_size = 64 << 10, -.max_constant_args = 8, -.local_mem_type = CL_GLOBAL, /* Gen6 */ -.local_mem_size = 16 << 10, /* Gen6 */ -.error_correction_support = CL_FALSE, -.host_unified_memory = CL_FALSE, -.profiling_timer_resolution = 80, /* ns */ -.endian_little = CL_TRUE, -.available = CL_TRUE, -.compiler_available = CL_FALSE, /* XXX */ -.execution_capabilities = CL_EXEC_KERNEL, -.queue_properties = CL_QUEUE_PROFILING_ENABLE, -.platform = NULL, /* == intel_platform (set when requested) */ +.local_mem_type = CL_GLOBAL, +.local_mem_size = 16 << 10, .gfx_id = IGFX_GEN6_CORE, -#define DECL_INFO_STRING(FIELD, STRING) \ - .FIELD = STRING, \ - .JOIN(FIELD,_sz) = sizeof(STRING) + 1, -DECL_INFO_STRING(name, "Intel HD Graphics Family") -DECL_INFO_STRING(vendor, "Intel") -DECL_INFO_STRING(version, "OpenCL 1.10") -DECL_INFO_STRING(profile, "FULL_PROFILE") -DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10") -DECL_INFO_STRING(extensions, "") -#undef DECL_INFO_STRING +#include "cl_gt_device.h" diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h new file mode 100644 index 00000000..75c4e3f0 --- /dev/null +++ b/src/cl_gen7_device.h @@ -0,0 +1,30 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + * Author: Benjamin Segovia <benjamin.segovia@intel.com> + */ + +/* Common fields for both SNB devices (either GT1 or GT2) + */ +.max_parameter_size = 256, +.global_mem_cache_line_size = 128, /* XXX */ +.global_mem_cache_size = 8 << 10, /* XXX */ +.local_mem_type = CL_GLOBAL, +.local_mem_size = 64 << 10, +.gfx_id = IGFX_GEN7_CORE, + +#include "cl_gt_device.h" + diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h new file mode 100644 index 00000000..d66d6ead --- /dev/null +++ b/src/cl_gt_device.h @@ -0,0 +1,77 @@ +/* + * Copyright © 2012 Intel Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see <http://www.gnu.org/licenses/>. + * + * Author: Benjamin Segovia <benjamin.segovia@intel.com> + */ + +/* Common fields for both all GT devices (IVB / SNB) */ +.device_type = CL_DEVICE_TYPE_GPU, +.vendor_id = 0, /* == device_id (set when requested) */ +.max_work_item_dimensions = 3, +.preferred_vector_width_char = 16, +.preferred_vector_width_short = 16, +.preferred_vector_width_int = 16, +.preferred_vector_width_long = 16, +.preferred_vector_width_float = 16, +.preferred_vector_width_double = 0, +.preferred_vector_width_half = 0, +.native_vector_width_char = 16, +.native_vector_width_short = 16, +.native_vector_width_int = 16, +.native_vector_width_long = 16, +.native_vector_width_float = 16, +.native_vector_width_double = 16, +.native_vector_width_half = 16, +.address_bits = 32, +.max_mem_alloc_size = 128 * 1024 * 1024, +.image_support = CL_FALSE, +.max_read_image_args = 0, +.max_write_image_args = 0, +.image2d_max_width = 0, +.image2d_max_height = 0, +.image3d_max_width = 0, +.image3d_max_height = 0, +.image3d_max_depth = 0, +.max_samplers = 0, +.mem_base_addr_align = sizeof(cl_uint) * 8, +.min_data_type_align_size = sizeof(cl_uint), +.single_fp_config = 0, /* XXX */ +.global_mem_cache_type = CL_READ_WRITE_CACHE, +.global_mem_size = 4, +.max_constant_buffer_size = 64 << 10, +.max_constant_args = 8, +.error_correction_support = CL_FALSE, +.host_unified_memory = CL_FALSE, +.profiling_timer_resolution = 80, /* ns */ +.endian_little = CL_TRUE, +.available = CL_TRUE, +.compiler_available = CL_FALSE, /* XXX */ +.execution_capabilities = CL_EXEC_KERNEL, +.queue_properties = CL_QUEUE_PROFILING_ENABLE, +.platform = NULL, /* == intel_platform (set when requested) */ + +#define DECL_INFO_STRING(FIELD, STRING) \ + .FIELD = STRING, \ + .JOIN(FIELD,_sz) = sizeof(STRING) + 1, +DECL_INFO_STRING(name, "Intel HD Graphics Family") +DECL_INFO_STRING(vendor, "Intel") +DECL_INFO_STRING(version, "OpenCL 1.10") +DECL_INFO_STRING(profile, "FULL_PROFILE") +DECL_INFO_STRING(opencl_c_version, "OpenCL 1.10") +DECL_INFO_STRING(extensions, "") +#undef DECL_INFO_STRING + + diff --git a/src/cl_kernel.c b/src/cl_kernel.c index 5c07b9bd..20c0f427 100644 --- a/src/cl_kernel.c +++ b/src/cl_kernel.c @@ -347,6 +347,10 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz) ASSOC_ITEM (EXECUTION_ENVIRONMENT, exec_env, exec_env); ASSOC_ITEM (THREAD_PAYLOAD, thread_payload, thread_payload); + case PATCH_TOKEN_DATA_PARAMETER_STREAM: + info->curbe.sz = *(uint32_t *) patch; + info->curbe.offset = 0; + break; case PATCH_TOKEN_CONSTANT_MEMORY_KERNEL_ARGUMENT: case PATCH_TOKEN_GLOBAL_MEMORY_KERNEL_ARGUMENT: { @@ -382,6 +386,7 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz) case DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_ARGUMENT_SIZES: case DATA_PARAMETER_LOCAL_WORK_SIZE: case DATA_PARAMETER_GLOBAL_WORK_SIZE: + case DATA_PARAMETER_GLOBAL_WORK_OFFSET: case DATA_PARAMETER_NUM_WORK_GROUPS: case DATA_PARAMETER_WORK_DIMENSIONS: case DATA_PARAMETER_IMAGE_WIDTH: @@ -389,6 +394,7 @@ cl_kernel_setup_patch_list(cl_kernel k, const char *patch, size_t sz) case DATA_PARAMETER_IMAGE_DEPTH: case DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE: case DATA_PARAMETER_IMAGE_CHANNEL_ORDER: + case DATA_PARAMETER_NUM_HARDWARE_THREADS: { curbe_key = cl_curbe_key(data->type, data->index, data->src_offset); curbe_info = cl_kernel_get_curbe_info_list(k, curbe_key); diff --git a/src/intel/cl_device_data.h b/src/intel/cl_device_data.h index b2acee95..b7faef16 100644 --- a/src/intel/cl_device_data.h +++ b/src/intel/cl_device_data.h @@ -62,6 +62,24 @@ devid == PCI_CHIP_SANDYBRIDGE_M_GT2_PLUS || \ devid == PCI_CHIP_SANDYBRIDGE_S_GT) +#define PCI_CHIP_IVYBRIDGE_GT1 0x0152 /* Desktop */ +#define PCI_CHIP_IVYBRIDGE_GT2 0x0162 +#define PCI_CHIP_IVYBRIDGE_M_GT1 0x0156 /* Mobile */ +#define PCI_CHIP_IVYBRIDGE_M_GT2 0x0166 +#define PCI_CHIP_IVYBRIDGE_S_GT1 0x015a /* Server */ + +#define IS_IVB_GT1(devid) \ + (devid == PCI_CHIP_IVYBRIDGE_GT1 || \ + devid == PCI_CHIP_IVYBRIDGE_M_GT1 || \ + devid == PCI_CHIP_IVYBRIDGE_S_GT1) + +#define IS_IVB_GT2(devid) \ + (devid == PCI_CHIP_IVYBRIDGE_GT2 || \ + devid == PCI_CHIP_IVYBRIDGE_M_GT2) + +#define IS_IVYBRIDGE(devid) (IS_IVB_GT1(devid) || IS_IVB_GT2(devid)) +#define IS_GEN7(devid) IS_IVYBRIDGE(devid) + #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ diff --git a/src/intel/genx_defines.h b/src/intel/genx_defines.h index 819fcbd7..af0e3db1 100644 --- a/src/intel/genx_defines.h +++ b/src/intel/genx_defines.h @@ -239,6 +239,7 @@ #define I965_SURFACEFORMAT_R16G16B16_SNORM 0x19D #define I965_SURFACEFORMAT_R16G16B16_SSCALED 0x19E #define I965_SURFACEFORMAT_R16G16B16_USCALED 0x19F +#define I965_SURFACEFORMAT_RAW 0x1FF #define I965_CULLMODE_BOTH 0 #define I965_CULLMODE_NONE 1 diff --git a/src/intel/genx_gpgpu.c b/src/intel/genx_gpgpu.c index 1685b0fd..adf73f9a 100644 --- a/src/intel/genx_gpgpu.c +++ b/src/intel/genx_gpgpu.c @@ -107,8 +107,77 @@ typedef struct gen6_surface_state uint32_t vertical_alignment:1; uint32_t x_offset:7; } ss5; + + uint32_t ss6; /* unused */ + uint32_t ss7; /* unused */ } gen6_surface_state_t; +typedef struct gen7_surface_state +{ + struct { + uint32_t cube_pos_z:1; + uint32_t cube_neg_z:1; + uint32_t cube_pos_y:1; + uint32_t cube_neg_y:1; + uint32_t cube_pos_x:1; + uint32_t cube_neg_x:1; + uint32_t media_boundary_pixel_mode:2; + uint32_t render_cache_rw_mode:1; + uint32_t pad1:1; + uint32_t surface_array_spacing:1; + uint32_t vertical_line_stride_offset:1; + uint32_t vertical_line_stride:1; + uint32_t tile_walk:1; + uint32_t tiled_surface:1; + uint32_t horizontal_alignment:1; + uint32_t vertical_alignment:2; + uint32_t surface_format:9; + uint32_t pad0:1; + uint32_t surface_array:1; + uint32_t surface_type:3; + } ss0; + + struct { + uint32_t base_addr; + } ss1; + + struct { + uint32_t width:14; + uint32_t pad1:2; + uint32_t height:14; + uint32_t pad0:2; + } ss2; + + struct { + uint32_t pitch:18; + uint32_t pad0:3; + uint32_t depth:11; + } ss3; + + uint32_t ss4; + + struct { + uint32_t mip_count:4; + uint32_t surface_min_load:4; + uint32_t pad2:6; + uint32_t coherence_type:1; + uint32_t stateless_force_write_thru:1; + uint32_t surface_object_control_state:4; + uint32_t y_offset:4; + uint32_t pad0:1; + uint32_t x_offset:7; + } ss5; + + uint32_t ss6; /* unused */ + uint32_t ss7; /* unused */ + +} gen7_surface_state_t; + +#define GEN7_CACHED_IN_LLC 3 + +STATIC_ASSERT(sizeof(gen6_surface_state_t) == sizeof(gen7_surface_state_t)); +static const size_t surface_state_sz = sizeof(gen6_surface_state_t); + typedef struct gen6_vfe_state_inline { struct { @@ -343,13 +412,16 @@ gpgpu_set_base_address(genx_gpgpu_state_t *state) OUT_BATCH(state->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */ /* If we output an AUB file, we limit the total size to 64MB */ #if USE_FULSIM - OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound - Ignore Check */ + OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */ + OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */ + OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */ + OUT_BATCH(state->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */ #else OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); #endif - OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound - Ignore Check */ - OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound - Ignore Check */ - OUT_BATCH(state->batch, 0 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound - Ignore Check */ ADVANCE_BATCH(state->batch); } @@ -383,11 +455,10 @@ gpgpu_load_constant_buffer(genx_gpgpu_state_t *state) BEGIN_BATCH(state->batch, 4); OUT_BATCH(state->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */ OUT_BATCH(state->batch, 0); /* mbz */ - OUT_BATCH(state->batch, state->urb.size_cs_entry* - state->urb.num_cs_entries*32); - OUT_RELOC(state->batch, state->curbe_b.bo, - I915_GEM_DOMAIN_INSTRUCTION, 0, - 0); + OUT_BATCH(state->batch, + state->urb.size_cs_entry* + state->urb.num_cs_entries*32); + OUT_RELOC(state->batch, state->curbe_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); ADVANCE_BATCH(state->batch); } @@ -398,9 +469,7 @@ gpgpu_load_idrt(genx_gpgpu_state_t *state) OUT_BATCH(state->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */ OUT_BATCH(state->batch, 0); /* mbz */ OUT_BATCH(state->batch, state->idrt_b.num*32); - OUT_RELOC(state->batch, state->idrt_b.bo, - I915_GEM_DOMAIN_INSTRUCTION, 0, - 0); + OUT_RELOC(state->batch, state->idrt_b.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 0); ADVANCE_BATCH(state->batch); } @@ -469,7 +538,7 @@ gpgpu_flush(genx_gpgpu_state_t *state) LOCAL void gpgpu_state_init(genx_gpgpu_state_t *state, - uint32_t max_thr, + uint32_t max_threads, uint32_t size_vfe_entry, uint32_t num_vfe_entries, uint32_t size_cs_entry, @@ -478,8 +547,6 @@ gpgpu_state_init(genx_gpgpu_state_t *state, dri_bo *bo; int32_t i; - assert(max_thr > 0 && max_thr < MAX_THREADS); - /* URB */ state->urb.vfe_start = 0; state->urb.num_vfe_entries = num_vfe_entries; @@ -487,6 +554,7 @@ gpgpu_state_init(genx_gpgpu_state_t *state, state->urb.num_cs_entries = num_cs_entries; state->urb.size_cs_entry = size_cs_entry; state->urb.cs_start = state->urb.vfe_start + state->urb.num_vfe_entries * state->urb.size_vfe_entry; + state->max_threads = max_threads; /* constant buffer */ if(state->curbe_b.bo) @@ -494,7 +562,7 @@ gpgpu_state_init(genx_gpgpu_state_t *state, uint32_t size_cb = state->urb.num_cs_entries * state->urb.size_cs_entry * (512/8); size_cb = (size_cb + (4096 - 1)) & (~(4096-1)); /* roundup to 4K */ bo = dri_bo_alloc(state->drv->bufmgr, - "constant buffer", + "CONSTANT_BUFFER", size_cb, 64); assert(bo); @@ -511,13 +579,13 @@ gpgpu_state_init(genx_gpgpu_state_t *state, if(state->binding_table_b.bo) dri_bo_unreference(state->binding_table_b.bo); bo = dri_bo_alloc(state->drv->bufmgr, - "binding table", + "SS_SURF_BIND", MAX_SURFACES * sizeof(uint32_t), 32); assert(bo); state->binding_table_b.bo = bo; - /* interface descriptor remapping table */ + /* IDRT */ if(state->idrt_b.bo) dri_bo_unreference(state->idrt_b.bo); bo = dri_bo_alloc(state->drv->bufmgr, @@ -565,14 +633,11 @@ gpgpu_bind_surf_2d(genx_gpgpu_state_t *state, state->surface_state_b[index].bo = NULL; } - bo = dri_bo_alloc(state->drv->bufmgr, - "surface state", - sizeof(gen6_surface_state_t), - 32); + bo = dri_bo_alloc(state->drv->bufmgr, "surface state", surface_state_sz, 32); assert(bo); dri_bo_map(bo, 1); assert(bo->virtual); - ss = (gen6_surface_state_t *)bo->virtual; + ss = (gen6_surface_state_t*) bo->virtual; memset(ss, 0, sizeof(*ss)); ss->ss0.surface_type = I965_SURFACE_2D; ss->ss0.surface_format = format; @@ -584,9 +649,8 @@ gpgpu_bind_surf_2d(genx_gpgpu_state_t *state, ss->ss3.pitch = (w*4) - 1; /* TEMP patch */ /* TODO: parse GFDT bit as well */ - if(state->drv->gen_ver == 6) { + if(state->drv->gen_ver == 6) ss->ss5.cache_control = cchint; - } if (is_dst) { write_domain = I915_GEM_DOMAIN_RENDER; @@ -704,9 +768,6 @@ gpgpu_bind_buf(genx_gpgpu_state_t *state, uint32_t size, uint32_t cchint) { - uint32_t size_ss = ((size + 0xf) >> 4)-1; /* ceil(size/16) - 1 */ - - gen6_surface_state_t *ss; dri_bo *bo; uint32_t write_domain, read_domain; @@ -718,40 +779,51 @@ gpgpu_bind_buf(genx_gpgpu_state_t *state, state->surface_state_b[index].bo = NULL; } - bo = dri_bo_alloc(state->drv->bufmgr, - "surface state", - sizeof(gen6_surface_state_t), 32); + bo = dri_bo_alloc(state->drv->bufmgr, "SS_SURFACE", surface_state_sz, 32); assert(bo); dri_bo_map(bo, 1); assert(bo->virtual); - ss = (gen6_surface_state_t *)bo->virtual; - memset(ss, 0, sizeof(*ss)); - - ss->ss0.surface_type = I965_SURFACE_BUFFER; - ss->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_FLOAT; - ss->ss0.vert_line_stride = 0; - ss->ss0.vert_line_stride_ofs = 0; - ss->ss1.base_addr = obj_bo->offset + offset; - ss->ss2.width = (size_ss & 0x7f); /* bits 6:0 of size_ss */ - ss->ss2.height = (size_ss >> 7) & 0x1fff; /* bits 19:7 of size_ss */ - ss->ss3.pitch = 16-1; - ss->ss3.depth = (size_ss >> 20); /* bits 26:20 of size_ss */ - - /* TODO: parse GFDT bit as well */ - if(state->drv->gen_ver==6) - ss->ss5.cache_control = cchint; - write_domain = I915_GEM_DOMAIN_RENDER; read_domain = I915_GEM_DOMAIN_RENDER; - dri_bo_emit_reloc(bo, - read_domain, - write_domain, - offset, - offsetof(gen6_surface_state_t, ss1), - obj_bo); - dri_bo_unmap(bo); + if(state->drv->gen_ver == 6) { + gen6_surface_state_t *ss = (gen6_surface_state_t *) bo->virtual; + const uint32_t size_ss = ((size+0xf) >> 4) - 1; /* ceil(size/16) - 1 */ + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_BUFFER; + ss->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_FLOAT; + ss->ss1.base_addr = obj_bo->offset + offset; + ss->ss2.width = size_ss & 0x7f; /* bits 6:0 of size_ss */ + ss->ss2.height = (size_ss >> 7) & 0x1fff; /* bits 19:7 of size_ss */ + ss->ss3.pitch = 0xf; + ss->ss3.depth = size_ss >> 20; /* bits 26:20 of size_ss */ + ss->ss5.cache_control = cchint; + dri_bo_emit_reloc(bo, + read_domain, + write_domain, + offset, + offsetof(gen6_surface_state_t, ss1), + obj_bo); + } else if (state->drv->gen_ver == 7) { + gen7_surface_state_t *ss = (gen7_surface_state_t *) bo->virtual; + const uint32_t size_ss = size - 1; + memset(ss, 0, sizeof(*ss)); + ss->ss0.surface_type = I965_SURFACE_BUFFER; + ss->ss0.surface_format = I965_SURFACEFORMAT_RAW; + ss->ss1.base_addr = obj_bo->offset + offset; + ss->ss2.width = size_ss & 0x7f; /* bits 6:0 of size_ss */ + ss->ss2.height = (size_ss & 0x1fff80) >> 7; /* bits 20:7 of size_ss */ + ss->ss3.depth = (size_ss & 0xffe00000) >> 20; /* bits 27:21 of size_ss */ + ss->ss5.surface_object_control_state = GEN7_CACHED_IN_LLC; + dri_bo_emit_reloc(bo, + read_domain, + write_domain, + offset, + offsetof(gen7_surface_state_t, ss1), + obj_bo); + } + dri_bo_unmap(bo); assert(index < (int) MAX_SURFACES); state->surface_state_b[index].bo = bo; } diff --git a/src/intel/genx_gpgpu.h b/src/intel/genx_gpgpu.h index 21868858..d2636049 100644 --- a/src/intel/genx_gpgpu.h +++ b/src/intel/genx_gpgpu.h @@ -101,7 +101,7 @@ extern void gpgpu_bind_buf(genx_gpgpu_state_t*, /* Configure state, size in 512-bit units */ extern void gpgpu_state_init(genx_gpgpu_state_t*, - uint32_t max_thr, + uint32_t max_threads, uint32_t size_vfe_entry, uint32_t num_vfe_entries, uint32_t size_cs_entry, diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index e44e675f..b4e87351 100644 --- a/src/intel/intel_driver.c +++ b/src/intel/intel_driver.c @@ -114,12 +114,24 @@ intel_driver_init(intel_driver_t *driver, int dev_fd) assert(res); intel_driver_memman_init(driver); - if (IS_GEN6(driver->device_id)) +#if EMULATE_GEN + driver->gen_ver = EMULATE_GEN; + if (EMULATE_GEN == 7) + driver->device_id = PCI_CHIP_IVYBRIDGE_GT2; /* we pick GT2 for IVB */ + else if (EMULATE_GEN == 6) + driver->device_id = PCI_CHIP_SANDYBRIDGE_GT2; /* we pick GT2 for SNB */ + else + FATAL ("Unsupported Gen for emulation"); +#else + if (IS_GEN&(driver->device_id)) + driver->gen_ver = 7; + else if (IS_GEN6(driver->device_id)) driver->gen_ver = 6; else if(IS_IGDNG(driver->device_id)) driver->gen_ver = 5; else driver->gen_ver = 4; +#endif /* EMULATE_GEN */ } LOCAL int |