diff options
author | Denis Steckelmacher <steckdenis@yahoo.fr> | 2011-07-27 13:20:11 +0200 |
---|---|---|
committer | Denis Steckelmacher <steckdenis@yahoo.fr> | 2011-07-27 13:20:11 +0200 |
commit | 82d497c0aa6b8fb439f709eecbbc99d1e515579f (patch) | |
tree | 08d67dc89fad3b971189755b06ca9f98977b28aa | |
parent | a038e07d3998624e0b9679c837fa008187d37e3d (diff) |
Replace small size_t x 3 malloced vectors with static allocation.
It provides a huge speed boost : test kernel takes now 1.72s on my
machine instead of 2.66 (1.54x speed boost).
-rw-r--r-- | src/core/config.h.cmake | 2 | ||||
-rw-r--r-- | src/core/cpu/device.cpp | 16 | ||||
-rw-r--r-- | src/core/cpu/kernel.cpp | 30 | ||||
-rw-r--r-- | src/core/cpu/kernel.h | 11 | ||||
-rw-r--r-- | src/core/events.cpp | 25 | ||||
-rw-r--r-- | src/core/events.h | 7 |
6 files changed, 30 insertions, 61 deletions
diff --git a/src/core/config.h.cmake b/src/core/config.h.cmake index 3556820..ccf87b7 100644 --- a/src/core/config.h.cmake +++ b/src/core/config.h.cmake @@ -4,4 +4,6 @@ #define LLVM_VERSION "@LLVM_VERSION@" #define COAL_VERSION "@Coal_VERSION@" +#define MAX_WORK_DIMS 3 + #endif diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp index eb18ec3..97c6973 100644 --- a/src/core/cpu/device.cpp +++ b/src/core/cpu/device.cpp @@ -265,7 +265,7 @@ cl_int CPUDevice::info(cl_device_info param_name, cl_device_exec_capabilities cl_device_exec_capabilities_var; cl_command_queue_properties cl_command_queue_properties_var; cl_platform_id cl_platform_id_var; - size_t three_size_t[3]; + size_t work_dims[MAX_WORK_DIMS]; }; switch (param_name) @@ -283,8 +283,7 @@ cl_int CPUDevice::info(cl_device_info param_name, break; case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: - // Spec minimum - SIMPLE_ASSIGN(cl_uint, 3); + SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS); break; case CL_DEVICE_MAX_WORK_GROUP_SIZE: @@ -292,11 +291,12 @@ cl_int CPUDevice::info(cl_device_info param_name, break; case CL_DEVICE_MAX_WORK_ITEM_SIZES: - three_size_t[0] = TYPE_MAX(size_t); - three_size_t[1] = TYPE_MAX(size_t); - three_size_t[2] = TYPE_MAX(size_t); - value_length = 3 * sizeof(size_t); - value = &three_size_t; + for (int i=0; i<MAX_WORK_DIMS; ++i) + { + work_dims[i] = TYPE_MAX(size_t); + } + value_length = MAX_WORK_DIMS * sizeof(size_t); + value = &work_dims; break; case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR: diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp index 0962929..3c7c11a 100644 --- a/src/core/cpu/kernel.cpp +++ b/src/core/cpu/kernel.cpp @@ -330,20 +330,13 @@ llvm::Function *CPUKernel::callFunction(std::vector<void *> &freeLocal) * CPUKernelEvent */ CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event) -: p_device(device), p_event(event), p_current_work_group(0), - p_max_work_groups(0), p_current_wg(0), p_finished_wg(0) +: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0) { // Mutex pthread_mutex_init(&p_mutex, 0); - // Tables - p_table_sizes = event->work_dim() * sizeof(size_t); - - p_current_work_group = (size_t *)std::malloc(p_table_sizes); - p_max_work_groups = (size_t *)std::malloc(p_table_sizes); - // Set current work group to (0, 0, ..., 0) - std::memset(p_current_work_group, 0, p_table_sizes); + std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t)); // Populate p_max_work_groups p_num_wg = 1; @@ -360,9 +353,6 @@ CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event) CPUKernelEvent::~CPUKernelEvent() { pthread_mutex_destroy(&p_mutex); - - std::free(p_current_work_group); - std::free(p_max_work_groups); } bool CPUKernelEvent::reserve() @@ -421,18 +411,11 @@ CPUKernelWorkGroup *CPUKernelEvent::takeInstance() CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, CPUKernelEvent *cpu_event, const size_t *work_group_index) -: p_kernel(kernel), p_event(event), p_index(0), p_current(0), p_maxs(0), - p_cpu_event(cpu_event), p_global_id(0) +: p_kernel(kernel), p_event(event), p_cpu_event(cpu_event) { - p_table_sizes = event->work_dim() * sizeof(size_t); - - p_index = (size_t *)std::malloc(p_table_sizes); - p_current = (size_t *)std::malloc(p_table_sizes); - p_maxs = (size_t *)std::malloc(p_table_sizes); - p_global_id = (size_t *)std::malloc(p_table_sizes); // Set index - std::memcpy(p_index, work_group_index, p_table_sizes); + std::memcpy(p_index, work_group_index, event->work_dim() * sizeof(size_t)); // Set maxs and global id for (unsigned int i=0; i<event->work_dim(); ++i) @@ -447,11 +430,6 @@ CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, CPUKernelWorkGroup::~CPUKernelWorkGroup() { - std::free(p_index); - std::free(p_current); - std::free(p_maxs); - std::free(p_global_id); - p_cpu_event->workGroupFinished(); } diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h index b20d2e5..7de36b1 100644 --- a/src/core/cpu/kernel.h +++ b/src/core/cpu/kernel.h @@ -2,6 +2,7 @@ #define __CPU_KERNEL_H__ #include "../deviceinterface.h" +#include "config.h" #include <llvm/ExecutionEngine/GenericValue.h> #include <vector> @@ -74,8 +75,10 @@ class CPUKernelWorkGroup CPUKernel *p_kernel; CPUKernelEvent *p_cpu_event; KernelEvent *p_event; - size_t *p_index, *p_current, *p_maxs, *p_global_id; - size_t p_table_sizes; + size_t p_index[MAX_WORK_DIMS], + p_current[MAX_WORK_DIMS], + p_maxs[MAX_WORK_DIMS], + p_global_id[MAX_WORK_DIMS]; }; class CPUKernelEvent @@ -93,8 +96,8 @@ class CPUKernelEvent private: CPUDevice *p_device; KernelEvent *p_event; - size_t *p_current_work_group, *p_max_work_groups; - size_t p_table_sizes; + size_t p_current_work_group[MAX_WORK_DIMS], + p_max_work_groups[MAX_WORK_DIMS]; size_t p_current_wg, p_finished_wg, p_num_wg; pthread_mutex_t p_mutex; }; diff --git a/src/core/events.cpp b/src/core/events.cpp index a591128..e7d8650 100644 --- a/src/core/events.cpp +++ b/src/core/events.cpp @@ -369,8 +369,7 @@ KernelEvent::KernelEvent(CommandQueue *parent, const Event **event_wait_list, cl_int *errcode_ret) : Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret), - p_kernel(kernel), p_work_dim(work_dim), p_global_work_offset(0), - p_global_work_size(0), p_local_work_size(0), p_max_work_item_sizes(0) + p_kernel(kernel), p_work_dim(work_dim) { *errcode_ret = CL_SUCCESS; @@ -385,7 +384,7 @@ KernelEvent::KernelEvent(CommandQueue *parent, DeviceInterface *device; Context *k_ctx, *q_ctx; size_t max_work_group_size; - cl_uint max_dims; + cl_uint max_dims = 0; *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), &device, 0); @@ -399,14 +398,12 @@ KernelEvent::KernelEvent(CommandQueue *parent, &max_work_group_size, 0); *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(size_t), &max_dims, 0); + *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_SIZES, + max_dims * sizeof(size_t), p_max_work_item_sizes, 0); if (*errcode_ret != CL_SUCCESS) return; - p_max_work_item_sizes = (size_t *)std::malloc(max_dims * sizeof(size_t)); - *errcode_ret = device->info(CL_DEVICE_MAX_WORK_ITEM_SIZES, - max_dims * sizeof(size_t), p_max_work_item_sizes, 0); - p_dev_kernel = kernel->deviceDependentKernel(device); if (!p_dev_kernel) @@ -437,10 +434,6 @@ KernelEvent::KernelEvent(CommandQueue *parent, } // Populate work_offset, work_size and local_work_size - p_global_work_offset = (size_t *)std::malloc(work_dim * sizeof(size_t)); - p_global_work_size = (size_t *)std::malloc(work_dim * sizeof(size_t)); - p_local_work_size = (size_t *)std::malloc(work_dim * sizeof(size_t)); - size_t work_group_size = 1; for (int i=0; i<work_dim; ++i) @@ -576,17 +569,7 @@ KernelEvent::KernelEvent(CommandQueue *parent, KernelEvent::~KernelEvent() { - if (p_global_work_offset) - std::free(p_global_work_offset); - - if (p_global_work_size) - std::free(p_global_work_size); - - if (p_local_work_size) - std::free(p_local_work_size); - if (p_max_work_item_sizes) - std::free(p_max_work_item_sizes); } cl_uint KernelEvent::work_dim() const diff --git a/src/core/events.h b/src/core/events.h index d16e26b..8c4f985 100644 --- a/src/core/events.h +++ b/src/core/events.h @@ -2,6 +2,7 @@ #define __EVENTS_H__ #include "commandqueue.h" +#include "config.h" #include <vector> @@ -244,8 +245,10 @@ class KernelEvent : public Event private: cl_uint p_work_dim; - size_t *p_global_work_offset, *p_global_work_size, *p_local_work_size, - *p_max_work_item_sizes; + size_t p_global_work_offset[MAX_WORK_DIMS], + p_global_work_size[MAX_WORK_DIMS], + p_local_work_size[MAX_WORK_DIMS], + p_max_work_item_sizes[MAX_WORK_DIMS]; Kernel *p_kernel; DeviceKernel *p_dev_kernel; }; |