summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenis Steckelmacher <steckdenis@yahoo.fr>2011-07-27 13:20:11 +0200
committerDenis Steckelmacher <steckdenis@yahoo.fr>2011-07-27 13:20:11 +0200
commit82d497c0aa6b8fb439f709eecbbc99d1e515579f (patch)
tree08d67dc89fad3b971189755b06ca9f98977b28aa
parenta038e07d3998624e0b9679c837fa008187d37e3d (diff)
Replace small size_t x 3 malloced vectors with static allocation.
It provides a huge speed boost : test kernel takes now 1.72s on my machine instead of 2.66 (1.54x speed boost).
-rw-r--r--src/core/config.h.cmake2
-rw-r--r--src/core/cpu/device.cpp16
-rw-r--r--src/core/cpu/kernel.cpp30
-rw-r--r--src/core/cpu/kernel.h11
-rw-r--r--src/core/events.cpp25
-rw-r--r--src/core/events.h7
6 files changed, 30 insertions, 61 deletions
diff --git a/src/core/config.h.cmake b/src/core/config.h.cmake
index 3556820..ccf87b7 100644
--- a/src/core/config.h.cmake
+++ b/src/core/config.h.cmake
@@ -4,4 +4,6 @@
#define LLVM_VERSION "@LLVM_VERSION@"
#define COAL_VERSION "@Coal_VERSION@"
+#define MAX_WORK_DIMS 3
+
#endif
diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp
index eb18ec3..97c6973 100644
--- a/src/core/cpu/device.cpp
+++ b/src/core/cpu/device.cpp
@@ -265,7 +265,7 @@ cl_int CPUDevice::info(cl_device_info param_name,
cl_device_exec_capabilities cl_device_exec_capabilities_var;
cl_command_queue_properties cl_command_queue_properties_var;
cl_platform_id cl_platform_id_var;
- size_t three_size_t[3];
+ size_t work_dims[MAX_WORK_DIMS];
};
switch (param_name)
@@ -283,8 +283,7 @@ cl_int CPUDevice::info(cl_device_info param_name,
break;
case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
- // Spec minimum
- SIMPLE_ASSIGN(cl_uint, 3);
+ SIMPLE_ASSIGN(cl_uint, MAX_WORK_DIMS);
break;
case CL_DEVICE_MAX_WORK_GROUP_SIZE:
@@ -292,11 +291,12 @@ cl_int CPUDevice::info(cl_device_info param_name,
break;
case CL_DEVICE_MAX_WORK_ITEM_SIZES:
- three_size_t[0] = TYPE_MAX(size_t);
- three_size_t[1] = TYPE_MAX(size_t);
- three_size_t[2] = TYPE_MAX(size_t);
- value_length = 3 * sizeof(size_t);
- value = &three_size_t;
+ for (int i=0; i<MAX_WORK_DIMS; ++i)
+ {
+ work_dims[i] = TYPE_MAX(size_t);
+ }
+ value_length = MAX_WORK_DIMS * sizeof(size_t);
+ value = &work_dims;
break;
case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp
index 0962929..3c7c11a 100644
--- a/src/core/cpu/kernel.cpp
+++ b/src/core/cpu/kernel.cpp
@@ -330,20 +330,13 @@ llvm::Function *CPUKernel::callFunction(std::vector<void *> &freeLocal)
* CPUKernelEvent
*/
CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event)
-: p_device(device), p_event(event), p_current_work_group(0),
- p_max_work_groups(0), p_current_wg(0), p_finished_wg(0)
+: p_device(device), p_event(event), p_current_wg(0), p_finished_wg(0)
{
// Mutex
pthread_mutex_init(&p_mutex, 0);
- // Tables
- p_table_sizes = event->work_dim() * sizeof(size_t);
-
- p_current_work_group = (size_t *)std::malloc(p_table_sizes);
- p_max_work_groups = (size_t *)std::malloc(p_table_sizes);
-
// Set current work group to (0, 0, ..., 0)
- std::memset(p_current_work_group, 0, p_table_sizes);
+ std::memset(p_current_work_group, 0, event->work_dim() * sizeof(size_t));
// Populate p_max_work_groups
p_num_wg = 1;
@@ -360,9 +353,6 @@ CPUKernelEvent::CPUKernelEvent(CPUDevice *device, KernelEvent *event)
CPUKernelEvent::~CPUKernelEvent()
{
pthread_mutex_destroy(&p_mutex);
-
- std::free(p_current_work_group);
- std::free(p_max_work_groups);
}
bool CPUKernelEvent::reserve()
@@ -421,18 +411,11 @@ CPUKernelWorkGroup *CPUKernelEvent::takeInstance()
CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
CPUKernelEvent *cpu_event,
const size_t *work_group_index)
-: p_kernel(kernel), p_event(event), p_index(0), p_current(0), p_maxs(0),
- p_cpu_event(cpu_event), p_global_id(0)
+: p_kernel(kernel), p_event(event), p_cpu_event(cpu_event)
{
- p_table_sizes = event->work_dim() * sizeof(size_t);
-
- p_index = (size_t *)std::malloc(p_table_sizes);
- p_current = (size_t *)std::malloc(p_table_sizes);
- p_maxs = (size_t *)std::malloc(p_table_sizes);
- p_global_id = (size_t *)std::malloc(p_table_sizes);
// Set index
- std::memcpy(p_index, work_group_index, p_table_sizes);
+ std::memcpy(p_index, work_group_index, event->work_dim() * sizeof(size_t));
// Set maxs and global id
for (unsigned int i=0; i<event->work_dim(); ++i)
@@ -447,11 +430,6 @@ CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
CPUKernelWorkGroup::~CPUKernelWorkGroup()
{
- std::free(p_index);
- std::free(p_current);
- std::free(p_maxs);
- std::free(p_global_id);
-
p_cpu_event->workGroupFinished();
}
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index b20d2e5..7de36b1 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -2,6 +2,7 @@
#define __CPU_KERNEL_H__
#include "../deviceinterface.h"
+#include "config.h"
#include <llvm/ExecutionEngine/GenericValue.h>
#include <vector>
@@ -74,8 +75,10 @@ class CPUKernelWorkGroup
CPUKernel *p_kernel;
CPUKernelEvent *p_cpu_event;
KernelEvent *p_event;
- size_t *p_index, *p_current, *p_maxs, *p_global_id;
- size_t p_table_sizes;
+ size_t p_index[MAX_WORK_DIMS],
+ p_current[MAX_WORK_DIMS],
+ p_maxs[MAX_WORK_DIMS],
+ p_global_id[MAX_WORK_DIMS];
};
class CPUKernelEvent
@@ -93,8 +96,8 @@ class CPUKernelEvent
private:
CPUDevice *p_device;
KernelEvent *p_event;
- size_t *p_current_work_group, *p_max_work_groups;
- size_t p_table_sizes;
+ size_t p_current_work_group[MAX_WORK_DIMS],
+ p_max_work_groups[MAX_WORK_DIMS];
size_t p_current_wg, p_finished_wg, p_num_wg;
pthread_mutex_t p_mutex;
};
diff --git a/src/core/events.cpp b/src/core/events.cpp
index a591128..e7d8650 100644
--- a/src/core/events.cpp
+++ b/src/core/events.cpp
@@ -369,8 +369,7 @@ KernelEvent::KernelEvent(CommandQueue *parent,
const Event **event_wait_list,
cl_int *errcode_ret)
: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret),
- p_kernel(kernel), p_work_dim(work_dim), p_global_work_offset(0),
- p_global_work_size(0), p_local_work_size(0), p_max_work_item_sizes(0)
+ p_kernel(kernel), p_work_dim(work_dim)
{
*errcode_ret = CL_SUCCESS;
@@ -385,7 +384,7 @@ KernelEvent::KernelEvent(CommandQueue *parent,
DeviceInterface *device;
Context *k_ctx, *q_ctx;
size_t max_work_group_size;
- cl_uint max_dims;
+ cl_uint max_dims = 0;
*errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *),
&device, 0);
@@ -399,14 +398,12 @@ KernelEvent::KernelEvent(CommandQueue *parent,
&max_work_group_size, 0);
*errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(size_t),
&max_dims, 0);
+ *errcode_ret |= device->info(CL_DEVICE_MAX_WORK_ITEM_SIZES,
+ max_dims * sizeof(size_t), p_max_work_item_sizes, 0);
if (*errcode_ret != CL_SUCCESS)
return;
- p_max_work_item_sizes = (size_t *)std::malloc(max_dims * sizeof(size_t));
- *errcode_ret = device->info(CL_DEVICE_MAX_WORK_ITEM_SIZES,
- max_dims * sizeof(size_t), p_max_work_item_sizes, 0);
-
p_dev_kernel = kernel->deviceDependentKernel(device);
if (!p_dev_kernel)
@@ -437,10 +434,6 @@ KernelEvent::KernelEvent(CommandQueue *parent,
}
// Populate work_offset, work_size and local_work_size
- p_global_work_offset = (size_t *)std::malloc(work_dim * sizeof(size_t));
- p_global_work_size = (size_t *)std::malloc(work_dim * sizeof(size_t));
- p_local_work_size = (size_t *)std::malloc(work_dim * sizeof(size_t));
-
size_t work_group_size = 1;
for (int i=0; i<work_dim; ++i)
@@ -576,17 +569,7 @@ KernelEvent::KernelEvent(CommandQueue *parent,
KernelEvent::~KernelEvent()
{
- if (p_global_work_offset)
- std::free(p_global_work_offset);
-
- if (p_global_work_size)
- std::free(p_global_work_size);
-
- if (p_local_work_size)
- std::free(p_local_work_size);
- if (p_max_work_item_sizes)
- std::free(p_max_work_item_sizes);
}
cl_uint KernelEvent::work_dim() const
diff --git a/src/core/events.h b/src/core/events.h
index d16e26b..8c4f985 100644
--- a/src/core/events.h
+++ b/src/core/events.h
@@ -2,6 +2,7 @@
#define __EVENTS_H__
#include "commandqueue.h"
+#include "config.h"
#include <vector>
@@ -244,8 +245,10 @@ class KernelEvent : public Event
private:
cl_uint p_work_dim;
- size_t *p_global_work_offset, *p_global_work_size, *p_local_work_size,
- *p_max_work_item_sizes;
+ size_t p_global_work_offset[MAX_WORK_DIMS],
+ p_global_work_size[MAX_WORK_DIMS],
+ p_local_work_size[MAX_WORK_DIMS],
+ p_max_work_item_sizes[MAX_WORK_DIMS];
Kernel *p_kernel;
DeviceKernel *p_dev_kernel;
};