diff options
author | Denis Steckelmacher <steckdenis@yahoo.fr> | 2011-07-27 13:51:15 +0200 |
---|---|---|
committer | Denis Steckelmacher <steckdenis@yahoo.fr> | 2011-07-27 13:51:15 +0200 |
commit | a1f3d501a7a0d73fc7594d5f0c3381744ed867a8 (patch) | |
tree | 9473ab7b45114f5c3b1e112efece840eb15afb0f | |
parent | 82d497c0aa6b8fb439f709eecbbc99d1e515579f (diff) |
Optimizations and code factoring.
-rw-r--r-- | src/core/cpu/device.cpp | 58 | ||||
-rw-r--r-- | src/core/cpu/device.h | 7 | ||||
-rw-r--r-- | src/core/cpu/kernel.cpp | 33 | ||||
-rw-r--r-- | src/core/cpu/kernel.h | 1 | ||||
-rw-r--r-- | src/core/deviceinterface.h | 2 | ||||
-rw-r--r-- | src/core/events.cpp | 70 | ||||
-rw-r--r-- | src/core/events.h | 4 |
7 files changed, 90 insertions, 85 deletions
diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp index 97c6973..907cdf0 100644 --- a/src/core/cpu/device.cpp +++ b/src/core/cpu/device.cpp @@ -38,6 +38,30 @@ void CPUDevice::init() pthread_cond_init(&p_events_cond, 0); pthread_mutex_init(&p_events_mutex, 0); + // Get info about the system + p_cores = sysconf(_SC_NPROCESSORS_ONLN); + p_cpu_mhz = 0.0f; + + std::filebuf fb; + fb.open("/proc/cpuinfo", std::ios::in); + std::istream is(&fb); + + while (!is.eof()) + { + std::string key, value; + + std::getline(is, key, ':'); + is.ignore(1); + std::getline(is, value); + + if (key.compare(0, 7, "cpu MHz") == 0) + { + std::istringstream ss(value); + ss >> p_cpu_mhz; + break; + } + } + // Create worker threads p_workers = (pthread_t *)std::malloc(numCPUs() * sizeof(pthread_t)); @@ -198,38 +222,14 @@ Event *CPUDevice::getEvent(bool &stop) return event; } -unsigned int CPUDevice::numCPUs() +unsigned int CPUDevice::numCPUs() const { - if (p_cores) return p_cores; - - return (p_cores = sysconf(_SC_NPROCESSORS_ONLN)); + return p_cores; } -float CPUDevice::cpuMhz() +float CPUDevice::cpuMhz() const { - std::filebuf fb; - fb.open("/proc/cpuinfo", std::ios::in); - std::istream is(&fb); - - float cpuMhz = 0.0; - - while (!is.eof()) - { - std::string key, value; - - std::getline(is, key, ':'); - is.ignore(1); - std::getline(is, value); - - if (key.compare(0, 7, "cpu MHz") == 0) - { - std::istringstream ss(value); - ss >> cpuMhz; - break; - } - } - - return cpuMhz; + return p_cpu_mhz; } // From inner parentheses to outher ones : @@ -248,7 +248,7 @@ float CPUDevice::cpuMhz() cl_int CPUDevice::info(cl_device_info param_name, size_t param_value_size, void *param_value, - size_t *param_value_size_ret) + size_t *param_value_size_ret) const { void *value = 0; size_t value_length = 0; diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h index 7869476..fd38ff7 100644 --- a/src/core/cpu/device.h +++ b/src/core/cpu/device.h @@ -25,7 +25,7 @@ class CPUDevice : public DeviceInterface cl_int info(cl_device_info param_name, size_t param_value_size, void *param_value, - size_t *param_value_size_ret); + size_t *param_value_size_ret) const; DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs); DeviceProgram *createDeviceProgram(Program *program); @@ -38,11 +38,12 @@ class CPUDevice : public DeviceInterface void pushEvent(Event *event); Event *getEvent(bool &stop); - unsigned int numCPUs(); - float cpuMhz(); + unsigned int numCPUs() const; + float cpuMhz() const; private: unsigned int p_cores, p_num_events; + float p_cpu_mhz; pthread_t *p_workers; std::list<Event *> p_events; diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp index 3c7c11a..a898522 100644 --- a/src/core/cpu/kernel.cpp +++ b/src/core/cpu/kernel.cpp @@ -132,7 +132,7 @@ size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim, // Don't let the loop go up to global_work_size, the overhead would be // too huge - if (divisor > cpus * 32) + if (divisor > global_work_size || divisor > cpus * 32) { divisor = 1; // Not parallel but has no CommandQueue overhead break; @@ -411,20 +411,21 @@ CPUKernelWorkGroup *CPUKernelEvent::takeInstance() CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, CPUKernelEvent *cpu_event, const size_t *work_group_index) -: p_kernel(kernel), p_event(event), p_cpu_event(cpu_event) +: p_kernel(kernel), p_event(event), p_cpu_event(cpu_event), + p_work_dim(event->work_dim()) { // Set index - std::memcpy(p_index, work_group_index, event->work_dim() * sizeof(size_t)); + std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t)); // Set maxs and global id - for (unsigned int i=0; i<event->work_dim(); ++i) + for (unsigned int i=0; i<p_work_dim; ++i) { p_maxs[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n // Set global id - p_global_id[i] = (p_index[i] * p_event->local_work_size(i)) - + p_event->global_work_offset(i); + p_global_id[i] = (p_index[i] * event->local_work_size(i)) + + event->global_work_offset(i); } } @@ -436,7 +437,7 @@ CPUKernelWorkGroup::~CPUKernelWorkGroup() bool CPUKernelWorkGroup::run() { // Set current pos to 0 - std::memset(p_current, 0, p_event->work_dim() * sizeof(size_t)); + std::memset(p_current, 0, p_work_dim * sizeof(size_t)); // Get the kernel function to call bool free_after = p_kernel->kernel()->needsLocalAllocation(); @@ -459,7 +460,7 @@ bool CPUKernelWorkGroup::run() { // Simply call the "call function", it and the builtins will do the rest kernel_func_addr(); - } while (!incVec(p_event->work_dim(), p_current, p_maxs)); + } while (!incVec(p_work_dim, p_current, p_maxs)); // We may have some cleanup to do if (free_after) @@ -478,12 +479,12 @@ bool CPUKernelWorkGroup::run() cl_uint CPUKernelWorkGroup::getWorkDim() const { - return p_event->work_dim(); + return p_work_dim; } size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const { - if (dimindx > p_event->work_dim()) + if (dimindx > p_work_dim) return 0; return p_global_id[dimindx] + p_current[dimindx]; @@ -491,7 +492,7 @@ size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const { - if (dimindx > p_event->work_dim()) + if (dimindx >p_work_dim) return 1; return p_event->global_work_size(dimindx); @@ -499,7 +500,7 @@ size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const { - if (dimindx > p_event->work_dim()) + if (dimindx > p_work_dim) return 1; return p_event->local_work_size(dimindx); @@ -507,7 +508,7 @@ size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const { - if (dimindx > p_event->work_dim()) + if (dimindx > p_work_dim) return 0; return p_current[dimindx]; @@ -515,7 +516,7 @@ size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const { - if (dimindx > p_event->work_dim()) + if (dimindx > p_work_dim) return 1; return (p_event->global_work_size(dimindx) / @@ -524,7 +525,7 @@ size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const { - if (dimindx > p_event->work_dim()) + if (dimindx > p_work_dim) return 0; return p_index[dimindx]; @@ -532,7 +533,7 @@ size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const { - if (dimindx > p_event->work_dim()) + if (dimindx > p_work_dim) return 0; return p_event->global_work_offset(dimindx); diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h index 7de36b1..91d1dfd 100644 --- a/src/core/cpu/kernel.h +++ b/src/core/cpu/kernel.h @@ -75,6 +75,7 @@ class CPUKernelWorkGroup CPUKernel *p_kernel; CPUKernelEvent *p_cpu_event; KernelEvent *p_event; + cl_uint p_work_dim; size_t p_index[MAX_WORK_DIMS], p_current[MAX_WORK_DIMS], p_maxs[MAX_WORK_DIMS], diff --git a/src/core/deviceinterface.h b/src/core/deviceinterface.h index d19e87a..361df2e 100644 --- a/src/core/deviceinterface.h +++ b/src/core/deviceinterface.h @@ -31,7 +31,7 @@ class DeviceInterface virtual cl_int info(cl_device_info param_name, size_t param_value_size, void *param_value, - size_t *param_value_size_ret) = 0; + size_t *param_value_size_ret) const = 0; virtual DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs) = 0; virtual DeviceProgram *createDeviceProgram(Program *program) = 0; diff --git a/src/core/events.cpp b/src/core/events.cpp index e7d8650..7dfdac8 100644 --- a/src/core/events.cpp +++ b/src/core/events.cpp @@ -42,29 +42,16 @@ BufferEvent::BufferEvent(CommandQueue *parent, // Alignment of SubBuffers DeviceInterface *device = 0; - cl_uint align; *errcode_ret = parent->info(CL_QUEUE_DEVICE, sizeof(DeviceInterface *), &device, 0); - if (errcode_ret != CL_SUCCESS) return; + if (errcode_ret != CL_SUCCESS) + return; - if (buffer->type() == MemObject::SubBuffer) + if (!isSubBufferAligned(buffer, device)) { - *errcode_ret = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), - &align, 0); - - if (*errcode_ret != CL_SUCCESS) return; - - size_t mask = 0; - - for (int i=0; i<align; ++i) - mask = 1 | (mask << 1); - - if (((SubBuffer *)buffer)->offset() | mask) - { - *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; - return; - } + *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; + return; } // Allocate the buffer for the device @@ -80,6 +67,32 @@ MemObject *BufferEvent::buffer() const return p_buffer; } +bool BufferEvent::isSubBufferAligned(const MemObject *buffer, + const DeviceInterface *device) +{ + cl_uint align; + cl_int rs; + + if (buffer->type() != MemObject::SubBuffer) + return true; + + rs = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), + &align, 0); + + if (rs != CL_SUCCESS) + return false; + + size_t mask = 0; + + for (int i=0; i<align; ++i) + mask = 1 | (mask << 1); + + if (((SubBuffer *)buffer)->offset() | mask) + return false; + + return true; +} + ReadWriteBufferEvent::ReadWriteBufferEvent(CommandQueue *parent, MemObject *buffer, size_t offset, @@ -502,25 +515,10 @@ KernelEvent::KernelEvent(CommandQueue *parent, { const MemObject *buffer = *(const MemObject **)(a.value(0)); - if (buffer->type() == MemObject::SubBuffer) + if (!BufferEvent::isSubBufferAligned(buffer, device)) { - cl_uint align; - *errcode_ret = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), - &align, 0); - - if (*errcode_ret != CL_SUCCESS) - return; - - size_t mask = 0; - - for (int i=0; i<align; ++i) - mask = 1 | (mask << 1); - - if (((SubBuffer *)buffer)->offset() | mask) - { - *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; - return; - } + *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; + return; } } else if (a.kind() == Kernel::Arg::Image2D) diff --git a/src/core/events.h b/src/core/events.h index 8c4f985..6702ff0 100644 --- a/src/core/events.h +++ b/src/core/events.h @@ -12,6 +12,7 @@ namespace Coal class MemObject; class Kernel; class DeviceKernel; +class DeviceInterface; class BufferEvent : public Event { @@ -24,6 +25,9 @@ class BufferEvent : public Event MemObject *buffer() const; + static bool isSubBufferAligned(const MemObject *buffer, + const DeviceInterface *device); + private: MemObject *p_buffer; }; |