summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenis Steckelmacher <steckdenis@yahoo.fr>2011-07-14 12:20:25 +0200
committerDenis Steckelmacher <steckdenis@yahoo.fr>2011-07-14 12:20:25 +0200
commite5c5e502334fc899b9cd8f3ea6e082c8bfb36d4d (patch)
tree4824fec6fb58f8b92c93ac81a355f39d85a5afc6
parenta8bb88edc1d88ec35ca7fd47584badbee56dcf4b (diff)
WIP: Implement kernel launching
Refactor Kernel::Arg to be cleaner, initialize the JIT in src/core/cpu/program.cpp, and get the JIT kernel call function.
-rw-r--r--src/core/commandqueue.cpp7
-rw-r--r--src/core/commandqueue.h2
-rw-r--r--src/core/cpu/device.cpp24
-rw-r--r--src/core/cpu/device.h3
-rw-r--r--src/core/cpu/kernel.cpp206
-rw-r--r--src/core/cpu/kernel.h14
-rw-r--r--src/core/cpu/program.cpp30
-rw-r--r--src/core/cpu/program.h14
-rw-r--r--src/core/cpu/worker.cpp10
-rw-r--r--src/core/deviceinterface.h6
-rw-r--r--src/core/events.cpp103
-rw-r--r--src/core/events.h6
-rw-r--r--src/core/kernel.cpp272
-rw-r--r--src/core/kernel.h105
-rw-r--r--src/core/program.cpp18
-rw-r--r--src/core/program.h2
16 files changed, 624 insertions, 198 deletions
diff --git a/src/core/commandqueue.cpp b/src/core/commandqueue.cpp
index 85d51c4..d4aa99c 100644
--- a/src/core/commandqueue.cpp
+++ b/src/core/commandqueue.cpp
@@ -412,12 +412,9 @@ void Event::setReleaseParent(bool release)
p_release_parent = release;
}
-bool Event::isSingleShot() const
+bool Event::lastSlot() const
{
- // NDRangeKernel is a single event that can be executed on several execution
- // units. The other (buffer copying) must be executed in one part.
-
- return (type() != NDRangeKernel);
+ return true;
}
bool Event::isDummy() const
diff --git a/src/core/commandqueue.h b/src/core/commandqueue.h
index 1b4f82e..c2a9a1e 100644
--- a/src/core/commandqueue.h
+++ b/src/core/commandqueue.h
@@ -116,7 +116,7 @@ class Event
virtual ~Event();
virtual Type type() const = 0;
- bool isSingleShot() const; /*!< Cannot be split on several execution units */
+ virtual bool lastSlot() const; /*!< The last slot of a NDRange event will be executed */
bool isDummy() const; /*!< Doesn't do anything, it's just an event type */
void reference();
diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp
index 487b4b3..c9b90bc 100644
--- a/src/core/cpu/device.cpp
+++ b/src/core/cpu/device.cpp
@@ -9,6 +9,8 @@
#include "../commandqueue.h"
#include "../events.h"
#include "../memobject.h"
+#include "../kernel.h"
+#include "../program.h"
#include <cstring>
#include <cstdlib>
@@ -68,9 +70,10 @@ DeviceProgram *CPUDevice::createDeviceProgram(Program *program)
return (DeviceProgram *)new CPUProgram(this, program);
}
-DeviceKernel *CPUDevice::createDeviceKernel(Kernel *kernel)
+DeviceKernel *CPUDevice::createDeviceKernel(Kernel *kernel,
+ llvm::Function *function)
{
- return (DeviceKernel *)new CPUKernel(this, kernel);
+ return (DeviceKernel *)new CPUKernel(this, kernel, function);
}
cl_int CPUDevice::initEventDeviceData(Event *event)
@@ -92,6 +95,19 @@ cl_int CPUDevice::initEventDeviceData(Event *event)
// Nothing do to
break;
+ case Event::NDRangeKernel:
+ case Event::TaskKernel:
+ {
+ // Instantiate the JIT for the CPU program
+ KernelEvent *e = (KernelEvent *)event;
+ CPUProgram *prog =
+ (CPUProgram *)e->kernel()->program()->deviceDependentProgram(this);
+
+ if (!prog->initJIT())
+ return CL_INVALID_PROGRAM_EXECUTABLE;
+
+ break;
+ }
default:
break;
}
@@ -122,8 +138,8 @@ Event *CPUDevice::getEvent(bool &stop)
Event *event = p_events.front();
- // If event is single-shot, remove it
- if (event->isSingleShot())
+ // If the run of this event will finish it, remove it from the list
+ if (event->lastSlot())
{
p_num_events--;
p_events.pop_front();
diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h
index bd7e535..f19fcbc 100644
--- a/src/core/cpu/device.h
+++ b/src/core/cpu/device.h
@@ -27,7 +27,8 @@ class CPUDevice : public DeviceInterface
DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs);
DeviceProgram *createDeviceProgram(Program *program);
- DeviceKernel *createDeviceKernel(Kernel *kernel);
+ DeviceKernel *createDeviceKernel(Kernel *kernel,
+ llvm::Function *function);
cl_int initEventDeviceData(Event *event);
diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp
index 7e334e2..bb630d0 100644
--- a/src/core/cpu/kernel.cpp
+++ b/src/core/cpu/kernel.cpp
@@ -1,19 +1,35 @@
#include "kernel.h"
#include "device.h"
+#include "buffer.h"
#include "../kernel.h"
+#include "../memobject.h"
+
+#include <llvm/Function.h>
+#include <llvm/Constants.h>
+#include <llvm/ADT/APInt.h>
+#include <llvm/ADT/APFloat.h>
+#include <llvm/Support/Casting.h>
+#include <llvm/Instructions.h>
+#include <llvm/LLVMContext.h>
+#include <llvm/Module.h>
+
+#include <cstdlib>
+#include <iostream>
using namespace Coal;
-CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel)
-: DeviceKernel(), p_device(device), p_kernel(kernel)
+CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function)
+: DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function),
+ p_call_function(0)
{
}
CPUKernel::~CPUKernel()
{
-
+ if (p_call_function)
+ p_call_function->eraseFromParent();
}
size_t CPUKernel::workGroupSize() const
@@ -78,3 +94,187 @@ size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
// Return the size
return global_work_size / divisor;
}
+
+llvm::Function *CPUKernel::function() const
+{
+ return p_function;
+}
+
+static llvm::Constant *getPointerConstant(llvm::LLVMContext &C,
+ const llvm::Type *type,
+ void *const *value)
+{
+ llvm::Constant *rs = 0;
+
+ if (sizeof(void *) == 4)
+ rs = llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), *(uint32_t *)value);
+ else
+ rs = llvm::ConstantInt::get(llvm::Type::getInt64Ty(C), *(uint64_t *)value);
+
+ // Cast to kernel's pointer type
+ rs = llvm::ConstantExpr::getIntToPtr(rs, type);
+
+ return rs;
+}
+
+llvm::Function *CPUKernel::callFunction()
+{
+ // If we can reuse the same function between work groups, do it
+ if (!p_kernel->needsLocalAllocation() && p_call_function)
+ return p_call_function;
+
+ // Create a LLVM function that calls the kernels with its arguments
+ // Code inspired from llvm/lib/ExecutionEngine/JIT/JIT.cpp
+ // Copyright The LLVM Compiler Infrastructure
+ const llvm::FunctionType *k_func_type = p_function->getFunctionType();
+ llvm::FunctionType *f_type =
+ llvm::FunctionType::get(p_function->getReturnType(), false);
+ llvm::Function *stub = llvm::Function::Create(f_type,
+ llvm::Function::InternalLinkage,
+ "", p_function->getParent());
+
+ // Insert a basic block
+ llvm::BasicBlock *block = llvm::BasicBlock::Create(p_function->getContext(),
+ "", stub);
+
+ llvm::SmallVector<llvm::Value *, 8> args;
+
+ // Add each kernel arg to args
+ for (int i=0; i<p_kernel->numArgs(); ++i)
+ {
+ const Kernel::Arg &a = p_kernel->arg(i);
+ llvm::Constant *arg_constant = 0;
+
+ // To handle vectors (float4, etc)
+ llvm::SmallVector<llvm::Constant *, 4> vec_elements;
+
+ // Explore the vector elements
+ for (unsigned short k=0; k<a.vecDim(); ++k)
+ {
+ const void *value = a.value(k);
+ llvm::Constant *C = 0;
+
+ switch (a.kind())
+ {
+ case Kernel::Arg::Int8:
+ C = llvm::ConstantInt::get(stub->getContext(),
+ llvm::APInt(8, *(uint8_t *)value));
+ break;
+
+ case Kernel::Arg::Int16:
+ C = llvm::ConstantInt::get(stub->getContext(),
+ llvm::APInt(16, *(uint16_t *)value));
+ break;
+
+ case Kernel::Arg::Int32:
+ C = llvm::ConstantInt::get(stub->getContext(),
+ llvm::APInt(32, *(uint32_t *)value));
+ break;
+
+ case Kernel::Arg::Int64:
+ C = llvm::ConstantInt::get(stub->getContext(),
+ llvm::APInt(64, *(uint64_t *)value));
+ break;
+
+ case Kernel::Arg::Float:
+ C = llvm::ConstantFP::get(stub->getContext(),
+ llvm::APFloat(*(float *)value));
+ break;
+
+ case Kernel::Arg::Double:
+ C = llvm::ConstantFP::get(stub->getContext(),
+ llvm::APFloat(*(double *)value));
+ break;
+
+ case Kernel::Arg::Buffer:
+ {
+ MemObject *buffer = *(MemObject **)value;
+
+ if (a.file() == Kernel::Arg::Local)
+ {
+ // Alloc a buffer and pass it to the kernel
+ // NOTE: Free this after use !
+ void *local_buffer = std::malloc(a.allocAtKernelRuntime());
+ C = getPointerConstant(stub->getContext(),
+ k_func_type->getParamType(i),
+ &local_buffer);
+ }
+ else
+ {
+ if (!buffer)
+ {
+ // We can do that, just send NULL
+ C = llvm::ConstantPointerNull::get(
+ llvm::cast<llvm::PointerType>(
+ k_func_type->getParamType(i)));
+ }
+ else
+ {
+ // Get the CPU buffer, allocate it and get its pointer
+ CPUBuffer *cpubuf =
+ (CPUBuffer *)buffer->deviceBuffer(p_device);
+ void *buf_ptr = 0;
+
+ if (!cpubuf->allocated())
+ cpubuf->allocate();
+
+ buf_ptr = cpubuf->data();
+
+ C = getPointerConstant(stub->getContext(),
+ k_func_type->getParamType(i),
+ &buf_ptr);
+ }
+ }
+
+ break;
+ }
+
+ case Kernel::Arg::Image2D:
+ case Kernel::Arg::Image3D:
+ // Assign a pointer to the image object, the instrinsic functions
+ // will handle them
+ C = getPointerConstant(stub->getContext(),
+ k_func_type->getParamType(i),
+ (void **)value);
+ break;
+
+ default:
+ break;
+ }
+
+ // Add the vector element
+ vec_elements.push_back(C);
+ }
+
+ // If the arg was a vector, handle it
+ if (a.vecDim() == 1)
+ {
+ arg_constant = vec_elements.front();
+ }
+ else
+ {
+ arg_constant = llvm::ConstantVector::get(vec_elements);
+ }
+
+ // Append the arg
+ args.push_back(arg_constant);
+ }
+
+ // Create the call instruction
+ llvm::CallInst *call_inst = llvm::CallInst::Create(p_function, args.begin(),
+ args.end(), "", block);
+ call_inst->setCallingConv(p_function->getCallingConv());
+ call_inst->setTailCall();
+
+ // Create a return instruction to end the stub
+ llvm::ReturnInst::Create(stub->getContext(), block);
+
+ // DEBUG
+ stub->getParent()->dump();
+
+ // Retain the function if it can be reused
+ if (!p_kernel->needsLocalAllocation())
+ p_call_function = stub;
+
+ return stub;
+}
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index 6cc998a..d72f078 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -3,6 +3,14 @@
#include "../deviceinterface.h"
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <vector>
+
+namespace llvm
+{
+ class Function;
+}
+
namespace Coal
{
@@ -12,7 +20,7 @@ class Kernel;
class CPUKernel : public DeviceKernel
{
public:
- CPUKernel(CPUDevice *device, Kernel *kernel);
+ CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function);
~CPUKernel();
size_t workGroupSize() const;
@@ -22,9 +30,13 @@ class CPUKernel : public DeviceKernel
size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
size_t global_work_size) const;
+ llvm::Function *function() const;
+ llvm::Function *callFunction();
+
private:
CPUDevice *p_device;
Kernel *p_kernel;
+ llvm::Function *p_function, *p_call_function;
};
}
diff --git a/src/core/cpu/program.cpp b/src/core/cpu/program.cpp
index 9d048c9..9a3a851 100644
--- a/src/core/cpu/program.cpp
+++ b/src/core/cpu/program.cpp
@@ -8,18 +8,20 @@
#include <llvm/Analysis/Verifier.h>
#include <llvm/Transforms/Scalar.h>
#include <llvm/Transforms/IPO.h>
+#include <llvm/ExecutionEngine/ExecutionEngine.h>
using namespace Coal;
CPUProgram::CPUProgram(CPUDevice *device, Program *program)
-: DeviceProgram(), p_device(device), p_program(program)
+: DeviceProgram(), p_device(device), p_program(program), p_jit(0)
{
}
CPUProgram::~CPUProgram()
{
-
+ if (p_jit)
+ delete p_jit;
}
bool CPUProgram::linkStdLib() const
@@ -56,8 +58,30 @@ void CPUProgram::createOptimizationPasses(llvm::PassManager *manager, bool optim
}
}
-bool CPUProgram::build(const llvm::Module *module)
+bool CPUProgram::build(llvm::Module *module)
{
// Nothing to build
+ p_module = module;
+
+ return true;
+}
+
+bool CPUProgram::initJIT()
+{
+ if (p_jit)
+ return true;
+
+ if (!p_module)
+ return false;
+
+ // Create the JIT
+ p_jit = llvm::ExecutionEngine::create(p_module, false, 0,
+ llvm::CodeGenOpt::Default, false);
+
return true;
}
+
+llvm::ExecutionEngine *CPUProgram::jit() const
+{
+ return p_jit;
+}
diff --git a/src/core/cpu/program.h b/src/core/cpu/program.h
index 8e61592..e0b7029 100644
--- a/src/core/cpu/program.h
+++ b/src/core/cpu/program.h
@@ -3,6 +3,12 @@
#include "../deviceinterface.h"
+namespace llvm
+{
+ class ExecutionEngine;
+ class Module;
+}
+
namespace Coal
{
@@ -17,11 +23,17 @@ class CPUProgram : public DeviceProgram
bool linkStdLib() const;
void createOptimizationPasses(llvm::PassManager *manager, bool optimize);
- bool build(const llvm::Module *module);
+ bool build(llvm::Module *module);
+
+ bool initJIT();
+ llvm::ExecutionEngine *jit() const;
private:
CPUDevice *p_device;
Program *p_program;
+
+ llvm::ExecutionEngine *p_jit;
+ llvm::Module *p_module;
};
}
diff --git a/src/core/cpu/worker.cpp b/src/core/cpu/worker.cpp
index 87e10e2..9b18c47 100644
--- a/src/core/cpu/worker.cpp
+++ b/src/core/cpu/worker.cpp
@@ -1,10 +1,12 @@
#include "worker.h"
#include "device.h"
#include "buffer.h"
+#include "kernel.h"
#include "../commandqueue.h"
#include "../events.h"
#include "../memobject.h"
+#include "../kernel.h"
#include <cstring>
@@ -13,7 +15,7 @@ using namespace Coal;
void *worker(void *data)
{
CPUDevice *device = (CPUDevice *)data;
- bool stop = false, success;
+ bool stop = false, success, last_slot;
Event *event;
while (true)
@@ -29,6 +31,7 @@ void *worker(void *data)
CommandQueue *queue = 0;
cl_command_queue_properties queue_props = 0;
success = true;
+ last_slot = event->lastSlot();
event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0);
@@ -76,6 +79,9 @@ void *worker(void *data)
case Event::TaskKernel:
{
KernelEvent *e = (KernelEvent *)event;
+ CPUKernel *k = (CPUKernel *)e->kernel()->deviceDependentKernel(device);
+ e->setLastSlot(true);
+ k->callFunction();
break;
}
@@ -84,7 +90,7 @@ void *worker(void *data)
}
// Cleanups
- if (success)
+ if (success && last_slot)
{
event->setStatus(Event::Complete);
diff --git a/src/core/deviceinterface.h b/src/core/deviceinterface.h
index b35ec20..bca8f13 100644
--- a/src/core/deviceinterface.h
+++ b/src/core/deviceinterface.h
@@ -7,6 +7,7 @@ namespace llvm
{
class PassManager;
class Module;
+ class Function;
}
namespace Coal
@@ -34,7 +35,8 @@ class DeviceInterface
virtual DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs) = 0;
virtual DeviceProgram *createDeviceProgram(Program *program) = 0;
- virtual DeviceKernel *createDeviceKernel(Kernel *kernel) = 0;
+ virtual DeviceKernel *createDeviceKernel(Kernel *kernel,
+ llvm::Function *function) = 0;
virtual void pushEvent(Event *event) = 0;
@@ -65,7 +67,7 @@ class DeviceProgram
virtual bool linkStdLib() const = 0;
virtual void createOptimizationPasses(llvm::PassManager *manager,
bool optimize) = 0;
- virtual bool build(const llvm::Module *module) = 0;
+ virtual bool build(llvm::Module *module) = 0;
};
class DeviceKernel
diff --git a/src/core/events.cpp b/src/core/events.cpp
index 68ed00e..2332f4a 100644
--- a/src/core/events.cpp
+++ b/src/core/events.cpp
@@ -370,8 +370,14 @@ KernelEvent::KernelEvent(CommandQueue *parent,
cl_int *errcode_ret)
: Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret),
p_kernel(kernel), p_work_dim(work_dim), p_global_work_offset(0),
- p_global_work_size(0), p_local_work_size(0), p_max_work_item_sizes(0)
+ p_global_work_size(0), p_local_work_size(0), p_max_work_item_sizes(0),
+ p_last_slot(false)
{
+ *errcode_ret = CL_SUCCESS;
+
+ // Locking machinery
+ pthread_mutex_init(&p_mutex, 0);
+
// Sanity checks
if (!kernel)
{
@@ -495,17 +501,87 @@ KernelEvent::KernelEvent(CommandQueue *parent,
if (work_group_size > max_work_group_size)
{
*errcode_ret = CL_INVALID_WORK_GROUP_SIZE;
+ return;
}
// Check arguments (buffer alignment, image size, ...)
- *errcode_ret = kernel->checkArgsForDevice(device);
+ for (int i=0; i<kernel->numArgs(); ++i)
+ {
+ const Kernel::Arg &a = kernel->arg(i);
- if (*errcode_ret != CL_SUCCESS)
- return;
+ if (a.kind() == Kernel::Arg::Buffer)
+ {
+ const MemObject *buffer = *(const MemObject **)(a.value(0));
+
+ if (buffer->type() == MemObject::SubBuffer)
+ {
+ cl_uint align;
+ *errcode_ret = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint),
+ &align, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ size_t mask = 0;
+
+ for (int i=0; i<align; ++i)
+ mask = 1 | (mask << 1);
+
+ if (((SubBuffer *)buffer)->offset() | mask)
+ {
+ *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET;
+ return;
+ }
+ }
+ }
+ else if (a.kind() == Kernel::Arg::Image2D)
+ {
+ const Image2D *image = *(const Image2D **)(a.value(0));
+ size_t maxWidth, maxHeight;
+
+ *errcode_ret = device->info(CL_DEVICE_IMAGE2D_MAX_WIDTH,
+ sizeof(size_t), &maxWidth, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE2D_MAX_HEIGHT,
+ sizeof(size_t), &maxHeight, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (image->width() > maxWidth || image->height() > maxHeight)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ }
+ else if (a.kind() == Kernel::Arg::Image3D)
+ {
+ const Image3D *image = *(const Image3D **)a.value(0);
+ size_t maxWidth, maxHeight, maxDepth;
+
+ *errcode_ret = device->info(CL_DEVICE_IMAGE3D_MAX_WIDTH,
+ sizeof(size_t), &maxWidth, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_HEIGHT,
+ sizeof(size_t), &maxHeight, 0);
+ *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_DEPTH,
+ sizeof(size_t), &maxDepth, 0);
+
+ if (*errcode_ret != CL_SUCCESS)
+ return;
+
+ if (image->width() > maxWidth || image->height() > maxHeight ||
+ image->depth() > maxDepth)
+ {
+ *errcode_ret = CL_INVALID_IMAGE_SIZE;
+ return;
+ }
+ }
+ }
}
KernelEvent::~KernelEvent()
{
+ pthread_mutex_destroy(&p_mutex);
+
if (p_global_work_offset)
std::free(p_global_work_offset);
@@ -554,6 +630,25 @@ Event::Type KernelEvent::type() const
return Event::NDRangeKernel;
}
+bool KernelEvent::lastSlot() const
+{
+ bool rs;
+ KernelEvent *hack = (KernelEvent *)this;
+
+ pthread_mutex_lock(&hack->p_mutex);
+ rs = p_last_slot;
+ pthread_mutex_unlock(&hack->p_mutex);
+
+ return rs;
+}
+
+void KernelEvent::setLastSlot(bool last_slot)
+{
+ pthread_mutex_lock(&p_mutex);
+ p_last_slot = last_slot;
+ pthread_mutex_unlock(&p_mutex);
+}
+
static size_t one = 1;
TaskEvent::TaskEvent(CommandQueue *parent,
diff --git a/src/core/events.h b/src/core/events.h
index 46eb2c2..528bfcb 100644
--- a/src/core/events.h
+++ b/src/core/events.h
@@ -4,6 +4,7 @@
#include "commandqueue.h"
#include <vector>
+#include <pthread.h>
namespace Coal
{
@@ -169,12 +170,17 @@ class KernelEvent : public Event
virtual Type type() const;
+ bool lastSlot() const;
+ void setLastSlot(bool last_slot);
+
private:
cl_uint p_work_dim;
size_t *p_global_work_offset, *p_global_work_size, *p_local_work_size,
*p_max_work_item_sizes;
Kernel *p_kernel;
DeviceKernel *p_dev_kernel;
+ bool p_last_slot;
+ pthread_mutex_t p_mutex;
};
class TaskEvent : public KernelEvent
diff --git a/src/core/kernel.cpp b/src/core/kernel.cpp
index 5df77f4..96e1c8d 100644
--- a/src/core/kernel.cpp
+++ b/src/core/kernel.cpp
@@ -7,6 +7,7 @@
#include <string>
#include <iostream>
#include <cstring>
+#include <cstdlib>
#include <llvm/Support/Casting.h>
#include <llvm/Module.h>
@@ -15,7 +16,7 @@
using namespace Coal;
Kernel::Kernel(Program *program)
-: p_program(program), p_references(1)
+: p_program(program), p_references(1), p_local_args(false)
{
clRetainProgram((cl_program)program); // TODO: Say a kernel is attached to the program (that becomes unalterable)
@@ -99,33 +100,35 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function,
for (int i=0; i<f->getNumParams(); ++i)
{
const llvm::Type *arg_type = f->getParamType(i);
- Arg a;
-
- a.kind = Arg::Invalid;
- a.vec_dim = 1;
- a.file = Arg::Private;
- a.kernel_alloc_size = 0;
- a.set = false;
+ Arg::Kind kind = Arg::Invalid;
+ Arg::File file = Arg::Private;
+ unsigned short vec_dim = 1;
if (arg_type->isPointerTy())
{
// It's a pointer, dereference it
const llvm::PointerType *p_type = llvm::cast<llvm::PointerType>(arg_type);
- a.file = (Arg::File)p_type->getAddressSpace();
+ file = (Arg::File)p_type->getAddressSpace();
arg_type = p_type->getElementType();
+ // If it's a __local argument, we'll have to allocate memory at run time
+ if (file == Arg::Local)
+ p_local_args = true;
+
// Get the name of the type to see if it's something like image2d, etc
std::string name = module->getTypeName(arg_type);
if (name == "image2d")
{
// TODO: Address space qualifiers for image types, and read_only
- a.kind = Arg::Image2D;
+ kind = Arg::Image2D;
+ file = Arg::Global;
}
else if (name == "image3d")
{
- a.kind = Arg::Image3D;
+ kind = Arg::Image3D;
+ file = Arg::Global;
}
else if (name == "sampler")
{
@@ -133,7 +136,7 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function,
}
else
{
- a.kind = Arg::Buffer;
+ kind = Arg::Buffer;
}
}
else
@@ -143,18 +146,18 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function,
// It's a vector, we need its element's type
const llvm::VectorType *v_type = llvm::cast<llvm::VectorType>(arg_type);
- a.vec_dim = v_type->getNumElements();
+ vec_dim = v_type->getNumElements();
arg_type = v_type->getElementType();
}
// Get type kind
if (arg_type->isFloatTy())
{
- a.kind = Arg::Float;
+ kind = Arg::Float;
}
else if (arg_type->isDoubleTy())
{
- a.kind = Arg::Double;
+ kind = Arg::Double;
}
else if (arg_type->isIntegerTy())
{
@@ -162,27 +165,30 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function,
if (i_type->getBitWidth() == 8)
{
- a.kind = Arg::Int8;
+ kind = Arg::Int8;
}
else if (i_type->getBitWidth() == 16)
{
- a.kind = Arg::Int16;
+ kind = Arg::Int16;
}
else if (i_type->getBitWidth() == 32)
{
- a.kind = Arg::Int32;
+ kind = Arg::Int32;
}
else if (i_type->getBitWidth() == 64)
{
- a.kind = Arg::Int64;
+ kind = Arg::Int64;
}
}
}
// Check if we recognized the type
- if (a.kind == Arg::Invalid)
+ if (kind == Arg::Invalid)
return CL_INVALID_KERNEL_DEFINITION;
+ // Create arg
+ Arg a(vec_dim, file, kind);
+
// If we also have a function registered, check for signature compliance
if (!append && a != p_args[i])
return CL_INVALID_KERNEL_DEFINITION;
@@ -192,7 +198,7 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function,
p_args.push_back(a);
}
- dep.kernel = device->createDeviceKernel(this);
+ dep.kernel = device->createDeviceKernel(this, dep.function);
p_device_dependent.push_back(dep);
return CL_SUCCESS;
@@ -205,31 +211,6 @@ llvm::Function *Kernel::function(DeviceInterface *device) const
return dep.function;
}
-size_t Kernel::Arg::valueSize() const
-{
- switch (kind)
- {
- case Invalid:
- return 0;
- case Int8:
- return 1;
- case Int16:
- return 2;
- case Int32:
- return 4;
- case Int64:
- return 8;
- case Float:
- return sizeof(cl_float);
- case Double:
- return sizeof(double);
- case Buffer:
- case Image2D:
- case Image3D:
- return sizeof(cl_mem);
- }
-}
-
cl_int Kernel::setArg(cl_uint index, size_t size, const void *value)
{
if (index > p_args.size())
@@ -238,7 +219,7 @@ cl_int Kernel::setArg(cl_uint index, size_t size, const void *value)
Arg &arg = p_args[index];
// Special case for __local pointers
- if (arg.file == Arg::Local)
+ if (arg.file() == Arg::Local)
{
if (size == 0)
return CL_INVALID_ARG_SIZE;
@@ -246,7 +227,7 @@ cl_int Kernel::setArg(cl_uint index, size_t size, const void *value)
if (value != 0)
return CL_INVALID_ARG_VALUE;
- arg.kernel_alloc_size = size;
+ arg.setAllocAtKernelRuntime(size);
return CL_SUCCESS;
}
@@ -258,17 +239,17 @@ cl_int Kernel::setArg(cl_uint index, size_t size, const void *value)
return CL_INVALID_ARG_SIZE;
// Check for null values
+ cl_mem null_mem = 0;
+
if (!value)
{
- switch (arg.kind)
+ switch (arg.kind())
{
case Arg::Buffer:
case Arg::Image2D:
case Arg::Image3D:
// Special case buffers : value can be 0 (or point to 0)
- arg.value.cl_mem_val = 0;
- arg.set = true;
- return CL_SUCCESS;
+ value = &null_mem;
// TODO samplers
default:
@@ -277,13 +258,22 @@ cl_int Kernel::setArg(cl_uint index, size_t size, const void *value)
}
// Copy the data
- std::memcpy(&arg.value, value, arg_size);
-
- arg.set = true;
+ arg.alloc();
+ arg.loadData(value);
return CL_SUCCESS;
}
+unsigned int Kernel::numArgs() const
+{
+ return p_args.size();
+}
+
+const Kernel::Arg &Kernel::arg(unsigned int index) const
+{
+ return p_args.at(index);
+}
+
Program *Kernel::program() const
{
return p_program;
@@ -293,79 +283,16 @@ bool Kernel::argsSpecified() const
{
for (int i=0; i<p_args.size(); ++i)
{
- if (!p_args[i].set)
+ if (!p_args[i].defined())
return false;
}
return true;
}
-cl_int Kernel::checkArgsForDevice(DeviceInterface *device) const
+bool Kernel::needsLocalAllocation() const
{
- const DeviceDependent &dep = deviceDependent(device);
- cl_int rs;
-
- for (int i=0; i<p_args.size(); ++i)
- {
- const Arg &a = p_args[i];
-
- if (a.kind == Arg::Buffer)
- {
- MemObject *buffer = (MemObject *)a.value.cl_mem_val;
-
- if (buffer->type() == MemObject::SubBuffer)
- {
- cl_uint align;
- rs = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint),
- &align, 0);
-
- if (rs != CL_SUCCESS) return rs;
-
- size_t mask = 0;
-
- for (int i=0; i<align; ++i)
- mask = 1 | (mask << 1);
-
- if (((SubBuffer *)buffer)->offset() | mask)
- return CL_MISALIGNED_SUB_BUFFER_OFFSET;
- }
- }
- else if (a.kind == Arg::Image2D)
- {
- Image2D *image = (Image2D *)a.value.cl_mem_val;
- size_t maxWidth, maxHeight;
-
- rs = device->info(CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t),
- &maxWidth, 0);
- rs |= device->info(CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t),
- &maxHeight, 0);
-
- if (rs != CL_SUCCESS) return rs;
-
- if (image->width() > maxWidth || image->height() > maxHeight)
- return CL_INVALID_IMAGE_SIZE;
- }
- else if (a.kind == Arg::Image3D)
- {
- Image3D *image = (Image3D *)a.value.cl_mem_val;
- size_t maxWidth, maxHeight, maxDepth;
-
- rs = device->info(CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t),
- &maxWidth, 0);
- rs |= device->info(CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t),
- &maxHeight, 0);
- rs |= device->info(CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t),
- &maxDepth, 0);
-
- if (rs != CL_SUCCESS) return rs;
-
- if (image->width() > maxWidth || image->height() > maxHeight ||
- image->depth() > maxDepth)
- return CL_INVALID_IMAGE_SIZE;
- }
- }
-
- return CL_SUCCESS;
+ return p_local_args;
}
DeviceKernel *Kernel::deviceDependentKernel(DeviceInterface *device) const
@@ -486,3 +413,106 @@ cl_int Kernel::workGroupInfo(DeviceInterface *device,
return CL_SUCCESS;
}
+
+/*
+ * Kernel::Arg
+ */
+Kernel::Arg::Arg(unsigned short vec_dim, File file, Kind kind)
+: p_vec_dim(vec_dim), p_file(file), p_kind(kind), p_defined(false),
+ p_runtime_alloc(0), p_data(0)
+{
+
+}
+
+Kernel::Arg::~Arg()
+{
+ if (p_data)
+ std::free(p_data);
+}
+
+void Kernel::Arg::alloc()
+{
+ if (!p_data)
+ p_data = std::malloc(p_vec_dim * valueSize());
+}
+
+void Kernel::Arg::loadData(const void *data)
+{
+ std::memcpy(p_data, data, p_vec_dim * valueSize());
+ p_defined = true;
+}
+
+void Kernel::Arg::setAllocAtKernelRuntime(size_t size)
+{
+ p_runtime_alloc = size;
+ p_defined = true;
+}
+
+bool Kernel::Arg::operator!=(const Arg &b)
+{
+ bool same = (p_vec_dim == b.p_vec_dim) &&
+ (p_file == b.p_file) &&
+ (p_kind == b.p_kind);
+
+ return !same;
+}
+
+size_t Kernel::Arg::valueSize() const
+{
+ switch (p_kind)
+ {
+ case Invalid:
+ return 0;
+ case Int8:
+ return 1;
+ case Int16:
+ return 2;
+ case Int32:
+ return 4;
+ case Int64:
+ return 8;
+ case Float:
+ return sizeof(cl_float);
+ case Double:
+ return sizeof(double);
+ case Buffer:
+ case Image2D:
+ case Image3D:
+ return sizeof(cl_mem);
+ }
+}
+
+unsigned short Kernel::Arg::vecDim() const
+{
+ return p_vec_dim;
+}
+
+Kernel::Arg::File Kernel::Arg::file() const
+{
+ return p_file;
+}
+
+Kernel::Arg::Kind Kernel::Arg::kind() const
+{
+ return p_kind;
+}
+
+bool Kernel::Arg::defined() const
+{
+ return p_defined;
+}
+
+size_t Kernel::Arg::allocAtKernelRuntime() const
+{
+ return p_runtime_alloc;
+}
+
+const void *Kernel::Arg::value(unsigned short index) const
+{
+ const char *data = (const char *)p_data;
+ unsigned int offset = index * valueSize();
+
+ data += offset;
+
+ return (const void *)data;
+}
diff --git a/src/core/kernel.h b/src/core/kernel.h
index b10b77e..2c9ffde 100644
--- a/src/core/kernel.h
+++ b/src/core/kernel.h
@@ -25,6 +25,57 @@ class Kernel
Kernel(Program *program);
~Kernel();
+ class Arg
+ {
+ public:
+ enum File
+ {
+ Private = 0,
+ Global = 1,
+ Local = 2,
+ Constant = 3
+ };
+ enum Kind
+ {
+ Invalid,
+ Int8,
+ Int16,
+ Int32,
+ Int64,
+ Float,
+ Double,
+ Buffer,
+ Image2D,
+ Image3D
+ // TODO: Sampler
+ };
+
+ Arg(unsigned short vec_dim, File file, Kind kind);
+ ~Arg();
+
+ void alloc();
+ void loadData(const void *data);
+ void setAllocAtKernelRuntime(size_t size);
+
+ bool operator !=(const Arg &b);
+
+ size_t valueSize() const;
+ unsigned short vecDim() const;
+ File file() const;
+ Kind kind() const;
+ bool defined() const;
+ size_t allocAtKernelRuntime() const;
+ const void *value(unsigned short index) const;
+
+ private:
+ unsigned short p_vec_dim;
+ File p_file;
+ Kind p_kind;
+ void *p_data;
+ bool p_defined;
+ size_t p_runtime_alloc;
+ };
+
void reference();
bool dereference();
@@ -32,12 +83,14 @@ class Kernel
llvm::Module *module);
llvm::Function *function(DeviceInterface *device) const;
cl_int setArg(cl_uint index, size_t size, const void *value);
+ unsigned int numArgs() const;
+ const Arg &arg(unsigned int index) const;
Program *program() const;
DeviceKernel *deviceDependentKernel(DeviceInterface *device) const;
bool argsSpecified() const;
- cl_int checkArgsForDevice(DeviceInterface *device) const;
+ bool needsLocalAllocation() const; /*!< One or more arguments is __local */
cl_int info(cl_kernel_info param_name,
size_t param_value_size,
@@ -53,6 +106,7 @@ class Kernel
Program *p_program;
unsigned int p_references;
std::string p_name;
+ bool p_local_args;
struct DeviceDependent
{
@@ -62,55 +116,6 @@ class Kernel
llvm::Module *module;
};
- struct Arg
- {
- unsigned short vec_dim;
- bool set;
- size_t kernel_alloc_size; /*!< Size of the memory that must be allocated at kernel execution */
-
- enum File
- {
- Private = 0,
- Global = 1,
- Local = 2,
- Constant = 3
- } file;
-
- enum Kind
- {
- Invalid,
- Int8,
- Int16,
- Int32,
- Int64,
- Float,
- Double,
- Buffer,
- Image2D,
- Image3D
- // TODO: Sampler
- } kind;
-
- union
- {
- #define TYPE_VAL(type) type type##_val
- TYPE_VAL(uint8_t);
- TYPE_VAL(uint16_t);
- TYPE_VAL(uint32_t);
- TYPE_VAL(uint64_t);
- TYPE_VAL(cl_float);
- TYPE_VAL(double);
- TYPE_VAL(cl_mem);
- #undef TYPE_VAL
- } value;
-
- inline bool operator !=(const Arg &b)
- {
- return (kind != b.kind) || (vec_dim != b.vec_dim);
- }
- size_t valueSize() const;
- };
-
std::vector<DeviceDependent> p_device_dependent;
std::vector<Arg> p_args;
DeviceDependent null_dep;
diff --git a/src/core/program.cpp b/src/core/program.cpp
index dfae848..91c8d17 100644
--- a/src/core/program.cpp
+++ b/src/core/program.cpp
@@ -94,6 +94,24 @@ Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device)
}
}
+const Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device) const
+{
+ for (int i=0; i<p_device_dependent.size(); ++i)
+ {
+ const DeviceDependent &rs = p_device_dependent[i];
+
+ if (rs.device == device || (!device && p_device_dependent.size() == 1))
+ return rs;
+ }
+}
+
+DeviceProgram *Program::deviceDependentProgram(DeviceInterface *device) const
+{
+ const DeviceDependent &dep = deviceDependent(device);
+
+ return dep.program;
+}
+
std::vector<llvm::Function *> Program::kernelFunctions(DeviceDependent &dep)
{
std::vector<llvm::Function *> rs;
diff --git a/src/core/program.h b/src/core/program.h
index 6036fe3..46ea88b 100644
--- a/src/core/program.h
+++ b/src/core/program.h
@@ -62,6 +62,7 @@ class Program
Kernel *createKernel(const std::string &name, cl_int *errcode_ret);
std::vector<Kernel *> createKernels(cl_int *errcode_ret);
+ DeviceProgram *deviceDependentProgram(DeviceInterface *device) const;
cl_int info(cl_program_info param_name,
size_t param_value_size,
@@ -93,6 +94,7 @@ class Program
void setDevices(cl_uint num_devices, DeviceInterface * const*devices);
DeviceDependent &deviceDependent(DeviceInterface *device);
+ const DeviceDependent &deviceDependent(DeviceInterface *device) const;
std::vector<llvm::Function *> kernelFunctions(DeviceDependent &dep);
};