diff options
author | Denis Steckelmacher <steckdenis@yahoo.fr> | 2011-07-14 12:20:25 +0200 |
---|---|---|
committer | Denis Steckelmacher <steckdenis@yahoo.fr> | 2011-07-14 12:20:25 +0200 |
commit | e5c5e502334fc899b9cd8f3ea6e082c8bfb36d4d (patch) | |
tree | 4824fec6fb58f8b92c93ac81a355f39d85a5afc6 | |
parent | a8bb88edc1d88ec35ca7fd47584badbee56dcf4b (diff) |
WIP: Implement kernel launching
Refactor Kernel::Arg to be cleaner, initialize the JIT in
src/core/cpu/program.cpp, and get the JIT kernel call function.
-rw-r--r-- | src/core/commandqueue.cpp | 7 | ||||
-rw-r--r-- | src/core/commandqueue.h | 2 | ||||
-rw-r--r-- | src/core/cpu/device.cpp | 24 | ||||
-rw-r--r-- | src/core/cpu/device.h | 3 | ||||
-rw-r--r-- | src/core/cpu/kernel.cpp | 206 | ||||
-rw-r--r-- | src/core/cpu/kernel.h | 14 | ||||
-rw-r--r-- | src/core/cpu/program.cpp | 30 | ||||
-rw-r--r-- | src/core/cpu/program.h | 14 | ||||
-rw-r--r-- | src/core/cpu/worker.cpp | 10 | ||||
-rw-r--r-- | src/core/deviceinterface.h | 6 | ||||
-rw-r--r-- | src/core/events.cpp | 103 | ||||
-rw-r--r-- | src/core/events.h | 6 | ||||
-rw-r--r-- | src/core/kernel.cpp | 272 | ||||
-rw-r--r-- | src/core/kernel.h | 105 | ||||
-rw-r--r-- | src/core/program.cpp | 18 | ||||
-rw-r--r-- | src/core/program.h | 2 |
16 files changed, 624 insertions, 198 deletions
diff --git a/src/core/commandqueue.cpp b/src/core/commandqueue.cpp index 85d51c4..d4aa99c 100644 --- a/src/core/commandqueue.cpp +++ b/src/core/commandqueue.cpp @@ -412,12 +412,9 @@ void Event::setReleaseParent(bool release) p_release_parent = release; } -bool Event::isSingleShot() const +bool Event::lastSlot() const { - // NDRangeKernel is a single event that can be executed on several execution - // units. The other (buffer copying) must be executed in one part. - - return (type() != NDRangeKernel); + return true; } bool Event::isDummy() const diff --git a/src/core/commandqueue.h b/src/core/commandqueue.h index 1b4f82e..c2a9a1e 100644 --- a/src/core/commandqueue.h +++ b/src/core/commandqueue.h @@ -116,7 +116,7 @@ class Event virtual ~Event(); virtual Type type() const = 0; - bool isSingleShot() const; /*!< Cannot be split on several execution units */ + virtual bool lastSlot() const; /*!< The last slot of a NDRange event will be executed */ bool isDummy() const; /*!< Doesn't do anything, it's just an event type */ void reference(); diff --git a/src/core/cpu/device.cpp b/src/core/cpu/device.cpp index 487b4b3..c9b90bc 100644 --- a/src/core/cpu/device.cpp +++ b/src/core/cpu/device.cpp @@ -9,6 +9,8 @@ #include "../commandqueue.h" #include "../events.h" #include "../memobject.h" +#include "../kernel.h" +#include "../program.h" #include <cstring> #include <cstdlib> @@ -68,9 +70,10 @@ DeviceProgram *CPUDevice::createDeviceProgram(Program *program) return (DeviceProgram *)new CPUProgram(this, program); } -DeviceKernel *CPUDevice::createDeviceKernel(Kernel *kernel) +DeviceKernel *CPUDevice::createDeviceKernel(Kernel *kernel, + llvm::Function *function) { - return (DeviceKernel *)new CPUKernel(this, kernel); + return (DeviceKernel *)new CPUKernel(this, kernel, function); } cl_int CPUDevice::initEventDeviceData(Event *event) @@ -92,6 +95,19 @@ cl_int CPUDevice::initEventDeviceData(Event *event) // Nothing do to break; + case Event::NDRangeKernel: + case Event::TaskKernel: + { + // Instantiate the JIT for the CPU program + KernelEvent *e = (KernelEvent *)event; + CPUProgram *prog = + (CPUProgram *)e->kernel()->program()->deviceDependentProgram(this); + + if (!prog->initJIT()) + return CL_INVALID_PROGRAM_EXECUTABLE; + + break; + } default: break; } @@ -122,8 +138,8 @@ Event *CPUDevice::getEvent(bool &stop) Event *event = p_events.front(); - // If event is single-shot, remove it - if (event->isSingleShot()) + // If the run of this event will finish it, remove it from the list + if (event->lastSlot()) { p_num_events--; p_events.pop_front(); diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h index bd7e535..f19fcbc 100644 --- a/src/core/cpu/device.h +++ b/src/core/cpu/device.h @@ -27,7 +27,8 @@ class CPUDevice : public DeviceInterface DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs); DeviceProgram *createDeviceProgram(Program *program); - DeviceKernel *createDeviceKernel(Kernel *kernel); + DeviceKernel *createDeviceKernel(Kernel *kernel, + llvm::Function *function); cl_int initEventDeviceData(Event *event); diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp index 7e334e2..bb630d0 100644 --- a/src/core/cpu/kernel.cpp +++ b/src/core/cpu/kernel.cpp @@ -1,19 +1,35 @@ #include "kernel.h" #include "device.h" +#include "buffer.h" #include "../kernel.h" +#include "../memobject.h" + +#include <llvm/Function.h> +#include <llvm/Constants.h> +#include <llvm/ADT/APInt.h> +#include <llvm/ADT/APFloat.h> +#include <llvm/Support/Casting.h> +#include <llvm/Instructions.h> +#include <llvm/LLVMContext.h> +#include <llvm/Module.h> + +#include <cstdlib> +#include <iostream> using namespace Coal; -CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel) -: DeviceKernel(), p_device(device), p_kernel(kernel) +CPUKernel::CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function) +: DeviceKernel(), p_device(device), p_kernel(kernel), p_function(function), + p_call_function(0) { } CPUKernel::~CPUKernel() { - + if (p_call_function) + p_call_function->eraseFromParent(); } size_t CPUKernel::workGroupSize() const @@ -78,3 +94,187 @@ size_t CPUKernel::guessWorkGroupSize(cl_uint num_dims, cl_uint dim, // Return the size return global_work_size / divisor; } + +llvm::Function *CPUKernel::function() const +{ + return p_function; +} + +static llvm::Constant *getPointerConstant(llvm::LLVMContext &C, + const llvm::Type *type, + void *const *value) +{ + llvm::Constant *rs = 0; + + if (sizeof(void *) == 4) + rs = llvm::ConstantInt::get(llvm::Type::getInt32Ty(C), *(uint32_t *)value); + else + rs = llvm::ConstantInt::get(llvm::Type::getInt64Ty(C), *(uint64_t *)value); + + // Cast to kernel's pointer type + rs = llvm::ConstantExpr::getIntToPtr(rs, type); + + return rs; +} + +llvm::Function *CPUKernel::callFunction() +{ + // If we can reuse the same function between work groups, do it + if (!p_kernel->needsLocalAllocation() && p_call_function) + return p_call_function; + + // Create a LLVM function that calls the kernels with its arguments + // Code inspired from llvm/lib/ExecutionEngine/JIT/JIT.cpp + // Copyright The LLVM Compiler Infrastructure + const llvm::FunctionType *k_func_type = p_function->getFunctionType(); + llvm::FunctionType *f_type = + llvm::FunctionType::get(p_function->getReturnType(), false); + llvm::Function *stub = llvm::Function::Create(f_type, + llvm::Function::InternalLinkage, + "", p_function->getParent()); + + // Insert a basic block + llvm::BasicBlock *block = llvm::BasicBlock::Create(p_function->getContext(), + "", stub); + + llvm::SmallVector<llvm::Value *, 8> args; + + // Add each kernel arg to args + for (int i=0; i<p_kernel->numArgs(); ++i) + { + const Kernel::Arg &a = p_kernel->arg(i); + llvm::Constant *arg_constant = 0; + + // To handle vectors (float4, etc) + llvm::SmallVector<llvm::Constant *, 4> vec_elements; + + // Explore the vector elements + for (unsigned short k=0; k<a.vecDim(); ++k) + { + const void *value = a.value(k); + llvm::Constant *C = 0; + + switch (a.kind()) + { + case Kernel::Arg::Int8: + C = llvm::ConstantInt::get(stub->getContext(), + llvm::APInt(8, *(uint8_t *)value)); + break; + + case Kernel::Arg::Int16: + C = llvm::ConstantInt::get(stub->getContext(), + llvm::APInt(16, *(uint16_t *)value)); + break; + + case Kernel::Arg::Int32: + C = llvm::ConstantInt::get(stub->getContext(), + llvm::APInt(32, *(uint32_t *)value)); + break; + + case Kernel::Arg::Int64: + C = llvm::ConstantInt::get(stub->getContext(), + llvm::APInt(64, *(uint64_t *)value)); + break; + + case Kernel::Arg::Float: + C = llvm::ConstantFP::get(stub->getContext(), + llvm::APFloat(*(float *)value)); + break; + + case Kernel::Arg::Double: + C = llvm::ConstantFP::get(stub->getContext(), + llvm::APFloat(*(double *)value)); + break; + + case Kernel::Arg::Buffer: + { + MemObject *buffer = *(MemObject **)value; + + if (a.file() == Kernel::Arg::Local) + { + // Alloc a buffer and pass it to the kernel + // NOTE: Free this after use ! + void *local_buffer = std::malloc(a.allocAtKernelRuntime()); + C = getPointerConstant(stub->getContext(), + k_func_type->getParamType(i), + &local_buffer); + } + else + { + if (!buffer) + { + // We can do that, just send NULL + C = llvm::ConstantPointerNull::get( + llvm::cast<llvm::PointerType>( + k_func_type->getParamType(i))); + } + else + { + // Get the CPU buffer, allocate it and get its pointer + CPUBuffer *cpubuf = + (CPUBuffer *)buffer->deviceBuffer(p_device); + void *buf_ptr = 0; + + if (!cpubuf->allocated()) + cpubuf->allocate(); + + buf_ptr = cpubuf->data(); + + C = getPointerConstant(stub->getContext(), + k_func_type->getParamType(i), + &buf_ptr); + } + } + + break; + } + + case Kernel::Arg::Image2D: + case Kernel::Arg::Image3D: + // Assign a pointer to the image object, the instrinsic functions + // will handle them + C = getPointerConstant(stub->getContext(), + k_func_type->getParamType(i), + (void **)value); + break; + + default: + break; + } + + // Add the vector element + vec_elements.push_back(C); + } + + // If the arg was a vector, handle it + if (a.vecDim() == 1) + { + arg_constant = vec_elements.front(); + } + else + { + arg_constant = llvm::ConstantVector::get(vec_elements); + } + + // Append the arg + args.push_back(arg_constant); + } + + // Create the call instruction + llvm::CallInst *call_inst = llvm::CallInst::Create(p_function, args.begin(), + args.end(), "", block); + call_inst->setCallingConv(p_function->getCallingConv()); + call_inst->setTailCall(); + + // Create a return instruction to end the stub + llvm::ReturnInst::Create(stub->getContext(), block); + + // DEBUG + stub->getParent()->dump(); + + // Retain the function if it can be reused + if (!p_kernel->needsLocalAllocation()) + p_call_function = stub; + + return stub; +} diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h index 6cc998a..d72f078 100644 --- a/src/core/cpu/kernel.h +++ b/src/core/cpu/kernel.h @@ -3,6 +3,14 @@ #include "../deviceinterface.h" +#include <llvm/ExecutionEngine/GenericValue.h> +#include <vector> + +namespace llvm +{ + class Function; +} + namespace Coal { @@ -12,7 +20,7 @@ class Kernel; class CPUKernel : public DeviceKernel { public: - CPUKernel(CPUDevice *device, Kernel *kernel); + CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function); ~CPUKernel(); size_t workGroupSize() const; @@ -22,9 +30,13 @@ class CPUKernel : public DeviceKernel size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim, size_t global_work_size) const; + llvm::Function *function() const; + llvm::Function *callFunction(); + private: CPUDevice *p_device; Kernel *p_kernel; + llvm::Function *p_function, *p_call_function; }; } diff --git a/src/core/cpu/program.cpp b/src/core/cpu/program.cpp index 9d048c9..9a3a851 100644 --- a/src/core/cpu/program.cpp +++ b/src/core/cpu/program.cpp @@ -8,18 +8,20 @@ #include <llvm/Analysis/Verifier.h> #include <llvm/Transforms/Scalar.h> #include <llvm/Transforms/IPO.h> +#include <llvm/ExecutionEngine/ExecutionEngine.h> using namespace Coal; CPUProgram::CPUProgram(CPUDevice *device, Program *program) -: DeviceProgram(), p_device(device), p_program(program) +: DeviceProgram(), p_device(device), p_program(program), p_jit(0) { } CPUProgram::~CPUProgram() { - + if (p_jit) + delete p_jit; } bool CPUProgram::linkStdLib() const @@ -56,8 +58,30 @@ void CPUProgram::createOptimizationPasses(llvm::PassManager *manager, bool optim } } -bool CPUProgram::build(const llvm::Module *module) +bool CPUProgram::build(llvm::Module *module) { // Nothing to build + p_module = module; + + return true; +} + +bool CPUProgram::initJIT() +{ + if (p_jit) + return true; + + if (!p_module) + return false; + + // Create the JIT + p_jit = llvm::ExecutionEngine::create(p_module, false, 0, + llvm::CodeGenOpt::Default, false); + return true; } + +llvm::ExecutionEngine *CPUProgram::jit() const +{ + return p_jit; +} diff --git a/src/core/cpu/program.h b/src/core/cpu/program.h index 8e61592..e0b7029 100644 --- a/src/core/cpu/program.h +++ b/src/core/cpu/program.h @@ -3,6 +3,12 @@ #include "../deviceinterface.h" +namespace llvm +{ + class ExecutionEngine; + class Module; +} + namespace Coal { @@ -17,11 +23,17 @@ class CPUProgram : public DeviceProgram bool linkStdLib() const; void createOptimizationPasses(llvm::PassManager *manager, bool optimize); - bool build(const llvm::Module *module); + bool build(llvm::Module *module); + + bool initJIT(); + llvm::ExecutionEngine *jit() const; private: CPUDevice *p_device; Program *p_program; + + llvm::ExecutionEngine *p_jit; + llvm::Module *p_module; }; } diff --git a/src/core/cpu/worker.cpp b/src/core/cpu/worker.cpp index 87e10e2..9b18c47 100644 --- a/src/core/cpu/worker.cpp +++ b/src/core/cpu/worker.cpp @@ -1,10 +1,12 @@ #include "worker.h" #include "device.h" #include "buffer.h" +#include "kernel.h" #include "../commandqueue.h" #include "../events.h" #include "../memobject.h" +#include "../kernel.h" #include <cstring> @@ -13,7 +15,7 @@ using namespace Coal; void *worker(void *data) { CPUDevice *device = (CPUDevice *)data; - bool stop = false, success; + bool stop = false, success, last_slot; Event *event; while (true) @@ -29,6 +31,7 @@ void *worker(void *data) CommandQueue *queue = 0; cl_command_queue_properties queue_props = 0; success = true; + last_slot = event->lastSlot(); event->info(CL_EVENT_COMMAND_QUEUE, sizeof(CommandQueue *), &queue, 0); @@ -76,6 +79,9 @@ void *worker(void *data) case Event::TaskKernel: { KernelEvent *e = (KernelEvent *)event; + CPUKernel *k = (CPUKernel *)e->kernel()->deviceDependentKernel(device); + e->setLastSlot(true); + k->callFunction(); break; } @@ -84,7 +90,7 @@ void *worker(void *data) } // Cleanups - if (success) + if (success && last_slot) { event->setStatus(Event::Complete); diff --git a/src/core/deviceinterface.h b/src/core/deviceinterface.h index b35ec20..bca8f13 100644 --- a/src/core/deviceinterface.h +++ b/src/core/deviceinterface.h @@ -7,6 +7,7 @@ namespace llvm { class PassManager; class Module; + class Function; } namespace Coal @@ -34,7 +35,8 @@ class DeviceInterface virtual DeviceBuffer *createDeviceBuffer(MemObject *buffer, cl_int *rs) = 0; virtual DeviceProgram *createDeviceProgram(Program *program) = 0; - virtual DeviceKernel *createDeviceKernel(Kernel *kernel) = 0; + virtual DeviceKernel *createDeviceKernel(Kernel *kernel, + llvm::Function *function) = 0; virtual void pushEvent(Event *event) = 0; @@ -65,7 +67,7 @@ class DeviceProgram virtual bool linkStdLib() const = 0; virtual void createOptimizationPasses(llvm::PassManager *manager, bool optimize) = 0; - virtual bool build(const llvm::Module *module) = 0; + virtual bool build(llvm::Module *module) = 0; }; class DeviceKernel diff --git a/src/core/events.cpp b/src/core/events.cpp index 68ed00e..2332f4a 100644 --- a/src/core/events.cpp +++ b/src/core/events.cpp @@ -370,8 +370,14 @@ KernelEvent::KernelEvent(CommandQueue *parent, cl_int *errcode_ret) : Event(parent, Queued, num_events_in_wait_list, event_wait_list, errcode_ret), p_kernel(kernel), p_work_dim(work_dim), p_global_work_offset(0), - p_global_work_size(0), p_local_work_size(0), p_max_work_item_sizes(0) + p_global_work_size(0), p_local_work_size(0), p_max_work_item_sizes(0), + p_last_slot(false) { + *errcode_ret = CL_SUCCESS; + + // Locking machinery + pthread_mutex_init(&p_mutex, 0); + // Sanity checks if (!kernel) { @@ -495,17 +501,87 @@ KernelEvent::KernelEvent(CommandQueue *parent, if (work_group_size > max_work_group_size) { *errcode_ret = CL_INVALID_WORK_GROUP_SIZE; + return; } // Check arguments (buffer alignment, image size, ...) - *errcode_ret = kernel->checkArgsForDevice(device); + for (int i=0; i<kernel->numArgs(); ++i) + { + const Kernel::Arg &a = kernel->arg(i); - if (*errcode_ret != CL_SUCCESS) - return; + if (a.kind() == Kernel::Arg::Buffer) + { + const MemObject *buffer = *(const MemObject **)(a.value(0)); + + if (buffer->type() == MemObject::SubBuffer) + { + cl_uint align; + *errcode_ret = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), + &align, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + size_t mask = 0; + + for (int i=0; i<align; ++i) + mask = 1 | (mask << 1); + + if (((SubBuffer *)buffer)->offset() | mask) + { + *errcode_ret = CL_MISALIGNED_SUB_BUFFER_OFFSET; + return; + } + } + } + else if (a.kind() == Kernel::Arg::Image2D) + { + const Image2D *image = *(const Image2D **)(a.value(0)); + size_t maxWidth, maxHeight; + + *errcode_ret = device->info(CL_DEVICE_IMAGE2D_MAX_WIDTH, + sizeof(size_t), &maxWidth, 0); + *errcode_ret |= device->info(CL_DEVICE_IMAGE2D_MAX_HEIGHT, + sizeof(size_t), &maxHeight, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if (image->width() > maxWidth || image->height() > maxHeight) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + } + else if (a.kind() == Kernel::Arg::Image3D) + { + const Image3D *image = *(const Image3D **)a.value(0); + size_t maxWidth, maxHeight, maxDepth; + + *errcode_ret = device->info(CL_DEVICE_IMAGE3D_MAX_WIDTH, + sizeof(size_t), &maxWidth, 0); + *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_HEIGHT, + sizeof(size_t), &maxHeight, 0); + *errcode_ret |= device->info(CL_DEVICE_IMAGE3D_MAX_DEPTH, + sizeof(size_t), &maxDepth, 0); + + if (*errcode_ret != CL_SUCCESS) + return; + + if (image->width() > maxWidth || image->height() > maxHeight || + image->depth() > maxDepth) + { + *errcode_ret = CL_INVALID_IMAGE_SIZE; + return; + } + } + } } KernelEvent::~KernelEvent() { + pthread_mutex_destroy(&p_mutex); + if (p_global_work_offset) std::free(p_global_work_offset); @@ -554,6 +630,25 @@ Event::Type KernelEvent::type() const return Event::NDRangeKernel; } +bool KernelEvent::lastSlot() const +{ + bool rs; + KernelEvent *hack = (KernelEvent *)this; + + pthread_mutex_lock(&hack->p_mutex); + rs = p_last_slot; + pthread_mutex_unlock(&hack->p_mutex); + + return rs; +} + +void KernelEvent::setLastSlot(bool last_slot) +{ + pthread_mutex_lock(&p_mutex); + p_last_slot = last_slot; + pthread_mutex_unlock(&p_mutex); +} + static size_t one = 1; TaskEvent::TaskEvent(CommandQueue *parent, diff --git a/src/core/events.h b/src/core/events.h index 46eb2c2..528bfcb 100644 --- a/src/core/events.h +++ b/src/core/events.h @@ -4,6 +4,7 @@ #include "commandqueue.h" #include <vector> +#include <pthread.h> namespace Coal { @@ -169,12 +170,17 @@ class KernelEvent : public Event virtual Type type() const; + bool lastSlot() const; + void setLastSlot(bool last_slot); + private: cl_uint p_work_dim; size_t *p_global_work_offset, *p_global_work_size, *p_local_work_size, *p_max_work_item_sizes; Kernel *p_kernel; DeviceKernel *p_dev_kernel; + bool p_last_slot; + pthread_mutex_t p_mutex; }; class TaskEvent : public KernelEvent diff --git a/src/core/kernel.cpp b/src/core/kernel.cpp index 5df77f4..96e1c8d 100644 --- a/src/core/kernel.cpp +++ b/src/core/kernel.cpp @@ -7,6 +7,7 @@ #include <string> #include <iostream> #include <cstring> +#include <cstdlib> #include <llvm/Support/Casting.h> #include <llvm/Module.h> @@ -15,7 +16,7 @@ using namespace Coal; Kernel::Kernel(Program *program) -: p_program(program), p_references(1) +: p_program(program), p_references(1), p_local_args(false) { clRetainProgram((cl_program)program); // TODO: Say a kernel is attached to the program (that becomes unalterable) @@ -99,33 +100,35 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function, for (int i=0; i<f->getNumParams(); ++i) { const llvm::Type *arg_type = f->getParamType(i); - Arg a; - - a.kind = Arg::Invalid; - a.vec_dim = 1; - a.file = Arg::Private; - a.kernel_alloc_size = 0; - a.set = false; + Arg::Kind kind = Arg::Invalid; + Arg::File file = Arg::Private; + unsigned short vec_dim = 1; if (arg_type->isPointerTy()) { // It's a pointer, dereference it const llvm::PointerType *p_type = llvm::cast<llvm::PointerType>(arg_type); - a.file = (Arg::File)p_type->getAddressSpace(); + file = (Arg::File)p_type->getAddressSpace(); arg_type = p_type->getElementType(); + // If it's a __local argument, we'll have to allocate memory at run time + if (file == Arg::Local) + p_local_args = true; + // Get the name of the type to see if it's something like image2d, etc std::string name = module->getTypeName(arg_type); if (name == "image2d") { // TODO: Address space qualifiers for image types, and read_only - a.kind = Arg::Image2D; + kind = Arg::Image2D; + file = Arg::Global; } else if (name == "image3d") { - a.kind = Arg::Image3D; + kind = Arg::Image3D; + file = Arg::Global; } else if (name == "sampler") { @@ -133,7 +136,7 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function, } else { - a.kind = Arg::Buffer; + kind = Arg::Buffer; } } else @@ -143,18 +146,18 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function, // It's a vector, we need its element's type const llvm::VectorType *v_type = llvm::cast<llvm::VectorType>(arg_type); - a.vec_dim = v_type->getNumElements(); + vec_dim = v_type->getNumElements(); arg_type = v_type->getElementType(); } // Get type kind if (arg_type->isFloatTy()) { - a.kind = Arg::Float; + kind = Arg::Float; } else if (arg_type->isDoubleTy()) { - a.kind = Arg::Double; + kind = Arg::Double; } else if (arg_type->isIntegerTy()) { @@ -162,27 +165,30 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function, if (i_type->getBitWidth() == 8) { - a.kind = Arg::Int8; + kind = Arg::Int8; } else if (i_type->getBitWidth() == 16) { - a.kind = Arg::Int16; + kind = Arg::Int16; } else if (i_type->getBitWidth() == 32) { - a.kind = Arg::Int32; + kind = Arg::Int32; } else if (i_type->getBitWidth() == 64) { - a.kind = Arg::Int64; + kind = Arg::Int64; } } } // Check if we recognized the type - if (a.kind == Arg::Invalid) + if (kind == Arg::Invalid) return CL_INVALID_KERNEL_DEFINITION; + // Create arg + Arg a(vec_dim, file, kind); + // If we also have a function registered, check for signature compliance if (!append && a != p_args[i]) return CL_INVALID_KERNEL_DEFINITION; @@ -192,7 +198,7 @@ cl_int Kernel::addFunction(DeviceInterface *device, llvm::Function *function, p_args.push_back(a); } - dep.kernel = device->createDeviceKernel(this); + dep.kernel = device->createDeviceKernel(this, dep.function); p_device_dependent.push_back(dep); return CL_SUCCESS; @@ -205,31 +211,6 @@ llvm::Function *Kernel::function(DeviceInterface *device) const return dep.function; } -size_t Kernel::Arg::valueSize() const -{ - switch (kind) - { - case Invalid: - return 0; - case Int8: - return 1; - case Int16: - return 2; - case Int32: - return 4; - case Int64: - return 8; - case Float: - return sizeof(cl_float); - case Double: - return sizeof(double); - case Buffer: - case Image2D: - case Image3D: - return sizeof(cl_mem); - } -} - cl_int Kernel::setArg(cl_uint index, size_t size, const void *value) { if (index > p_args.size()) @@ -238,7 +219,7 @@ cl_int Kernel::setArg(cl_uint index, size_t size, const void *value) Arg &arg = p_args[index]; // Special case for __local pointers - if (arg.file == Arg::Local) + if (arg.file() == Arg::Local) { if (size == 0) return CL_INVALID_ARG_SIZE; @@ -246,7 +227,7 @@ cl_int Kernel::setArg(cl_uint index, size_t size, const void *value) if (value != 0) return CL_INVALID_ARG_VALUE; - arg.kernel_alloc_size = size; + arg.setAllocAtKernelRuntime(size); return CL_SUCCESS; } @@ -258,17 +239,17 @@ cl_int Kernel::setArg(cl_uint index, size_t size, const void *value) return CL_INVALID_ARG_SIZE; // Check for null values + cl_mem null_mem = 0; + if (!value) { - switch (arg.kind) + switch (arg.kind()) { case Arg::Buffer: case Arg::Image2D: case Arg::Image3D: // Special case buffers : value can be 0 (or point to 0) - arg.value.cl_mem_val = 0; - arg.set = true; - return CL_SUCCESS; + value = &null_mem; // TODO samplers default: @@ -277,13 +258,22 @@ cl_int Kernel::setArg(cl_uint index, size_t size, const void *value) } // Copy the data - std::memcpy(&arg.value, value, arg_size); - - arg.set = true; + arg.alloc(); + arg.loadData(value); return CL_SUCCESS; } +unsigned int Kernel::numArgs() const +{ + return p_args.size(); +} + +const Kernel::Arg &Kernel::arg(unsigned int index) const +{ + return p_args.at(index); +} + Program *Kernel::program() const { return p_program; @@ -293,79 +283,16 @@ bool Kernel::argsSpecified() const { for (int i=0; i<p_args.size(); ++i) { - if (!p_args[i].set) + if (!p_args[i].defined()) return false; } return true; } -cl_int Kernel::checkArgsForDevice(DeviceInterface *device) const +bool Kernel::needsLocalAllocation() const { - const DeviceDependent &dep = deviceDependent(device); - cl_int rs; - - for (int i=0; i<p_args.size(); ++i) - { - const Arg &a = p_args[i]; - - if (a.kind == Arg::Buffer) - { - MemObject *buffer = (MemObject *)a.value.cl_mem_val; - - if (buffer->type() == MemObject::SubBuffer) - { - cl_uint align; - rs = device->info(CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), - &align, 0); - - if (rs != CL_SUCCESS) return rs; - - size_t mask = 0; - - for (int i=0; i<align; ++i) - mask = 1 | (mask << 1); - - if (((SubBuffer *)buffer)->offset() | mask) - return CL_MISALIGNED_SUB_BUFFER_OFFSET; - } - } - else if (a.kind == Arg::Image2D) - { - Image2D *image = (Image2D *)a.value.cl_mem_val; - size_t maxWidth, maxHeight; - - rs = device->info(CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), - &maxWidth, 0); - rs |= device->info(CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), - &maxHeight, 0); - - if (rs != CL_SUCCESS) return rs; - - if (image->width() > maxWidth || image->height() > maxHeight) - return CL_INVALID_IMAGE_SIZE; - } - else if (a.kind == Arg::Image3D) - { - Image3D *image = (Image3D *)a.value.cl_mem_val; - size_t maxWidth, maxHeight, maxDepth; - - rs = device->info(CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), - &maxWidth, 0); - rs |= device->info(CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), - &maxHeight, 0); - rs |= device->info(CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), - &maxDepth, 0); - - if (rs != CL_SUCCESS) return rs; - - if (image->width() > maxWidth || image->height() > maxHeight || - image->depth() > maxDepth) - return CL_INVALID_IMAGE_SIZE; - } - } - - return CL_SUCCESS; + return p_local_args; } DeviceKernel *Kernel::deviceDependentKernel(DeviceInterface *device) const @@ -486,3 +413,106 @@ cl_int Kernel::workGroupInfo(DeviceInterface *device, return CL_SUCCESS; } + +/* + * Kernel::Arg + */ +Kernel::Arg::Arg(unsigned short vec_dim, File file, Kind kind) +: p_vec_dim(vec_dim), p_file(file), p_kind(kind), p_defined(false), + p_runtime_alloc(0), p_data(0) +{ + +} + +Kernel::Arg::~Arg() +{ + if (p_data) + std::free(p_data); +} + +void Kernel::Arg::alloc() +{ + if (!p_data) + p_data = std::malloc(p_vec_dim * valueSize()); +} + +void Kernel::Arg::loadData(const void *data) +{ + std::memcpy(p_data, data, p_vec_dim * valueSize()); + p_defined = true; +} + +void Kernel::Arg::setAllocAtKernelRuntime(size_t size) +{ + p_runtime_alloc = size; + p_defined = true; +} + +bool Kernel::Arg::operator!=(const Arg &b) +{ + bool same = (p_vec_dim == b.p_vec_dim) && + (p_file == b.p_file) && + (p_kind == b.p_kind); + + return !same; +} + +size_t Kernel::Arg::valueSize() const +{ + switch (p_kind) + { + case Invalid: + return 0; + case Int8: + return 1; + case Int16: + return 2; + case Int32: + return 4; + case Int64: + return 8; + case Float: + return sizeof(cl_float); + case Double: + return sizeof(double); + case Buffer: + case Image2D: + case Image3D: + return sizeof(cl_mem); + } +} + +unsigned short Kernel::Arg::vecDim() const +{ + return p_vec_dim; +} + +Kernel::Arg::File Kernel::Arg::file() const +{ + return p_file; +} + +Kernel::Arg::Kind Kernel::Arg::kind() const +{ + return p_kind; +} + +bool Kernel::Arg::defined() const +{ + return p_defined; +} + +size_t Kernel::Arg::allocAtKernelRuntime() const +{ + return p_runtime_alloc; +} + +const void *Kernel::Arg::value(unsigned short index) const +{ + const char *data = (const char *)p_data; + unsigned int offset = index * valueSize(); + + data += offset; + + return (const void *)data; +} diff --git a/src/core/kernel.h b/src/core/kernel.h index b10b77e..2c9ffde 100644 --- a/src/core/kernel.h +++ b/src/core/kernel.h @@ -25,6 +25,57 @@ class Kernel Kernel(Program *program); ~Kernel(); + class Arg + { + public: + enum File + { + Private = 0, + Global = 1, + Local = 2, + Constant = 3 + }; + enum Kind + { + Invalid, + Int8, + Int16, + Int32, + Int64, + Float, + Double, + Buffer, + Image2D, + Image3D + // TODO: Sampler + }; + + Arg(unsigned short vec_dim, File file, Kind kind); + ~Arg(); + + void alloc(); + void loadData(const void *data); + void setAllocAtKernelRuntime(size_t size); + + bool operator !=(const Arg &b); + + size_t valueSize() const; + unsigned short vecDim() const; + File file() const; + Kind kind() const; + bool defined() const; + size_t allocAtKernelRuntime() const; + const void *value(unsigned short index) const; + + private: + unsigned short p_vec_dim; + File p_file; + Kind p_kind; + void *p_data; + bool p_defined; + size_t p_runtime_alloc; + }; + void reference(); bool dereference(); @@ -32,12 +83,14 @@ class Kernel llvm::Module *module); llvm::Function *function(DeviceInterface *device) const; cl_int setArg(cl_uint index, size_t size, const void *value); + unsigned int numArgs() const; + const Arg &arg(unsigned int index) const; Program *program() const; DeviceKernel *deviceDependentKernel(DeviceInterface *device) const; bool argsSpecified() const; - cl_int checkArgsForDevice(DeviceInterface *device) const; + bool needsLocalAllocation() const; /*!< One or more arguments is __local */ cl_int info(cl_kernel_info param_name, size_t param_value_size, @@ -53,6 +106,7 @@ class Kernel Program *p_program; unsigned int p_references; std::string p_name; + bool p_local_args; struct DeviceDependent { @@ -62,55 +116,6 @@ class Kernel llvm::Module *module; }; - struct Arg - { - unsigned short vec_dim; - bool set; - size_t kernel_alloc_size; /*!< Size of the memory that must be allocated at kernel execution */ - - enum File - { - Private = 0, - Global = 1, - Local = 2, - Constant = 3 - } file; - - enum Kind - { - Invalid, - Int8, - Int16, - Int32, - Int64, - Float, - Double, - Buffer, - Image2D, - Image3D - // TODO: Sampler - } kind; - - union - { - #define TYPE_VAL(type) type type##_val - TYPE_VAL(uint8_t); - TYPE_VAL(uint16_t); - TYPE_VAL(uint32_t); - TYPE_VAL(uint64_t); - TYPE_VAL(cl_float); - TYPE_VAL(double); - TYPE_VAL(cl_mem); - #undef TYPE_VAL - } value; - - inline bool operator !=(const Arg &b) - { - return (kind != b.kind) || (vec_dim != b.vec_dim); - } - size_t valueSize() const; - }; - std::vector<DeviceDependent> p_device_dependent; std::vector<Arg> p_args; DeviceDependent null_dep; diff --git a/src/core/program.cpp b/src/core/program.cpp index dfae848..91c8d17 100644 --- a/src/core/program.cpp +++ b/src/core/program.cpp @@ -94,6 +94,24 @@ Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device) } } +const Program::DeviceDependent &Program::deviceDependent(DeviceInterface *device) const +{ + for (int i=0; i<p_device_dependent.size(); ++i) + { + const DeviceDependent &rs = p_device_dependent[i]; + + if (rs.device == device || (!device && p_device_dependent.size() == 1)) + return rs; + } +} + +DeviceProgram *Program::deviceDependentProgram(DeviceInterface *device) const +{ + const DeviceDependent &dep = deviceDependent(device); + + return dep.program; +} + std::vector<llvm::Function *> Program::kernelFunctions(DeviceDependent &dep) { std::vector<llvm::Function *> rs; diff --git a/src/core/program.h b/src/core/program.h index 6036fe3..46ea88b 100644 --- a/src/core/program.h +++ b/src/core/program.h @@ -62,6 +62,7 @@ class Program Kernel *createKernel(const std::string &name, cl_int *errcode_ret); std::vector<Kernel *> createKernels(cl_int *errcode_ret); + DeviceProgram *deviceDependentProgram(DeviceInterface *device) const; cl_int info(cl_program_info param_name, size_t param_value_size, @@ -93,6 +94,7 @@ class Program void setDevices(cl_uint num_devices, DeviceInterface * const*devices); DeviceDependent &deviceDependent(DeviceInterface *device); + const DeviceDependent &deviceDependent(DeviceInterface *device) const; std::vector<llvm::Function *> kernelFunctions(DeviceDependent &dep); }; |