From f83c60e01ec4234d35277d221e50957f86ec814c Mon Sep 17 00:00:00 2001 From: Denis Steckelmacher Date: Tue, 9 Aug 2011 16:51:12 +0200 Subject: Implement barrier() The most exciting built-in, needing to use obscure things like makecontext() and swapcontext(). I'll properly document all what I've just implemented during the last week of the Google Summer of Code project. --- src/core/cpu/builtins.cpp | 211 ++++++++++++++++++++++++++++++++++++++++++++-- src/core/cpu/builtins.h | 39 +++++++++ src/core/cpu/kernel.cpp | 143 ++++++++++--------------------- src/core/cpu/kernel.h | 30 +++++-- src/core/cpu/program.cpp | 1 + src/core/cpu/worker.cpp | 13 +++ src/runtime/stdlib.h | 8 ++ tests/test_builtins.cpp | 45 ++++++++-- 8 files changed, 376 insertions(+), 114 deletions(-) create mode 100644 src/core/cpu/builtins.h diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp index 6fdab30..64eb945 100644 --- a/src/core/cpu/builtins.cpp +++ b/src/core/cpu/builtins.cpp @@ -1,6 +1,205 @@ +#include "builtins.h" #include "kernel.h" +#include "../events.h" + +#include +#include + +#include +#include +#include + +using namespace Coal; + +/* + * TLS-related functions + */ __thread Coal::CPUKernelWorkGroup *g_work_group; +__thread void *work_items_data; +__thread size_t work_items_size; + +void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current) +{ + g_work_group = current; +} + +void *getWorkItemsData(size_t &size) +{ + size = work_items_size; + return work_items_data; +} + +void setWorkItemsData(void *ptr, size_t size) +{ + work_items_data = ptr; + work_items_size = size; +} + +/* + * Actual built-ins implementations + */ +cl_uint CPUKernelWorkGroup::getWorkDim() const +{ + return p_work_dim; +} + +size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx]; +} + +size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const +{ + if (dimindx >p_work_dim) + return 1; + + return p_event->global_work_size(dimindx); +} + +size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 1; + + return p_event->local_work_size(dimindx); +} + +size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_current_context->local_id[dimindx]; +} + +size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 1; + + return (p_event->global_work_size(dimindx) / + p_event->local_work_size(dimindx)); +} + +size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_index[dimindx]; +} + +size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const +{ + if (dimindx > p_work_dim) + return 0; + + return p_event->global_work_offset(dimindx); +} + +void CPUKernelWorkGroup::barrier(unsigned int flags) +{ + p_had_barrier = true; + + // Allocate or reuse TLS memory for the stacks (it isn't freed between + // the work groups, and even the kernels, so if we need less space than + // allocated, it's good) + if (!p_contexts) + { + if (p_current_work_item != 0) + { + // Completely abnormal, it means that not every work-items + // encounter the barrier + std::cerr << "*** Not every work-items of " + << p_kernel->function()->getNameStr() + << " calls barrier(); !" << std::endl; + return; + } + + // Allocate or reuse the stacks + size_t contexts_size; + p_contexts = getWorkItemsData(contexts_size); + size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context)); + + if (!p_contexts || contexts_size < needed_size) + { + // We must allocate a new space + if (p_contexts) + munmap(p_contexts, contexts_size); + + p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */ + MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + + setWorkItemsData(p_contexts, contexts_size); + } + + // Now that we have a real main context, initialize it + p_current_context = getContextAddr(0); + p_current_context->initialized = 1; + std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t)); + + getcontext(&p_current_context->context); + } + + // Take the next context + p_current_work_item++; + if (p_current_work_item == p_num_work_items) p_current_work_item = 0; + + Context *next = getContextAddr(p_current_work_item); + Context *main = getContextAddr(0); // The context not created with makecontext + + // If the next context isn't initialized, initialize it. + // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized + if (next->initialized == 0) + { + next->initialized = 1; + + // local-id of next is the one of the current context, but incVec'ed + std::memcpy(next->local_id, p_current_context->local_id, + MAX_WORK_DIMS * sizeof(size_t)); + + incVec(p_work_dim, next->local_id, p_max_local_id); + + // Initialize the next context + if (getcontext(&next->context) != 0) + return; + + // Get its stack. It is located a next + sizeof(Context) + char *stack = (char *)next; + stack += sizeof(Context); + + next->context.uc_link = &main->context; + next->context.uc_stack.ss_sp = stack; + next->context.uc_stack.ss_size = p_stack_size; + + // Tell it to run the kernel function + makecontext(&next->context, p_kernel_func_addr, 0); + } + + // Switch to the next context + ucontext_t *cur = &p_current_context->context; + p_current_context = next; + + swapcontext(cur, &next->context); + + // When we return here, it means that all the other work items encountered + // a barrier and that we returned to this one. We can continue. +} + +void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const +{ + std::cout << "OpenCL: Non-existant builtin function " << name + << " found in kernel " << p_kernel->function()->getNameStr() + << '.' << std::endl; +} + +/* + * Built-in functions + */ static size_t get_global_id(cl_uint dimindx) { @@ -42,14 +241,14 @@ static size_t get_global_offset(uint dimindx) return g_work_group->getGlobalOffset(dimindx); } -/* - * Utility functions - */ -void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current) +static void barrier(unsigned int flags) { - g_work_group = current; + g_work_group->barrier(flags); } +/* + * Bridge between LLVM and us + */ static void unimplemented_stub() { } @@ -72,6 +271,8 @@ void *getBuiltin(const std::string &name) return (void *)&get_group_id; else if (name == "get_global_offset") return (void *)&get_global_offset; + else if (name == "barrier") + return (void *)&barrier; // Function not found g_work_group->builtinNotFound(name); diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h new file mode 100644 index 0000000..1a703fb --- /dev/null +++ b/src/core/cpu/builtins.h @@ -0,0 +1,39 @@ +#ifndef __BUILTINS_H__ +#define __BUILTINS_H__ + +#include + +namespace Coal { + class CPUKernelWorkGroup; +} + +void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current); +void *getBuiltin(const std::string &name); +void *getWorkItemsData(size_t &size); +void setWorkItemsData(void *ptr, size_t size); + +template +bool incVec(unsigned long dims, T *vec, T *maxs) +{ + bool overflow = false; + + for (unsigned int i=0; i maxs[i]) + { + vec[i] = 0; + overflow = true; + } + else + { + overflow = false; + break; + } + } + + return overflow; +} + +#endif \ No newline at end of file diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp index 61ef87f..20cb0d6 100644 --- a/src/core/cpu/kernel.cpp +++ b/src/core/cpu/kernel.cpp @@ -2,6 +2,7 @@ #include "device.h" #include "buffer.h" #include "program.h" +#include "builtins.h" #include "../kernel.h" #include "../memobject.h" @@ -21,33 +22,10 @@ #include #include #include +#include using namespace Coal; -template -bool incVec(cl_ulong dims, T *vec, T *maxs) -{ - bool overflow = false; - - for (cl_ulong i=0; i maxs[i]) - { - vec[i] = 0; - overflow = true; - } - else - { - overflow = false; - break; - } - } - - return overflow; -} - static llvm::Constant *getPointerConstant(llvm::LLVMContext &C, llvm::Type *type, void *const *value) @@ -411,19 +389,23 @@ CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event, CPUKernelEvent *cpu_event, const size_t *work_group_index) : p_kernel(kernel), p_cpu_event(cpu_event), p_event(event), - p_work_dim(event->work_dim()) + p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */), + p_had_barrier(false) { // Set index std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t)); // Set maxs and global id + p_num_work_items = 1; + for (unsigned int i=0; ilocal_work_size(i) - 1; // 0..n-1, not 1..n + p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n + p_num_work_items *= event->local_work_size(i); // Set global id - p_global_id[i] = (p_index[i] * event->local_work_size(i)) + p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i)) + event->global_work_offset(i); } } @@ -435,9 +417,6 @@ CPUKernelWorkGroup::~CPUKernelWorkGroup() bool CPUKernelWorkGroup::run() { - // Set current pos to 0 - std::memset(p_current, 0, p_work_dim * sizeof(size_t)); - // Get the kernel function to call bool free_after = p_kernel->kernel()->needsLocalAllocation(); std::vector local_to_free; @@ -449,16 +428,41 @@ bool CPUKernelWorkGroup::run() Program *p = (Program *)p_kernel->kernel()->parent(); CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device())); - void (*kernel_func_addr)() = (void(*)())prog->jit()->getPointerToFunction(kernel_func); + p_kernel_func_addr = (void(*)())prog->jit()->getPointerToFunction(kernel_func); - // Tell the builtins this thread will run a kernel + // Tell the builtins this thread will run a kernel work group setThreadLocalWorkGroup(this); + // Initialize the dummy context used by the builtins before a call to barrier() + p_current_work_item = 0; + p_current_context = &p_dummy_context; + + std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t)); + do { // Simply call the "call function", it and the builtins will do the rest - kernel_func_addr(); - } while (!incVec(p_work_dim, p_current, p_maxs)); + p_kernel_func_addr(); + } while (!p_had_barrier && + !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id)); + + // If no barrier() call was made, all is fine. If not, only the first + // work-item has currently finished. We must let the others run. + if (p_had_barrier) + { + Context *main_context = p_current_context; // After the first swapcontext, + // we will not be able to trust + // p_current_context anymore. + + // We'll call swapcontext for each remaining work-item. They will + // finish, and when they'll do so, this main context will be resumed, so + // it's easy (i starts from 1 because the main context already finished) + for (unsigned int i=1; icontext, &ctx->context); + } + } // We may have some cleanup to do if (free_after) @@ -475,71 +479,14 @@ bool CPUKernelWorkGroup::run() return true; } -cl_uint CPUKernelWorkGroup::getWorkDim() const +CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index) { - return p_work_dim; -} + size_t size; + char *data = (char *)p_contexts; -size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const -{ - if (dimindx > p_work_dim) - return 0; + // Each Context in data is an element of size p_stack_size + sizeof(Context) + size = p_stack_size + sizeof(Context); + size *= index; // To get an offset - return p_global_id[dimindx] + p_current[dimindx]; -} - -size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const -{ - if (dimindx >p_work_dim) - return 1; - - return p_event->global_work_size(dimindx); -} - -size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const -{ - if (dimindx > p_work_dim) - return 1; - - return p_event->local_work_size(dimindx); -} - -size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const -{ - if (dimindx > p_work_dim) - return 0; - - return p_current[dimindx]; -} - -size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const -{ - if (dimindx > p_work_dim) - return 1; - - return (p_event->global_work_size(dimindx) / - p_event->local_work_size(dimindx)); -} - -size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const -{ - if (dimindx > p_work_dim) - return 0; - - return p_index[dimindx]; -} - -size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const -{ - if (dimindx > p_work_dim) - return 0; - - return p_event->global_work_offset(dimindx); -} - -void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const -{ - std::cout << "OpenCL: Non-existant builtin function " << name - << " found in kernel " << p_kernel->function()->getNameStr() - << '.' << std::endl; + return (Context *)(data + size); // Pointer to the context } diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h index 91d1dfd..4abc38d 100644 --- a/src/core/cpu/kernel.h +++ b/src/core/cpu/kernel.h @@ -7,6 +7,8 @@ #include #include #include + +#include #include namespace llvm @@ -68,6 +70,7 @@ class CPUKernelWorkGroup size_t getNumGroups(cl_uint dimindx) const; size_t getGroupID(cl_uint dimindx) const; size_t getGlobalOffset(cl_uint dimindx) const; + void barrier(unsigned int flags); void builtinNotFound(const std::string &name) const; @@ -77,9 +80,27 @@ class CPUKernelWorkGroup KernelEvent *p_event; cl_uint p_work_dim; size_t p_index[MAX_WORK_DIMS], - p_current[MAX_WORK_DIMS], - p_maxs[MAX_WORK_DIMS], - p_global_id[MAX_WORK_DIMS]; + p_max_local_id[MAX_WORK_DIMS], + p_global_id_start_offset[MAX_WORK_DIMS]; + + void (*p_kernel_func_addr)(); + + // Machinery to have barrier() working + struct Context + { + size_t local_id[MAX_WORK_DIMS]; + ucontext_t context; + unsigned int initialized; + }; + + Context *getContextAddr(unsigned int index); + + Context *p_current_context; + Context p_dummy_context; + void *p_contexts; + size_t p_stack_size; + unsigned int p_num_work_items, p_current_work_item; + bool p_had_barrier; }; class CPUKernelEvent @@ -105,7 +126,4 @@ class CPUKernelEvent } -void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current); -void *getBuiltin(const std::string &name); - #endif diff --git a/src/core/cpu/program.cpp b/src/core/cpu/program.cpp index 653fb1e..1bc31de 100644 --- a/src/core/cpu/program.cpp +++ b/src/core/cpu/program.cpp @@ -1,6 +1,7 @@ #include "program.h" #include "device.h" #include "kernel.h" +#include "builtins.h" #include "../program.h" diff --git a/src/core/cpu/worker.cpp b/src/core/cpu/worker.cpp index 62fc2f0..a456714 100644 --- a/src/core/cpu/worker.cpp +++ b/src/core/cpu/worker.cpp @@ -2,12 +2,15 @@ #include "device.h" #include "buffer.h" #include "kernel.h" +#include "builtins.h" #include "../commandqueue.h" #include "../events.h" #include "../memobject.h" #include "../kernel.h" +#include + #include #include @@ -20,6 +23,9 @@ void *worker(void *data) cl_int errcode; Event *event; + // Initialize TLS + setWorkItemsData(0, 0); + while (true) { event = device->getEvent(stop); @@ -229,5 +235,12 @@ void *worker(void *data) } } + // Free mmapped() data if needed + size_t mapped_size; + void *mapped_data = getWorkItemsData(mapped_size); + + if (mapped_data) + munmap(mapped_data, mapped_size); + return 0; } diff --git a/src/runtime/stdlib.h b/src/runtime/stdlib.h index dae9043..527d5ac 100644 --- a/src/runtime/stdlib.h +++ b/src/runtime/stdlib.h @@ -66,6 +66,12 @@ COAL_VECTOR_SET(float); #define CLK_FILTER_NEAREST 0x00000000 #define CLK_FILTER_LINEAR 0x00000100 +#define CLK_LOCAL_MEM_FENCE 0x00000001 +#define CLK_GLOBAL_MEM_FENCE 0x00000002 + +/* Typedefs */ +typedef unsigned int cl_mem_fence_flags; + /* Management functions */ uint get_work_dim(); size_t get_global_size(uint dimindx); @@ -75,3 +81,5 @@ size_t get_local_id(uint dimindx); size_t get_num_groups(uint dimindx); size_t get_group_id(uint dimindx); size_t get_global_offset(uint dimindx); + +void barrier(cl_mem_fence_flags flags); diff --git a/tests/test_builtins.cpp b/tests/test_builtins.cpp index 1513428..cb8c7ab 100644 --- a/tests/test_builtins.cpp +++ b/tests/test_builtins.cpp @@ -12,13 +12,21 @@ const char sampler_source[] = " CLK_ADDRESS_MIRRORED_REPEAT |\n" " CLK_FILTER_NEAREST;\n" "\n" - " if (sampler != good_sampler) *rs = 1;" + " if (sampler != good_sampler) *rs = 1;\n" + "}\n"; + +const char barrier_source[] = + "__kernel void test_case(__global uint *rs) {\n" + " *rs = 0;\n" + " barrier(0);\n" + " *rs += 1;\n" "}\n"; enum TestCaseKind { NormalKind, - SamplerKind + SamplerKind, + BarrierKind }; /* @@ -96,10 +104,25 @@ static uint32_t run_kernel(const char *source, TestCaseKind kind) result = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler); if (result != CL_SUCCESS) return 65547; break; + + default: + break; } - result = clEnqueueTask(queue, kernel, 0, 0, &event); - if (result != CL_SUCCESS) return 65544; + if (kind == BarrierKind) + { + size_t local_size = 64; + size_t global_size = 64; + + result = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &global_size, + &local_size, 0, 0, &event); + if (result != CL_SUCCESS) return 65544; + } + else + { + result = clEnqueueTask(queue, kernel, 0, 0, &event); + if (result != CL_SUCCESS) return 65544; + } result = clWaitForEvents(1, &event); if (result != CL_SUCCESS) return 65545; @@ -138,7 +161,7 @@ static const char *default_error(uint32_t errcode) case 65543: return "Cannot set kernel argument"; case 65544: - return "Cannot enqueue a task kernel"; + return "Cannot enqueue the kernel"; case 65545: return "Cannot wait for the event"; case 65546: @@ -172,10 +195,22 @@ START_TEST (test_sampler) } END_TEST +START_TEST (test_barrier) +{ + uint32_t rs = run_kernel(barrier_source, BarrierKind); + + fail_if( + rs != 0x40, + default_error(rs) + ); +} +END_TEST + TCase *cl_builtins_tcase_create(void) { TCase *tc = NULL; tc = tcase_create("builtins"); tcase_add_test(tc, test_sampler); + tcase_add_test(tc, test_barrier); return tc; } -- cgit v1.2.3