summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDenis Steckelmacher <steckdenis@yahoo.fr>2011-08-09 16:51:12 +0200
committerDenis Steckelmacher <steckdenis@yahoo.fr>2011-08-09 16:51:12 +0200
commitf83c60e01ec4234d35277d221e50957f86ec814c (patch)
treef2ad868c4690f9e638e5b4ab7f967910fa42276f
parentfddcc53e44aecd16d284b9c3b8a4eb5a37e752e7 (diff)
Implement barrier()
The most exciting built-in, needing to use obscure things like makecontext() and swapcontext(). I'll properly document all what I've just implemented during the last week of the Google Summer of Code project.
-rw-r--r--src/core/cpu/builtins.cpp211
-rw-r--r--src/core/cpu/builtins.h39
-rw-r--r--src/core/cpu/kernel.cpp143
-rw-r--r--src/core/cpu/kernel.h30
-rw-r--r--src/core/cpu/program.cpp1
-rw-r--r--src/core/cpu/worker.cpp13
-rw-r--r--src/runtime/stdlib.h8
-rw-r--r--tests/test_builtins.cpp45
8 files changed, 376 insertions, 114 deletions
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp
index 6fdab30..64eb945 100644
--- a/src/core/cpu/builtins.cpp
+++ b/src/core/cpu/builtins.cpp
@@ -1,6 +1,205 @@
+#include "builtins.h"
#include "kernel.h"
+#include "../events.h"
+
+#include <sys/mman.h>
+#include <signal.h>
+
+#include <llvm/Function.h>
+#include <iostream>
+#include <cstring>
+
+using namespace Coal;
+
+/*
+ * TLS-related functions
+ */
__thread Coal::CPUKernelWorkGroup *g_work_group;
+__thread void *work_items_data;
+__thread size_t work_items_size;
+
+void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
+{
+ g_work_group = current;
+}
+
+void *getWorkItemsData(size_t &size)
+{
+ size = work_items_size;
+ return work_items_data;
+}
+
+void setWorkItemsData(void *ptr, size_t size)
+{
+ work_items_data = ptr;
+ work_items_size = size;
+}
+
+/*
+ * Actual built-ins implementations
+ */
+cl_uint CPUKernelWorkGroup::getWorkDim() const
+{
+ return p_work_dim;
+}
+
+size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
+{
+ if (dimindx >p_work_dim)
+ return 1;
+
+ return p_event->global_work_size(dimindx);
+}
+
+size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 1;
+
+ return p_event->local_work_size(dimindx);
+}
+
+size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_current_context->local_id[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 1;
+
+ return (p_event->global_work_size(dimindx) /
+ p_event->local_work_size(dimindx));
+}
+
+size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_index[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const
+{
+ if (dimindx > p_work_dim)
+ return 0;
+
+ return p_event->global_work_offset(dimindx);
+}
+
+void CPUKernelWorkGroup::barrier(unsigned int flags)
+{
+ p_had_barrier = true;
+
+ // Allocate or reuse TLS memory for the stacks (it isn't freed between
+ // the work groups, and even the kernels, so if we need less space than
+ // allocated, it's good)
+ if (!p_contexts)
+ {
+ if (p_current_work_item != 0)
+ {
+ // Completely abnormal, it means that not every work-items
+ // encounter the barrier
+ std::cerr << "*** Not every work-items of "
+ << p_kernel->function()->getNameStr()
+ << " calls barrier(); !" << std::endl;
+ return;
+ }
+
+ // Allocate or reuse the stacks
+ size_t contexts_size;
+ p_contexts = getWorkItemsData(contexts_size);
+ size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context));
+
+ if (!p_contexts || contexts_size < needed_size)
+ {
+ // We must allocate a new space
+ if (p_contexts)
+ munmap(p_contexts, contexts_size);
+
+ p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+
+ setWorkItemsData(p_contexts, contexts_size);
+ }
+
+ // Now that we have a real main context, initialize it
+ p_current_context = getContextAddr(0);
+ p_current_context->initialized = 1;
+ std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t));
+
+ getcontext(&p_current_context->context);
+ }
+
+ // Take the next context
+ p_current_work_item++;
+ if (p_current_work_item == p_num_work_items) p_current_work_item = 0;
+
+ Context *next = getContextAddr(p_current_work_item);
+ Context *main = getContextAddr(0); // The context not created with makecontext
+
+ // If the next context isn't initialized, initialize it.
+ // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized
+ if (next->initialized == 0)
+ {
+ next->initialized = 1;
+
+ // local-id of next is the one of the current context, but incVec'ed
+ std::memcpy(next->local_id, p_current_context->local_id,
+ MAX_WORK_DIMS * sizeof(size_t));
+
+ incVec(p_work_dim, next->local_id, p_max_local_id);
+
+ // Initialize the next context
+ if (getcontext(&next->context) != 0)
+ return;
+
+ // Get its stack. It is located a next + sizeof(Context)
+ char *stack = (char *)next;
+ stack += sizeof(Context);
+
+ next->context.uc_link = &main->context;
+ next->context.uc_stack.ss_sp = stack;
+ next->context.uc_stack.ss_size = p_stack_size;
+
+ // Tell it to run the kernel function
+ makecontext(&next->context, p_kernel_func_addr, 0);
+ }
+
+ // Switch to the next context
+ ucontext_t *cur = &p_current_context->context;
+ p_current_context = next;
+
+ swapcontext(cur, &next->context);
+
+ // When we return here, it means that all the other work items encountered
+ // a barrier and that we returned to this one. We can continue.
+}
+
+void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const
+{
+ std::cout << "OpenCL: Non-existant builtin function " << name
+ << " found in kernel " << p_kernel->function()->getNameStr()
+ << '.' << std::endl;
+}
+
+/*
+ * Built-in functions
+ */
static size_t get_global_id(cl_uint dimindx)
{
@@ -42,14 +241,14 @@ static size_t get_global_offset(uint dimindx)
return g_work_group->getGlobalOffset(dimindx);
}
-/*
- * Utility functions
- */
-void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
+static void barrier(unsigned int flags)
{
- g_work_group = current;
+ g_work_group->barrier(flags);
}
+/*
+ * Bridge between LLVM and us
+ */
static void unimplemented_stub()
{
}
@@ -72,6 +271,8 @@ void *getBuiltin(const std::string &name)
return (void *)&get_group_id;
else if (name == "get_global_offset")
return (void *)&get_global_offset;
+ else if (name == "barrier")
+ return (void *)&barrier;
// Function not found
g_work_group->builtinNotFound(name);
diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h
new file mode 100644
index 0000000..1a703fb
--- /dev/null
+++ b/src/core/cpu/builtins.h
@@ -0,0 +1,39 @@
+#ifndef __BUILTINS_H__
+#define __BUILTINS_H__
+
+#include <string>
+
+namespace Coal {
+ class CPUKernelWorkGroup;
+}
+
+void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current);
+void *getBuiltin(const std::string &name);
+void *getWorkItemsData(size_t &size);
+void setWorkItemsData(void *ptr, size_t size);
+
+template<typename T>
+bool incVec(unsigned long dims, T *vec, T *maxs)
+{
+ bool overflow = false;
+
+ for (unsigned int i=0; i<dims; ++i)
+ {
+ vec[i] += 1;
+
+ if (vec[i] > maxs[i])
+ {
+ vec[i] = 0;
+ overflow = true;
+ }
+ else
+ {
+ overflow = false;
+ break;
+ }
+ }
+
+ return overflow;
+}
+
+#endif \ No newline at end of file
diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp
index 61ef87f..20cb0d6 100644
--- a/src/core/cpu/kernel.cpp
+++ b/src/core/cpu/kernel.cpp
@@ -2,6 +2,7 @@
#include "device.h"
#include "buffer.h"
#include "program.h"
+#include "builtins.h"
#include "../kernel.h"
#include "../memobject.h"
@@ -21,33 +22,10 @@
#include <cstdlib>
#include <cstring>
#include <iostream>
+#include <sys/mman.h>
using namespace Coal;
-template<typename T>
-bool incVec(cl_ulong dims, T *vec, T *maxs)
-{
- bool overflow = false;
-
- for (cl_ulong i=0; i<dims; ++i)
- {
- vec[i] += 1;
-
- if (vec[i] > maxs[i])
- {
- vec[i] = 0;
- overflow = true;
- }
- else
- {
- overflow = false;
- break;
- }
- }
-
- return overflow;
-}
-
static llvm::Constant *getPointerConstant(llvm::LLVMContext &C,
llvm::Type *type,
void *const *value)
@@ -411,19 +389,23 @@ CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
CPUKernelEvent *cpu_event,
const size_t *work_group_index)
: p_kernel(kernel), p_cpu_event(cpu_event), p_event(event),
- p_work_dim(event->work_dim())
+ p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */),
+ p_had_barrier(false)
{
// Set index
std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t));
// Set maxs and global id
+ p_num_work_items = 1;
+
for (unsigned int i=0; i<p_work_dim; ++i)
{
- p_maxs[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
+ p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
+ p_num_work_items *= event->local_work_size(i);
// Set global id
- p_global_id[i] = (p_index[i] * event->local_work_size(i))
+ p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i))
+ event->global_work_offset(i);
}
}
@@ -435,9 +417,6 @@ CPUKernelWorkGroup::~CPUKernelWorkGroup()
bool CPUKernelWorkGroup::run()
{
- // Set current pos to 0
- std::memset(p_current, 0, p_work_dim * sizeof(size_t));
-
// Get the kernel function to call
bool free_after = p_kernel->kernel()->needsLocalAllocation();
std::vector<void *> local_to_free;
@@ -449,16 +428,41 @@ bool CPUKernelWorkGroup::run()
Program *p = (Program *)p_kernel->kernel()->parent();
CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device()));
- void (*kernel_func_addr)() = (void(*)())prog->jit()->getPointerToFunction(kernel_func);
+ p_kernel_func_addr = (void(*)())prog->jit()->getPointerToFunction(kernel_func);
- // Tell the builtins this thread will run a kernel
+ // Tell the builtins this thread will run a kernel work group
setThreadLocalWorkGroup(this);
+ // Initialize the dummy context used by the builtins before a call to barrier()
+ p_current_work_item = 0;
+ p_current_context = &p_dummy_context;
+
+ std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t));
+
do
{
// Simply call the "call function", it and the builtins will do the rest
- kernel_func_addr();
- } while (!incVec(p_work_dim, p_current, p_maxs));
+ p_kernel_func_addr();
+ } while (!p_had_barrier &&
+ !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id));
+
+ // If no barrier() call was made, all is fine. If not, only the first
+ // work-item has currently finished. We must let the others run.
+ if (p_had_barrier)
+ {
+ Context *main_context = p_current_context; // After the first swapcontext,
+ // we will not be able to trust
+ // p_current_context anymore.
+
+ // We'll call swapcontext for each remaining work-item. They will
+ // finish, and when they'll do so, this main context will be resumed, so
+ // it's easy (i starts from 1 because the main context already finished)
+ for (unsigned int i=1; i<p_num_work_items; ++i)
+ {
+ Context *ctx = getContextAddr(i);
+ swapcontext(&main_context->context, &ctx->context);
+ }
+ }
// We may have some cleanup to do
if (free_after)
@@ -475,71 +479,14 @@ bool CPUKernelWorkGroup::run()
return true;
}
-cl_uint CPUKernelWorkGroup::getWorkDim() const
+CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index)
{
- return p_work_dim;
-}
+ size_t size;
+ char *data = (char *)p_contexts;
-size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
-{
- if (dimindx > p_work_dim)
- return 0;
+ // Each Context in data is an element of size p_stack_size + sizeof(Context)
+ size = p_stack_size + sizeof(Context);
+ size *= index; // To get an offset
- return p_global_id[dimindx] + p_current[dimindx];
-}
-
-size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
-{
- if (dimindx >p_work_dim)
- return 1;
-
- return p_event->global_work_size(dimindx);
-}
-
-size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
-{
- if (dimindx > p_work_dim)
- return 1;
-
- return p_event->local_work_size(dimindx);
-}
-
-size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
-{
- if (dimindx > p_work_dim)
- return 0;
-
- return p_current[dimindx];
-}
-
-size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
-{
- if (dimindx > p_work_dim)
- return 1;
-
- return (p_event->global_work_size(dimindx) /
- p_event->local_work_size(dimindx));
-}
-
-size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
-{
- if (dimindx > p_work_dim)
- return 0;
-
- return p_index[dimindx];
-}
-
-size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const
-{
- if (dimindx > p_work_dim)
- return 0;
-
- return p_event->global_work_offset(dimindx);
-}
-
-void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const
-{
- std::cout << "OpenCL: Non-existant builtin function " << name
- << " found in kernel " << p_kernel->function()->getNameStr()
- << '.' << std::endl;
+ return (Context *)(data + size); // Pointer to the context
}
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index 91d1dfd..4abc38d 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -7,6 +7,8 @@
#include <llvm/ExecutionEngine/GenericValue.h>
#include <vector>
#include <string>
+
+#include <ucontext.h>
#include <pthread.h>
namespace llvm
@@ -68,6 +70,7 @@ class CPUKernelWorkGroup
size_t getNumGroups(cl_uint dimindx) const;
size_t getGroupID(cl_uint dimindx) const;
size_t getGlobalOffset(cl_uint dimindx) const;
+ void barrier(unsigned int flags);
void builtinNotFound(const std::string &name) const;
@@ -77,9 +80,27 @@ class CPUKernelWorkGroup
KernelEvent *p_event;
cl_uint p_work_dim;
size_t p_index[MAX_WORK_DIMS],
- p_current[MAX_WORK_DIMS],
- p_maxs[MAX_WORK_DIMS],
- p_global_id[MAX_WORK_DIMS];
+ p_max_local_id[MAX_WORK_DIMS],
+ p_global_id_start_offset[MAX_WORK_DIMS];
+
+ void (*p_kernel_func_addr)();
+
+ // Machinery to have barrier() working
+ struct Context
+ {
+ size_t local_id[MAX_WORK_DIMS];
+ ucontext_t context;
+ unsigned int initialized;
+ };
+
+ Context *getContextAddr(unsigned int index);
+
+ Context *p_current_context;
+ Context p_dummy_context;
+ void *p_contexts;
+ size_t p_stack_size;
+ unsigned int p_num_work_items, p_current_work_item;
+ bool p_had_barrier;
};
class CPUKernelEvent
@@ -105,7 +126,4 @@ class CPUKernelEvent
}
-void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current);
-void *getBuiltin(const std::string &name);
-
#endif
diff --git a/src/core/cpu/program.cpp b/src/core/cpu/program.cpp
index 653fb1e..1bc31de 100644
--- a/src/core/cpu/program.cpp
+++ b/src/core/cpu/program.cpp
@@ -1,6 +1,7 @@
#include "program.h"
#include "device.h"
#include "kernel.h"
+#include "builtins.h"
#include "../program.h"
diff --git a/src/core/cpu/worker.cpp b/src/core/cpu/worker.cpp
index 62fc2f0..a456714 100644
--- a/src/core/cpu/worker.cpp
+++ b/src/core/cpu/worker.cpp
@@ -2,12 +2,15 @@
#include "device.h"
#include "buffer.h"
#include "kernel.h"
+#include "builtins.h"
#include "../commandqueue.h"
#include "../events.h"
#include "../memobject.h"
#include "../kernel.h"
+#include <sys/mman.h>
+
#include <cstring>
#include <iostream>
@@ -20,6 +23,9 @@ void *worker(void *data)
cl_int errcode;
Event *event;
+ // Initialize TLS
+ setWorkItemsData(0, 0);
+
while (true)
{
event = device->getEvent(stop);
@@ -229,5 +235,12 @@ void *worker(void *data)
}
}
+ // Free mmapped() data if needed
+ size_t mapped_size;
+ void *mapped_data = getWorkItemsData(mapped_size);
+
+ if (mapped_data)
+ munmap(mapped_data, mapped_size);
+
return 0;
}
diff --git a/src/runtime/stdlib.h b/src/runtime/stdlib.h
index dae9043..527d5ac 100644
--- a/src/runtime/stdlib.h
+++ b/src/runtime/stdlib.h
@@ -66,6 +66,12 @@ COAL_VECTOR_SET(float);
#define CLK_FILTER_NEAREST 0x00000000
#define CLK_FILTER_LINEAR 0x00000100
+#define CLK_LOCAL_MEM_FENCE 0x00000001
+#define CLK_GLOBAL_MEM_FENCE 0x00000002
+
+/* Typedefs */
+typedef unsigned int cl_mem_fence_flags;
+
/* Management functions */
uint get_work_dim();
size_t get_global_size(uint dimindx);
@@ -75,3 +81,5 @@ size_t get_local_id(uint dimindx);
size_t get_num_groups(uint dimindx);
size_t get_group_id(uint dimindx);
size_t get_global_offset(uint dimindx);
+
+void barrier(cl_mem_fence_flags flags);
diff --git a/tests/test_builtins.cpp b/tests/test_builtins.cpp
index 1513428..cb8c7ab 100644
--- a/tests/test_builtins.cpp
+++ b/tests/test_builtins.cpp
@@ -12,13 +12,21 @@ const char sampler_source[] =
" CLK_ADDRESS_MIRRORED_REPEAT |\n"
" CLK_FILTER_NEAREST;\n"
"\n"
- " if (sampler != good_sampler) *rs = 1;"
+ " if (sampler != good_sampler) *rs = 1;\n"
+ "}\n";
+
+const char barrier_source[] =
+ "__kernel void test_case(__global uint *rs) {\n"
+ " *rs = 0;\n"
+ " barrier(0);\n"
+ " *rs += 1;\n"
"}\n";
enum TestCaseKind
{
NormalKind,
- SamplerKind
+ SamplerKind,
+ BarrierKind
};
/*
@@ -96,10 +104,25 @@ static uint32_t run_kernel(const char *source, TestCaseKind kind)
result = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler);
if (result != CL_SUCCESS) return 65547;
break;
+
+ default:
+ break;
}
- result = clEnqueueTask(queue, kernel, 0, 0, &event);
- if (result != CL_SUCCESS) return 65544;
+ if (kind == BarrierKind)
+ {
+ size_t local_size = 64;
+ size_t global_size = 64;
+
+ result = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &global_size,
+ &local_size, 0, 0, &event);
+ if (result != CL_SUCCESS) return 65544;
+ }
+ else
+ {
+ result = clEnqueueTask(queue, kernel, 0, 0, &event);
+ if (result != CL_SUCCESS) return 65544;
+ }
result = clWaitForEvents(1, &event);
if (result != CL_SUCCESS) return 65545;
@@ -138,7 +161,7 @@ static const char *default_error(uint32_t errcode)
case 65543:
return "Cannot set kernel argument";
case 65544:
- return "Cannot enqueue a task kernel";
+ return "Cannot enqueue the kernel";
case 65545:
return "Cannot wait for the event";
case 65546:
@@ -172,10 +195,22 @@ START_TEST (test_sampler)
}
END_TEST
+START_TEST (test_barrier)
+{
+ uint32_t rs = run_kernel(barrier_source, BarrierKind);
+
+ fail_if(
+ rs != 0x40,
+ default_error(rs)
+ );
+}
+END_TEST
+
TCase *cl_builtins_tcase_create(void)
{
TCase *tc = NULL;
tc = tcase_create("builtins");
tcase_add_test(tc, test_sampler);
+ tcase_add_test(tc, test_barrier);
return tc;
}