From f83c60e01ec4234d35277d221e50957f86ec814c Mon Sep 17 00:00:00 2001
From: Denis Steckelmacher <steckdenis@yahoo.fr>
Date: Tue, 9 Aug 2011 16:51:12 +0200
Subject: Implement barrier()

The most exciting built-in, needing to use obscure things like
makecontext() and swapcontext(). I'll properly document all what I've
just implemented during the last week of the Google Summer of Code
project.
---
 src/core/cpu/builtins.cpp | 211 ++++++++++++++++++++++++++++++++++++++++++++--
 src/core/cpu/builtins.h   |  39 +++++++++
 src/core/cpu/kernel.cpp   | 143 ++++++++++---------------------
 src/core/cpu/kernel.h     |  30 +++++--
 src/core/cpu/program.cpp  |   1 +
 src/core/cpu/worker.cpp   |  13 +++
 src/runtime/stdlib.h      |   8 ++
 tests/test_builtins.cpp   |  45 ++++++++--
 8 files changed, 376 insertions(+), 114 deletions(-)
 create mode 100644 src/core/cpu/builtins.h
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp
index 6fdab30..64eb945 100644
--- a/src/core/cpu/builtins.cpp
+++ b/src/core/cpu/builtins.cpp
@@ -1,6 +1,205 @@
+#include "builtins.h"
 #include "kernel.h"
 
+#include "../events.h"
+
+#include <sys/mman.h>
+#include <signal.h>
+
+#include <llvm/Function.h>
+#include <iostream>
+#include <cstring>
+
+using namespace Coal;
+
+/*
+ * TLS-related functions
+ */
 __thread Coal::CPUKernelWorkGroup *g_work_group;
+__thread void *work_items_data;
+__thread size_t work_items_size;
+
+void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
+{
+    g_work_group = current;
+}
+
+void *getWorkItemsData(size_t &size)
+{
+    size = work_items_size;
+    return work_items_data;
+}
+
+void setWorkItemsData(void *ptr, size_t size)
+{
+    work_items_data = ptr;
+    work_items_size = size;
+}
+
+/*
+ * Actual built-ins implementations
+ */
+cl_uint CPUKernelWorkGroup::getWorkDim() const
+{
+    return p_work_dim;
+}
+
+size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
+{
+    if (dimindx > p_work_dim)
+        return 0;
+
+    return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
+{
+    if (dimindx >p_work_dim)
+        return 1;
+
+    return p_event->global_work_size(dimindx);
+}
+
+size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
+{
+    if (dimindx > p_work_dim)
+        return 1;
+
+    return p_event->local_work_size(dimindx);
+}
+
+size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
+{
+    if (dimindx > p_work_dim)
+        return 0;
+
+    return p_current_context->local_id[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
+{
+    if (dimindx > p_work_dim)
+        return 1;
+
+    return (p_event->global_work_size(dimindx) /
+            p_event->local_work_size(dimindx));
+}
+
+size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
+{
+    if (dimindx > p_work_dim)
+        return 0;
+
+    return p_index[dimindx];
+}
+
+size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const
+{
+    if (dimindx > p_work_dim)
+        return 0;
+
+    return p_event->global_work_offset(dimindx);
+}
+
+void CPUKernelWorkGroup::barrier(unsigned int flags)
+{
+    p_had_barrier = true;
+
+    // Allocate or reuse TLS memory for the stacks (it isn't freed between
+    // the work groups, and even the kernels, so if we need less space than
+    // allocated, it's good)
+    if (!p_contexts)
+    {
+        if (p_current_work_item != 0)
+        {
+            // Completely abnormal, it means that not every work-items
+            // encounter the barrier
+            std::cerr << "*** Not every work-items of "
+                      << p_kernel->function()->getNameStr()
+                      << " calls barrier(); !" << std::endl;
+            return;
+        }
+
+        // Allocate or reuse the stacks
+        size_t contexts_size;
+        p_contexts = getWorkItemsData(contexts_size);
+        size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context));
+
+        if (!p_contexts || contexts_size < needed_size)
+        {
+            // We must allocate a new space
+            if (p_contexts)
+                munmap(p_contexts, contexts_size);
+
+            p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */
+                            MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+
+            setWorkItemsData(p_contexts, contexts_size);
+        }
+
+        // Now that we have a real main context, initialize it
+        p_current_context = getContextAddr(0);
+        p_current_context->initialized = 1;
+        std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t));
+
+        getcontext(&p_current_context->context);
+    }
+
+    // Take the next context
+    p_current_work_item++;
+    if (p_current_work_item == p_num_work_items) p_current_work_item = 0;
+
+    Context *next = getContextAddr(p_current_work_item);
+    Context *main = getContextAddr(0);  // The context not created with makecontext
+
+    // If the next context isn't initialized, initialize it.
+    // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized
+    if (next->initialized == 0)
+    {
+        next->initialized = 1;
+
+        // local-id of next is the one of the current context, but incVec'ed
+        std::memcpy(next->local_id, p_current_context->local_id,
+                    MAX_WORK_DIMS * sizeof(size_t));
+
+        incVec(p_work_dim, next->local_id, p_max_local_id);
+
+        // Initialize the next context
+        if (getcontext(&next->context) != 0)
+            return;
+
+        // Get its stack. It is located a next + sizeof(Context)
+        char *stack = (char *)next;
+        stack += sizeof(Context);
+
+        next->context.uc_link = &main->context;
+        next->context.uc_stack.ss_sp = stack;
+        next->context.uc_stack.ss_size = p_stack_size;
+
+        // Tell it to run the kernel function
+        makecontext(&next->context, p_kernel_func_addr, 0);
+    }
+
+    // Switch to the next context
+    ucontext_t *cur = &p_current_context->context;
+    p_current_context = next;
+
+    swapcontext(cur, &next->context);
+
+    // When we return here, it means that all the other work items encountered
+    // a barrier and that we returned to this one. We can continue.
+}
+
+void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const
+{
+    std::cout << "OpenCL: Non-existant builtin function " << name
+              << " found in kernel " << p_kernel->function()->getNameStr()
+              << '.' << std::endl;
+}
+
+/*
+ * Built-in functions
+ */
 
 static size_t get_global_id(cl_uint dimindx)
 {
@@ -42,14 +241,14 @@ static size_t get_global_offset(uint dimindx)
     return g_work_group->getGlobalOffset(dimindx);
 }
 
-/*
- * Utility functions
- */
-void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
+static void barrier(unsigned int flags)
 {
-    g_work_group = current;
+    g_work_group->barrier(flags);
 }
 
+/*
+ * Bridge between LLVM and us
+ */
 static void unimplemented_stub()
 {
 }
@@ -72,6 +271,8 @@ void *getBuiltin(const std::string &name)
         return (void *)&get_group_id;
     else if (name == "get_global_offset")
         return (void *)&get_global_offset;
+    else if (name == "barrier")
+        return (void *)&barrier;
 
     // Function not found
     g_work_group->builtinNotFound(name);
diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h
new file mode 100644
index 0000000..1a703fb
--- /dev/null
+++ b/src/core/cpu/builtins.h
@@ -0,0 +1,39 @@
+#ifndef __BUILTINS_H__
+#define __BUILTINS_H__
+
+#include <string>
+
+namespace Coal {
+    class CPUKernelWorkGroup;
+}
+
+void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current);
+void *getBuiltin(const std::string &name);
+void *getWorkItemsData(size_t &size);
+void setWorkItemsData(void *ptr, size_t size);
+
+template<typename T>
+bool incVec(unsigned long dims, T *vec, T *maxs)
+{
+    bool overflow = false;
+
+    for (unsigned int i=0; i<dims; ++i)
+    {
+        vec[i] += 1;
+
+        if (vec[i] > maxs[i])
+        {
+            vec[i] = 0;
+            overflow = true;
+        }
+        else
+        {
+            overflow = false;
+            break;
+        }
+    }
+
+    return overflow;
+}
+
+#endif
\ No newline at end of file
diff --git a/src/core/cpu/kernel.cpp b/src/core/cpu/kernel.cpp
index 61ef87f..20cb0d6 100644
--- a/src/core/cpu/kernel.cpp
+++ b/src/core/cpu/kernel.cpp
@@ -2,6 +2,7 @@
 #include "device.h"
 #include "buffer.h"
 #include "program.h"
+#include "builtins.h"
 
 #include "../kernel.h"
 #include "../memobject.h"
@@ -21,33 +22,10 @@
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
+#include <sys/mman.h>
 
 using namespace Coal;
 
-template<typename T>
-bool incVec(cl_ulong dims, T *vec, T *maxs)
-{
-    bool overflow = false;
-
-    for (cl_ulong i=0; i<dims; ++i)
-    {
-        vec[i] += 1;
-
-        if (vec[i] > maxs[i])
-        {
-            vec[i] = 0;
-            overflow = true;
-        }
-        else
-        {
-            overflow = false;
-            break;
-        }
-    }
-
-    return overflow;
-}
-
 static llvm::Constant *getPointerConstant(llvm::LLVMContext &C,
                                           llvm::Type *type,
                                           void *const *value)
@@ -411,19 +389,23 @@ CPUKernelWorkGroup::CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
                                        CPUKernelEvent *cpu_event,
                                        const size_t *work_group_index)
 : p_kernel(kernel), p_cpu_event(cpu_event), p_event(event),
-  p_work_dim(event->work_dim())
+  p_work_dim(event->work_dim()), p_contexts(0), p_stack_size(8192 /* TODO */),
+  p_had_barrier(false)
 {
 
     // Set index
     std::memcpy(p_index, work_group_index, p_work_dim * sizeof(size_t));
 
     // Set maxs and global id
+    p_num_work_items = 1;
+
     for (unsigned int i=0; i<p_work_dim; ++i)
     {
-        p_maxs[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
+        p_max_local_id[i] = event->local_work_size(i) - 1; // 0..n-1, not 1..n
+        p_num_work_items *= event->local_work_size(i);
 
         // Set global id
-        p_global_id[i] = (p_index[i] * event->local_work_size(i))
+        p_global_id_start_offset[i] = (p_index[i] * event->local_work_size(i))
                          + event->global_work_offset(i);
     }
 }
@@ -435,9 +417,6 @@ CPUKernelWorkGroup::~CPUKernelWorkGroup()
 
 bool CPUKernelWorkGroup::run()
 {
-    // Set current pos to 0
-    std::memset(p_current, 0, p_work_dim * sizeof(size_t));
-
     // Get the kernel function to call
     bool free_after = p_kernel->kernel()->needsLocalAllocation();
     std::vector<void *> local_to_free;
@@ -449,16 +428,41 @@ bool CPUKernelWorkGroup::run()
     Program *p = (Program *)p_kernel->kernel()->parent();
     CPUProgram *prog = (CPUProgram *)(p->deviceDependentProgram(p_kernel->device()));
 
-    void (*kernel_func_addr)() = (void(*)())prog->jit()->getPointerToFunction(kernel_func);
+    p_kernel_func_addr = (void(*)())prog->jit()->getPointerToFunction(kernel_func);
 
-    // Tell the builtins this thread will run a kernel
+    // Tell the builtins this thread will run a kernel work group
     setThreadLocalWorkGroup(this);
 
+    // Initialize the dummy context used by the builtins before a call to barrier()
+    p_current_work_item = 0;
+    p_current_context = &p_dummy_context;
+
+    std::memset(p_dummy_context.local_id, 0, p_work_dim * sizeof(size_t));
+
     do
     {
         // Simply call the "call function", it and the builtins will do the rest
-        kernel_func_addr();
-    } while (!incVec(p_work_dim, p_current, p_maxs));
+        p_kernel_func_addr();
+    } while (!p_had_barrier &&
+             !incVec(p_work_dim, p_dummy_context.local_id, p_max_local_id));
+
+    // If no barrier() call was made, all is fine. If not, only the first
+    // work-item has currently finished. We must let the others run.
+    if (p_had_barrier)
+    {
+        Context *main_context = p_current_context; // After the first swapcontext,
+                                                   // we will not be able to trust
+                                                   // p_current_context anymore.
+
+        // We'll call swapcontext for each remaining work-item. They will
+        // finish, and when they'll do so, this main context will be resumed, so
+        // it's easy (i starts from 1 because the main context already finished)
+        for (unsigned int i=1; i<p_num_work_items; ++i)
+        {
+            Context *ctx = getContextAddr(i);
+            swapcontext(&main_context->context, &ctx->context);
+        }
+    }
 
     // We may have some cleanup to do
     if (free_after)
@@ -475,71 +479,14 @@ bool CPUKernelWorkGroup::run()
     return true;
 }
 
-cl_uint CPUKernelWorkGroup::getWorkDim() const
+CPUKernelWorkGroup::Context *CPUKernelWorkGroup::getContextAddr(unsigned int index)
 {
-    return p_work_dim;
-}
+    size_t size;
+    char *data = (char *)p_contexts;
 
-size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
-{
-    if (dimindx > p_work_dim)
-        return 0;
+    // Each Context in data is an element of size p_stack_size + sizeof(Context)
+    size = p_stack_size + sizeof(Context);
+    size *= index;  // To get an offset
 
-    return p_global_id[dimindx] + p_current[dimindx];
-}
-
-size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
-{
-    if (dimindx >p_work_dim)
-        return 1;
-
-    return p_event->global_work_size(dimindx);
-}
-
-size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
-{
-    if (dimindx > p_work_dim)
-        return 1;
-
-    return p_event->local_work_size(dimindx);
-}
-
-size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
-{
-    if (dimindx > p_work_dim)
-        return 0;
-
-    return p_current[dimindx];
-}
-
-size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
-{
-    if (dimindx > p_work_dim)
-        return 1;
-
-    return (p_event->global_work_size(dimindx) /
-            p_event->local_work_size(dimindx));
-}
-
-size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
-{
-    if (dimindx > p_work_dim)
-        return 0;
-
-    return p_index[dimindx];
-}
-
-size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const
-{
-    if (dimindx > p_work_dim)
-        return 0;
-
-    return p_event->global_work_offset(dimindx);
-}
-
-void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const
-{
-    std::cout << "OpenCL: Non-existant builtin function " << name
-              << " found in kernel " << p_kernel->function()->getNameStr()
-              << '.' << std::endl;
+    return (Context *)(data + size); // Pointer to the context
 }
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index 91d1dfd..4abc38d 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -7,6 +7,8 @@
 #include <llvm/ExecutionEngine/GenericValue.h>
 #include <vector>
 #include <string>
+
+#include <ucontext.h>
 #include <pthread.h>
 
 namespace llvm
@@ -68,6 +70,7 @@ class CPUKernelWorkGroup
         size_t getNumGroups(cl_uint dimindx) const;
         size_t getGroupID(cl_uint dimindx) const;
         size_t getGlobalOffset(cl_uint dimindx) const;
+        void barrier(unsigned int flags);
 
         void builtinNotFound(const std::string &name) const;
 
@@ -77,9 +80,27 @@ class CPUKernelWorkGroup
         KernelEvent *p_event;
         cl_uint p_work_dim;
         size_t p_index[MAX_WORK_DIMS],
-               p_current[MAX_WORK_DIMS],
-               p_maxs[MAX_WORK_DIMS],
-               p_global_id[MAX_WORK_DIMS];
+               p_max_local_id[MAX_WORK_DIMS],
+               p_global_id_start_offset[MAX_WORK_DIMS];
+
+        void (*p_kernel_func_addr)();
+
+        // Machinery to have barrier() working
+        struct Context
+        {
+            size_t local_id[MAX_WORK_DIMS];
+            ucontext_t context;
+            unsigned int initialized;
+        };
+
+        Context *getContextAddr(unsigned int index);
+
+        Context *p_current_context;
+        Context p_dummy_context;
+        void *p_contexts;
+        size_t p_stack_size;
+        unsigned int p_num_work_items, p_current_work_item;
+        bool p_had_barrier;
 };
 
 class CPUKernelEvent
@@ -105,7 +126,4 @@ class CPUKernelEvent
 
 }
 
-void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current);
-void *getBuiltin(const std::string &name);
-
 #endif
diff --git a/src/core/cpu/program.cpp b/src/core/cpu/program.cpp
index 653fb1e..1bc31de 100644
--- a/src/core/cpu/program.cpp
+++ b/src/core/cpu/program.cpp
@@ -1,6 +1,7 @@
 #include "program.h"
 #include "device.h"
 #include "kernel.h"
+#include "builtins.h"
 
 #include "../program.h"
 
diff --git a/src/core/cpu/worker.cpp b/src/core/cpu/worker.cpp
index 62fc2f0..a456714 100644
--- a/src/core/cpu/worker.cpp
+++ b/src/core/cpu/worker.cpp
@@ -2,12 +2,15 @@
 #include "device.h"
 #include "buffer.h"
 #include "kernel.h"
+#include "builtins.h"
 
 #include "../commandqueue.h"
 #include "../events.h"
 #include "../memobject.h"
 #include "../kernel.h"
 
+#include <sys/mman.h>
+
 #include <cstring>
 #include <iostream>
 
@@ -20,6 +23,9 @@ void *worker(void *data)
     cl_int errcode;
     Event *event;
 
+    // Initialize TLS
+    setWorkItemsData(0, 0);
+
     while (true)
     {
         event = device->getEvent(stop);
@@ -229,5 +235,12 @@ void *worker(void *data)
         }
     }
 
+    // Free mmapped() data if needed
+    size_t mapped_size;
+    void *mapped_data = getWorkItemsData(mapped_size);
+
+    if (mapped_data)
+        munmap(mapped_data, mapped_size);
+
     return 0;
 }
diff --git a/src/runtime/stdlib.h b/src/runtime/stdlib.h
index dae9043..527d5ac 100644
--- a/src/runtime/stdlib.h
+++ b/src/runtime/stdlib.h
@@ -66,6 +66,12 @@ COAL_VECTOR_SET(float);
 #define CLK_FILTER_NEAREST          0x00000000
 #define CLK_FILTER_LINEAR           0x00000100
 
+#define CLK_LOCAL_MEM_FENCE         0x00000001
+#define CLK_GLOBAL_MEM_FENCE        0x00000002
+
+/* Typedefs */
+typedef unsigned int cl_mem_fence_flags;
+
 /* Management functions */
 uint get_work_dim();
 size_t get_global_size(uint dimindx);
@@ -75,3 +81,5 @@ size_t get_local_id(uint dimindx);
 size_t get_num_groups(uint dimindx);
 size_t get_group_id(uint dimindx);
 size_t get_global_offset(uint dimindx);
+
+void barrier(cl_mem_fence_flags flags);
diff --git a/tests/test_builtins.cpp b/tests/test_builtins.cpp
index 1513428..cb8c7ab 100644
--- a/tests/test_builtins.cpp
+++ b/tests/test_builtins.cpp
@@ -12,13 +12,21 @@ const char sampler_source[] =
     "                            CLK_ADDRESS_MIRRORED_REPEAT |\n"
     "                            CLK_FILTER_NEAREST;\n"
     "\n"
-    "   if (sampler != good_sampler) *rs = 1;"
+    "   if (sampler != good_sampler) *rs = 1;\n"
+    "}\n";
+
+const char barrier_source[] =
+    "__kernel void test_case(__global uint *rs) {\n"
+    "   *rs = 0;\n"
+    "   barrier(0);\n"
+    "   *rs += 1;\n"
     "}\n";
 
 enum TestCaseKind
 {
     NormalKind,
-    SamplerKind
+    SamplerKind,
+    BarrierKind
 };
 
 /*
@@ -96,10 +104,25 @@ static uint32_t run_kernel(const char *source, TestCaseKind kind)
             result = clSetKernelArg(kernel, 1, sizeof(cl_sampler), &sampler);
             if (result != CL_SUCCESS) return 65547;
             break;
+
+        default:
+            break;
     }
 
-    result = clEnqueueTask(queue, kernel, 0, 0, &event);
-    if (result != CL_SUCCESS) return 65544;
+    if (kind == BarrierKind)
+    {
+        size_t local_size = 64;
+        size_t global_size = 64;
+
+        result = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &global_size,
+                                        &local_size, 0, 0, &event);
+        if (result != CL_SUCCESS) return 65544;
+    }
+    else
+    {
+        result = clEnqueueTask(queue, kernel, 0, 0, &event);
+        if (result != CL_SUCCESS) return 65544;
+    }
 
     result = clWaitForEvents(1, &event);
     if (result != CL_SUCCESS) return 65545;
@@ -138,7 +161,7 @@ static const char *default_error(uint32_t errcode)
         case 65543:
             return "Cannot set kernel argument";
         case 65544:
-            return "Cannot enqueue a task kernel";
+            return "Cannot enqueue the kernel";
         case 65545:
             return "Cannot wait for the event";
         case 65546:
@@ -172,10 +195,22 @@ START_TEST (test_sampler)
 }
 END_TEST
 
+START_TEST (test_barrier)
+{
+    uint32_t rs = run_kernel(barrier_source, BarrierKind);
+
+    fail_if(
+        rs != 0x40,
+        default_error(rs)
+    );
+}
+END_TEST
+
 TCase *cl_builtins_tcase_create(void)
 {
     TCase *tc = NULL;
     tc = tcase_create("builtins");
     tcase_add_test(tc, test_sampler);
+    tcase_add_test(tc, test_barrier);
     return tc;
 }
-- 
cgit v1.2.3