summaryrefslogtreecommitdiff
path: root/src/core/cpu/builtins.cpp
blob: 64eb9450cf08b30be1ae16525065922ae596be87 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#include "builtins.h"
#include "kernel.h"

#include "../events.h"

#include <sys/mman.h>
#include <signal.h>

#include <llvm/Function.h>
#include <iostream>
#include <cstring>

using namespace Coal;

/*
 * TLS-related functions
 */
__thread Coal::CPUKernelWorkGroup *g_work_group;
__thread void *work_items_data;
__thread size_t work_items_size;

void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
{
    g_work_group = current;
}

void *getWorkItemsData(size_t &size)
{
    size = work_items_size;
    return work_items_data;
}

void setWorkItemsData(void *ptr, size_t size)
{
    work_items_data = ptr;
    work_items_size = size;
}

/*
 * Actual built-ins implementations
 */
cl_uint CPUKernelWorkGroup::getWorkDim() const
{
    return p_work_dim;
}

size_t CPUKernelWorkGroup::getGlobalId(cl_uint dimindx) const
{
    if (dimindx > p_work_dim)
        return 0;

    return p_global_id_start_offset[dimindx] + p_current_context->local_id[dimindx];
}

size_t CPUKernelWorkGroup::getGlobalSize(cl_uint dimindx) const
{
    if (dimindx >p_work_dim)
        return 1;

    return p_event->global_work_size(dimindx);
}

size_t CPUKernelWorkGroup::getLocalSize(cl_uint dimindx) const
{
    if (dimindx > p_work_dim)
        return 1;

    return p_event->local_work_size(dimindx);
}

size_t CPUKernelWorkGroup::getLocalID(cl_uint dimindx) const
{
    if (dimindx > p_work_dim)
        return 0;

    return p_current_context->local_id[dimindx];
}

size_t CPUKernelWorkGroup::getNumGroups(cl_uint dimindx) const
{
    if (dimindx > p_work_dim)
        return 1;

    return (p_event->global_work_size(dimindx) /
            p_event->local_work_size(dimindx));
}

size_t CPUKernelWorkGroup::getGroupID(cl_uint dimindx) const
{
    if (dimindx > p_work_dim)
        return 0;

    return p_index[dimindx];
}

size_t CPUKernelWorkGroup::getGlobalOffset(cl_uint dimindx) const
{
    if (dimindx > p_work_dim)
        return 0;

    return p_event->global_work_offset(dimindx);
}

void CPUKernelWorkGroup::barrier(unsigned int flags)
{
    p_had_barrier = true;

    // Allocate or reuse TLS memory for the stacks (it isn't freed between
    // the work groups, and even the kernels, so if we need less space than
    // allocated, it's good)
    if (!p_contexts)
    {
        if (p_current_work_item != 0)
        {
            // Completely abnormal, it means that not every work-items
            // encounter the barrier
            std::cerr << "*** Not every work-items of "
                      << p_kernel->function()->getNameStr()
                      << " calls barrier(); !" << std::endl;
            return;
        }

        // Allocate or reuse the stacks
        size_t contexts_size;
        p_contexts = getWorkItemsData(contexts_size);
        size_t needed_size = p_num_work_items * (p_stack_size + sizeof(Context));

        if (!p_contexts || contexts_size < needed_size)
        {
            // We must allocate a new space
            if (p_contexts)
                munmap(p_contexts, contexts_size);

            p_contexts = mmap(0, needed_size, PROT_EXEC | PROT_READ | PROT_WRITE, /* People say a stack must be executable */
                            MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);

            setWorkItemsData(p_contexts, contexts_size);
        }

        // Now that we have a real main context, initialize it
        p_current_context = getContextAddr(0);
        p_current_context->initialized = 1;
        std::memset(p_current_context->local_id, 0, p_work_dim * sizeof(size_t));

        getcontext(&p_current_context->context);
    }

    // Take the next context
    p_current_work_item++;
    if (p_current_work_item == p_num_work_items) p_current_work_item = 0;

    Context *next = getContextAddr(p_current_work_item);
    Context *main = getContextAddr(0);  // The context not created with makecontext

    // If the next context isn't initialized, initialize it.
    // Note: mmap zeroes the memory, so next->initialized == 0 if it isn't initialized
    if (next->initialized == 0)
    {
        next->initialized = 1;

        // local-id of next is the one of the current context, but incVec'ed
        std::memcpy(next->local_id, p_current_context->local_id,
                    MAX_WORK_DIMS * sizeof(size_t));

        incVec(p_work_dim, next->local_id, p_max_local_id);

        // Initialize the next context
        if (getcontext(&next->context) != 0)
            return;

        // Get its stack. It is located a next + sizeof(Context)
        char *stack = (char *)next;
        stack += sizeof(Context);

        next->context.uc_link = &main->context;
        next->context.uc_stack.ss_sp = stack;
        next->context.uc_stack.ss_size = p_stack_size;

        // Tell it to run the kernel function
        makecontext(&next->context, p_kernel_func_addr, 0);
    }

    // Switch to the next context
    ucontext_t *cur = &p_current_context->context;
    p_current_context = next;

    swapcontext(cur, &next->context);

    // When we return here, it means that all the other work items encountered
    // a barrier and that we returned to this one. We can continue.
}

void CPUKernelWorkGroup::builtinNotFound(const std::string &name) const
{
    std::cout << "OpenCL: Non-existant builtin function " << name
              << " found in kernel " << p_kernel->function()->getNameStr()
              << '.' << std::endl;
}

/*
 * Built-in functions
 */

static size_t get_global_id(cl_uint dimindx)
{
    return g_work_group->getGlobalId(dimindx);
}

static cl_uint get_work_dim()
{
    return g_work_group->getWorkDim();
}

static size_t get_global_size(uint dimindx)
{
    return g_work_group->getGlobalSize(dimindx);
}

static size_t get_local_size(uint dimindx)
{
    return g_work_group->getLocalSize(dimindx);
}

static size_t get_local_id(uint dimindx)
{
    return g_work_group->getLocalID(dimindx);
}

static size_t get_num_groups(uint dimindx)
{
    return g_work_group->getNumGroups(dimindx);
}

static size_t get_group_id(uint dimindx)
{
    return g_work_group->getGroupID(dimindx);
}

static size_t get_global_offset(uint dimindx)
{
    return g_work_group->getGlobalOffset(dimindx);
}

static void barrier(unsigned int flags)
{
    g_work_group->barrier(flags);
}

/*
 * Bridge between LLVM and us
 */
static void unimplemented_stub()
{
}

void *getBuiltin(const std::string &name)
{
    if (name == "get_global_id")
        return (void *)&get_global_id;
    else if (name == "get_work_dim")
        return (void *)&get_work_dim;
    else if (name == "get_global_size")
        return (void *)&get_global_size;
    else if (name == "get_local_size")
        return (void *)&get_local_size;
    else if (name == "get_local_id")
        return (void *)&get_local_id;
    else if (name == "get_num_groups")
        return (void *)&get_num_groups;
    else if (name == "get_group_id")
        return (void *)&get_group_id;
    else if (name == "get_global_offset")
        return (void *)&get_global_offset;
    else if (name == "barrier")
        return (void *)&barrier;

    // Function not found
    g_work_group->builtinNotFound(name);

    return (void *)&unimplemented_stub;
}