summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/cpu/buffer.h19
-rw-r--r--src/core/cpu/builtins.cpp12
-rw-r--r--src/core/cpu/builtins.h73
-rw-r--r--src/core/cpu/device.h28
-rw-r--r--src/core/cpu/kernel.h145
-rw-r--r--src/core/cpu/program.h35
-rw-r--r--src/core/cpu/worker.h12
7 files changed, 305 insertions, 19 deletions
diff --git a/src/core/cpu/buffer.h b/src/core/cpu/buffer.h
index 5da39a3..5ca901a 100644
--- a/src/core/cpu/buffer.h
+++ b/src/core/cpu/buffer.h
@@ -25,6 +25,11 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/**
+ * \file buffer.h
+ * \brief CPU buffer
+ */
+
#ifndef __CPU_BUFFER_H__
#define __CPU_BUFFER_H__
@@ -36,15 +41,27 @@ namespace Coal
class CPUDevice;
class MemObject;
+/**
+ * \brief CPU implementation of \c Coal::MemObject
+ *
+ * This class is responsible of the actual allocation of buffer objects, using
+ * \c malloc() or by reusing a given \c host_ptr.
+ */
class CPUBuffer : public DeviceBuffer
{
public:
+ /**
+ * \brief Constructor
+ * \param device Device for which the buffer is allocated
+ * \param buffer \c Coal::MemObject holding information about the buffer
+ * \param rs return code (\c CL_SUCCESS if all is good)
+ */
CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs);
~CPUBuffer();
bool allocate();
DeviceInterface *device() const;
- void *data() const;
+ void *data() const; /*!< \brief Pointer to the buffer's data */
void *nativeGlobalPointer() const;
bool allocated() const;
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp
index 217e55d..98a6e65 100644
--- a/src/core/cpu/builtins.cpp
+++ b/src/core/cpu/builtins.cpp
@@ -28,6 +28,12 @@
/**
* \file cpu/builtins.cpp
* \brief Native OpenCL C built-in functions
+ *
+ * All these built-ins are directly called by kernels. When the LLVM JIT
+ * sees a function name it doesn't know, it calls \c getBuiltin() with this
+ * name as parameter. This function then returns the address of an actual
+ * function implementation, that finally gets called by the kernel when
+ * it is run.
*/
#include "builtins.h"
@@ -64,9 +70,9 @@ unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z,
/*
* TLS-related functions
*/
-__thread Coal::CPUKernelWorkGroup *g_work_group;
-__thread void *work_items_data;
-__thread size_t work_items_size;
+__thread Coal::CPUKernelWorkGroup *g_work_group; /*!< \brief \c Coal::CPUKernelWorkGroup currently running on this thread */
+__thread void *work_items_data; /*!< \brief Space allocated for work-items stacks, see \ref barrier */
+__thread size_t work_items_size; /*!< \brief Size of \c work_items_data, see \ref barrier */
void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
{
diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h
index 9328cf3..71ffea3 100644
--- a/src/core/cpu/builtins.h
+++ b/src/core/cpu/builtins.h
@@ -25,6 +25,10 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/**
+ * \file builtins.h
+ * \brief CPU built-in functions
+ */
#ifndef __BUILTINS_H__
#define __BUILTINS_H__
@@ -34,11 +38,65 @@ namespace Coal {
class CPUKernelWorkGroup;
}
+/**
+ * \brief Set the current kernel work-group of this thread
+ * \param current \c Coal::CPUKernelWorkGroup to be set in \c g_work_group.
+ */
void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current);
+
+/**
+ * \brief Return the address of a built-in function given its name
+ * \param name name of the built-in whose address is requested
+ */
void *getBuiltin(const std::string &name);
+
+/**
+ * \brief Work-item stacks
+ * \see \ref barrier
+ * \param size size of the allocated space for stacks
+ * \return address of the allocated space for stacks
+ */
void *getWorkItemsData(size_t &size);
+
+/**
+ * \brief Set work-item stacks
+ * \see \ref barrier
+ * \param ptr address of allocated space for stacks
+ * \param size size of the allocated space for stacks
+ */
void setWorkItemsData(void *ptr, size_t size);
+/**
+ * \brief Increment a n-component vector given a maximum value
+ *
+ * This function is used to increment a vector for which a set of maximum values
+ * each of its element can reach before the next is incremented.
+ *
+ * For example, if \p dims is \c 3, \p vec starts at <tt>{0, 0, 0}</tt> and
+ * \p maxs if <tt>{2, 3, 1}</tt>, repeatedly calling this function with the
+ * same vector will produce the following results :
+ *
+ * \code
+ * {0, 0, 1}
+ * {0, 1, 0}
+ * {0, 1, 1}
+ * {0, 2, 0}
+ * {0, 2, 1}
+ * {0, 3, 0}
+ * {0, 3, 1}
+ * {1, 0, 0}
+ * ...
+ * \endcode
+ *
+ * Until \p vec reaches <tt>{2, 3, 1}</tt>.
+ *
+ * \param dims number of elements in the vectors
+ * \param vec vector whose elements will be incremented
+ * \param maxs vector containing a maximum value above which each corresponding
+ * element of \p vec cannot go.
+ * \return false if the increment was ok, true if \p vec was already at it's
+ * maximum value and couldn't be further incremented.
+ */
template<typename T>
bool incVec(unsigned long dims, T *vec, T *maxs)
{
@@ -63,6 +121,21 @@ bool incVec(unsigned long dims, T *vec, T *maxs)
return overflow;
}
+/**
+ * \brief Address of a pixel in an image
+ *
+ * This function is heavily used when Clover needs to address a pixel or a byte
+ * in a rectangular or three-dimensional image or buffer.
+ *
+ * \param base address of the first pixel in the image (address of the image itself)
+ * \param x X coordinate, cannot be bigger or equal to \c width
+ * \param y Y coordinate, cannot be bigger or equal to \c height
+ * \param z Z coordinate, cannot be bigger or equal to \c depth (1 for 2D arrays)
+ * \param row_pitch size in bytes of a row of pixels in the image
+ * \param slice_pitch size in bytes of a slice in a 3D array
+ * \param bytes_per_pixel bytes per pixel (1 for simple buffers), used when
+ * coordinates are in pixels and not in bytes.
+ */
unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z,
size_t row_pitch, size_t slice_pitch,
unsigned int bytes_per_pixel);
diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h
index 3a15b2b..36a6bb3 100644
--- a/src/core/cpu/device.h
+++ b/src/core/cpu/device.h
@@ -25,6 +25,11 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/**
+ * \file cpu/device.h
+ * \brief CPU device
+ */
+
#ifndef __CPU_DEVICE_H__
#define __CPU_DEVICE_H__
@@ -41,12 +46,31 @@ class Event;
class Program;
class Kernel;
+/**
+ * \brief CPU device
+ *
+ * This class is the base of all the CPU-accelerated OpenCL processing. It
+ * creates and manages subclasses such as \c Coal::DeviceBuffer,
+ * \c Coal::DeviceProgram and \c Coal::DeviceKernel.
+ *
+ * This class and the aforementioned ones work together to compile and run
+ * kernels using the LLVM JIT, manage buffers, provide built-in functions
+ * and do all of this in a multithreaded fashion using worker threads.
+ *
+ * \see \ref events
+ */
class CPUDevice : public DeviceInterface
{
public:
CPUDevice();
~CPUDevice();
+ /**
+ * \brief Initialize the CPU device
+ *
+ * This function creates the worker threads and get information about
+ * the host system for the \c numCPUs() and \c cpuMhz functions.
+ */
void init();
cl_int info(cl_device_info param_name,
@@ -65,8 +89,8 @@ class CPUDevice : public DeviceInterface
void pushEvent(Event *event);
Event *getEvent(bool &stop);
- unsigned int numCPUs() const;
- float cpuMhz() const;
+ unsigned int numCPUs() const; /*!< \brief Number of logical CPU cores on the system */
+ float cpuMhz() const; /*!< \brief Speed of the CPU in Mhz */
private:
unsigned int p_cores, p_num_events;
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index 70e348c..c965741 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -25,6 +25,11 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/**
+ * \file cpu/kernel.h
+ * \brief CPU kernel
+ */
+
#ifndef __CPU_KERNEL_H__
#define __CPU_KERNEL_H__
@@ -53,9 +58,26 @@ class KernelEvent;
class Image2D;
class Image3D;
+/**
+ * \brief CPU kernel
+ *
+ * This class holds passive information about a kernel (\c Coal::Kernel object
+ * and device on which it is run) and provides the \c callFunction() function.
+ *
+ * This function is described at the end of \ref llvm .
+ *
+ * \see Coal::CPUKernelWorkGroup
+ */
class CPUKernel : public DeviceKernel
{
public:
+ /**
+ * \brief Constructor
+ * \param device device on which the kernel will be run
+ * \param kernel \c Coal::Kernel object holding information about this
+ * kernel
+ * \param function \c llvm::Function to run
+ */
CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function);
~CPUKernel();
@@ -66,11 +88,45 @@ class CPUKernel : public DeviceKernel
size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
size_t global_work_size) const;
- Kernel *kernel() const;
- CPUDevice *device() const;
-
- llvm::Function *function() const;
- llvm::Function *callFunction();
+ Kernel *kernel() const; /*!< \brief \c Coal::Kernel object this kernel will run */
+ CPUDevice *device() const; /*!< \brief device on which the kernel will be run */
+
+ llvm::Function *function() const; /*!< \brief \c llvm::Function representing the kernel but <strong>not to be run</strong> */
+ llvm::Function *callFunction(); /*!< \brief stub function used to run the kernel, see \ref llvm */
+
+ /**
+ * \brief Calculate where to place a value in an array
+ *
+ * This function is used to calculate where to place a value in an
+ * array given its size, properly aligning it.
+ *
+ * This function is called repeatedly to obtain the aligned position of
+ * each value that must be place in the array
+ *
+ * \code
+ * size_t array_len = 0, array_offset = 0;
+ * void *array;
+ *
+ * // First, get the array size given alignment constraints
+ * typeOffset(array_len, sizeof(int));
+ * typeOffset(array_len, sizeof(float));
+ * typeOffset(array_len, sizeof(void *));
+ *
+ * // Then, allocate memory
+ * array = malloc(array_len)
+ *
+ * // Finally, place the arguments
+ * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337;
+ * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f;
+ * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array;
+ * \endcode
+ *
+ * \param offset offset at which the value will be placed. This variable
+ * gets incremented by <tt>type_len + padding</tt>.
+ * \param type_len size in bytes of the value that will be stored
+ * \return offset at which the value will be stored (equal to \p offset
+ * before incrementation.
+ */
static size_t typeOffset(size_t &offset, size_t type_len);
private:
@@ -82,18 +138,64 @@ class CPUKernel : public DeviceKernel
class CPUKernelEvent;
+/**
+ * \brief CPU kernel work-group
+ *
+ * This class represent a bulk of work-items that will be run. It is the one
+ * to actually run the kernel of its elements.
+ *
+ * \see \ref llvm
+ * \nosubgrouping
+ */
class CPUKernelWorkGroup
{
public:
+ /**
+ * \brief Constructor
+ * \param kernel kernel to run
+ * \param event event containing information about the kernel run
+ * \param cpu_event CPU-specific information and cache about \p event
+ * \param work_group_index index of this work-group in the kernel
+ */
CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
CPUKernelEvent *cpu_event,
const size_t *work_group_index);
~CPUKernelWorkGroup();
+ /**
+ * \brief Build a structure of arguments
+ *
+ * As C doesn't support calling functions with variable arguments
+ * unknown at the compilation, this function builds the list of
+ * arguments in memory. This array will then be passed to a LLVM stub
+ * function reading it and passing its values to the actuel kernel.
+ *
+ * \see \ref llvm
+ * \param locals_to_free if this kernel takes \c __local arguments, they
+ * must be \c malloc()'ed for every work-group.
+ * They are placed in this vector to be
+ * \c free()'ed at the end of \c run().
+ * \return address of a memory location containing the arguments
+ */
void *callArgs(std::vector<void *> &locals_to_free);
+
+ /**
+ * \brief Run the work-group
+ *
+ * This function is the core of CPU-acceleration. It runs the work-items
+ * of this work-group given the correct arguments.
+ *
+ * \see \ref llvm
+ * \see \ref barrier
+ * \see callArgs()
+ * \return true if success, false in case of an error
+ */
bool run();
- // Native functions
+ /**
+ * \name Native implementation of built-in OpenCL C functions
+ * @{
+ */
size_t getGlobalId(cl_uint dimindx) const;
cl_uint getWorkDim() const;
size_t getGlobalSize(cl_uint dimindx) const;
@@ -124,7 +226,13 @@ class CPUKernelWorkGroup
uint32_t sampler) const;
void readImage(uint32_t *result, Image2D *image, float x, float y, float z,
uint32_t sampler) const;
+ /**
+ * @}
+ */
+ /**
+ * \brief Function called when a built-in name cannot be found
+ */
void builtinNotFound(const std::string &name) const;
private:
@@ -174,20 +282,33 @@ class CPUKernelWorkGroup
bool p_had_barrier;
};
+/**
+ * \brief CPU-specific information about a kernel event
+ *
+ * This class put in a \c Coal::KernelEvent device-data field
+ * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the
+ * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads.
+ */
class CPUKernelEvent
{
public:
+ /**
+ * \brief Constructor
+ * \param device device running the kernel
+ * \param event \c Coal::KernelEvent holding device-agnostic data
+ * about the event
+ */
CPUKernelEvent(CPUDevice *device, KernelEvent *event);
~CPUKernelEvent();
- bool reserve(); /*!< The next Work Group that will execute will be the last. Locks the event */
- bool finished(); /*!< All the work groups have finished */
- CPUKernelWorkGroup *takeInstance(); /*!< Must be called exactly one time after reserve(). Unlocks the event */
+ bool reserve(); /*!< \brief The next Work Group that will execute will be the last. Locks the event */
+ bool finished(); /*!< \brief All the work groups have finished */
+ CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */
- void *kernelArgs() const;
- void cacheKernelArgs(void *args);
+ void *kernelArgs() const; /*!< \brief Return the cached kernel arguments */
+ void cacheKernelArgs(void *args); /*!< \brief Cache pre-built kernel arguments */
- void workGroupFinished();
+ void workGroupFinished(); /*!< \brief A work-group has just finished */
private:
CPUDevice *p_device;
diff --git a/src/core/cpu/program.h b/src/core/cpu/program.h
index 34668f2..350d248 100644
--- a/src/core/cpu/program.h
+++ b/src/core/cpu/program.h
@@ -25,6 +25,11 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/**
+ * \file cpu/program.h
+ * \brief CPU program
+ */
+
#ifndef __CPU_PROGRAM_H__
#define __CPU_PROGRAM_H__
@@ -42,9 +47,23 @@ namespace Coal
class CPUDevice;
class Program;
+/**
+ * \brief CPU program
+ *
+ * This class implements the \c Coal::DeviceProgram interface for CPU
+ * acceleration.
+ *
+ * It's main purpose is to initialize a \c llvm::JIT object to run LLVM bitcode,
+ * in \c initJIT().
+ */
class CPUProgram : public DeviceProgram
{
public:
+ /**
+ * \brief Constructor
+ * \param device CPU device to which this program is attached
+ * \param program \c Coal::Program that will be run
+ */
CPUProgram(CPUDevice *device, Program *program);
~CPUProgram();
@@ -52,8 +71,22 @@ class CPUProgram : public DeviceProgram
void createOptimizationPasses(llvm::PassManager *manager, bool optimize);
bool build(llvm::Module *module);
+ /**
+ * \brief Initialize an LLVM JIT
+ *
+ * This function creates a \c llvm::JIT object to run this program on
+ * the CPU. A few implementation details :
+ *
+ * - The JIT is set not to resolve unknown symbols using \c dlsym().
+ * This way, a malicious kernel cannot execute arbitrary code on
+ * the host by declaring \c libc functions and calling them.
+ * - All the unknown function names are passed to \c getBuiltin() to
+ * get native built-in implementations.
+ *
+ * \return true if success, false otherwise
+ */
bool initJIT();
- llvm::ExecutionEngine *jit() const;
+ llvm::ExecutionEngine *jit() const; /*!< \brief Current LLVM execution engine */
private:
CPUDevice *p_device;
diff --git a/src/core/cpu/worker.h b/src/core/cpu/worker.h
index ec462f7..66130c6 100644
--- a/src/core/cpu/worker.h
+++ b/src/core/cpu/worker.h
@@ -25,9 +25,21 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+/**
+ * \file worker.h
+ * \brief Function run by the CPU worker threads
+ */
+
#ifndef __CPU_WORKER_H__
#define __CPU_WORKER_H__
+/**
+ * \brief Main loop of the CPU worker threads
+ *
+ * This function is run by as many thread as they are CPU cores on the host
+ * system. As explained by \ref events , this function waits until there
+ * are \c Coal::Event objects to process and handle them.
+ */
void *worker(void *data);
#endif