7 files changed, 305 insertions, 19 deletions
diff --git a/src/core/cpu/buffer.h b/src/core/cpu/buffer.h
index 5da39a3..5ca901a 100644
--- a/src/core/cpu/buffer.h
+++ b/src/core/cpu/buffer.h
@@ -25,6 +25,11 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file buffer.h
+ * \brief CPU buffer
+ */
+
 #ifndef __CPU_BUFFER_H__
 #define __CPU_BUFFER_H__
 
@@ -36,15 +41,27 @@ namespace Coal
 class CPUDevice;
 class MemObject;
 
+/**
+ * \brief CPU implementation of \c Coal::MemObject
+ *
+ * This class is responsible of the actual allocation of buffer objects, using
+ * \c malloc() or by reusing a given \c host_ptr.
+ */
 class CPUBuffer : public DeviceBuffer
 {
     public:
+        /**
+         * \brief Constructor
+         * \param device Device for which the buffer is allocated
+         * \param buffer \c Coal::MemObject holding information about the buffer
+         * \param rs return code (\c CL_SUCCESS if all is good)
+         */
         CPUBuffer(CPUDevice *device, MemObject *buffer, cl_int *rs);
         ~CPUBuffer();
 
         bool allocate();
         DeviceInterface *device() const;
-        void *data() const;
+        void *data() const;                 /*!< \brief Pointer to the buffer's data */
         void *nativeGlobalPointer() const;
         bool allocated() const;
 
diff --git a/src/core/cpu/builtins.cpp b/src/core/cpu/builtins.cpp
index 217e55d..98a6e65 100644
--- a/src/core/cpu/builtins.cpp
+++ b/src/core/cpu/builtins.cpp
@@ -28,6 +28,12 @@
 /**
  * \file cpu/builtins.cpp
  * \brief Native OpenCL C built-in functions
+ *
+ * All these built-ins are directly called by kernels. When the LLVM JIT
+ * sees a function name it doesn't know, it calls \c getBuiltin() with this
+ * name as parameter. This function then returns the address of an actual
+ * function implementation, that finally gets called by the kernel when
+ * it is run.
  */
 
 #include "builtins.h"
@@ -64,9 +70,9 @@ unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z,
 /*
  * TLS-related functions
  */
-__thread Coal::CPUKernelWorkGroup *g_work_group;
-__thread void *work_items_data;
-__thread size_t work_items_size;
+__thread Coal::CPUKernelWorkGroup *g_work_group;    /*!< \brief \c Coal::CPUKernelWorkGroup currently running on this thread */
+__thread void *work_items_data;                     /*!< \brief Space allocated for work-items stacks, see \ref barrier */
+__thread size_t work_items_size;                    /*!< \brief Size of \c work_items_data, see \ref barrier */
 
 void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current)
 {
diff --git a/src/core/cpu/builtins.h b/src/core/cpu/builtins.h
index 9328cf3..71ffea3 100644
--- a/src/core/cpu/builtins.h
+++ b/src/core/cpu/builtins.h
@@ -25,6 +25,10 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file builtins.h
+ * \brief CPU built-in functions
+ */
 #ifndef __BUILTINS_H__
 #define __BUILTINS_H__
 
@@ -34,11 +38,65 @@ namespace Coal {
     class CPUKernelWorkGroup;
 }
 
+/**
+ * \brief Set the current kernel work-group of this thread
+ * \param current \c Coal::CPUKernelWorkGroup to be set in \c g_work_group.
+ */
 void setThreadLocalWorkGroup(Coal::CPUKernelWorkGroup *current);
+
+/**
+ * \brief Return the address of a built-in function given its name
+ * \param name name of the built-in whose address is requested
+ */
 void *getBuiltin(const std::string &name);
+
+/**
+ * \brief Work-item stacks
+ * \see \ref barrier
+ * \param size size of the allocated space for stacks
+ * \return address of the allocated space for stacks
+ */
 void *getWorkItemsData(size_t &size);
+
+/**
+ * \brief Set work-item stacks
+ * \see \ref barrier
+ * \param ptr address of allocated space for stacks
+ * \param size size of the allocated space for stacks
+ */
 void setWorkItemsData(void *ptr, size_t size);
 
+/**
+ * \brief Increment a n-component vector given a maximum value
+ *
+ * This function is used to increment a vector for which a set of maximum values
+ * each of its element can reach before the next is incremented.
+ *
+ * For example, if \p dims is \c 3, \p vec starts at <tt>{0, 0, 0}</tt> and
+ * \p maxs if <tt>{2, 3, 1}</tt>, repeatedly calling this function with the
+ * same vector will produce the following results :
+ *
+ * \code
+ * {0, 0, 1}
+ * {0, 1, 0}
+ * {0, 1, 1}
+ * {0, 2, 0}
+ * {0, 2, 1}
+ * {0, 3, 0}
+ * {0, 3, 1}
+ * {1, 0, 0}
+ * ...
+ * \endcode
+ *
+ * Until \p vec reaches <tt>{2, 3, 1}</tt>.
+ *
+ * \param dims number of elements in the vectors
+ * \param vec vector whose elements will be incremented
+ * \param maxs vector containing a maximum value above which each corresponding
+ *             element of \p vec cannot go.
+ * \return false if the increment was ok, true if \p vec was already at it's
+ *         maximum value and couldn't be further incremented.
+ */
 template<typename T>
 bool incVec(unsigned long dims, T *vec, T *maxs)
 {
@@ -63,6 +121,21 @@ bool incVec(unsigned long dims, T *vec, T *maxs)
     return overflow;
 }
 
+/**
+ * \brief Address of a pixel in an image
+ *
+ * This function is heavily used when Clover needs to address a pixel or a byte
+ * in a rectangular or three-dimensional image or buffer.
+ *
+ * \param base address of the first pixel in the image (address of the image itself)
+ * \param x X coordinate, cannot be bigger or equal to \c width
+ * \param y Y coordinate, cannot be bigger or equal to \c height
+ * \param z Z coordinate, cannot be bigger or equal to \c depth (1 for 2D arrays)
+ * \param row_pitch size in bytes of a row of pixels in the image
+ * \param slice_pitch size in bytes of a slice in a 3D array
+ * \param bytes_per_pixel bytes per pixel (1 for simple buffers), used when
+ *                        coordinates are in pixels and not in bytes.
+ */
 unsigned char *imageData(unsigned char *base, size_t x, size_t y, size_t z,
                          size_t row_pitch, size_t slice_pitch,
                          unsigned int bytes_per_pixel);
diff --git a/src/core/cpu/device.h b/src/core/cpu/device.h
index 3a15b2b..36a6bb3 100644
--- a/src/core/cpu/device.h
+++ b/src/core/cpu/device.h
@@ -25,6 +25,11 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file cpu/device.h
+ * \brief CPU device
+ */
+
 #ifndef __CPU_DEVICE_H__
 #define __CPU_DEVICE_H__
 
@@ -41,12 +46,31 @@ class Event;
 class Program;
 class Kernel;
 
+/**
+ * \brief CPU device
+ *
+ * This class is the base of all the CPU-accelerated OpenCL processing. It
+ * creates and manages subclasses such as \c Coal::DeviceBuffer,
+ * \c Coal::DeviceProgram and \c Coal::DeviceKernel.
+ *
+ * This class and the aforementioned ones work together to compile and run
+ * kernels using the LLVM JIT, manage buffers, provide built-in functions
+ * and do all of this in a multithreaded fashion using worker threads.
+ *
+ * \see \ref events
+ */
 class CPUDevice : public DeviceInterface
 {
     public:
         CPUDevice();
         ~CPUDevice();
 
+        /**
+         * \brief Initialize the CPU device
+         *
+         * This function creates the worker threads and get information about
+         * the host system for the \c numCPUs() and \c cpuMhz functions.
+         */
         void init();
 
         cl_int info(cl_device_info param_name,
@@ -65,8 +89,8 @@ class CPUDevice : public DeviceInterface
         void pushEvent(Event *event);
         Event *getEvent(bool &stop);
 
-        unsigned int numCPUs() const;
-        float cpuMhz() const;
+        unsigned int numCPUs() const;   /*!< \brief Number of logical CPU cores on the system */
+        float cpuMhz() const;           /*!< \brief Speed of the CPU in Mhz */
 
     private:
         unsigned int p_cores, p_num_events;
diff --git a/src/core/cpu/kernel.h b/src/core/cpu/kernel.h
index 70e348c..c965741 100644
--- a/src/core/cpu/kernel.h
+++ b/src/core/cpu/kernel.h
@@ -25,6 +25,11 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file cpu/kernel.h
+ * \brief CPU kernel
+ */
+
 #ifndef __CPU_KERNEL_H__
 #define __CPU_KERNEL_H__
 
@@ -53,9 +58,26 @@ class KernelEvent;
 class Image2D;
 class Image3D;
 
+/**
+ * \brief CPU kernel
+ *
+ * This class holds passive information about a kernel (\c Coal::Kernel object
+ * and device on which it is run) and provides the \c callFunction() function.
+ *
+ * This function is described at the end of \ref llvm .
+ *
+ * \see Coal::CPUKernelWorkGroup
+ */
 class CPUKernel : public DeviceKernel
 {
     public:
+        /**
+         * \brief Constructor
+         * \param device device on which the kernel will be run
+         * \param kernel \c Coal::Kernel object holding information about this
+         *               kernel
+         * \param function \c llvm::Function to run
+         */
         CPUKernel(CPUDevice *device, Kernel *kernel, llvm::Function *function);
         ~CPUKernel();
 
@@ -66,11 +88,45 @@ class CPUKernel : public DeviceKernel
         size_t guessWorkGroupSize(cl_uint num_dims, cl_uint dim,
                                   size_t global_work_size) const;
 
-        Kernel *kernel() const;
-        CPUDevice *device() const;
-
-        llvm::Function *function() const;
-        llvm::Function *callFunction();
+        Kernel *kernel() const;     /*!< \brief \c Coal::Kernel object this kernel will run */
+        CPUDevice *device() const;  /*!< \brief device on which the kernel will be run */
+
+        llvm::Function *function() const;   /*!< \brief \c llvm::Function representing the kernel but <strong>not to be run</strong> */
+        llvm::Function *callFunction();     /*!< \brief stub function used to run the kernel, see \ref llvm */
+
+        /**
+         * \brief Calculate where to place a value in an array
+         *
+         * This function is used to calculate where to place a value in an
+         * array given its size, properly aligning it.
+         *
+         * This function is called repeatedly to obtain the aligned position of
+         * each value that must be place in the array
+         *
+         * \code
+         * size_t array_len = 0, array_offset = 0;
+         * void *array;
+         *
+         * // First, get the array size given alignment constraints
+         * typeOffset(array_len, sizeof(int));
+         * typeOffset(array_len, sizeof(float));
+         * typeOffset(array_len, sizeof(void *));
+         *
+         * // Then, allocate memory
+         * array = malloc(array_len)
+         *
+         * // Finally, place the arguments
+         * *(int *)((char *)array + typeOffset(array_offset, sizeof(int))) = 1337;
+         * *(float *)((char *)array + typeOffset(array_offset, sizeof(int))) = 3.1415f;
+         * *(void **)((char *)array + typeOffset(array_offset, sizeof(int))) = array;
+         * \endcode
+         *
+         * \param offset offset at which the value will be placed. This variable
+         *               gets incremented by <tt>type_len + padding</tt>.
+         * \param type_len size in bytes of the value that will be stored
+         * \return offset at which the value will be stored (equal to \p offset
+         *         before incrementation.
+         */
         static size_t typeOffset(size_t &offset, size_t type_len);
 
     private:
@@ -82,18 +138,64 @@ class CPUKernel : public DeviceKernel
 
 class CPUKernelEvent;
 
+/**
+ * \brief CPU kernel work-group
+ *
+ * This class represent a bulk of work-items that will be run. It is the one
+ * to actually run the kernel of its elements.
+ *
+ * \see \ref llvm
+ * \nosubgrouping
+ */
 class CPUKernelWorkGroup
 {
     public:
+        /**
+         * \brief Constructor
+         * \param kernel kernel to run
+         * \param event event containing information about the kernel run
+         * \param cpu_event CPU-specific information and cache about \p event
+         * \param work_group_index index of this work-group in the kernel
+         */
         CPUKernelWorkGroup(CPUKernel *kernel, KernelEvent *event,
                            CPUKernelEvent *cpu_event,
                            const size_t *work_group_index);
         ~CPUKernelWorkGroup();
 
+        /**
+         * \brief Build a structure of arguments
+         *
+         * As C doesn't support calling functions with variable arguments
+         * unknown at the compilation, this function builds the list of
+         * arguments in memory. This array will then be passed to a LLVM stub
+         * function reading it and passing its values to the actuel kernel.
+         *
+         * \see \ref llvm
+         * \param locals_to_free if this kernel takes \c __local arguments, they
+         *                       must be \c malloc()'ed for every work-group.
+         *                       They are placed in this vector to be
+         *                       \c free()'ed at the end of \c run().
+         * \return address of a memory location containing the arguments
+         */
         void *callArgs(std::vector<void *> &locals_to_free);
+
+        /**
+         * \brief Run the work-group
+         *
+         * This function is the core of CPU-acceleration. It runs the work-items
+         * of this work-group given the correct arguments.
+         *
+         * \see \ref llvm
+         * \see \ref barrier
+         * \see callArgs()
+         * \return true if success, false in case of an error
+         */
         bool run();
 
-        // Native functions
+        /**
+         * \name Native implementation of built-in OpenCL C functions
+         * @{
+         */
         size_t getGlobalId(cl_uint dimindx) const;
         cl_uint getWorkDim() const;
         size_t getGlobalSize(cl_uint dimindx) const;
@@ -124,7 +226,13 @@ class CPUKernelWorkGroup
                        uint32_t sampler) const;
         void readImage(uint32_t *result, Image2D *image, float x, float y, float z,
                        uint32_t sampler) const;
+        /**
+         * @}
+         */
 
+        /**
+         * \brief Function called when a built-in name cannot be found
+         */
         void builtinNotFound(const std::string &name) const;
 
     private:
@@ -174,20 +282,33 @@ class CPUKernelWorkGroup
         bool p_had_barrier;
 };
 
+/**
+ * \brief CPU-specific information about a kernel event
+ *
+ * This class put in a \c Coal::KernelEvent device-data field
+ * (see \c Coal::Event::setDeviceData()) is responsible for dispatching the
+ * \c Coal::CPUKernelWorkGroup objects between the CPU worker threads.
+ */
 class CPUKernelEvent
 {
     public:
+        /**
+         * \brief Constructor
+         * \param device device running the kernel
+         * \param event \c Coal::KernelEvent holding device-agnostic data
+         *              about the event
+         */
         CPUKernelEvent(CPUDevice *device, KernelEvent *event);
         ~CPUKernelEvent();
 
-        bool reserve();  /*!< The next Work Group that will execute will be the last. Locks the event */
-        bool finished(); /*!< All the work groups have finished */
-        CPUKernelWorkGroup *takeInstance(); /*!< Must be called exactly one time after reserve(). Unlocks the event */
+        bool reserve();  /*!< \brief The next Work Group that will execute will be the last. Locks the event */
+        bool finished(); /*!< \brief All the work groups have finished */
+        CPUKernelWorkGroup *takeInstance(); /*!< \brief Must be called exactly one time after reserve(). Unlocks the event */
 
-        void *kernelArgs() const;
-        void cacheKernelArgs(void *args);
+        void *kernelArgs() const;           /*!< \brief Return the cached kernel arguments */
+        void cacheKernelArgs(void *args);   /*!< \brief Cache pre-built kernel arguments */
 
-        void workGroupFinished();
+        void workGroupFinished();           /*!< \brief A work-group has just finished */
 
     private:
         CPUDevice *p_device;
diff --git a/src/core/cpu/program.h b/src/core/cpu/program.h
index 34668f2..350d248 100644
--- a/src/core/cpu/program.h
+++ b/src/core/cpu/program.h
@@ -25,6 +25,11 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file cpu/program.h
+ * \brief CPU program
+ */
+
 #ifndef __CPU_PROGRAM_H__
 #define __CPU_PROGRAM_H__
 
@@ -42,9 +47,23 @@ namespace Coal
 class CPUDevice;
 class Program;
 
+/**
+ * \brief CPU program
+ *
+ * This class implements the \c Coal::DeviceProgram interface for CPU
+ * acceleration.
+ *
+ * It's main purpose is to initialize a \c llvm::JIT object to run LLVM bitcode,
+ * in \c initJIT().
+ */
 class CPUProgram : public DeviceProgram
 {
     public:
+        /**
+         * \brief Constructor
+         * \param device CPU device to which this program is attached
+         * \param program \c Coal::Program that will be run
+         */
         CPUProgram(CPUDevice *device, Program *program);
         ~CPUProgram();
 
@@ -52,8 +71,22 @@ class CPUProgram : public DeviceProgram
         void createOptimizationPasses(llvm::PassManager *manager, bool optimize);
         bool build(llvm::Module *module);
 
+        /**
+         * \brief Initialize an LLVM JIT
+         *
+         * This function creates a \c llvm::JIT object to run this program on
+         * the CPU. A few implementation details :
+         *
+         * - The JIT is set not to resolve unknown symbols using \c dlsym().
+         *   This way, a malicious kernel cannot execute arbitrary code on
+         *   the host by declaring \c libc functions and calling them.
+         * - All the unknown function names are passed to \c getBuiltin() to
+         *   get native built-in implementations.
+         *
+         * \return true if success, false otherwise
+         */
         bool initJIT();
-        llvm::ExecutionEngine *jit() const;
+        llvm::ExecutionEngine *jit() const; /*!< \brief Current LLVM execution engine */
 
     private:
         CPUDevice *p_device;
diff --git a/src/core/cpu/worker.h b/src/core/cpu/worker.h
index ec462f7..66130c6 100644
--- a/src/core/cpu/worker.h
+++ b/src/core/cpu/worker.h
@@ -25,9 +25,21 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/**
+ * \file worker.h
+ * \brief Function run by the CPU worker threads
+ */
+
 #ifndef __CPU_WORKER_H__
 #define __CPU_WORKER_H__
 
+/**
+ * \brief Main loop of the CPU worker threads
+ *
+ * This function is run by as many thread as they are CPU cores on the host
+ * system. As explained by \ref events , this function waits until there
+ * are \c Coal::Event objects to process and handle them.
+ */
 void *worker(void *data);
 
 #endif