summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZhenyu Wang <zhenyuw@linux.intel.com>2014-10-16 14:10:11 +0800
committerZhenyu Wang <zhenyuw@linux.intel.com>2014-10-16 14:10:11 +0800
commitbf8d34506d36df0c2028b97c25abd61c8659d326 (patch)
treebf78890927d593c29144d6cf078a3828e286cd29
Import MPBenchmark 20091214
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
-rw-r--r--BealtoOpenCL/LICENSE.txt27
-rw-r--r--BealtoOpenCL/Makefile68
-rw-r--r--BealtoOpenCL/include/BealtoOpenCL.h13
-rw-r--r--BealtoOpenCL/include/CLBuffer.h34
-rw-r--r--BealtoOpenCL/include/CLCommandQueue.h229
-rw-r--r--BealtoOpenCL/include/CLContext.h149
-rw-r--r--BealtoOpenCL/include/CLError.h19
-rw-r--r--BealtoOpenCL/include/CLEvent.h67
-rw-r--r--BealtoOpenCL/include/CLEventList.h96
-rw-r--r--BealtoOpenCL/include/CLImage2D.h35
-rw-r--r--BealtoOpenCL/include/CLKernel.h78
-rw-r--r--BealtoOpenCL/include/CLMemoryObject.h77
-rw-r--r--BealtoOpenCL/include/CLProgram.h74
-rw-r--r--BealtoOpenCL/src/CLContext.cpp351
-rw-r--r--BealtoOpenCL/src/CLError.cpp82
-rw-r--r--BealtoOpenCL/src/CLProgram.cpp84
-rw-r--r--BealtoOpenCL/vs2008/BealtoOpenCL.sln26
-rw-r--r--BealtoOpenCL/vs2008/BealtoOpenCL.vcproj369
-rw-r--r--MPBenchmarks/ArithmeticTasks.cpp294
-rw-r--r--MPBenchmarks/ArithmeticTasks.h103
-rw-r--r--MPBenchmarks/CPUTask.h22
-rw-r--r--MPBenchmarks/Config.h172
-rw-r--r--MPBenchmarks/Conversions.h74
-rw-r--r--MPBenchmarks/GPUTask.cpp74
-rw-r--r--MPBenchmarks/GPUTask.h56
-rw-r--r--MPBenchmarks/LICENSE.txt27
-rw-r--r--MPBenchmarks/Makefile72
-rw-r--r--MPBenchmarks/MemoryTasks.cpp280
-rw-r--r--MPBenchmarks/MemoryTasks.h83
-rw-r--r--MPBenchmarks/appMain.cpp284
-rw-r--r--MPBenchmarks/gpu_add.cl86
-rw-r--r--MPBenchmarks/gpu_mp.cl153
-rw-r--r--MPBenchmarks/gpu_mul1.cl116
-rw-r--r--MPBenchmarks/gpu_zero.cl18
-rw-r--r--MPBenchmarks/t1.cpp18
-rw-r--r--MPBenchmarks/vs2008/MPBenchmarks.sln39
-rw-r--r--MPBenchmarks/vs2008/MPBenchmarks.vcproj428
37 files changed, 4277 insertions, 0 deletions
diff --git a/BealtoOpenCL/LICENSE.txt b/BealtoOpenCL/LICENSE.txt
new file mode 100644
index 0000000..f5fe116
--- /dev/null
+++ b/BealtoOpenCL/LICENSE.txt
@@ -0,0 +1,27 @@
+This code is released under the following license (BSD-style).
+--
+
+Copyright (c) 2009, Eric Bainville
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Eric Bainville nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY ERIC BAINVILLE ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ERIC BAINVILLE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/BealtoOpenCL/Makefile b/BealtoOpenCL/Makefile
new file mode 100644
index 0000000..f8e725f
--- /dev/null
+++ b/BealtoOpenCL/Makefile
@@ -0,0 +1,68 @@
+# OpenCL multiprecision CPU+GPU benchmarks
+# EB Nov 2009
+
+# archive data
+CURRENT_DATE := $(shell date +%Y%m%d_%Hh%M)
+CURRENT_DIR := $(notdir $(shell /bin/pwd))
+CURRENT_MACHINE := $(shell uname -m)
+
+# output directories
+OBJDIR := objs
+DEPDIR := deps
+ifeq ($(CURRENT_MACHINE),x86_64)
+LIBDIR := lib-64
+else
+LIBDIR := lib-32
+endif
+
+# targets
+OBJS = $(patsubst %,$(OBJDIR)/%.o, CLContext CLError CLProgram )
+DEPS = $(patsubst $(OBJDIR)/%.o, $(DEPDIR)/%.d, $(OBJS))
+
+LIB = $(LIBDIR)/libBealtoOpenCL.a
+
+VPATH = src
+
+# OpenCL SDK directory
+# OPENCL_DIR = /opt/cuda/include
+OPENCL_DIR = /opt/ati-stream-sdk-v2.0-beta4-lnx64
+
+# flags
+CXXFLAGS = -DLinux -DCONFIG_USE_QT=0 -O2 -mtune=nocona -msse3 -Wall -I./include -I$(OPENCL_DIR)/include
+LDFLAGS = -L$(OPENCL_DIR)/lib/x86_64
+
+lib: $(LIB)
+
+$(LIB): $(OBJS)
+ @[ -d $(LIBDIR) ] || mkdir -p $(LIBDIR)
+ ar rvs $(LIB) $(OBJS)
+
+dos2unix:
+ dos2unix Makefile src/*.cpp include/*.h
+
+clean:
+ /bin/rm -f *.o *~ $(LIB)
+ /bin/rm -rf vs2008/Release-* vs2008/Debug-* $(OBJDIR) $(DEPDIR) lib-32 lib-64
+
+archive: clean
+ @echo "ARCHIVE $(CURRENT_DATE)"
+ tar czf "../BealtoOpenCL-$(CURRENT_DATE).tar.gz" -C.. --exclude=".svn" $(CURRENT_DIR)
+
+##### Dependencies
+$(DEPDIR)/%.d: %.cpp
+ @[ -d $(DEPDIR) ] || mkdir -p $(DEPDIR)
+ @/bin/echo -e "DEPS \033[32m$*\033[0m"
+ @$(CXX) $(CXXFLAGS) -o $@ -MM -MT '$(OBJDIR)/$*.o $@' $<
+
+##### Compilation
+$(OBJDIR)/%.o: %.cpp
+ @[ -d $(OBJDIR) ] || mkdir -p $(OBJDIR)
+ @/bin/echo -e "C++ \033[34m$*\033[0m"
+ @$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+##### Qt
+moc_%.cpp: %.h
+ @/bin/echo -e "MOC \033[34m$*\033[0m"
+ @moc -o $@ $<
+
+-include $(DEPS)
diff --git a/BealtoOpenCL/include/BealtoOpenCL.h b/BealtoOpenCL/include/BealtoOpenCL.h
new file mode 100644
index 0000000..b35e9f1
--- /dev/null
+++ b/BealtoOpenCL/include/BealtoOpenCL.h
@@ -0,0 +1,13 @@
+// Main CL C++ bindings include file
+// (c) EB Oct 2009
+
+#pragma once
+
+#include "CLError.h"
+#include "CLContext.h"
+#include "CLCommandQueue.h"
+#include "CLBuffer.h"
+#include "CLEvent.h"
+#include "CLEventList.h"
+#include "CLProgram.h"
+#include "CLKernel.h"
diff --git a/BealtoOpenCL/include/CLBuffer.h b/BealtoOpenCL/include/CLBuffer.h
new file mode 100644
index 0000000..be0f640
--- /dev/null
+++ b/BealtoOpenCL/include/CLBuffer.h
@@ -0,0 +1,34 @@
+// OpenCL buffer object
+// (c) EB Sep 2009
+
+#ifndef CLBuffer_h
+#define CLBuffer_h
+
+#include <CL/cl.h>
+#include "CLMemoryObject.h"
+#include "CLError.h"
+
+namespace cl {
+
+class Buffer : public MemoryObject
+{
+public:
+
+ // Instances of this class are created from a Context.
+
+ // Destructor
+ virtual ~Buffer() { }
+
+private:
+
+ Buffer(); // not implemented
+ Buffer(cl_mem x) : MemoryObject(x) { }
+
+ friend class Context;
+ friend class CommandQueue;
+ friend class Kernel;
+};
+
+} // namespace
+
+#endif // CLBuffer_h
diff --git a/BealtoOpenCL/include/CLCommandQueue.h b/BealtoOpenCL/include/CLCommandQueue.h
new file mode 100644
index 0000000..6fde44c
--- /dev/null
+++ b/BealtoOpenCL/include/CLCommandQueue.h
@@ -0,0 +1,229 @@
+// OpenCL command queue object
+// (c) EB Sep 2009
+
+#ifndef CLCommandQueue_h
+#define CLCommandQueue_h
+
+#include <CL/cl.h>
+#include "CLBuffer.h"
+#include "CLEvent.h"
+#include "CLEventList.h"
+#include "CLKernel.h"
+
+namespace cl {
+
+class CommandQueue
+{
+public:
+
+ // Instances of this class are created from a Context.
+
+ // Copy constructor
+ CommandQueue(const CommandQueue & a) : mX(a.mX)
+ { clRetainCommandQueue(mX); }
+
+ // = operator
+ CommandQueue & operator = (const CommandQueue & a)
+ {
+ if (a.mX != mX)
+ {
+ clReleaseCommandQueue(mX);
+ mX = a.mX;
+ clRetainCommandQueue(mX);
+ }
+ return *this;
+ }
+
+ // Destructor
+ virtual ~CommandQueue()
+ { clReleaseCommandQueue(mX); }
+
+ // Commands
+ // All commands return an Event instance.
+ // If the call fails, the returned event is invalid.
+
+ // Read buffer to host memory
+ Event readBuffer(Buffer * b,cl_bool blocking_read,size_t offset,size_t cb,void * ptr,const EventList & wait_list = EventList())
+ {
+ if (b == 0) return Event(0); // Invalid
+ cl_uint num_events = 0;
+ const cl_event * events = 0;
+ wait_list.getParams(num_events,events);
+ cl_event e = 0;
+ cl_int status = clEnqueueReadBuffer(mX,b->mX,blocking_read,offset,cb,ptr,num_events,events,&e);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) e = 0;
+ return Event(e);
+ }
+
+ // Write buffer from host memory
+ Event writeBuffer(Buffer * b,cl_bool blocking_write,size_t offset,size_t cb,const void * ptr,const EventList & wait_list = EventList())
+ {
+ if (b == 0) return Event(0); // Invalid
+ cl_uint num_events = 0;
+ const cl_event * events = 0;
+ wait_list.getParams(num_events,events);
+ cl_event e = 0;
+ cl_int status = clEnqueueWriteBuffer(mX,b->mX,blocking_write,offset,cb,ptr,num_events,events,&e);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) e = 0;
+ return Event(e);
+ }
+
+ // Copy buffers
+ Event copyBuffer(Buffer * src,Buffer * dst,size_t src_offset,size_t dst_offset,size_t cb,const EventList & wait_list = EventList())
+ {
+ if (src == 0 || dst == 0) return Event(0); // Invalid
+ cl_uint num_events = 0;
+ const cl_event * events = 0;
+ wait_list.getParams(num_events,events);
+ cl_event e = 0;
+ cl_int status = clEnqueueCopyBuffer(mX,src->mX,dst->mX,src_offset,dst_offset,cb,num_events,events,&e);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) e = 0;
+ return Event(e);
+ }
+
+ // Map buffer. The mapped address is put in ADDRESS.
+ template <typename T> Event mapBuffer(Buffer * b,T * & address,cl_bool blocking_map,cl_map_flags map_flags,size_t offset,size_t cb,const EventList & wait_list = EventList())
+ {
+ if (b == 0) return Event(0);
+ cl_uint num_events = 0;
+ const cl_event * events = 0;
+ wait_list.getParams(num_events,events);
+ cl_event e = 0;
+ cl_int status;
+ address = (T *)clEnqueueMapBuffer(mX,b->mX,blocking_map,map_flags,offset,cb,num_events,events,&e,&status);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) e = 0;
+ return Event(e);
+ }
+
+ // Map full buffer. Queries its size, and maps all its contents. The mapped address is put in ADDRESS.
+ template <typename T> Event mapBuffer(Buffer * b,T * & address,cl_bool blocking_map,cl_map_flags map_flags,const EventList & wait_list = EventList())
+ {
+ if (b == 0) return Event(0); // Invalid buffer
+ size_t sz = b->getSize();
+ if (sz == 0) return Event(0); // Invalid size
+ return mapBuffer<T>(b,address,blocking_map,map_flags,0,sz,wait_list);
+ }
+
+ // Unmap memory object. (Buffer, Image2D, etc.)
+ Event unmapMemoryObject(MemoryObject * m,void * address,const EventList & wait_list = EventList())
+ {
+ if (m == 0) return Event(0);
+ cl_uint num_events = 0;
+ const cl_event * events = 0;
+ wait_list.getParams(num_events,events);
+ cl_event e = 0;
+ cl_int status = clEnqueueUnmapMemObject(mX,m->mX,address,num_events,events,&e);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) e = 0;
+ return Event(e);
+ }
+
+ // Enqueue 1D kernel execution
+ // K is the kernel to run.
+ // N is the total number of work items (global work size).
+ // G is the number of work items inside a work group. G must divide N.
+ // G can be 0, in which case the OpenCL implementation will choose the best value.
+ Event execKernel1(Kernel * k,size_t n,size_t g,const EventList & wait_list = EventList())
+ {
+ if (k == 0) return Event(0); // Invalid
+ cl_uint num_events = 0;
+ const cl_event * events = 0;
+ wait_list.getParams(num_events,events);
+ cl_event e = 0;
+ size_t * pgw = &n;
+ size_t * plw = (g>0)?(&g):0;
+ cl_int status = clEnqueueNDRangeKernel(mX,k->mX,1,0,pgw,plw,num_events,events,&e);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) e = 0;
+ return Event(e);
+ }
+
+ // Enqueue 2D kernel execution
+ // K is the kernel to run.
+ // NX*NY is the total number of work items (global work size).
+ // GX*GY is the number of work items inside a work group. G<d> must divide N<d>.
+ Event execKernel2(Kernel * k,size_t nx,size_t ny,size_t gx,size_t gy,const EventList & wait_list = EventList())
+ {
+ if (k == 0) return Event(0); // Invalid
+ cl_uint num_events = 0;
+ const cl_event * events = 0;
+ wait_list.getParams(num_events,events);
+ cl_event e = 0;
+ size_t pgw[2]; pgw[0] = nx; pgw[1] = ny;
+ size_t plw[2]; plw[0] = gx; plw[1] = gy;
+ cl_int status = clEnqueueNDRangeKernel(mX,k->mX,2,0,pgw,plw,num_events,events,&e);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) e = 0;
+ return Event(e);
+ }
+
+ //
+ // Execution control
+ //
+
+ // Enqueue a marker
+ Event mark()
+ {
+ cl_event e = 0;
+ cl_int status = clEnqueueMarker(mX,&e);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) e = 0;
+ return Event(e);
+ }
+
+ // Insert a wait point for a specific list of events
+ bool wait(const EventList & wait_list)
+ {
+ cl_uint num_events = 0;
+ const cl_event * events = 0;
+ wait_list.getParams(num_events,events);
+ if (num_events == 0) return true; // Nop
+ cl_int status = clEnqueueWaitForEvents(mX,num_events,events);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+ }
+
+ // Insert a wait point for all events. (barrier)
+ bool wait()
+ {
+ cl_int status = clEnqueueBarrier(mX);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+ }
+
+ // Blocks until all queued commands have been submitted to the device.
+ bool flush()
+ {
+ cl_int status = clFlush(mX);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+ }
+
+ // Blocks until all queued commands have been executed.
+ bool finish()
+ {
+ cl_int status = clFinish(mX);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+ }
+
+private:
+
+ // Private constructors
+ CommandQueue(); // not implemented
+ CommandQueue(cl_command_queue x) : mX(x) { }
+
+ // OpenCL handle (always non 0)
+ cl_command_queue mX;
+
+ friend class Context;
+
+}; // class CommandQueue
+
+} // namespace
+
+#endif // CLCommandQueue_h
diff --git a/BealtoOpenCL/include/CLContext.h b/BealtoOpenCL/include/CLContext.h
new file mode 100644
index 0000000..2f8fcd4
--- /dev/null
+++ b/BealtoOpenCL/include/CLContext.h
@@ -0,0 +1,149 @@
+// OpenCL context
+// (c) EB Sep 2009
+
+#ifndef CLContext_h
+#define CLContext_h
+
+#include <CL/cl.h>
+#include <vector>
+#include <string>
+
+namespace cl {
+
+class Buffer;
+class Image2D;
+class CommandQueue;
+class Program;
+
+class Context
+{
+public:
+
+ // Copy constructor
+ Context(const Context & a);
+
+ // = operator
+ Context & operator = (const Context & a);
+
+ // Destructor
+ virtual ~Context();
+
+ // Create default context (including all devices with the given type).
+ // Return a new instance if OK, and 0 otherwise.
+ static Context * create(cl_device_type deviceType = CL_DEVICE_TYPE_GPU);
+
+ // Get number of associated devices
+ int getNDevices() const;
+
+ // Create a new command queue object.
+ // D is the index of the device ine the context (0..NDevices-1)
+ // See clCreateCommandQueue for the arguments.
+ // Return a new instance if OK, and 0 otherwise.
+ CommandQueue * createCommandQueue(int d,cl_command_queue_properties properties);
+
+ // Create a new buffer object.
+ // See clCreateBuffer for the arguments.
+ // Return a new instance if OK, and 0 otherwise.
+ Buffer * createBuffer(cl_mem_flags flags,size_t size,void * host_ptr = 0);
+
+ // Create a new image2D object.
+ // See clCreateImage2D for the arguments.
+ // Return a new instance if OK, and 0 otherwise.
+ Image2D * createImage2D(cl_mem_flags flags,const cl_image_format * image_format,
+ size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0);
+
+ // Create a R / RG / RGBA image2D object.
+ // channel_type is passed in a cl_image_format structure.
+ Image2D * createRImage2D(cl_mem_flags flags,cl_channel_type data_type,
+ size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0)
+ {
+ cl_image_format f;
+ f.image_channel_order = CL_R;
+ f.image_channel_data_type = data_type;
+ return createImage2D(flags,&f,width,height,pitch,host_ptr);
+ }
+ Image2D * createRGImage2D(cl_mem_flags flags,cl_channel_type data_type,
+ size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0)
+ {
+ cl_image_format f;
+ f.image_channel_order = CL_RG;
+ f.image_channel_data_type = data_type;
+ return createImage2D(flags,&f,width,height,pitch,host_ptr);
+ }
+ Image2D * createRGBAImage2D(cl_mem_flags flags,cl_channel_type data_type,
+ size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0)
+ {
+ cl_image_format f;
+ f.image_channel_order = CL_RGBA;
+ f.image_channel_data_type = data_type;
+ return createImage2D(flags,&f,width,height,pitch,host_ptr);
+ }
+
+ // Create a new program from source strings.
+ // See clCreateProgramWithSource for the arguments.
+ // Return a new instance if OK, and 0 otherwise.
+ Program * createProgramWithSource(cl_uint count,const char ** strings,const size_t * lengths = 0);
+
+ // Create a new program from source files.
+ // See clCreateProgramWithSource for the arguments.
+ // Return a new instance if OK, and 0 otherwise.
+ Program * createProgramWithFiles(cl_uint count,const char ** filenames);
+ Program * createProgramWithFile(const char * filename)
+ { return createProgramWithFiles(1,&filename); }
+ Program * createProgramWithFiles(const std::vector<std::string> & filenames);
+
+ // Get device info for device D (0..NDevices-1).
+ // See clGetDeviceInfo for the arguments.
+ bool getDeviceInfo(int d,cl_device_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret);
+
+ // Template version suitable for scalar types
+ template <class T> bool getDeviceInfo(int d,cl_device_info param_name,T & x)
+ { return getDeviceInfo(d,param_name,sizeof(T),&x,0); }
+
+ // Get all info for device D.
+ // INFO receives the result (one attribute per line), cleared by the call.
+ bool getAllDeviceInfo(int d,std::string & info);
+
+ // Specific info entries.
+ // D is the device index in context, 0..NDevices-1.
+ size_t getDeviceMaxMemAllocSize(int d = 0)
+ {
+ cl_ulong x;
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_MEM_ALLOC_SIZE,x)) return 0; // Failed
+ return (size_t)x;
+ }
+ bool getDeviceImageSupport(int d = 0)
+ {
+ cl_bool x;
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE_SUPPORT,x)) return false; // Failed
+ return (x)?true:false;
+ }
+ size_t getDeviceImage2DMaxWidth(int d = 0)
+ {
+ size_t x;
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_WIDTH,x)) return 0; // Failed
+ return x;
+ }
+ size_t getDeviceImage2DMaxHeight(int d = 0)
+ {
+ size_t x;
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_HEIGHT,x)) return 0; // Failed
+ return x;
+ }
+
+private:
+
+ // This constructor is private, use create() to instanciate this class
+ Context();
+
+ // OpenCL handle (always non 0)
+ cl_context mX;
+
+ // Devices associated with this context
+ std::vector<cl_device_id> mDevices;
+
+}; // class Context
+
+} // namespace
+
+#endif // CLContext_h
diff --git a/BealtoOpenCL/include/CLError.h b/BealtoOpenCL/include/CLError.h
new file mode 100644
index 0000000..d929695
--- /dev/null
+++ b/BealtoOpenCL/include/CLError.h
@@ -0,0 +1,19 @@
+// OpenCL error reporting
+// (c) EB Sep 2009
+
+#ifndef CLError_h
+#define CLError_h
+
+namespace cl {
+
+// Report an OpenCL call status.
+// FILE,LINE,FUNCTION shall be the source file/line/name of the function where the call failed.
+// STATUS is the returned value (CL_SUCCESS,...).
+void reportStatus(const char * file,int line,const char * function,int status);
+
+// Use this macro to report errors.
+#define REPORT_OPENCL_STATUS(status) { cl::reportStatus(__FILE__,__LINE__,__FUNCTION__,status); }
+
+} // namespace
+
+#endif // CLError_h
diff --git a/BealtoOpenCL/include/CLEvent.h b/BealtoOpenCL/include/CLEvent.h
new file mode 100644
index 0000000..b7ac9b4
--- /dev/null
+++ b/BealtoOpenCL/include/CLEvent.h
@@ -0,0 +1,67 @@
+// OpenCL event object
+// (c) EB Sep 2009
+
+#ifndef CLEvent_h
+#define CLEvent_h
+
+#include <CL/cl.h>
+#include "CLError.h"
+
+namespace cl {
+
+class Event
+{
+public:
+
+ // Instances of this class are created by calls in a CommandQueue.
+
+ // Copy constructor
+ Event(const Event & a) : mX(a.mX)
+ { if (mX != 0) clRetainEvent(mX); }
+
+ // = operator
+ Event & operator = (const Event & a)
+ {
+ if (a.mX != mX)
+ {
+ if (mX != 0) clReleaseEvent(mX);
+ mX = a.mX;
+ if (mX != 0) clRetainEvent(mX);
+ }
+ return *this;
+ }
+
+ // Destructor
+ virtual ~Event()
+ { if (mX != 0) clReleaseEvent(mX); }
+
+ // Check if the event is valid
+ bool isValid() const { return (mX != 0); }
+
+ // Wait for this event to complete.
+ // Return TRUE if OK, and FALSE otherwise.
+ // If the event is invalid, return TRUE.
+ bool wait()
+ {
+ if (mX == 0) return true; // OK, event is invalid
+ cl_int status = clWaitForEvents(1,&mX);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+ }
+
+private:
+
+ Event(); // not implemented
+ Event(cl_event x) : mX(x) { }
+
+ // OpenCL handle (MAY BE 0)
+ cl_event mX;
+
+ friend class CommandQueue;
+ friend class EventList;
+
+}; // class Event
+
+} // namespace
+
+#endif // CLEvent_h
diff --git a/BealtoOpenCL/include/CLEventList.h b/BealtoOpenCL/include/CLEventList.h
new file mode 100644
index 0000000..a841d15
--- /dev/null
+++ b/BealtoOpenCL/include/CLEventList.h
@@ -0,0 +1,96 @@
+// OpenCL event list
+// (c) EB Sep 2009
+
+#ifndef CLEventList_h
+#define CLEventList_h
+
+#include <CL/cl.h>
+#include <vector>
+#include "CLEvent.h"
+
+namespace cl {
+
+class EventList
+{
+public:
+
+ // Constructor from a list of events
+ EventList() { }
+ EventList(const Event & e1) { insert(e1); }
+ EventList(const Event & e1,const Event & e2) { insert(e1); insert(e2); }
+ EventList(const Event & e1,const Event & e2,const Event & e3) { insert(e1); insert(e2); insert(e3); }
+
+ // Copy constructor
+ EventList(const EventList & e)
+ {
+ for (event_list_t::const_iterator it = e.mX.begin(); it != e.mX.end(); it++) insert(*it);
+ }
+
+ // = operator
+ EventList & operator = (const EventList & e)
+ {
+ if (&e == this) return *this;
+ clear();
+ for (event_list_t::const_iterator it = e.mX.begin(); it != e.mX.end(); it++) insert(*it);
+ return *this;
+ }
+
+ // Destructor
+ ~EventList() { clear(); }
+
+ // Clear the list
+ void clear()
+ {
+ for (event_list_t::iterator it = mX.begin(); it != mX.end(); it++) clReleaseEvent(*it);
+ mX.clear();
+ }
+
+ // Insert an event to the list. Ignore if E is invalid.
+ void insert(const Event & e)
+ {
+ cl_event x = e.mX;
+ if (x == 0) return;
+ clRetainEvent(x);
+ mX.push_back(x);
+ }
+
+ // Get the number of events in the list
+ int size() const { return (int)mX.size(); }
+
+ // Is the list empty?
+ bool empty() const { return mX.empty(); }
+
+ // Wait for all the events to complete.
+ // Return TRUE if OK, and FALSE otherwise.
+ // If the list is empty, return TRUE.
+ bool wait()
+ {
+ if (mX.empty()) return true; // OK, empty
+ cl_uint num_events;
+ const cl_event * events;
+ getParams(num_events,events);
+ cl_int status = clWaitForEvents(num_events,events);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+ }
+
+ // Get CL parameters for the calls
+ void getParams(cl_uint & num_events,const cl_event * & events) const
+ {
+ num_events = (cl_uint)mX.size();
+ if (num_events == 0) events = 0;
+ else events = &(mX[0]);
+ }
+
+private:
+
+ typedef std::vector<cl_event> event_list_t;
+
+ // OpenCL handles
+ event_list_t mX;
+
+}; // class EventList
+
+} // namespace
+
+#endif // CLEventList_h
diff --git a/BealtoOpenCL/include/CLImage2D.h b/BealtoOpenCL/include/CLImage2D.h
new file mode 100644
index 0000000..baf996f
--- /dev/null
+++ b/BealtoOpenCL/include/CLImage2D.h
@@ -0,0 +1,35 @@
+// OpenCL image2D object
+// (c) EB Nov 2009
+
+#ifndef CLImage2D_h
+#define CLImage2D_h
+
+#include <CL/cl.h>
+#include "CLError.h"
+#include "CLMemoryObject.h"
+
+namespace cl {
+
+ class Image2D : public MemoryObject
+{
+public:
+
+ // Instances of this class are created from a Context.
+
+ // Destructor
+ virtual ~Image2D() { }
+
+private:
+
+ Image2D(); // not implemented
+ Image2D(cl_mem x) : MemoryObject(x) { }
+
+ friend class Context;
+ friend class CommandQueue;
+ friend class Kernel;
+
+}; // class Image2D
+
+} // namespace
+
+#endif // CLImage2D_h
diff --git a/BealtoOpenCL/include/CLKernel.h b/BealtoOpenCL/include/CLKernel.h
new file mode 100644
index 0000000..7ded751
--- /dev/null
+++ b/BealtoOpenCL/include/CLKernel.h
@@ -0,0 +1,78 @@
+// OpenCL kernel object
+// (c) EB Sep 2009
+
+#ifndef CLKernel_h
+#define CLKernel_h
+
+#include <CL/cl.h>
+#include "CLError.h"
+#include "CLBuffer.h"
+
+namespace cl {
+
+class Kernel
+{
+public:
+
+ // Instances of this class are created from a Program.
+
+ // Copy constructor
+ Kernel(const Kernel & a) : mX(a.mX)
+ { clRetainKernel(mX); }
+
+ // = operator
+ Kernel & operator = (const Kernel & a)
+ {
+ if (a.mX != mX)
+ {
+ clReleaseKernel(mX);
+ mX = a.mX;
+ clRetainKernel(mX);
+ }
+ return *this;
+ }
+
+ // Destructor
+ virtual ~Kernel()
+ { clReleaseKernel(mX); }
+
+ // Set kernel argument
+ // See clSetKernelArg for arguments.
+ // Return TRUE if OK, and FALSE otherwise.
+ bool setArg(cl_uint arg_index,size_t arg_size,const void * arg_value)
+ {
+ cl_int status = clSetKernelArg(mX,arg_index,arg_size,arg_value);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+ }
+ // Set kernel argument (Buffer)
+ // Return TRUE if OK, and FALSE otherwise.
+ bool setArg(cl_uint arg_index,Buffer * b)
+ {
+ if (b == 0) return false; // Invalid
+ cl_mem m = b->mX;
+ return setArg(arg_index,sizeof(m),(const void *)&m);
+ }
+ // Set kernel argument (other types)
+ // Return TRUE if OK, and FALSE otherwise.
+ bool setArg(cl_int arg_index,cl_int x)
+ { return setArg(arg_index,sizeof(x),(const void *)&x); }
+ bool setArg(cl_int arg_index,cl_float x)
+ { return setArg(arg_index,sizeof(x),(const void *)&x); }
+
+private:
+
+ Kernel(); // not implemented
+ Kernel(cl_kernel x) : mX(x) { }
+
+ // OpenCL handle (always non 0)
+ cl_kernel mX;
+
+ friend class Program;
+ friend class CommandQueue;
+
+}; // class Kernel
+
+} // namespace
+
+#endif // CLKernel_h
diff --git a/BealtoOpenCL/include/CLMemoryObject.h b/BealtoOpenCL/include/CLMemoryObject.h
new file mode 100644
index 0000000..433bbdc
--- /dev/null
+++ b/BealtoOpenCL/include/CLMemoryObject.h
@@ -0,0 +1,77 @@
+// OpenCL memory object
+// (c) EB Dec 2009
+
+#ifndef CLMemoryObject_h
+#define CLMemoryObject_h
+
+#include <CL/cl.h>
+#include "CLError.h"
+
+namespace cl {
+
+class MemoryObject
+{
+public:
+
+ // Instances of this class are created from a Context.
+
+ // Copy constructor
+ MemoryObject(const MemoryObject & a) : mX(a.mX)
+ { clRetainMemObject(mX); }
+
+ // = operator
+ MemoryObject & operator = (const MemoryObject & a)
+ {
+ if (a.mX != mX)
+ {
+ clReleaseMemObject(mX);
+ mX = a.mX;
+ clRetainMemObject(mX);
+ }
+ return *this;
+ }
+
+ // Destructor
+ virtual ~MemoryObject()
+ { clReleaseMemObject(mX); }
+
+ // Get memory object info.
+ // See clGetMemObjectInfo for the arguments.
+ bool getMemObjectInfo(cl_mem_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret)
+ {
+ cl_int status = clGetMemObjectInfo(mX,param_name,param_value_size,param_value,param_value_size_ret);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+ }
+
+ // Template version suitable for scalar types
+ template <class T> bool getMemObjectInfo(cl_mem_info param_name,T & x)
+ { return getMemObjectInfo(param_name,sizeof(T),&x,0); }
+
+ // Specific info entries
+ size_t getSize()
+ {
+ size_t x;
+ bool ok = getMemObjectInfo(CL_MEM_SIZE,x);
+ return (ok)?x:0;
+ }
+
+private:
+
+ MemoryObject(); // not implemented
+ MemoryObject(cl_mem x) : mX(x) { }
+
+ // OpenCL handle (always non 0)
+ cl_mem mX;
+
+ friend class Context;
+ friend class CommandQueue;
+ friend class Kernel;
+ friend class Buffer;
+ friend class Image2D;
+
+}; // class MemoryObject
+
+} // namespace
+
+#endif // CLMemoryObject_h
diff --git a/BealtoOpenCL/include/CLProgram.h b/BealtoOpenCL/include/CLProgram.h
new file mode 100644
index 0000000..07a85a7
--- /dev/null
+++ b/BealtoOpenCL/include/CLProgram.h
@@ -0,0 +1,74 @@
+// OpenCL program object
+// (c) EB Sep 2009
+
+#ifndef CLProgram_h
+#define CLProgram_h
+
+#include <CL/cl.h>
+#include <string>
+#include <vector>
+#include "CLError.h"
+#include "CLKernel.h"
+
+namespace cl {
+
+class Program
+{
+public:
+
+ // Instances of this class are created from a Context.
+
+ // Copy constructor
+ Program(const Program & a) : mX(a.mX)
+ { clRetainProgram(mX); }
+
+ // = operator
+ Program & operator = (const Program & a)
+ {
+ if (a.mX != mX)
+ {
+ clReleaseProgram(mX);
+ mX = a.mX;
+ clRetainProgram(mX);
+ }
+ return *this;
+ }
+
+ // Destructor
+ virtual ~Program()
+ { clReleaseProgram(mX); }
+
+ // Build the program (for all devices, and blocking)
+ // See clBuildProgram for the arguments.
+ // Return TRUE if OK, and FALSE otherwise.
+ bool build(const char * options,std::string & buildErrors);
+
+ // Get program binary for first device
+ bool getBinary(std::vector<unsigned char> & binary);
+
+ // Get a kernel for this program
+ // Return a new instance if OK, and 0 otherwise.
+ Kernel * createKernel(const char * kernel_name)
+ {
+ cl_int status;
+ cl_kernel k = clCreateKernel(mX,kernel_name,&status);
+ REPORT_OPENCL_STATUS(status);
+ if (k == 0) return 0; // Failed
+ return new Kernel(k);
+ }
+
+private:
+
+ Program(); // not implemented
+ Program(cl_program x) : mX(x) { }
+
+ // OpenCL handle (always non 0)
+ cl_program mX;
+
+ friend class Context;
+
+}; // class Buffer
+
+} // namespace
+
+#endif // CLProgram_h
diff --git a/BealtoOpenCL/src/CLContext.cpp b/BealtoOpenCL/src/CLContext.cpp
new file mode 100644
index 0000000..7ca7848
--- /dev/null
+++ b/BealtoOpenCL/src/CLContext.cpp
@@ -0,0 +1,351 @@
+// OpenCL context
+// (c) EB Sep 2009
+
+#define _CRT_SECURE_NO_WARNINGS
+#include <sys/stat.h>
+#include <vector>
+#include <sstream>
+#include <stdlib.h>
+#include "CLError.h"
+#include "CLContext.h"
+#include "CLCommandQueue.h"
+#include "CLBuffer.h"
+#include "CLImage2D.h"
+#include "CLProgram.h"
+
+#ifdef Linux
+#define _stat stat
+#endif
+
+using namespace cl;
+
+Context::Context() : mX(0)
+{
+}
+
+Context::Context(const Context & a) : mX(a.mX)
+{
+ clRetainContext(mX);
+}
+
+Context & Context::operator = (const Context & a)
+{
+ if (a.mX != mX)
+ {
+ clReleaseContext(mX);
+ mX = a.mX;
+ clRetainContext(mX);
+ }
+ return *this;
+}
+
+Context::~Context()
+{
+ clReleaseContext(mX);
+}
+
+Context * Context::create(cl_device_type deviceType)
+{
+ cl_platform_id bestPlatform = 0;
+ std::vector<cl_device_id> devices;
+ int status;
+
+ // Platforms
+ cl_uint nPlatforms = 0;
+ std::vector<cl_platform_id> pIDs;
+ status = clGetPlatformIDs(0,0,&nPlatforms);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS || nPlatforms == 0) return 0; // Failed
+ pIDs.resize(nPlatforms,0);
+ status = clGetPlatformIDs(nPlatforms,&(pIDs[0]),&nPlatforms);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) return 0; // Failed
+
+ // printf("NPlatforms: %u\n",nPlatforms);
+
+ // Devices for each platform
+ for (unsigned int i=0;i<nPlatforms;i++)
+ {
+ cl_platform_id p = pIDs[i];
+
+ // Get all GPU devices for this platform
+ cl_uint nDevices = 0;
+ status = clGetDeviceIDs(p,deviceType,0,0,&nDevices);
+ // printf("NDevices: %u\n",nDevices);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS || nDevices == 0) continue;
+ devices.resize(nDevices,0);
+ status = clGetDeviceIDs(p,deviceType,nDevices,&(devices[0]),&nDevices);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) { devices.clear(); continue; }
+
+ bestPlatform = p;
+ break; // OK
+ }
+ if (devices.empty())
+ {
+ printf("No device found\n");
+ return 0;
+ }
+
+ cl_context_properties cp[4];
+ cp[0] = CL_CONTEXT_PLATFORM;
+ cp[1] = (cl_context_properties)bestPlatform;
+ cp[2] = cp[3] = 0;
+ cl_context c = clCreateContext(cp,(int)devices.size(),&(devices[0]),0,0,0);
+ if (c == 0) return 0;
+
+ Context * result = new Context();
+ result->mX = c;
+ result->mDevices = devices;
+
+ return result;
+}
+
+int Context::getNDevices() const
+{
+ return (int)mDevices.size();
+}
+
+CommandQueue * Context::createCommandQueue(int d,cl_command_queue_properties properties)
+{
+ cl_int status;
+ if (d<0 || d>=(int)mDevices.size()) return 0; // Invalid D
+ cl_command_queue q = clCreateCommandQueue(mX,mDevices[d],properties,&status);
+ REPORT_OPENCL_STATUS(status);
+ if (q == 0) return 0; // Failed
+ return new CommandQueue(q);
+}
+
+Buffer * Context::createBuffer(cl_mem_flags flags,size_t size,void * host_ptr)
+{
+ cl_int status;
+ cl_mem m = clCreateBuffer(mX,flags,size,host_ptr,&status);
+ REPORT_OPENCL_STATUS(status);
+ if (m == 0) return 0; // Failed
+ return new Buffer(m);
+}
+
+Image2D * Context::createImage2D(cl_mem_flags flags,const cl_image_format * image_format,
+ size_t width,size_t height,size_t pitch,void * host_ptr)
+{
+ cl_int status;
+ cl_mem m = clCreateImage2D(mX,flags,image_format,width,height,pitch,host_ptr,&status);
+ REPORT_OPENCL_STATUS(status);
+ if (m == 0) return 0; // Failed
+ return new Image2D(m);
+}
+
+Program * Context::createProgramWithSource(cl_uint count,const char ** strings,const size_t * lengths)
+{
+ cl_int status;
+ cl_program p = clCreateProgramWithSource(mX,count,strings,lengths,&status);
+ REPORT_OPENCL_STATUS(status);
+ if (p == 0) return 0; // Failed
+ return new Program(p);
+}
+
+Program * Context::createProgramWithFiles(cl_uint count,const char ** filenames)
+{
+ if (count == 0 || filenames == 0) return 0; // Invalid
+
+ std::vector<char *> strings;
+ std::vector<size_t> lengths;
+ strings.resize(count,0);
+ lengths.resize(count,0);
+ bool ok = true;
+ for (unsigned int i=0;i<count;i++)
+ {
+ // Get file size and check it exists
+ struct _stat s;
+ int status = ::_stat(filenames[i],&s);
+ if (status != 0) { ok = false; break; }
+ size_t sz = s.st_size;
+ if (sz == 0) { ok = false; break; }
+
+ // Read the file
+ FILE * f = fopen(filenames[i],"rb");
+ if (f == 0) { ok = false; break; }
+ char * buffer = (char *)malloc(sz);
+ strings[i] = buffer;
+ lengths[i] = sz;
+
+ size_t readBlocks = fread(buffer,sz,1,f);
+ fclose(f);
+ if (readBlocks != 1) { ok = false; break; } // read failed
+ }
+ Program * result = 0;
+ if (ok) result = createProgramWithSource(count,(const char **)&(strings[0]),&(lengths[0]));
+
+ // Cleanup
+ for (unsigned int i=0;i<count;i++) if (strings[i] != 0)
+ { free(strings[i]); }
+
+ return result;
+}
+
+Program * Context::createProgramWithFiles(const std::vector<std::string> & filenames)
+{
+ cl_uint n = (int)filenames.size();
+ if (n == 0) return 0; // Empty
+ std::vector<const char *> files(n,0);
+ for (cl_uint i=0;i<n;i++)
+ {
+ if (filenames[i].empty()) continue; // 0 if empty
+ files[i] = filenames[i].c_str();
+ }
+ return createProgramWithFiles(n,&(files[0]));
+}
+
+bool Context::getDeviceInfo(int d,cl_device_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret)
+{
+ if (d<0 || d>=(int)mDevices.size()) return 0; // Invalid D
+ cl_int status = clGetDeviceInfo(mDevices[d],param_name,param_value_size,param_value,param_value_size_ret);
+ REPORT_OPENCL_STATUS(status);
+ return (status == CL_SUCCESS);
+}
+
+bool Context::getAllDeviceInfo(int d,std::string & info)
+{
+ cl_uint ui;
+ cl_ulong ul;
+ cl_bool b;
+ info.clear();
+ std::ostringstream o;
+ o << "Device info (ID=0x" << mDevices[d] << ")\n";
+
+ cl_device_type dType;
+ if (!getDeviceInfo(d,CL_DEVICE_TYPE,dType)) return false;
+ o << "- type:";
+ if (dType & CL_DEVICE_TYPE_CPU) o << " cpu";
+ if (dType & CL_DEVICE_TYPE_GPU) o << " gpu";
+ if (dType & CL_DEVICE_TYPE_ACCELERATOR) o << " accelerator";
+ o << "\n";
+
+ if (!getDeviceInfo(d,CL_DEVICE_VENDOR_ID,ui)) return false;
+ o << "- vendor id: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_COMPUTE_UNITS,ui)) return false;
+ o << "- max compute units: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,ui)) return false;
+ o << "- max work item dimensions: " << ui << "\n";
+ size_t sz[24];
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_ITEM_SIZES,24*sizeof(size_t),sz,0)) return false;
+ o << "- max work item sizes:";
+ for (cl_uint i=0;i<ui;i++) o << " " << sz[i];
+ o << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_GROUP_SIZE,sz[0])) return false;
+ o << "- max work group size: " << sz[0] << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,ui)) return false;
+ o << "- preferred vector width char: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,ui)) return false;
+ o << "- preferred vector width short: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,ui)) return false;
+ o << "- preferred vector width int: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,ui)) return false;
+ o << "- preferred vector width long: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,ui)) return false;
+ o << "- preferred vector width float: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,ui)) return false;
+ o << "- preferred vector width double: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_CLOCK_FREQUENCY,ui)) return false;
+ o << "- max clock frequency: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_ADDRESS_BITS,ui)) return false;
+ o << "- address bits: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_MEM_ALLOC_SIZE,ul)) return false;
+ o << "- max mem alloc size: " << ul << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE_SUPPORT,b)) return false;
+ o << "- image support: " << (b?"yes":"no") << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_READ_IMAGE_ARGS,ui)) return false;
+ o << "- max read image args: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_WRITE_IMAGE_ARGS,ui)) return false;
+ o << "- max write image args: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_WIDTH,sz[0])) return false;
+ o << "- image2d max width: " << sz[0] << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_HEIGHT,sz[0])) return false;
+ o << "- image2d max height: " << sz[0] << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_WIDTH,sz[0])) return false;
+ o << "- image3d max width: " << sz[0] << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_HEIGHT,sz[0])) return false;
+ o << "- image3d max height: " << sz[0] << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_DEPTH,sz[0])) return false;
+ o << "- image3d max depth: " << sz[0] << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_SAMPLERS,ui)) return false;
+ o << "- max samplers: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_PARAMETER_SIZE,sz[0])) return false;
+ o << "- max parameter size: " << sz[0] << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MEM_BASE_ADDR_ALIGN,ui)) return false;
+ o << "- mem base addr align: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE,ui)) return false;
+ o << "- min data type align size: " << ui << "\n";
+
+ cl_device_fp_config fpc;
+ if (!getDeviceInfo(d,CL_DEVICE_SINGLE_FP_CONFIG,fpc)) return false;
+ o << "- single fp config:";
+ if (fpc & CL_FP_DENORM) o << " denorm";
+ if (fpc & CL_FP_INF_NAN) o << " inf_nan";
+ if (fpc & CL_FP_ROUND_TO_NEAREST) o << " round_to_nearest";
+ if (fpc & CL_FP_ROUND_TO_ZERO) o << " round_to_zero";
+ if (fpc & CL_FP_ROUND_TO_INF) o << " round_to_inf";
+ if (fpc & CL_FP_FMA) o << " fma";
+ o << "\n";
+
+ cl_device_mem_cache_type mct;
+ if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHE_TYPE,mct)) return false;
+ o << "- global mem cache type:";
+ if (mct == CL_NONE) o << " none";
+ else if (mct == CL_READ_ONLY_CACHE) o << " read_only_cache";
+ else if (mct == CL_READ_WRITE_CACHE) o << " read_write_cache";
+ o << "\n";
+
+ if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE,ui)) return false;
+ o << "- global mem cacheline size: " << ui << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,ul)) return false;
+ o << "- global mem cache size: " << ul << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_SIZE,ul)) return false;
+ o << "- global mem size: " << ul << "\n";
+
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,ul)) return false;
+ o << "- max constant buffer size: " << ul << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_MAX_CONSTANT_ARGS,ui)) return false;
+ o << "- max constant args: " << ui << "\n";
+
+ cl_device_local_mem_type lmt;
+ if (!getDeviceInfo(d,CL_DEVICE_LOCAL_MEM_TYPE,lmt)) return false;
+ o << "- local mem type:";
+ if (lmt == CL_LOCAL) o << " local";
+ else o << " global";
+ o << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_LOCAL_MEM_SIZE,ul)) return false;
+ o << "- local mem size: " << ul << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_ERROR_CORRECTION_SUPPORT,b)) return false;
+ o << "- error correction support: " << (b?"yes":"no") << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_PROFILING_TIMER_RESOLUTION,sz[0])) return false;
+ o << "- profiling timer resolution: " << sz[0] << " ns\n";
+ if (!getDeviceInfo(d,CL_DEVICE_ENDIAN_LITTLE,b)) return false;
+ o << "- endian little: " << (b?"yes":"no") << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_AVAILABLE,b)) return false;
+ o << "- available: " << (b?"yes":"no") << "\n";
+ if (!getDeviceInfo(d,CL_DEVICE_COMPILER_AVAILABLE,b)) return false;
+ o << "- compiler available: " << (b?"yes":"no") << "\n";
+
+ cl_device_exec_capabilities ec;
+ if (!getDeviceInfo(d,CL_DEVICE_EXECUTION_CAPABILITIES,ec)) return false;
+ o << "- execution capabilities:";
+ if (ec & CL_EXEC_KERNEL) o << " exec_kernel";
+ if (ec & CL_EXEC_NATIVE_KERNEL) o << " exec_native_kernel";
+ o << "\n";
+
+ cl_command_queue_properties qp;
+ if (!getDeviceInfo(d,CL_DEVICE_QUEUE_PROPERTIES,qp)) return false;
+ o << "- queue properties:";
+ if (ec & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) o << " out_of_order_exec_mode";
+ if (ec & CL_QUEUE_PROFILING_ENABLE) o << " profiling";
+ o << "\n";
+
+ char exts[8192];
+ if (!getDeviceInfo(d,CL_DEVICE_EXTENSIONS,8192,exts,0)) return false;
+ o << "- extensions: " << exts << "\n";
+
+ info.assign(o.str());
+ return true;
+}
diff --git a/BealtoOpenCL/src/CLError.cpp b/BealtoOpenCL/src/CLError.cpp
new file mode 100644
index 0000000..1fcd12b
--- /dev/null
+++ b/BealtoOpenCL/src/CLError.cpp
@@ -0,0 +1,82 @@
+// OpenCL error reporting
+// (c) EB Sep 2009
+
+#if CONFIG_USE_QT
+#include <QtCore/QDebug>
+#include <QtCore/QString>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <CL/cl.h>
+#include "CLError.h"
+
+const int NErrorCodes = 63;
+static const char * ErrorCodes[NErrorCodes] = {
+"CL_SUCCESS", // 0
+"CL_DEVICE_NOT_FOUND", // -1
+"CL_DEVICE_NOT_AVAILABLE", // -2
+"CL_COMPILER_NOT_AVAILABLE", // -3
+"CL_MEM_OBJECT_ALLOCATION_FAILURE", // -4
+"CL_OUT_OF_RESOURCES", // -5
+"CL_OUT_OF_HOST_MEMORY", // -6
+"CL_PROFILING_INFO_NOT_AVAILABLE", // -7
+"CL_MEM_COPY_OVERLAP", // -8
+"CL_IMAGE_FORMAT_MISMATCH", // -9
+"CL_IMAGE_FORMAT_NOT_SUPPORTED", // -10
+"CL_BUILD_PROGRAM_FAILURE", // -11
+"CL_MAP_FAILURE", // -12
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // -13..-29
+"CL_INVALID_VALUE", // -30
+"CL_INVALID_DEVICE_TYPE", // -31
+"CL_INVALID_PLATFORM", // -32
+"CL_INVALID_DEVICE", // -33
+"CL_INVALID_CONTEXT", // -34
+"CL_INVALID_QUEUE_PROPERTIES", // -35
+"CL_INVALID_COMMAND_QUEUE", // -36
+"CL_INVALID_HOST_PTR", // -37
+"CL_INVALID_MEM_OBJECT", // -38
+"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", // -39
+"CL_INVALID_IMAGE_SIZE", // -40
+"CL_INVALID_SAMPLER", // -41
+"CL_INVALID_BINARY", // -42
+"CL_INVALID_BUILD_OPTIONS", // -43
+"CL_INVALID_PROGRAM", // -44
+"CL_INVALID_PROGRAM_EXECUTABLE", // -45
+"CL_INVALID_KERNEL_NAME", // -46
+"CL_INVALID_KERNEL_DEFINITION", // -47
+"CL_INVALID_KERNEL", // -48
+"CL_INVALID_ARG_INDEX", // -49
+"CL_INVALID_ARG_VALUE", // -50
+"CL_INVALID_ARG_SIZE", // -51
+"CL_INVALID_KERNEL_ARGS", // -52
+"CL_INVALID_WORK_DIMENSION", // -53
+"CL_INVALID_WORK_GROUP_SIZE", // -54
+"CL_INVALID_WORK_ITEM_SIZE", // -55
+"CL_INVALID_GLOBAL_OFFSET", // -56
+"CL_INVALID_EVENT_WAIT_LIST", // -57
+"CL_INVALID_EVENT", // -58
+"CL_INVALID_OPERATION", // -59
+"CL_INVALID_GL_OBJECT", // -60
+"CL_INVALID_BUFFER_SIZE", // -61
+"CL_INVALID_MIP_LEVEL" // -62
+};
+
+void cl::reportStatus(const char * file,int line,const char * function,int status)
+{
+ if (status >= 0) return;
+ status = -status;
+#if CONFIG_USE_QT
+ QString e;
+ if (status >= NErrorCodes || ErrorCodes[status] == 0) e = QString("%1").arg(status);
+ else e = ErrorCodes[status];
+ qDebug() << QString("OpenCL ERROR %1(%2),%3: %4").arg(file).arg(line).arg(function).arg(e);
+#else
+ const char * e = 0;
+ if (status >= NErrorCodes || ErrorCodes[status] == 0) e = "???";
+ else e = ErrorCodes[status];
+ printf("Error %s(%d),%s,%s\n",file,line,function,e);
+#endif
+#ifdef Linux
+ // exit(1);
+#endif
+}
diff --git a/BealtoOpenCL/src/CLProgram.cpp b/BealtoOpenCL/src/CLProgram.cpp
new file mode 100644
index 0000000..731c385
--- /dev/null
+++ b/BealtoOpenCL/src/CLProgram.cpp
@@ -0,0 +1,84 @@
+// OpenCL program object
+// (c) EB Sep 2009
+
+#include <stdlib.h>
+#include <string.h>
+#include "CLProgram.h"
+
+using namespace cl;
+
+bool Program::build(const char * options,std::string & buildErrors)
+{
+ buildErrors.clear();
+ cl_int status;
+
+ // Get devices
+ cl_uint nDevices = 0;
+ status = clGetProgramInfo(mX,CL_PROGRAM_NUM_DEVICES,sizeof(nDevices),&nDevices,0);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS || nDevices == 0) { buildErrors.append("clGetProgramInfo NUM_DEVICES failed\n"); return false; }
+ std::vector<cl_device_id> devices;
+ devices.resize(nDevices,0);
+ status = clGetProgramInfo(mX,CL_PROGRAM_DEVICES,sizeof(cl_device_id)*nDevices,&(devices[0]),0);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) { buildErrors.append("clGetProgramInfo DEVICES failed\n"); return false; }
+
+ // Build program
+ status = clBuildProgram(mX,nDevices,&(devices[0]),options,0,0);
+ REPORT_OPENCL_STATUS(status);
+ if (status == CL_SUCCESS) return true; // OK
+
+ // Get build info for each device
+ for (unsigned int i=0;i<nDevices;i++)
+ {
+ cl_device_id d = devices[i];
+ char aux[4000];
+ status = clGetProgramBuildInfo(mX,d,CL_PROGRAM_BUILD_LOG,4000,aux,0);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) { buildErrors.append("clGetProgramBuildInfo failed\n"); continue; }
+ buildErrors.append(aux);
+ }
+
+ buildErrors.append("Build failed\n");
+ return false;
+}
+
+// Get program binary for first device
+bool Program::getBinary(std::vector<unsigned char> & binary)
+{
+ cl_int status;
+ binary.clear();
+
+ // Get devices
+ cl_uint nDevices = 0;
+ status = clGetProgramInfo(mX,CL_PROGRAM_NUM_DEVICES,sizeof(nDevices),&nDevices,0);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS || nDevices == 0) return false;
+ // Get binary sizes for all devices
+ std::vector<size_t> binarySize(nDevices,0);
+ status = clGetProgramInfo(mX,CL_PROGRAM_BINARY_SIZES,nDevices*sizeof(size_t),&(binarySize[0]),0);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) return false;
+ // Alloc buffers
+ std::vector<unsigned char *> buf(nDevices,0);
+ for (cl_uint i=0;i<nDevices;i++)
+ {
+ if (binarySize[i]>0) buf[i] = (unsigned char *)malloc(binarySize[i]);
+ }
+ status = clGetProgramInfo(mX,CL_PROGRAM_BINARIES,nDevices*sizeof(unsigned char *),&(buf[0]),0);
+ REPORT_OPENCL_STATUS(status);
+ if (status != CL_SUCCESS) return false;
+ // Copy result for first device
+ if (binarySize[0]>0)
+ {
+ binary.resize(binarySize[0],0);
+ memcpy(&(binary[0]),buf[0],binarySize[0]);
+ }
+ // Free buffers
+ for (cl_uint i=0;i<nDevices;i++)
+ {
+ if (buf[i] != 0) free(buf[i]);
+ }
+ return true; // OK
+}
+
diff --git a/BealtoOpenCL/vs2008/BealtoOpenCL.sln b/BealtoOpenCL/vs2008/BealtoOpenCL.sln
new file mode 100644
index 0000000..8ccd961
--- /dev/null
+++ b/BealtoOpenCL/vs2008/BealtoOpenCL.sln
@@ -0,0 +1,26 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BealtoOpenCL", "BealtoOpenCL.vcproj", "{86930221-164E-4E80-B8C0-50DCB5740B37}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.ActiveCfg = Debug|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.Build.0 = Debug|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.ActiveCfg = Debug|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.Build.0 = Debug|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.ActiveCfg = Release|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.Build.0 = Release|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.ActiveCfg = Release|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj b/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj
new file mode 100644
index 0000000..ce9faa9
--- /dev/null
+++ b/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj
@@ -0,0 +1,369 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="9,00"
+ Name="BealtoOpenCL"
+ ProjectGUID="{86930221-164E-4E80-B8C0-50DCB5740B37}"
+ RootNamespace="BealtoOpenCL"
+ Keyword="Win32Proj"
+ TargetFrameworkVersion="196613"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ <Platform
+ Name="x64"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+ IntermediateDirectory="$(ConfigurationName)-32"
+ ConfigurationType="4"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="..\include"
+ PreprocessorDefinitions="WIN32;_DEBUG;_LIB"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ EnableEnhancedInstructionSet="2"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ AdditionalDependencies="OpenCL.lib"
+ OutputFile="$(ProjectDir)..\lib-32\$(ProjectName)d.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Debug|x64"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-64"
+ IntermediateDirectory="$(ConfigurationName)-64"
+ ConfigurationType="4"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="..\include"
+ PreprocessorDefinitions="WIN32;_DEBUG;_LIB;CONFIG_USE_QT=1"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ EnableEnhancedInstructionSet="0"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ AdditionalDependencies="OpenCL.lib"
+ OutputFile="$(ProjectDir)..\lib-64\$(ProjectName)d.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+ IntermediateDirectory="$(ConfigurationName)-32"
+ ConfigurationType="4"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ AdditionalIncludeDirectories="..\include"
+ PreprocessorDefinitions="WIN32;NDEBUG;_LIB"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ EnableEnhancedInstructionSet="2"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ AdditionalDependencies="OpenCL.lib"
+ OutputFile="$(ProjectDir)..\lib-32\$(ProjectName).lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|x64"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-64"
+ IntermediateDirectory="$(ConfigurationName)-64"
+ ConfigurationType="4"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ AdditionalIncludeDirectories="..\include"
+ PreprocessorDefinitions="WIN32;NDEBUG;_LIB;CONFIG_USE_QT=1"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ EnableEnhancedInstructionSet="0"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ AdditionalDependencies="OpenCL.lib"
+ OutputFile="$(ProjectDir)..\lib-64\$(ProjectName).lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <File
+ RelativePath="..\src\CLContext.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\src\CLError.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\src\CLProgram.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <File
+ RelativePath="..\include\BealtoOpenCL.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLBuffer.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLCommandQueue.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLContext.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLError.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLEvent.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLEventList.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLImage2D.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLKernel.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLMemoryObject.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLProgram.h"
+ >
+ </File>
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
diff --git a/MPBenchmarks/ArithmeticTasks.cpp b/MPBenchmarks/ArithmeticTasks.cpp
new file mode 100644
index 0000000..670f1d6
--- /dev/null
+++ b/MPBenchmarks/ArithmeticTasks.cpp
@@ -0,0 +1,294 @@
+// Arithmetic tasks
+// (c) EB Nov 2009
+
+#include <stdlib.h>
+#include "ArithmeticTasks.h"
+
+#ifdef Linux
+typedef short __int16;
+typedef int __int32;
+typedef long long int __int64;
+#endif
+
+// Get the ratio of user bits per stored bits for a given word size
+// (constants defined in OpenCL code)
+inline double useRatio(int ws)
+{
+ switch (ws)
+ {
+ case 16: return 14.0/16.0;
+ case 32: return 30.0/32.0;
+ case 64: return 62.0/64.0;
+ }
+ return 1.0;
+}
+
+double AddNGPUTask::run(int workgroupSize,size_t sz)
+{
+ if (!mBuildOK) return -1;
+
+ cl::Context * c = getContext();
+ cl::CommandQueue * q = getQueue();
+ cl::Program * p = getProgram();
+ if (c == 0 || q == 0 || p == 0) return -1;
+
+ // Check allocation size
+ size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size
+ if (3*sz > max_sz) return -1;
+
+ cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz);
+ cl::Buffer * b = c->createBuffer(CL_MEM_READ_WRITE,sz);
+ cl::Buffer * y = c->createBuffer(CL_MEM_READ_WRITE,sz);
+ cl::Kernel * kernel = 0;
+ if (mVariant == ADDN_V1) kernel = p->createKernel("add_v1");
+ else if (mVariant == ADDN_V2) kernel = p->createKernel("add_v2");
+ unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16);
+ int n = (int)sz / (mWS>>3); // N words in SZ
+ double mbps = -1;
+ if (kernel == 0 || a == 0 || b == 0 || y == 0 || buf == 0) goto END;
+
+ // Initialize A and B
+ for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF);
+ if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+ for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF);
+ if (!q->writeBuffer(b,true,0,sz,buf).isValid()) goto END;
+
+ // Run tests, double nOps until min time is reached
+ kernel->setArg(0,a);
+ kernel->setArg(1,b);
+ kernel->setArg(2,y);
+ if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END;
+ for (int nOps = 1; ; nOps <<= 1)
+ {
+ double t0 = getT();
+ for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize);
+ q->finish();
+ double t = (getT() - t0);
+ if (t < MIN_RUNNING_TIME) continue;
+ // OK, t is large enough
+ t /= (double)nOps;
+ mbps = (double)sz*useRatio(mWS)*1.0e-6/t; // MB/s
+ break;
+ }
+
+END: // Cleanup
+ if (buf != 0) _aligned_free(buf);
+ if (a != 0) delete a;
+ if (b != 0) delete b;
+ if (y != 0) delete y;
+ if (kernel != 0) delete kernel;
+
+ return mbps;
+}
+
+double Mul1GPUTask::run(int workgroupSize,size_t sz)
+{
+ if (!mBuildOK) return -1;
+
+ cl::Context * c = getContext();
+ cl::CommandQueue * q = getQueue();
+ cl::Program * p = getProgram();
+ if (c == 0 || q == 0 || p == 0) return -1;
+
+ // Check allocation size
+ size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size
+ if (2*sz > max_sz) return -1;
+
+ cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz);
+ cl::Buffer * y = c->createBuffer(CL_MEM_READ_WRITE,sz);
+ cl::Kernel * kernel = 0;
+ if (mVariant == MUL1_V1) kernel = p->createKernel("mul1_v1");
+
+ unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16);
+ int n = (int)sz>>2; // 4 bytes/word
+ const int kk = 0x2FEFEFEF;
+ double mbps = -1;
+ if (kernel == 0 || a == 0 || y == 0 || buf == 0) goto END;
+
+ // Initialize A
+ for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF);
+ if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+
+ // Run tests, double nOps until min time is reached
+ kernel->setArg(0,kk);
+ kernel->setArg(1,a);
+ kernel->setArg(2,y);
+ if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END;
+ for (int nOps = 1; ; nOps <<= 1)
+ {
+ double t0 = getT();
+ for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize);
+ q->finish();
+ double t = (getT() - t0);
+ if (t < MIN_RUNNING_TIME) continue;
+ // OK, t is large enough
+ t /= (double)nOps;
+ mbps = (double)sz*useRatio(32)*1.0e-6/t; // MB/s
+ break;
+ }
+
+END: // Cleanup
+ if (buf != 0) _aligned_free(buf);
+ if (a != 0) delete a;
+ if (y != 0) delete y;
+ if (kernel != 0) delete kernel;
+
+ return mbps;
+}
+
+// CPU
+
+struct AddNThreadParam
+{
+ int nOps; // number of loops
+ int index; // Block index
+ size_t sz; // size to process in thread
+ int wordSize; // 32 or 64
+ int variant; // ADD_V1, ADD_V2
+ const unsigned char * a;
+ const unsigned char * b;
+ unsigned char * out;
+};
+
+template <typename T,int LOG_BASE> void addNv1(AddNThreadParam * p)
+{
+ size_t n = p->sz / sizeof(T); // Words in block
+ const T * a = (const T *)(p->a);
+ const T * b = (const T *)(p->b);
+ T * out = (T *)(p->out);
+ T BASE = (T)1<<(T)LOG_BASE;
+ T BASE_MINUS1 = BASE - (T)1;
+
+ for (int it=0;it<p->nOps;it++)
+ {
+ T t = 0; // "carry" from previous word
+ if (p->index > 0)
+ {
+ // Get "carry" for last values of previous block
+ t = (a[-1] + b[-1]) >> (T)LOG_BASE;
+ }
+
+ for (size_t i=0;i<n;i++)
+ {
+ T s = a[i] + b[i];
+ out[i] = (s & BASE_MINUS1) + t;
+ t = s >> (T)LOG_BASE;
+ }
+ }
+}
+
+template <typename T,int LOG_BASE> void addNv2(AddNThreadParam * p)
+{
+ size_t n = p->sz / sizeof(T); // Words in block
+ const T * a = (const T *)(p->a);
+ const T * b = (const T *)(p->b);
+ T * out = (T *)(p->out);
+
+ for (int it=0;it<p->nOps;it++)
+ {
+ for (size_t i=0;i<n;i++)
+ {
+ out[i] = a[i] + b[i];
+ }
+ }
+}
+
+#ifndef Linux // Uses Win32 intrinsics
+void addNv3(AddNThreadParam * p)
+{
+ const int LOG_BASE = 30;
+ const int BASE = 1<<LOG_BASE;
+ const int BASE_MINUS1 = BASE - 1;
+
+ size_t n = p->sz >> 4; // We process blocks of 16 bytes
+ const __m128i * a = (const __m128i *)(p->a);
+ const __m128i * b = (const __m128i *)(p->b);
+ __m128i * out = (__m128i *)(p->out);
+
+ __m128i t;
+ __m128i mask = _mm_set1_epi32(BASE_MINUS1);
+ for (int it=0;it<p->nOps;it++)
+ {
+ // TODO: initialize T with the carry from the end of previous block
+ t = _mm_setzero_si128();
+ for (size_t i=0;i<n;i++)
+ {
+ __m128i s = _mm_add_epi32(_mm_load_si128(a+i),_mm_load_si128(b+i));
+ __m128i o = _mm_add_epi32(_mm_and_si128(s,mask),t);
+ _mm_store_si128(out+i,o);
+ t = _mm_srli_epi32(s,LOG_BASE);
+ }
+ }
+}
+#endif
+
+thread_return_t AddNThread(void * x)
+{
+ AddNThreadParam * p = (AddNThreadParam *)x;
+
+ if (p->wordSize == 16 && p->variant == ADDN_V1) addNv1<__int16,14>(p);
+ else if (p->wordSize == 32 && p->variant == ADDN_V1) addNv1<__int32,30>(p);
+ else if (p->wordSize == 64 && p->variant == ADDN_V1) addNv1<__int64,62>(p);
+ else if (p->wordSize == 16 && p->variant == ADDN_V2) addNv2<__int16,14>(p);
+ else if (p->wordSize == 32 && p->variant == ADDN_V2) addNv2<__int32,30>(p);
+ else if (p->wordSize == 64 && p->variant == ADDN_V2) addNv2<__int64,62>(p);
+#ifndef Linux
+ else if (p->wordSize == 32 && p->variant == ADDN_V3) addNv3(p);
+#endif
+
+ return 0;
+}
+
+double AddNCPUTask::run(int nThreads,size_t sz)
+{
+ if (nThreads <= 0 || sz <= 0) return -1; // invalid
+ size_t sz1 = sz / nThreads;
+ if (sz1 <= 0) return -1; // SZ too small
+
+ unsigned char * in_a = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+ unsigned char * in_b = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+ unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+ double mbps = -1;
+
+ std::vector<AddNThreadParam> params(nThreads);
+ for (int nOps=1; ;nOps<<=1)
+ {
+ // Initialize memory
+ for (size_t i=0;i<sz;i++)
+ {
+ in_a[i] = (unsigned char)(i & 0xFF);
+ in_b[i] = (unsigned char)((i+99) & 0xFF);
+ out[i] = 0;
+ }
+ // setup params
+ for (int i=0;i<nThreads;i++)
+ {
+ params[i].a = in_a + i*sz1;
+ params[i].b = in_b + i*sz1;
+ params[i].out = out + i*sz1;
+ params[i].sz = sz1;
+ params[i].nOps = nOps;
+ params[i].wordSize = mWS;
+ params[i].variant = mVariant;
+ params[i].index = i;
+ }
+ // run threads
+ double t = runCPUThreads(params,AddNThread);
+ // Check result
+ if (nOps == 1)
+ {
+ // ZZZ: do it!
+ }
+
+ if (t < MIN_RUNNING_TIME) continue; // Too short
+ t /= (double)nOps;
+
+ mbps = (double)sz1*(double)nThreads*useRatio(mWS)*1.0e-6/t;
+ break;
+ } // nOps loop
+
+ _aligned_free(in_a);
+ _aligned_free(in_b);
+ _aligned_free(out);
+ return mbps;
+}
diff --git a/MPBenchmarks/ArithmeticTasks.h b/MPBenchmarks/ArithmeticTasks.h
new file mode 100644
index 0000000..fd322a5
--- /dev/null
+++ b/MPBenchmarks/ArithmeticTasks.h
@@ -0,0 +1,103 @@
+// Arithmetic tasks
+// (c) EB Nov 2009
+
+#ifndef ArithmeticTasks_h
+#define ArithmeticTasks_h
+
+#include "Config.h"
+#include "GPUTask.h"
+#include "CPUTask.h"
+
+enum AddNVariants
+{
+ ADDN_V1 = 0,
+ ADDN_V2,
+ ADDN_V3,
+ NB_ADDN_VARIANTS
+};
+
+enum Mul1Variants
+{
+ MUL1_V1 = 0,
+ NB_MUL1_VARIANTS
+};
+
+
+// Sum two buffers
+class AddNGPUTask : public GPUTask
+{
+
+public:
+
+ AddNGPUTask(int variant,int wordSize,Logger * log) : GPUTask("gpu_add.cl",log)
+ {
+ mWS = wordSize;
+ mVariant = variant;
+ char options[200];
+ _snprintf(options,200,"-DWORD_SIZE=%d",mWS);
+ mBuildOK = buildProgram(options,log);
+ if (!mBuildOK) fprintf(stderr,"Build failed\n");
+ }
+
+ double run(int workgroupSize,size_t sz);
+
+private:
+
+ // Word size
+ int mWS;
+ // Variant
+ int mVariant;
+ // Build OK?
+ bool mBuildOK;
+
+};
+
+// Multiply buffer by 1 digit
+class Mul1GPUTask : public GPUTask
+{
+
+public:
+
+ Mul1GPUTask(int variant,int blockSize,Logger * log) : GPUTask("gpu_mul1.cl",log)
+ {
+ mBS = blockSize;
+ mVariant = variant;
+ char options[200];
+ const int logBase = 30;
+ const int base = 1<<logBase;
+ const int baseMinus1 = base-1;
+ _snprintf(options,200,"-DLOG_BASE=%d -DBASE=%d -DBASE_MINUS1=%d -DBLOCK_SIZE=%d",logBase,base,baseMinus1,blockSize);
+ mBuildOK = buildProgram(options,log);
+ }
+
+ double run(int workgroupSize,size_t sz);
+
+private:
+
+ // Build OK?
+ bool mBuildOK;
+ // Block size
+ int mBS;
+ // Variant
+ int mVariant;
+
+};
+
+// CPU
+
+class AddNCPUTask : public CPUTask
+{
+public:
+ AddNCPUTask(int variant,int wordSize) : mWS(wordSize), mVariant(variant) { }
+
+ double run(int nThreads,size_t sz);
+
+private:
+
+ // Word size (32 or 64)
+ int mWS;
+ // Variant
+ int mVariant;
+};
+
+#endif // ArithmeticTasks_h
diff --git a/MPBenchmarks/CPUTask.h b/MPBenchmarks/CPUTask.h
new file mode 100644
index 0000000..62fe768
--- /dev/null
+++ b/MPBenchmarks/CPUTask.h
@@ -0,0 +1,22 @@
+// CPU task base class
+// (c) EB Nov 2009
+
+#ifndef CPUTask_h
+#define CPUTask_h
+
+class CPUTask
+{
+public:
+
+ // Destructor
+ virtual ~CPUTask() { }
+
+ // Run task on NTHREADS threads and output size SZ.
+ // If SZ is not a multiple of NTHREADS, SZ is rounded down to
+ // the closest multiple.
+ // Return throughput in MB/s of output, or -1 on error.
+ virtual double run(int nThreads,size_t sz) = 0;
+
+};
+
+#endif // CPUTask_h
diff --git a/MPBenchmarks/Config.h b/MPBenchmarks/Config.h
new file mode 100644
index 0000000..c9a72ff
--- /dev/null
+++ b/MPBenchmarks/Config.h
@@ -0,0 +1,172 @@
+// Config flags, common includes, and utils
+// (c) EB Nov 2009
+
+#ifndef Config_h
+#define Config_h
+
+// Configuration. The flags may be defined on the compiler command line,
+// and in this case we don't change them here.
+
+// Use Qt interface?
+#ifndef CONFIG_USE_QT
+#define CONFIG_USE_QT 1
+#endif
+
+// Use MPIR library to cross-check results?
+#ifndef CONFIG_USE_MPIR
+#define CONFIG_USE_MPIR 0
+#endif
+
+#ifdef WIN32
+#define _CRT_SECURE_NO_WARNINGS
+#define WIN32_LEAN_AND_MEAN
+#endif
+
+// Minimal run time required to have a valid measurement (s)
+const double MIN_RUNNING_TIME = 0.2;
+
+// CPU memory alloc alignment (bytes)
+const int ALLOC_ALIGN = 4096;
+
+#include <malloc.h>
+#include <math.h>
+#include <string>
+
+// Windows specific
+#ifdef WIN32
+#include <emmintrin.h>
+#include <windows.h>
+inline double getT()
+{
+ LARGE_INTEGER c,f;
+ QueryPerformanceFrequency(&f);
+ QueryPerformanceCounter(&c);
+ return (double)c.QuadPart/(double)f.QuadPart;
+}
+#endif
+
+// Linux specific
+#ifdef Linux
+#include <sys/time.h>
+#include <pthread.h>
+inline void * _aligned_malloc(size_t sz,size_t a) { return memalign(a,sz); }
+inline void _aligned_free(void * x) { free(x); }
+#define _snprintf snprintf
+inline double getT()
+{
+ struct timeval tv;
+ gettimeofday(&tv,0);
+ return (double)tv.tv_sec+1.0e-6*(double)tv.tv_usec;
+}
+#endif
+
+// Common
+#if CONFIG_USE_MPIR
+#include <mpir.h>
+#endif
+#include "BealtoOpenCL.h"
+
+// Text logger
+class Logger
+{
+public:
+ virtual ~Logger() { }
+ virtual void append(const char * s) { printf("%s\n",s); fflush(stdout); }
+ void append(const std::string & s) { append(s.c_str()); }
+ virtual void appendTitle(const char * s) { printf("\n*** %s\n\n",s); fflush(stdout); }
+ void appendTitle(const std::string & s) { appendTitle(s.c_str()); }
+};
+
+#if CONFIG_USE_QT
+#include <QtGui/QTextEdit>
+#include <QtGui/QApplication>
+// Qt QTextEdit logger
+class QtLogger : public Logger
+{
+public:
+ QtLogger(QTextEdit * e) : mE(e) { }
+ void append(const char * s)
+ {
+ QTextCharFormat f;
+ f.setFontFamily("fixed");
+ mE->setCurrentCharFormat(f);
+ mE->append(s);
+ qApp->processEvents();
+ }
+ void appendTitle(const char * s)
+ {
+ QTextCharFormat f;
+ f.setFontWeight(QFont::Bold);
+ mE->setCurrentCharFormat(f);
+ mE->append(s);
+ qApp->processEvents();
+ }
+private:
+ QTextEdit * mE;
+};
+#endif
+
+// Return user friendly size from LOG (size = 1<<LOG) in S.
+inline void getUserSize(int log,std::string & s)
+{
+ if (log < 0 || log >= 40) { s.assign("???"); return; }
+ char aux[200];
+ if (log < 10) _snprintf(aux,200,"%d B",1<<log);
+ else if (log < 20) _snprintf(aux,200,"%d KiB",1<<(log-10));
+ else if (log < 30) _snprintf(aux,200,"%d MiB",1<<(log-20));
+ else if (log < 40) _snprintf(aux,200,"%d GiB",1<<(log-30));
+ s.assign(aux);
+}
+
+#ifdef Linux
+typedef void * (*ThreadProc)(void *);
+#define thread_return_t void *
+#endif
+#ifdef WIN32
+typedef LPTHREAD_START_ROUTINE ThreadProc;
+#define thread_return_t DWORD WINAPI
+#endif
+
+// Run one CPU thread calling F for each parameter in the PARAMS array.
+// Return the real time of execution, or -1 on error.
+template <class P> double runCPUThreads(std::vector<P> & params,ThreadProc f)
+{
+ int nt = (int)params.size(); // thread count
+ if (nt <= 0) return -1; // invalid
+ double t0 = getT();
+ bool ok = true;
+#ifdef Linux
+ std::vector<pthread_t> threads(nt,0);
+ // start all threads
+ for (int i=0;i<nt;i++)
+ {
+ int s = pthread_create(&(threads[i]),0,f,&(params[i]));
+ if (s != 0) { ok = false; threads[i] = 0; }
+ }
+ // join all threads
+ for (int i=0;i<nt;i++)
+ {
+ if (threads[i] == 0) continue; // creation failed
+ int s = pthread_join(threads[i],0);
+ if (s != 0) ok = false;
+ }
+#endif
+#ifdef WIN32
+ std::vector<HANDLE> threads(nt,0);
+ // start all threads
+ for (int i=0;i<nt;i++)
+ {
+ threads[i] = CreateThread(0,0,f,&(params[i]),0,0);
+ if (threads[i] == 0) ok = false;
+ }
+ // wait for all threads to terminate
+ WaitForMultipleObjects(nt,&(threads[0]),TRUE,INFINITE);
+ // delete threads
+ for (int i=0;i<nt;i++) CloseHandle(threads[i]);
+#endif
+ if (!ok) return -1; // error
+ double t = getT() - t0;
+ return t;
+}
+
+#endif // Config_h
diff --git a/MPBenchmarks/Conversions.h b/MPBenchmarks/Conversions.h
new file mode 100644
index 0000000..1ec9eb9
--- /dev/null
+++ b/MPBenchmarks/Conversions.h
@@ -0,0 +1,74 @@
+// Convert 30-bit and MPZ numbers to 32-bit hexa strings for comparisons
+// (c) EB Nov 2009
+
+#ifndef Conversions_h
+#define Conversions_h
+
+// Convert X[N] from redundand representation in base 2^30
+// to an hex string.
+void convert30(int n,const int * x,std::string & out)
+{
+ int nAux = 0; // Valid bits in aux
+ __int64 aux = 0;
+ std::vector<unsigned int> y(n,0);
+ int j = 0; // Next value in Y receiving bits
+
+ for (int i=0;i<n;i++)
+ {
+ // Insert X[i] into AUX
+ aux += ((__int64)x[i]<<nAux);
+ nAux += 30;
+ // Extract 32 bits if available
+ if (nAux>=32)
+ {
+ y[j++] = (unsigned int)(aux&0xFFFFFFFF);
+ aux >>= 32;
+ nAux -= 32;
+ }
+ }
+ // Extract last bits
+ while (nAux > 0)
+ {
+ y[j++] = (unsigned int)(aux&0xFFFFFFFF);
+ aux >>= 32;
+ nAux -= 32;
+ }
+ out.clear();
+ out.reserve(9*n+20);
+ char s[32];
+ bool first = true;
+ for (int i=n-1;i>=0;i--)
+ {
+ if (first)
+ {
+ if (y[i] == 0) continue;
+ first = false;
+ }
+ else out.push_back('.');
+ _snprintf(s,32,"%08X",y[i]);
+ out.append(s);
+ }
+ if (first) out.append("0");
+}
+
+#if CONFIG_USE_MPIR
+// Convert mpz_t integer to a string
+void convertMPZ(mpz_t x,std::string & out)
+{
+ char * s = mpz_get_str(0,16,x);
+ if (s == 0) return; // Failed
+ int n = (int)strlen(s);
+ out.clear();
+ out.reserve(n+n/8+8);
+ int digits = 0;
+ while ( (digits+n)&7 ) { out.push_back('0'); digits++; } // Initial 0 to get a multiple of 8
+ for (int i=0;i<n;i++)
+ {
+ if ( (digits&7) == 0 && i>0 ) out.push_back('.');
+ out.push_back(toupper(s[i])); digits++;
+ }
+ free(s);
+}
+#endif
+
+#endif // Conversions_h
diff --git a/MPBenchmarks/GPUTask.cpp b/MPBenchmarks/GPUTask.cpp
new file mode 100644
index 0000000..dbfa65c
--- /dev/null
+++ b/MPBenchmarks/GPUTask.cpp
@@ -0,0 +1,74 @@
+// GPU task base class
+// (c) EB Nov 2009
+
+#include "GPUTask.h"
+
+GPUTask::GPUTask(const char * programFile,Logger * log)
+{
+ cl::Context * c = 0;
+ cl::CommandQueue * q = 0;
+ cl::Program * p = 0;
+ bool ok = true;
+ std::string s;
+ std::vector<unsigned char> binary;
+
+ if (log == 0) { ok = false; goto END; }
+ c = cl::Context::create();
+ if (c == 0) { log->append("Context creation failed"); ok = false; goto END; }
+ q = c->createCommandQueue(0,0);
+ if (q == 0) { log->append("Command queue creation failed"); ok = false; goto END; }
+ if (programFile == 0) p = 0;
+ else
+ {
+#ifdef WIN32
+ // We run from the vs2008 solution directory, so we must
+ // change the path... (quick & dirty patch)
+ char programFileWin[1000];
+ _snprintf(programFileWin,1000,"../%s",programFile);
+ p = c->createProgramWithFile(programFileWin);
+#else
+ p = c->createProgramWithFile(programFile);
+#endif
+ if (p == 0) { log->append("Program creation failed"); ok = false; goto END; }
+ }
+
+END:
+ if (!ok)
+ {
+ if (p != 0) { delete p; p = 0; }
+ if (q != 0) { delete q; q = 0; }
+ if (c != 0) { delete c; c = 0; }
+ }
+ mProgram = p;
+ mQueue = q;
+ mContext = c;
+}
+
+GPUTask::~GPUTask()
+{
+ delete mProgram;
+ delete mQueue;
+ delete mContext;
+}
+
+bool GPUTask::buildProgram(const char * options,Logger * log)
+{
+ if (mProgram == 0) return false; // Invalid
+ if (options == 0 || log == 0) return false; // Invalid
+
+ std::string s;
+ bool ok = mProgram->build(options,s);
+ if (!ok) { log->append("Build failed, errors:"); log->append(s.c_str()); }
+#if 0 // Show PTX code (nvidia only)
+ ok &= p->getBinary(binary);
+ if (!ok) { log->append("Get binary failed"); goto END; }
+ log->append((char *)&(binary[0]));
+#endif
+ return ok;
+}
+
+cl::Kernel * GPUTask::createKernel(const char * name)
+{
+ if (mProgram == 0) return 0; // Invalid
+ return mProgram->createKernel(name);
+}
diff --git a/MPBenchmarks/GPUTask.h b/MPBenchmarks/GPUTask.h
new file mode 100644
index 0000000..d9f8155
--- /dev/null
+++ b/MPBenchmarks/GPUTask.h
@@ -0,0 +1,56 @@
+// GPU task base class
+// (c) EB Nov 2009
+
+#ifndef GPUTask_h
+#define GPUTask_h
+
+#include "Config.h"
+
+class GPUTask
+{
+public:
+
+ // Create a new context, with a command queue, and the given program.
+ // PROGRAMFILE may be 0 if no program is needed.
+ // Check if the class is valid after construction with isValid().
+ GPUTask(const char * programFile,Logger * log);
+
+ // Destructor. Release CL objects
+ virtual ~GPUTask();
+
+ // Check if the task is well-defined. May be redefined in derived classes
+ // to add more checks (call base class if redefined).
+ virtual bool isValid() { return (mContext != 0) && (mQueue != 0); }
+
+ // Functions to redefine in derived classes
+
+ // Run test producing SZ bytes of output.
+ // Return throughput in MB/s of generated output,
+ // or -1 on error.
+ virtual double run(int workgroupSize,size_t sz) = 0;
+
+protected:
+
+ // Build the program with OPTIONS. Error messages are put in LOG.
+ // Return TRUE if OK, FALSE on error.
+ bool buildProgram(const char * options,Logger * log);
+
+ // Create a kernel for the current program, unless undefined
+ cl::Kernel * createKernel(const char * name);
+
+ inline cl::Context * getContext() { return mContext; }
+ inline cl::CommandQueue * getQueue() { return mQueue; }
+ inline cl::Program * getProgram() { return mProgram; }
+
+private:
+
+ // No copy
+ GPUTask(const GPUTask &);
+ GPUTask & operator = (const GPUTask &);
+
+ cl::Context * mContext;
+ cl::CommandQueue * mQueue;
+ cl::Program * mProgram;
+};
+
+#endif // GPUTask_h
diff --git a/MPBenchmarks/LICENSE.txt b/MPBenchmarks/LICENSE.txt
new file mode 100644
index 0000000..f5fe116
--- /dev/null
+++ b/MPBenchmarks/LICENSE.txt
@@ -0,0 +1,27 @@
+This code is released under the following license (BSD-style).
+--
+
+Copyright (c) 2009, Eric Bainville
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Eric Bainville nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY ERIC BAINVILLE ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ERIC BAINVILLE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MPBenchmarks/Makefile b/MPBenchmarks/Makefile
new file mode 100644
index 0000000..ebd701b
--- /dev/null
+++ b/MPBenchmarks/Makefile
@@ -0,0 +1,72 @@
+# OpenCL multiprecision CPU+GPU benchmarks
+# EB Nov 2009
+
+# archive data
+CURRENT_DATE := $(shell date +%Y%m%d_%Hh%M)
+CURRENT_DIR := $(notdir $(shell /bin/pwd))
+CURRENT_MACHINE := $(shell uname -m)
+
+# output directories
+OBJDIR := objs
+DEPDIR := deps
+ifeq ($(CURRENT_MACHINE),x86_64)
+LIBDIR := lib-64
+else
+LIBDIR := lib-32
+endif
+
+# targets
+OBJS = $(patsubst %,$(OBJDIR)/%.o, GPUTask MemoryTasks ArithmeticTasks appMain )
+DEPS = $(patsubst $(OBJDIR)/%.o, $(DEPDIR)/%.d, $(OBJS))
+
+# OpenCL SDK directory
+# OPENCL_DIR = /opt/cuda/include
+OPENCL_DIR = /opt/ati-stream-sdk-v2.0-beta4-lnx64
+# Bealto OpenCL library path
+BEALTO_OPENCL_DIR = ../BealtoOpenCL
+BEALTO_OPENCL_LIB = $(BEALTO_OPENCL_DIR)/$(LIBDIR)/libBealtoOpenCL.a
+
+# flags
+CXXFLAGS = -DLinux -DCONFIG_USE_QT=0 -O2 -mtune=nocona -msse3 -Wall -I/usr/include/qt4 -I$(OPENCL_DIR)/include -I$(BEALTO_OPENCL_DIR)/include
+LDFLAGS = -L/usr/lib/qt4 -L$(OPENCL_DIR)/lib/x86_64
+
+all: tocl
+
+tocl: $(OBJS) $(BEALTO_OPENCL_LIB)
+ g++ $(LDFLAGS) -o $@ $^ $(BEALTO_OPENCL_LIB) -lpthread -lOpenCL # -lQtCore -lQtGui
+
+$(BEALTO_OPENCL_LIB):
+ make -C $(BEALTO_OPENCL_DIR) lib
+
+t1: t1.o
+ g++ -o $@ $^
+
+dos2unix:
+ dos2unix *.cpp *.h *.cl Makefile
+
+clean:
+ /bin/rm -f *.o *~ tocl t1
+ /bin/rm -rf vs2008/Release-* vs2008/Debug-* $(OBJDIR) $(DEPDIR)
+
+archive: clean
+ @echo "ARCHIVE $(CURRENT_DATE)"
+ tar czf "../MPBenchmarks-$(CURRENT_DATE).tar.gz" -C.. --exclude=".svn" $(CURRENT_DIR)
+
+##### Dependencies
+$(DEPDIR)/%.d: %.cpp
+ @[ -d $(DEPDIR) ] || mkdir -p $(DEPDIR)
+ @/bin/echo -e "DEPS \033[32m$*\033[0m"
+ @$(CXX) $(CXXFLAGS) -o $@ -MM -MT '$(OBJDIR)/$*.o $@' $<
+
+##### Compilation
+$(OBJDIR)/%.o: %.cpp
+ @[ -d $(OBJDIR) ] || mkdir -p $(OBJDIR)
+ @/bin/echo -e "C++ \033[34m$*\033[0m"
+ @$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+##### Qt
+moc_%.cpp: %.h
+ @/bin/echo -e "MOC \033[34m$*\033[0m"
+ @moc -o $@ $<
+
+-include $(DEPS)
diff --git a/MPBenchmarks/MemoryTasks.cpp b/MPBenchmarks/MemoryTasks.cpp
new file mode 100644
index 0000000..a7780bc
--- /dev/null
+++ b/MPBenchmarks/MemoryTasks.cpp
@@ -0,0 +1,280 @@
+// Memory tasks
+// (c) EB Nov 2009
+
+#include "MemoryTasks.h"
+
+#include <string.h>
+
+
+// GPU
+
+double CopyGPUTask::run(int workgroupSize,size_t sz)
+{
+ cl::Context * c = getContext();
+ cl::CommandQueue * q = getQueue();
+ if (c == 0 || q == 0) return -1;
+
+ // Check allocation size
+ size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size
+ if (mCT == DEVICE_TO_DEVICE_COPY)
+ {
+ if (2*sz > max_sz) return -1;
+ }
+ if (sz > max_sz) return -1;
+
+ bool ok = true;
+ cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz);
+ if (a == 0) ok = false;
+ cl::Buffer * b = 0;
+ if (mCT == DEVICE_TO_DEVICE_COPY)
+ {
+ b = c->createBuffer(CL_MEM_READ_WRITE,sz);
+ if (b == 0) ok = false;
+ }
+ unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16);
+ if (buf == 0) ok = false;
+ double mbps = -1;
+
+ if (!ok) goto END; // Alloc failed
+
+ // Initialize A and check errors
+ for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF);
+ if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+ for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)0;
+ if (!q->readBuffer(a,true,0,sz,buf).isValid()) goto END;
+ ok = true;
+
+ // check write+read loop
+ for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)(i & 0xFF)) ok = false;
+ if (!ok) { fprintf(stderr,"write+read failed\n"); goto END; }
+
+ // check write+copy+read loop
+ if (b != 0)
+ {
+ for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF);
+ if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+ for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)0;
+ if (!q->writeBuffer(b,true,0,sz,buf).isValid()) goto END;
+ if (!q->copyBuffer(a,b,0,0,sz).isValid()) goto END;
+ if (!q->readBuffer(b,true,0,sz,buf).isValid()) goto END;
+ for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)(i & 0xFF)) ok = false;
+ if (!ok) { fprintf(stderr,"write+copy+read failed\n"); goto END; }
+ }
+
+ // Run tests, double nOps until the min time is reached
+ for (int nOps = 5; ; nOps <<= 1)
+ {
+ double t0 = getT();
+ switch (mCT)
+ {
+ case HOST_TO_DEVICE_COPY:
+ for (int i=0;i<nOps;i++) q->writeBuffer(a,false,0,sz,buf);
+ break;
+ case DEVICE_TO_HOST_COPY:
+ for (int i=0;i<nOps;i++) q->readBuffer(a,false,0,sz,buf);
+ break;
+ case DEVICE_TO_DEVICE_COPY:
+ for (int i=0;i<nOps;i++) q->copyBuffer(a,b,0,0,sz);
+ break;
+ }
+ q->finish();
+ double t = (getT() - t0);
+ if (t < MIN_RUNNING_TIME) continue;
+ // OK, t is large enough
+ t /= (double)nOps;
+ mbps = (double)sz*1.0e-6/t; // MB/s
+ break;
+ }
+
+END:
+ if (buf != 0) _aligned_free(buf);
+ if (a != 0) delete a;
+ if (b != 0) delete b;
+
+ return mbps;
+}
+
+double ZeroGPUTask::run(int workgroupSize,size_t sz)
+{
+ cl::Context * c = getContext();
+ cl::CommandQueue * q = getQueue();
+ cl::Program * p = getProgram();
+ if (c == 0 || q == 0 || p == 0) return -1;
+
+ // Check allocation size
+ size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size
+ if (sz > max_sz) return -1;
+
+ cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz);
+ unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16);
+ cl::Kernel * kernel = p->createKernel("zero");
+ double mbps = -1;
+ int n = (int)sz / (mWS>>3); // N words in SZ
+ bool ok;
+ if (kernel == 0 || a == 0 || buf == 0) goto END;
+
+ // Initialize A
+ for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF);
+ if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+
+ // Run tests, double nOps until min time is reached
+ kernel->setArg(0,a);
+ if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END;
+
+ // check write+zero+read
+ if (!q->readBuffer(a,true,0,sz,buf).isValid()) goto END;
+ ok = true;
+ for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)0) ok = false;
+ if (!ok) { fprintf(stderr,"write+zero+read failed\n"); goto END; }
+
+ for (int nOps = 5; ; nOps <<= 1)
+ {
+ double t0 = getT();
+ for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize);
+ q->finish();
+ double t = (getT() - t0);
+ if (t < MIN_RUNNING_TIME) continue;
+ // OK, t is large enough
+ t /= (double)nOps;
+ mbps = (double)sz*1.0e-6/t; // MB/s
+ break;
+ }
+
+END:
+ if (buf != 0) _aligned_free(buf);
+ if (a != 0) delete a;
+ if (kernel != 0) delete kernel;
+
+ return mbps;
+}
+
+// CPU
+
+struct CopyThreadParam
+{
+ int nOps; // number of loops
+ size_t sz; // size to process in thread
+ const unsigned char * in;
+ unsigned char * out;
+};
+
+thread_return_t CopyThread(void * x)
+{
+ CopyThreadParam * p = (CopyThreadParam *)x;
+#ifdef WIN32
+ for (int i=0;i<p->nOps;i++) CopyMemory(p->out,p->in,p->sz);
+#else
+ for (int i=0;i<p->nOps;i++) memcpy(p->out,p->in,p->sz);
+#endif
+ return 0;
+}
+
+double CopyCPUTask::run(int nThreads,size_t sz)
+{
+ if (nThreads <= 0 || sz <= 0) return -1; // invalid
+ size_t sz1 = sz / nThreads;
+ if (sz1 <= 0) return -1; // SZ too small
+
+ unsigned char * in = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+ unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+ double mbps = -1;
+
+ std::vector<CopyThreadParam> params(nThreads);
+ for (int nOps=1; ;nOps<<=1)
+ {
+ // Initialize memory
+ for (size_t i=0;i<sz;i++)
+ {
+ in[i] = (unsigned char)(i & 0xFF);
+ out[i] = 0;
+ }
+ // setup params
+ for (int i=0;i<nThreads;i++)
+ {
+ params[i].in = in + i*sz1;
+ params[i].out = out + i*sz1;
+ params[i].sz = sz1;
+ params[i].nOps = nOps;
+ }
+ // run threads
+ double t = runCPUThreads(params,CopyThread);
+ // Check result
+ if (nOps == 1)
+ {
+ if (memcmp(in,out,sz) != 0) { fprintf(stderr,"CPU copy error\n"); mbps = -1; break; }
+ }
+
+ if (t < MIN_RUNNING_TIME) continue; // Too short
+ t /= (double)nOps;
+
+ mbps = (double)sz1*(double)nThreads*1.0e-6/t;
+ break;
+ }
+
+ _aligned_free(in);
+ _aligned_free(out);
+ return mbps;
+}
+
+struct ZeroThreadParam
+{
+ int nOps; // number of loops
+ size_t sz; // size to process in thread
+ unsigned char * out;
+};
+
+thread_return_t ZeroThread(void * x)
+{
+ CopyThreadParam * p = (CopyThreadParam *)x;
+#ifdef WIN32
+ for (int i=0;i<p->nOps;i++) ZeroMemory(p->out,p->sz);
+#else
+ for (int i=0;i<p->nOps;i++) memset(p->out,0,p->sz);
+#endif
+ return 0;
+}
+
+double ZeroCPUTask::run(int nThreads,size_t sz)
+{
+ if (nThreads <= 0 || sz <= 0) return -1; // invalid
+ size_t sz1 = sz / nThreads;
+ if (sz1 <= 0) return -1; // SZ too small
+
+ unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+ double mbps = -1;
+
+ std::vector<CopyThreadParam> params(nThreads);
+ for (int nOps=1; ;nOps<<=1)
+ {
+ // Initialize memory
+ for (size_t i=0;i<sz;i++)
+ {
+ out[i] = (unsigned char)(i & 0xFF);
+ }
+ // setup params
+ for (int i=0;i<nThreads;i++)
+ {
+ params[i].out = out + i*sz1;
+ params[i].sz = sz1;
+ params[i].nOps = nOps;
+ }
+ // run threads
+ double t = runCPUThreads(params,ZeroThread);
+ // Check result
+ if (nOps == 1)
+ {
+ bool ok = true;
+ for (size_t i=0;i<sz;i++) if ( out[i] != 0 ) { ok = false; break; }
+ if (!ok) { fprintf(stderr,"CPU zero error\n"); mbps = -1; break; }
+ }
+
+ if (t < MIN_RUNNING_TIME) continue; // Too short
+ t /= (double)nOps;
+
+ mbps = (double)sz1*(double)nThreads*1.0e-6/t;
+ break;
+ }
+
+ _aligned_free(out);
+ return mbps;
+}
diff --git a/MPBenchmarks/MemoryTasks.h b/MPBenchmarks/MemoryTasks.h
new file mode 100644
index 0000000..92e35f6
--- /dev/null
+++ b/MPBenchmarks/MemoryTasks.h
@@ -0,0 +1,83 @@
+// Memory tasks
+// (c) EB Nov 2009
+
+#ifndef MemoryTasks_h
+#define MemoryTasks_h
+
+#include "Config.h"
+#include "GPUTask.h"
+#include "CPUTask.h"
+
+// Copy two buffers (GPU)
+class CopyGPUTask : public GPUTask
+{
+
+public:
+
+ enum CopyType
+ {
+ HOST_TO_DEVICE_COPY,
+ DEVICE_TO_HOST_COPY,
+ DEVICE_TO_DEVICE_COPY
+ };
+
+ CopyGPUTask(int copyType,Logger * log) : GPUTask(0,log)
+ {
+ mCT = copyType;
+ }
+
+ ~CopyGPUTask()
+ {
+ }
+
+ double run(int workgroupSize,size_t sz);
+
+private:
+
+ // Copy type
+ int mCT;
+
+};
+
+// Zero one buffer (GPU)
+class ZeroGPUTask : public GPUTask
+{
+
+public:
+
+ ZeroGPUTask(int wordSize,Logger * log) : GPUTask("gpu_zero.cl",log)
+ {
+ mWS = wordSize;
+ char options[200];
+ _snprintf(options,200,"-DWORD_SIZE=%d",mWS);
+ buildProgram(options,log);
+ }
+
+ ~ZeroGPUTask()
+ {
+ }
+
+ double run(int workgroupSize,size_t sz);
+
+private:
+
+ // Word size
+ int mWS;
+
+};
+
+// CPU
+
+class CopyCPUTask : public CPUTask
+{
+public:
+ double run(int nThreads,size_t sz);
+};
+
+class ZeroCPUTask : public CPUTask
+{
+public:
+ double run(int nThreads,size_t sz);
+};
+
+#endif // MemoryTasks_h
diff --git a/MPBenchmarks/appMain.cpp b/MPBenchmarks/appMain.cpp
new file mode 100644
index 0000000..f6f82bf
--- /dev/null
+++ b/MPBenchmarks/appMain.cpp
@@ -0,0 +1,284 @@
+// (c) EB Nov 2009
+
+#include "Config.h"
+
+#include "MemoryTasks.h"
+#include "ArithmeticTasks.h"
+
+#if CONFIG_USE_QT
+#include <QtGui/QApplication>
+#include <QtGui/QMainWindow>
+#include <QtGui/QTextEdit>
+#include <QtGui/QClipboard>
+#endif
+
+struct TestResult
+{
+ std::string title;
+ std::vector<int> mbps; // MB/s by logSize
+ void update(int ls,int x) { if (x > mbps[ls]) mbps[ls] = x; } // keep max
+};
+
+void print(FILE * f,const TestResult & r,int minLogSize,int maxLogSize)
+{
+ fprintf(f,"%-25s",r.title.c_str());
+ for (int i=minLogSize;i<=maxLogSize;i++)
+ {
+ int x = r.mbps[i];
+ fprintf(f,"%8d",x);
+ }
+ fprintf(f,"\n");
+}
+
+void print(Logger * log,const TestResult & r,int minLogSize,int maxLogSize)
+{
+ char aux[200];
+ std::string s;
+ _snprintf(aux,200,"%-25s",r.title.c_str()); s.append(aux);
+ for (int i=minLogSize;i<=maxLogSize;i++)
+ {
+ int x = r.mbps[i];
+ _snprintf(aux,200,"%8d",x); s.append(aux);
+ }
+ log->append(s);
+}
+
+int main(int argc,char ** argv)
+{
+#if CONFIG_USE_QT
+ // Qt windows
+ QApplication app(argc,argv);
+ QMainWindow * w = new QMainWindow();
+ QTextEdit * te = new QTextEdit();
+ te->setReadOnly(true);
+ w->setCentralWidget(te);
+ w->setMinimumSize(QSize(800,600));
+ w->show();
+ app.processEvents();
+ Logger * log = new QtLogger(te);
+#else
+ // Command line
+ Logger * log = new Logger();
+#endif
+
+#if 1
+ // Display device info
+ cl::Context * c = cl::Context::create();
+ if (c != 0)
+ {
+ std::string s;
+ if (c->getAllDeviceInfo(0,s)) log->append(s.c_str());
+ delete c;
+ c = 0;
+ }
+#endif
+
+ int minLogSize = 10;
+ int maxLogSize = 29;
+ std::vector<TestResult> allTests;
+
+ // Run all CPU tests
+ for (int task=0;task<100;task++)
+ {
+ // Select tests here
+ // if (task != 30) continue;
+ // continue;
+
+ CPUTask * g = 0;
+ // Default values
+ std::string title;
+ int minNThreads = 1;
+ int maxNThreads = 32;
+
+ switch (task)
+ {
+ case 0:
+ title.assign("CPU copyN");
+ g = new CopyCPUTask();
+ break;
+ case 1:
+ title.assign("CPU zeroN");
+ g = new ZeroCPUTask();
+ break;
+
+ case 10:
+ title.assign("CPU addN v1 16-bit");
+ g = new AddNCPUTask(ADDN_V1,16);
+ break;
+ case 11:
+ title.assign("CPU addN v1 32-bit");
+ g = new AddNCPUTask(ADDN_V1,32);
+ break;
+ case 12:
+ title.assign("CPU addN v1 64-bit");
+ g = new AddNCPUTask(ADDN_V1,64);
+ break;
+
+ case 20:
+ title.assign("CPU addN v2 16-bit");
+ g = new AddNCPUTask(ADDN_V2,16);
+ break;
+ case 21:
+ title.assign("CPU addN v2 32-bit");
+ g = new AddNCPUTask(ADDN_V2,32);
+ break;
+ case 22:
+ title.assign("CPU addN v2 64-bit");
+ g = new AddNCPUTask(ADDN_V2,64);
+ break;
+
+#ifndef Linux
+ case 30:
+ title.assign("CPU addN v3 32-bit");
+ g = new AddNCPUTask(ADDN_V3,32);
+ break;
+#endif
+ }
+ if (g == 0) continue; // nothing to do
+
+ TestResult tr;
+ tr.title.assign(title);
+ tr.mbps.resize(1+maxLogSize,0);
+ log->appendTitle(title.c_str());
+ for (int nt=minNThreads;nt<=maxNThreads;nt<<=1)
+ for (int ls=minLogSize;ls<=maxLogSize;ls++)
+ {
+ size_t sz = 1<<ls;
+ double mbps = g->run(nt,sz);
+ char aux[200];
+ std::string usz;
+ getUserSize(ls,usz);
+ _snprintf(aux,200,"NT=%3d SZ=%7s %6.0f MB/s",nt,usz.c_str(),mbps);
+ log->append(aux);
+ tr.update(ls,mbps);
+ }
+
+ print(stdout,tr,minLogSize,maxLogSize);
+#if CONFIG_USE_QT
+ print(log,tr,minLogSize,maxLogSize);
+#endif
+ allTests.push_back(tr);
+ delete g;
+ }
+
+ // Run all GPU tests
+ for (int task=0;task<100;task++)
+ {
+ // Select tests here
+ //continue;
+ //if (task != 30) continue;
+
+ GPUTask * g = 0;
+ // Default values
+ std::string title;
+ int minWorkgroupSize = 16;
+ int maxWorkgroupSize = 512;
+
+ switch (task)
+ {
+ case 0:
+ title.assign("Host to device copy");
+ g = new CopyGPUTask(CopyGPUTask::HOST_TO_DEVICE_COPY,log);
+ minWorkgroupSize = maxWorkgroupSize;
+ break;
+ case 1:
+ title.assign("Device to host copy");
+ g = new CopyGPUTask(CopyGPUTask::DEVICE_TO_HOST_COPY,log);
+ minWorkgroupSize = maxWorkgroupSize;
+ break;
+ case 2:
+ title.assign("copyN");
+ g = new CopyGPUTask(CopyGPUTask::DEVICE_TO_DEVICE_COPY,log);
+ minWorkgroupSize = maxWorkgroupSize;
+ break;
+ case 3:
+ title.assign("zeroN 32-bit");
+ g = new ZeroGPUTask(32,log);
+ break;
+ case 4:
+ title.assign("zeroN 64-bit");
+ g = new ZeroGPUTask(64,log);
+ break;
+
+ case 10:
+ title.assign("AddN v1 16-bit");
+ g = new AddNGPUTask(ADDN_V1,16,log);
+ break;
+ case 11:
+ title.assign("AddN v1 32-bit");
+ g = new AddNGPUTask(ADDN_V1,32,log);
+ break;
+ case 12:
+ title.assign("AddN v1 64-bit");
+ g = new AddNGPUTask(ADDN_V1,64,log);
+ break;
+
+ case 20:
+ title.assign("AddN v2 16-bit");
+ g = new AddNGPUTask(ADDN_V2,16,log);
+ break;
+ case 21:
+ title.assign("AddN v2 32-bit");
+ g = new AddNGPUTask(ADDN_V2,32,log);
+ break;
+ case 22:
+ title.assign("AddN v2 64-bit");
+ g = new AddNGPUTask(ADDN_V2,64,log);
+ break;
+
+ case 40:
+ title.assign("Mul1 v1 32-bit");
+ g = new Mul1GPUTask(MUL1_V1,1,log);
+ break;
+
+ }
+ if (g == 0) continue; // nothing to do
+
+ TestResult tr;
+ tr.title.assign(title);
+ tr.mbps.resize(1+maxLogSize,0);
+ log->appendTitle(title.c_str());
+ for (int wg=minWorkgroupSize;wg<=maxWorkgroupSize;wg<<=1)
+ {
+ char aux[1000];
+ _snprintf(aux,1000,"%s - workgroup_size=%d",title.c_str(),wg);
+ log->append(aux);
+ for (int ls=minLogSize;ls<=maxLogSize;ls++)
+ {
+ size_t sz = 1<<ls;
+ double mbps = g->run(wg,sz);
+ std::string usz;
+ getUserSize(ls,usz);
+ _snprintf(aux,1000,"WG=%3d SZ=%7s %6.0f MB/s",wg,usz.c_str(),mbps);
+ log->append(aux);
+ tr.update(ls,mbps);
+ }
+ }
+
+ print(stdout,tr,minLogSize,maxLogSize);
+#if CONFIG_USE_QT
+ print(log,tr,minLogSize,maxLogSize);
+#endif
+ allTests.push_back(tr);
+ delete g;
+ }
+
+ // Dump all results
+ int nt = (int)allTests.size();
+ log->appendTitle("RESULTS");
+ printf("\n----\n");
+ for (int i=0;i<nt;i++)
+ {
+ TestResult & tr = allTests[i];
+ print(stdout,tr,minLogSize,maxLogSize);
+#if CONFIG_USE_QT
+ print(log,tr,minLogSize,maxLogSize);
+#endif
+ }
+
+#if CONFIG_USE_QT
+ // Enter event loop to terminate
+ log->appendTitle("DONE");
+ app.exec();
+#endif
+}
diff --git a/MPBenchmarks/gpu_add.cl b/MPBenchmarks/gpu_add.cl
new file mode 100644
index 0000000..7237395
--- /dev/null
+++ b/MPBenchmarks/gpu_add.cl
@@ -0,0 +1,86 @@
+// GPU addition
+// (c) EB Sep 2009
+
+#if (WORD_SIZE == 64)
+
+typedef long Word;
+const Word LOG_BASE = 62;
+const Word BASE = (Word)1<<(Word)62;
+const Word BASE_MINUS1 = (Word)1<<(Word)62 - 1;
+
+#elif (WORD_SIZE == 32)
+
+typedef int Word;
+const Word LOG_BASE = 30;
+const Word BASE = (Word)1<<(Word)30;
+const Word BASE_MINUS1 = (Word)1<<(Word)30 - 1;
+
+#elif (WORD_SIZE == 16)
+
+typedef short Word;
+const Word LOG_BASE = 14;
+const Word BASE = (Word)1<<(Word)14;
+const Word BASE_MINUS1 = (Word)1<<(Word)14 - 1;
+
+#endif
+
+__kernel void add_v1(__global const Word * x,__global const Word * y,__global Word * z)
+{
+ int i = get_global_id(0);
+
+ // Carry from I-1
+ Word t = (Word)0;
+ if (i>0)
+ {
+ t = (x[i-1]+y[i-1]) >> LOG_BASE;
+ }
+
+ // Sum I
+ Word s = (x[i]+y[i]) & BASE_MINUS1;
+
+ // Final result
+ z[i] = t + s;
+}
+
+__kernel void add_v2(__global const Word * x,__global const Word * y,__global Word * z)
+{
+ int i = get_global_id(0);
+ z[i] = x[i] + y[i];
+}
+
+#if 0
+__kernel void add_v3(__global const Word * x,__global const Word * y,__global Word * z,
+ __local Word * sLoc)
+{
+ int i = get_global_id(0);
+ int ii = get_local_id(0);
+ sLoc[ii] = x[i] + y[i];
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Carry from I-1
+ Word t = (Word)0;
+ Word s = 0;
+#if 0
+ if (ii == 0)
+ {
+ s = (i>0)?(x[i-1] + y[i-1]):0;
+ }
+ else
+ {
+ // s = sLoc[ii-1];
+ }
+#endif
+ // s = sLoc[ii-1];
+ if (s >= MAX_WORD) t = 1;
+ else if (s <= -MAX_WORD) t = -1;
+
+ // Sum I
+ s = sLoc[ii-1];
+ if (s >= MAX_WORD) s -= BASE;
+ else if (s <= -MAX_WORD) s += BASE;
+
+ // Final result
+ z[i] = t + s;
+}
+#endif
diff --git a/MPBenchmarks/gpu_mp.cl b/MPBenchmarks/gpu_mp.cl
new file mode 100644
index 0000000..703f5fa
--- /dev/null
+++ b/MPBenchmarks/gpu_mp.cl
@@ -0,0 +1,153 @@
+// GPU multi-precision (old file)
+// (c) EB Sep 2009
+
+#if (WORD_SIZE == 64)
+
+typedef long Word;
+const Word BASE = (Word)1<<(Word)60;
+const Word MAX_WORD = (Word)1<<(Word)60 - (Word)1;
+
+#elif (WORD_SIZE == 32)
+
+typedef int Word;
+const Word BASE = (Word)1<<(Word)30;
+const Word MAX_WORD = (Word)1<<(Word)30 - (Word)1;
+
+#elif (WORD_SIZE == 16)
+
+typedef short Word;
+const Word BASE = (Word)1<<(Word)14;
+const Word MAX_WORD = (Word)1<<(Word)14 - (Word)1;
+
+#elif (WORD_SIZE == 128)
+
+typedef float4 Word;
+
+#endif
+
+__kernel void copy(__global Word * x,__global Word * z)
+{
+ int i = get_global_id(0);
+ z[i] = x[i];
+}
+
+__kernel void zero(__global Word * a)
+{
+ int i = get_global_id(0);
+ a[i] = 0.0f;
+}
+
+__kernel void add_v1(__global const Word * x,__global const Word * y,__global Word * z)
+{
+ int i = get_global_id(0);
+
+ // Carry from I-1
+ Word t = (Word)0;
+ if (i>0)
+ {
+ Word s1 = x[i-1]+y[i-1];
+ if (s1 >= MAX_WORD) t = 1;
+ else if (s1 <= -MAX_WORD) t = -1;
+ }
+
+ // Sum I
+ Word s = x[i]+y[i];
+ if (s >= MAX_WORD) s -= BASE;
+ else if (s <= -MAX_WORD) s += BASE;
+
+ // Final result
+ z[i] = t + s;
+}
+
+__kernel void add_v3(__global const Word * x,__global const Word * y,__global Word * z,
+ __local Word * sLoc)
+{
+ int i = get_global_id(0);
+ int ii = get_local_id(0);
+ sLoc[ii] = x[i] + y[i];
+
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Carry from I-1
+ Word t = (Word)0;
+ Word s = 0;
+#if 0
+ if (ii == 0)
+ {
+ s = (i>0)?(x[i-1] + y[i-1]):0;
+ }
+ else
+ {
+ // s = sLoc[ii-1];
+ }
+#endif
+ // s = sLoc[ii-1];
+ if (s >= MAX_WORD) t = 1;
+ else if (s <= -MAX_WORD) t = -1;
+
+ // Sum I
+ s = sLoc[ii-1];
+ if (s >= MAX_WORD) s -= BASE;
+ else if (s <= -MAX_WORD) s += BASE;
+
+ // Final result
+ z[i] = t + s;
+}
+
+__kernel void add_v2(__global const Word * x,__global const Word * y,__global Word * z)
+{
+ int i = get_global_id(0);
+ z[i] = x[i] + y[i];
+}
+
+__kernel void propagate(__global const Word * x,__global Word * z)
+{
+ int i = get_global_id(0);
+ Word a = x[i];
+ Word b = 0;
+}
+
+// ______________________________________________________________________
+//
+
+#if (VECTOR_LENGTH == 1)
+typedef float vector_t;
+#elif (VECTOR_LENGTH == 2)
+typedef float2 vector_t;
+#elif (VECTOR_LENGTH == 4)
+typedef float4 vector_t;
+#endif
+
+__kernel void crunch(__global float * out)
+{
+ vector_t x,y,cs,sn,xx,yy;
+ x = 1.0f;
+ y = 0.0f;
+ cs = cos(2.0f); // random angle
+ sn = sin(2.0f);
+ for (int i=0;i<N_ROTATIONS;i++)
+ {
+ xx = x*cs - y*sn;
+ yy = y*cs + x*sn;
+ x = xx;
+ y = yy;
+ }
+ out[get_global_id(0)] = dot(x,y);
+}
+
+__kernel void crunch2(__global float * out)
+{
+ vector_t x,y,cs,sn,xx,yy;
+ x = 1.0f;
+ y = 0.0f;
+ cs = cos(2.0f); // random angle
+ sn = sin(2.0f);
+ for (int i=0;i<N_ROTATIONS;i++)
+ {
+ xx = mad(x,cs,-y*sn);
+ yy = mad(y,cs,x*sn);
+ x = xx;
+ y = yy;
+ }
+ out[get_global_id(0)] = dot(x,y);
+}
diff --git a/MPBenchmarks/gpu_mul1.cl b/MPBenchmarks/gpu_mul1.cl
new file mode 100644
index 0000000..be2875b
--- /dev/null
+++ b/MPBenchmarks/gpu_mul1.cl
@@ -0,0 +1,116 @@
+// GPU multi-precision
+// (c) EB Nov 2009
+
+__kernel void mul1_v0(int k,__global const int * x,__global int * z)
+{
+ int i = get_global_id(0);
+ z[i] = (int)(((long)k*(long)x[i]) & (long)BASE_MINUS1);
+}
+
+__kernel void mul1_v1(int k,__global const int * x,__global int * z)
+{
+ int i = get_global_id(0);
+ long z0 = ((long)k*(long)x[i]) & (long)BASE_MINUS1;
+ long z1 = (((long)k*(long)((i>0)?x[i-1]:0))>>LOG_BASE) & (long)BASE_MINUS1;
+ long z2 = (((long)k*(long)((i>1)?x[i-2]:0))>>(2*LOG_BASE)) & (long)BASE_MINUS1;
+ z[i] = (int)(z0+z1+z2);
+}
+
+#if 0
+__kernel void mul1_v2(int k,__global const int * x,__global int * z)
+{
+ __local long aux[WORKGROUP_SIZE+2];
+ int i = get_global_id(0); // in X
+ int ii = get_local_id(0); // in workgroup
+ long s;
+
+ // Load X[i]*K into AUX for all threads of the workgroup + 2
+ s = (long)x[i];
+ aux[ii+2] = s * (long)k;
+ if (ii < 2) // 2 work items load the previous 2 values
+ {
+ s = (long)((i>2)?x[i-2]:0);
+ aux[ii] = s * (long)k;
+ }
+ barrier(CLK_LOCAL_MEM_FENCE);
+
+ // Shift, mask and sum the 3 consecutive words in each cell
+ s = (aux[ii]>>(2*LOG_BASE)) & (long)BASE_MINUS1;
+ s += (aux[ii+1]>>(LOG_BASE)) & (long)BASE_MINUS1;
+ s += aux[ii+2] & (long)BASE_MINUS1;
+
+ // Store the result
+ z[i] = (int)s;
+}
+#endif
+
+#if 0
+__kernel void mul1_v3(int k,__global const int * x,__global int * z)
+{
+ // Get index of block to compute
+ int i = get_global_id(0) * BLOCK_SIZE;
+
+ // Load the previous two values
+ long p0,p1,p2;
+ if (i>0)
+ {
+ p2 = (long)x[i-2]*(long)k; p2 >>= (2*LOG_BASE);
+ p1 = (long)x[i-1]*(long)k; p1 >>= LOG_BASE;
+ }
+ else p2 = p1 = 0;
+
+ // Compute the block (sequentially)
+ for (int j=0;j<BLOCK_SIZE;j++)
+ {
+ // Load one value
+ p0 = (long)x[i]*(long)k;
+ // Store one result computed from the last 3 values
+ z[i] = (int)((p0&(long)BASE_MINUS1)+(p1&(long)BASE_MINUS1)+p2);
+ // Shift
+ i++;
+ p2 = p1 >> LOG_BASE;
+ p1 = p0 >> LOG_BASE;
+ }
+}
+#endif
+
+__kernel void mul1_v3bis(int k,__global const int * x,__global int * z)
+{
+ int i = get_global_id(0) << 1;
+ long m1,m2;
+ if (i>0)
+ {
+ m2 = ((long)k*(long)x[i-2]) >> (2*LOG_BASE);
+ m1 = ((long)k*(long)x[i-1]) >> LOG_BASE;
+ } else m1 = m2 = 0;
+ long x0 = (long)k*(long)x[i];
+ long x1 = (long)k*(long)x[i+1];
+ z[i] = (int)( ( x0 & (long)BASE_MINUS1 )
+ + ( m1 & (long)BASE_MINUS1 )
+ + ( m2 & (long)BASE_MINUS1 ) );
+ x0 >>= LOG_BASE;
+ m1 >>= LOG_BASE;
+ z[i+1] = (int)( ( x1 & (long)BASE_MINUS1 )
+ + ( x0 & (long)BASE_MINUS1 )
+ + ( m1 & (long)BASE_MINUS1 ) );
+}
+
+__kernel void mul1_v4a(int k,__global const int * x,__global int * z0,__global int * z1,__global int * z2)
+{
+ int i = get_global_id(0);
+ long u = (long)k*(long)x[i];
+ z0[i] = (int) (u & (long)BASE_MINUS1);
+ u >>= LOG_BASE;
+ z1[i] = (int) (u & (long)BASE_MINUS1);
+ u >>= LOG_BASE;
+ z2[i] = (int)u;
+}
+
+__kernel void mul1_v4b(__global const int * z0,__global const int * z1,__global const int * z2,__global int * z)
+{
+ int i = get_global_id(0);
+ int s = z0[i];
+ if (i>0) s += z1[i-1];
+ if (i>1) s += z2[i-2];
+ z[i] = s;
+}
diff --git a/MPBenchmarks/gpu_zero.cl b/MPBenchmarks/gpu_zero.cl
new file mode 100644
index 0000000..0f276e0
--- /dev/null
+++ b/MPBenchmarks/gpu_zero.cl
@@ -0,0 +1,18 @@
+// GPU memory 0
+// (c) EB Sep 2009
+
+#if (WORD_SIZE == 64)
+typedef long Word;
+#elif (WORD_SIZE == 32)
+typedef int Word;
+#elif (WORD_SIZE == 16)
+typedef short Word;
+#elif (WORD_SIZE == 128)
+typedef float4 Word;
+#endif
+
+__kernel void zero(__global Word * a)
+{
+ int i = get_global_id(0);
+ a[i] = 0.0f;
+}
diff --git a/MPBenchmarks/t1.cpp b/MPBenchmarks/t1.cpp
new file mode 100644
index 0000000..2398a1b
--- /dev/null
+++ b/MPBenchmarks/t1.cpp
@@ -0,0 +1,18 @@
+#include <stdio.h>
+
+typedef long long int int64;
+
+int main()
+{
+ const int64 Blog = 8;
+ const int64 B = (int64)1<<Blog;
+
+ for (int64 m=B;m<3*B ;m++)
+ {
+ int64 x = B-1+2*(m/B);
+ if (x>m) continue;
+
+ printf("Blog=%lld m=%llX\n",Blog,m);
+ // break;
+ }
+}
diff --git a/MPBenchmarks/vs2008/MPBenchmarks.sln b/MPBenchmarks/vs2008/MPBenchmarks.sln
new file mode 100644
index 0000000..a3d9660
--- /dev/null
+++ b/MPBenchmarks/vs2008/MPBenchmarks.sln
@@ -0,0 +1,39 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MPBenchmarks", "MPBenchmarks.vcproj", "{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}"
+ ProjectSection(ProjectDependencies) = postProject
+ {86930221-164E-4E80-B8C0-50DCB5740B37} = {86930221-164E-4E80-B8C0-50DCB5740B37}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BealtoOpenCL", "..\..\BealtoOpenCL\vs2008\BealtoOpenCL.vcproj", "{86930221-164E-4E80-B8C0-50DCB5740B37}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|Win32.ActiveCfg = Debug|Win32
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|Win32.Build.0 = Debug|Win32
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|x64.ActiveCfg = Debug|x64
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|x64.Build.0 = Debug|x64
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|Win32.ActiveCfg = Release|Win32
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|Win32.Build.0 = Release|Win32
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|x64.ActiveCfg = Release|x64
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|x64.Build.0 = Release|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.ActiveCfg = Debug|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.Build.0 = Debug|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.ActiveCfg = Debug|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.Build.0 = Debug|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.ActiveCfg = Release|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.Build.0 = Release|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.ActiveCfg = Release|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/MPBenchmarks/vs2008/MPBenchmarks.vcproj b/MPBenchmarks/vs2008/MPBenchmarks.vcproj
new file mode 100644
index 0000000..5fe65c4
--- /dev/null
+++ b/MPBenchmarks/vs2008/MPBenchmarks.vcproj
@@ -0,0 +1,428 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="9,00"
+ Name="MPBenchmarks"
+ ProjectGUID="{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}"
+ RootNamespace="TestsOpenCL"
+ Keyword="Win32Proj"
+ TargetFrameworkVersion="196613"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ <Platform
+ Name="x64"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+ IntermediateDirectory="$(ConfigurationName)-32"
+ ConfigurationType="1"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="qtmaind.lib QtCored4.lib QtGuid4.lib OpenCL.lib BealtoOpenCLd.lib"
+ LinkIncremental="2"
+ AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-32"
+ GenerateDebugInformation="true"
+ SubSystem="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ Description="Installing DLL&apos;s"
+ CommandLine="copy &quot;$(QTDIR_BASE)-32\bin\QtCored4.dll&quot; $(OutDir) ;&#x0D;&#x0A;copy &quot;$(QTDIR_BASE)-32\bin\QtGuid4.dll&quot; $(OutDir) ;&#x0D;&#x0A;copy &quot;$(ATISTREAMSDKROOT)\bin\x86\OpenCL.dll&quot; $(OutDir) ;&#x0D;&#x0A;"
+ />
+ </Configuration>
+ <Configuration
+ Name="Debug|x64"
+ OutputDirectory="$(SolutionDir)\$(ConfigurationName)-64"
+ IntermediateDirectory="$(ConfigurationName)-64"
+ ConfigurationType="1"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="qtmaind.lib QtCored4.lib QtGuid4.lib OpenCL.lib BealtoOpenCLd.lib"
+ LinkIncremental="2"
+ AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-64"
+ GenerateDebugInformation="true"
+ SubSystem="2"
+ TargetMachine="17"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ Description="Installing Qt DLL&apos;s"
+ CommandLine="copy &quot;$(QTDIR_BASE)-64\bin\QtCored4.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;copy &quot;$(QTDIR_BASE)-64\bin\QtGuid4.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;copy &quot;$(ATISTREAMSDKROOT)\bin\x86_64\OpenCL.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+ IntermediateDirectory="$(ConfigurationName)-32"
+ ConfigurationType="1"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ EnableEnhancedInstructionSet="2"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="OpenCL.lib qtmain.lib QtCore4.lib QtGui4.lib BealtoOpenCL.lib"
+ LinkIncremental="1"
+ AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-32"
+ GenerateDebugInformation="true"
+ SubSystem="2"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ Description="Installing DLL&apos;s"
+ CommandLine="copy &quot;$(QTDIR_BASE)-32\bin\QtCore4.dll&quot; $(OutDir) ;&#x0D;&#x0A;copy &quot;$(QTDIR_BASE)-32\bin\QtGui4.dll&quot; $(OutDir) ;&#x0D;&#x0A;copy &quot;$(ATISTREAMSDKROOT)\bin\x86\OpenCL.dll&quot; $(OutDir) ;&#x0D;&#x0A;"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|x64"
+ OutputDirectory="$(SolutionDir)\$(ConfigurationName)-64"
+ IntermediateDirectory="$(ConfigurationName)-64"
+ ConfigurationType="1"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="OpenCL.lib QtCore4.lib QtGui4.lib qtmain.lib BealtoOpenCL.lib"
+ LinkIncremental="1"
+ AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-64"
+ GenerateDebugInformation="true"
+ SubSystem="2"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="17"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ Description="Installing Qt DLL&apos;s"
+ CommandLine="copy &quot;$(QTDIR_BASE)-64\bin\QtCore4.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;copy &quot;$(QTDIR_BASE)-64\bin\QtGui4.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;copy &quot;$(ATISTREAMSDKROOT)\bin\x86_64\OpenCL.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <File
+ RelativePath="..\appMain.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\ArithmeticTasks.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\GPUTask.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\MemoryTasks.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <File
+ RelativePath="..\ArithmeticTasks.h"
+ >
+ </File>
+ <File
+ RelativePath="..\Config.h"
+ >
+ </File>
+ <File
+ RelativePath="..\Conversions.h"
+ >
+ </File>
+ <File
+ RelativePath="..\CPUTask.h"
+ >
+ </File>
+ <File
+ RelativePath="..\GPUTask.h"
+ >
+ </File>
+ <File
+ RelativePath="..\MemoryTasks.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="OpenCL Files"
+ >
+ <File
+ RelativePath="..\gpu_add.cl"
+ >
+ </File>
+ <File
+ RelativePath="..\gpu_mp.cl"
+ >
+ </File>
+ <File
+ RelativePath="..\gpu_mul1.cl"
+ >
+ </File>
+ <File
+ RelativePath="..\gpu_zero.cl"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Data Files"
+ >
+ <File
+ RelativePath="..\LICENSE.txt"
+ >
+ </File>
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>