Import MPBenchmark 20091214

Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
author: Zhenyu Wang <zhenyuw@linux.intel.com> 2014-10-16 14:10:11 +0800
committer: Zhenyu Wang <zhenyuw@linux.intel.com> 2014-10-16 14:10:11 +0800
commit: bf8d34506d36df0c2028b97c25abd61c8659d326 (patch)
tree: bf78890927d593c29144d6cf078a3828e286cd29
37 files changed, 4277 insertions, 0 deletions
diff --git a/BealtoOpenCL/LICENSE.txt b/BealtoOpenCL/LICENSE.txt
new file mode 100644
index 0000000..f5fe116
--- /dev/null
+++ b/BealtoOpenCL/LICENSE.txt
@@ -0,0 +1,27 @@
+This code is released under the following license (BSD-style).
+--
+
+Copyright (c) 2009, Eric Bainville
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Eric Bainville nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY ERIC BAINVILLE ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ERIC BAINVILLE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/BealtoOpenCL/Makefile b/BealtoOpenCL/Makefile
new file mode 100644
index 0000000..f8e725f
--- /dev/null
+++ b/BealtoOpenCL/Makefile
@@ -0,0 +1,68 @@
+# OpenCL multiprecision CPU+GPU benchmarks
+# EB Nov 2009
+
+# archive data
+CURRENT_DATE := $(shell date +%Y%m%d_%Hh%M)
+CURRENT_DIR := $(notdir $(shell /bin/pwd))
+CURRENT_MACHINE := $(shell uname -m)
+
+# output directories
+OBJDIR := objs
+DEPDIR := deps
+ifeq ($(CURRENT_MACHINE),x86_64)
+LIBDIR := lib-64
+else
+LIBDIR := lib-32
+endif
+
+# targets
+OBJS = $(patsubst %,$(OBJDIR)/%.o, CLContext CLError CLProgram )
+DEPS = $(patsubst $(OBJDIR)/%.o, $(DEPDIR)/%.d, $(OBJS))
+
+LIB = $(LIBDIR)/libBealtoOpenCL.a
+
+VPATH = src
+
+# OpenCL SDK directory
+# OPENCL_DIR = /opt/cuda/include
+OPENCL_DIR = /opt/ati-stream-sdk-v2.0-beta4-lnx64
+
+# flags
+CXXFLAGS = -DLinux -DCONFIG_USE_QT=0 -O2 -mtune=nocona -msse3 -Wall -I./include -I$(OPENCL_DIR)/include
+LDFLAGS = -L$(OPENCL_DIR)/lib/x86_64
+
+lib: $(LIB)
+
+$(LIB): $(OBJS)
+	@[ -d $(LIBDIR) ] || mkdir -p $(LIBDIR)
+	ar rvs $(LIB) $(OBJS)
+
+dos2unix:
+	dos2unix Makefile src/*.cpp include/*.h
+
+clean:
+	/bin/rm -f *.o *~ $(LIB)
+	/bin/rm -rf vs2008/Release-* vs2008/Debug-* $(OBJDIR) $(DEPDIR) lib-32 lib-64
+
+archive: clean
+	@echo "ARCHIVE $(CURRENT_DATE)"
+	tar czf "../BealtoOpenCL-$(CURRENT_DATE).tar.gz" -C.. --exclude=".svn" $(CURRENT_DIR)
+
+##### Dependencies
+$(DEPDIR)/%.d: %.cpp
+	@[ -d $(DEPDIR) ] || mkdir -p $(DEPDIR)
+	@/bin/echo -e "DEPS \033[32m$*\033[0m"
+	@$(CXX) $(CXXFLAGS) -o $@ -MM -MT '$(OBJDIR)/$*.o $@' $<
+
+##### Compilation
+$(OBJDIR)/%.o: %.cpp
+	@[ -d $(OBJDIR) ] || mkdir -p $(OBJDIR)
+	@/bin/echo -e "C++  \033[34m$*\033[0m"
+	@$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+##### Qt
+moc_%.cpp: %.h
+	@/bin/echo -e "MOC  \033[34m$*\033[0m"
+	@moc -o $@ $<
+
+-include $(DEPS)
diff --git a/BealtoOpenCL/include/BealtoOpenCL.h b/BealtoOpenCL/include/BealtoOpenCL.h
new file mode 100644
index 0000000..b35e9f1
--- /dev/null
+++ b/BealtoOpenCL/include/BealtoOpenCL.h
@@ -0,0 +1,13 @@
+// Main CL C++ bindings include file
+// (c) EB Oct 2009
+
+#pragma once
+
+#include "CLError.h"
+#include "CLContext.h"
+#include "CLCommandQueue.h"
+#include "CLBuffer.h"
+#include "CLEvent.h"
+#include "CLEventList.h"
+#include "CLProgram.h"
+#include "CLKernel.h"
diff --git a/BealtoOpenCL/include/CLBuffer.h b/BealtoOpenCL/include/CLBuffer.h
new file mode 100644
index 0000000..be0f640
--- /dev/null
+++ b/BealtoOpenCL/include/CLBuffer.h
@@ -0,0 +1,34 @@
+// OpenCL buffer object
+// (c) EB Sep 2009
+
+#ifndef CLBuffer_h
+#define CLBuffer_h
+
+#include <CL/cl.h>
+#include "CLMemoryObject.h"
+#include "CLError.h"
+
+namespace cl {
+
+class Buffer : public MemoryObject
+{
+public:
+
+  // Instances of this class are created from a Context.
+
+  // Destructor
+  virtual ~Buffer() { }
+
+private:
+
+  Buffer(); // not implemented
+  Buffer(cl_mem x) : MemoryObject(x) { }
+
+  friend class Context;
+  friend class CommandQueue;
+  friend class Kernel;
+};
+
+} // namespace
+
+#endif // CLBuffer_h
diff --git a/BealtoOpenCL/include/CLCommandQueue.h b/BealtoOpenCL/include/CLCommandQueue.h
new file mode 100644
index 0000000..6fde44c
--- /dev/null
+++ b/BealtoOpenCL/include/CLCommandQueue.h
@@ -0,0 +1,229 @@
+// OpenCL command queue object
+// (c) EB Sep 2009
+
+#ifndef CLCommandQueue_h
+#define CLCommandQueue_h
+
+#include <CL/cl.h>
+#include "CLBuffer.h"
+#include "CLEvent.h"
+#include "CLEventList.h"
+#include "CLKernel.h"
+
+namespace cl {
+
+class CommandQueue
+{
+public:
+
+  // Instances of this class are created from a Context.
+
+  // Copy constructor
+  CommandQueue(const CommandQueue & a) : mX(a.mX)
+  { clRetainCommandQueue(mX); }
+
+  // = operator
+  CommandQueue & operator = (const CommandQueue & a)
+  {
+    if (a.mX != mX)
+    {
+      clReleaseCommandQueue(mX);
+      mX = a.mX;
+      clRetainCommandQueue(mX);
+    }
+    return *this;
+  }
+
+  // Destructor
+  virtual ~CommandQueue()
+  { clReleaseCommandQueue(mX); }
+
+  // Commands
+  // All commands return an Event instance.
+  // If the call fails, the returned event is invalid.
+
+  // Read buffer to host memory
+  Event readBuffer(Buffer * b,cl_bool blocking_read,size_t offset,size_t cb,void * ptr,const EventList & wait_list = EventList())
+  {
+    if (b == 0) return Event(0); // Invalid
+    cl_uint num_events = 0;
+    const cl_event * events = 0;
+    wait_list.getParams(num_events,events);
+    cl_event e = 0;
+    cl_int status = clEnqueueReadBuffer(mX,b->mX,blocking_read,offset,cb,ptr,num_events,events,&e);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) e = 0;
+    return Event(e);
+  }
+
+  // Write buffer from host memory
+  Event writeBuffer(Buffer * b,cl_bool blocking_write,size_t offset,size_t cb,const void * ptr,const EventList & wait_list = EventList())
+  {
+    if (b == 0) return Event(0); // Invalid
+    cl_uint num_events = 0;
+    const cl_event * events = 0;
+    wait_list.getParams(num_events,events);
+    cl_event e = 0;
+    cl_int status = clEnqueueWriteBuffer(mX,b->mX,blocking_write,offset,cb,ptr,num_events,events,&e);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) e = 0;
+    return Event(e);
+  }
+
+  // Copy buffers
+  Event copyBuffer(Buffer * src,Buffer * dst,size_t src_offset,size_t dst_offset,size_t cb,const EventList & wait_list = EventList())
+  {
+    if (src == 0 || dst == 0) return Event(0); // Invalid
+    cl_uint num_events = 0;
+    const cl_event * events = 0;
+    wait_list.getParams(num_events,events);
+    cl_event e = 0;
+    cl_int status = clEnqueueCopyBuffer(mX,src->mX,dst->mX,src_offset,dst_offset,cb,num_events,events,&e);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) e = 0;
+    return Event(e);
+  }
+
+  // Map buffer.  The mapped address is put in ADDRESS.
+  template <typename T> Event mapBuffer(Buffer * b,T * & address,cl_bool blocking_map,cl_map_flags map_flags,size_t offset,size_t cb,const EventList & wait_list = EventList())
+  {
+    if (b == 0) return Event(0);
+    cl_uint num_events = 0;
+    const cl_event * events = 0;
+    wait_list.getParams(num_events,events);
+    cl_event e = 0;
+    cl_int status;
+    address = (T *)clEnqueueMapBuffer(mX,b->mX,blocking_map,map_flags,offset,cb,num_events,events,&e,&status);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) e = 0;
+    return Event(e);
+  }
+
+  // Map full buffer.  Queries its size, and maps all its contents.  The mapped address is put in ADDRESS.
+  template <typename T> Event mapBuffer(Buffer * b,T * & address,cl_bool blocking_map,cl_map_flags map_flags,const EventList & wait_list = EventList())
+  {
+    if (b == 0) return Event(0); // Invalid buffer
+    size_t sz = b->getSize();
+    if (sz == 0) return Event(0); // Invalid size
+    return mapBuffer<T>(b,address,blocking_map,map_flags,0,sz,wait_list);
+  }
+
+  // Unmap memory object. (Buffer, Image2D, etc.)
+  Event unmapMemoryObject(MemoryObject * m,void * address,const EventList & wait_list = EventList())
+  {
+    if (m == 0) return Event(0);
+    cl_uint num_events = 0;
+    const cl_event * events = 0;
+    wait_list.getParams(num_events,events);
+    cl_event e = 0;
+    cl_int status = clEnqueueUnmapMemObject(mX,m->mX,address,num_events,events,&e);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) e = 0;
+    return Event(e);
+  }
+
+  // Enqueue 1D kernel execution
+  // K is the kernel to run.
+  // N is the total number of work items (global work size).
+  // G is the number of work items inside a work group. G must divide N.
+  // G can be 0, in which case the OpenCL implementation will choose the best value.
+  Event execKernel1(Kernel * k,size_t n,size_t g,const EventList & wait_list = EventList())
+  {
+    if (k == 0) return Event(0); // Invalid
+    cl_uint num_events = 0;
+    const cl_event * events = 0;
+    wait_list.getParams(num_events,events);
+    cl_event e = 0;
+    size_t * pgw = &n;
+    size_t * plw = (g>0)?(&g):0;
+    cl_int status = clEnqueueNDRangeKernel(mX,k->mX,1,0,pgw,plw,num_events,events,&e);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) e = 0;
+    return Event(e);
+  }
+
+  // Enqueue 2D kernel execution
+  // K is the kernel to run.
+  // NX*NY is the total number of work items (global work size).
+  // GX*GY is the number of work items inside a work group. G<d> must divide N<d>.
+  Event execKernel2(Kernel * k,size_t nx,size_t ny,size_t gx,size_t gy,const EventList & wait_list = EventList())
+  {
+    if (k == 0) return Event(0); // Invalid
+    cl_uint num_events = 0;
+    const cl_event * events = 0;
+    wait_list.getParams(num_events,events);
+    cl_event e = 0;
+    size_t pgw[2]; pgw[0] = nx; pgw[1] = ny;
+    size_t plw[2]; plw[0] = gx; plw[1] = gy;
+    cl_int status = clEnqueueNDRangeKernel(mX,k->mX,2,0,pgw,plw,num_events,events,&e);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) e = 0;
+    return Event(e);
+  }
+
+  //
+  // Execution control
+  //
+
+  // Enqueue a marker
+  Event mark()
+  {
+    cl_event e = 0;
+    cl_int status = clEnqueueMarker(mX,&e);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) e = 0;
+    return Event(e);
+  }
+
+  // Insert a wait point for a specific list of events
+  bool wait(const EventList & wait_list)
+  {
+    cl_uint num_events = 0;
+    const cl_event * events = 0;
+    wait_list.getParams(num_events,events);
+    if (num_events == 0) return true; // Nop
+    cl_int status = clEnqueueWaitForEvents(mX,num_events,events);
+    REPORT_OPENCL_STATUS(status);
+    return (status == CL_SUCCESS);
+  }
+
+  // Insert a wait point for all events. (barrier)
+  bool wait()
+  {
+    cl_int status = clEnqueueBarrier(mX);
+    REPORT_OPENCL_STATUS(status);
+    return (status == CL_SUCCESS);
+  }
+
+  // Blocks until all queued commands have been submitted to the device.
+  bool flush()
+  {
+    cl_int status = clFlush(mX);
+    REPORT_OPENCL_STATUS(status);
+    return (status == CL_SUCCESS);
+  }
+
+  // Blocks until all queued commands have been executed.
+  bool finish()
+  {
+    cl_int status = clFinish(mX);
+    REPORT_OPENCL_STATUS(status);
+    return (status == CL_SUCCESS);
+  }
+
+private:
+
+  // Private constructors
+  CommandQueue(); // not implemented
+  CommandQueue(cl_command_queue x) : mX(x) { }
+
+  // OpenCL handle (always non 0)
+  cl_command_queue mX;
+
+  friend class Context;
+
+}; // class CommandQueue
+
+} // namespace
+
+#endif // CLCommandQueue_h
diff --git a/BealtoOpenCL/include/CLContext.h b/BealtoOpenCL/include/CLContext.h
new file mode 100644
index 0000000..2f8fcd4
--- /dev/null
+++ b/BealtoOpenCL/include/CLContext.h
@@ -0,0 +1,149 @@
+// OpenCL context
+// (c) EB Sep 2009
+
+#ifndef CLContext_h
+#define CLContext_h
+
+#include <CL/cl.h>
+#include <vector>
+#include <string>
+
+namespace cl {
+
+class Buffer;
+class Image2D;
+class CommandQueue;
+class Program;
+
+class Context
+{
+public:
+
+  // Copy constructor
+  Context(const Context & a);
+
+  // = operator
+  Context & operator = (const Context & a);
+
+  // Destructor
+  virtual ~Context();
+
+  // Create default context (including all devices with the given type).
+  // Return a new instance if OK, and 0 otherwise.
+  static Context * create(cl_device_type deviceType = CL_DEVICE_TYPE_GPU);
+
+  // Get number of associated devices
+  int getNDevices() const;
+
+  // Create a new command queue object.
+  // D is the index of the device ine the context (0..NDevices-1)
+  // See clCreateCommandQueue for the arguments.
+  // Return a new instance if OK, and 0 otherwise.
+  CommandQueue * createCommandQueue(int d,cl_command_queue_properties properties);
+
+  // Create a new buffer object.
+  // See clCreateBuffer for the arguments.
+  // Return a new instance if OK, and 0 otherwise.
+  Buffer * createBuffer(cl_mem_flags flags,size_t size,void * host_ptr = 0);
+
+  // Create a new image2D object.
+  // See clCreateImage2D for the arguments.
+  // Return a new instance if OK, and 0 otherwise.
+  Image2D * createImage2D(cl_mem_flags flags,const cl_image_format * image_format,
+    size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0);
+
+  // Create a R / RG / RGBA image2D object.
+  // channel_type is passed in a cl_image_format structure.
+  Image2D * createRImage2D(cl_mem_flags flags,cl_channel_type data_type,
+    size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0)
+  {
+    cl_image_format f;
+    f.image_channel_order = CL_R;
+    f.image_channel_data_type = data_type;
+    return createImage2D(flags,&f,width,height,pitch,host_ptr);
+  }
+  Image2D * createRGImage2D(cl_mem_flags flags,cl_channel_type data_type,
+    size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0)
+  {
+    cl_image_format f;
+    f.image_channel_order = CL_RG;
+    f.image_channel_data_type = data_type;
+    return createImage2D(flags,&f,width,height,pitch,host_ptr);
+  }
+  Image2D * createRGBAImage2D(cl_mem_flags flags,cl_channel_type data_type,
+    size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0)
+  {
+    cl_image_format f;
+    f.image_channel_order = CL_RGBA;
+    f.image_channel_data_type = data_type;
+    return createImage2D(flags,&f,width,height,pitch,host_ptr);
+  }
+
+  // Create a new program from source strings.
+  // See clCreateProgramWithSource for the arguments.
+  // Return a new instance if OK, and 0 otherwise.
+  Program * createProgramWithSource(cl_uint count,const char ** strings,const size_t * lengths = 0);
+
+  // Create a new program from source files.
+  // See clCreateProgramWithSource for the arguments.
+  // Return a new instance if OK, and 0 otherwise.
+  Program * createProgramWithFiles(cl_uint count,const char ** filenames);
+  Program * createProgramWithFile(const char * filename)
+  { return createProgramWithFiles(1,&filename); }
+  Program * createProgramWithFiles(const std::vector<std::string> & filenames);
+
+  // Get device info for device D (0..NDevices-1).
+  // See clGetDeviceInfo for the arguments.
+  bool getDeviceInfo(int d,cl_device_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret);
+
+  // Template version suitable for scalar types
+  template <class T> bool getDeviceInfo(int d,cl_device_info param_name,T & x)
+  { return getDeviceInfo(d,param_name,sizeof(T),&x,0); }
+
+  // Get all info for device D.
+  // INFO receives the result (one attribute per line), cleared by the call.
+  bool getAllDeviceInfo(int d,std::string & info);
+
+  // Specific info entries.
+  // D is the device index in context, 0..NDevices-1.
+  size_t getDeviceMaxMemAllocSize(int d = 0)
+  {
+    cl_ulong x;
+    if (!getDeviceInfo(d,CL_DEVICE_MAX_MEM_ALLOC_SIZE,x)) return 0; // Failed
+    return (size_t)x;
+  }
+  bool getDeviceImageSupport(int d = 0)
+  {
+    cl_bool x;
+    if (!getDeviceInfo(d,CL_DEVICE_IMAGE_SUPPORT,x)) return false; // Failed
+    return (x)?true:false;
+  }
+  size_t getDeviceImage2DMaxWidth(int d = 0)
+  {
+    size_t x;
+    if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_WIDTH,x)) return 0; // Failed
+    return x;
+  }
+  size_t getDeviceImage2DMaxHeight(int d = 0)
+  {
+    size_t x;
+    if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_HEIGHT,x)) return 0; // Failed
+    return x;
+  }
+
+private:
+
+  // This constructor is private, use create() to instanciate this class
+  Context();
+
+  // OpenCL handle (always non 0)
+  cl_context mX;
+
+  // Devices associated with this context
+  std::vector<cl_device_id> mDevices;
+
+}; // class Context
+
+} // namespace
+
+#endif // CLContext_h
diff --git a/BealtoOpenCL/include/CLError.h b/BealtoOpenCL/include/CLError.h
new file mode 100644
index 0000000..d929695
--- /dev/null
+++ b/BealtoOpenCL/include/CLError.h
@@ -0,0 +1,19 @@
+// OpenCL error reporting
+// (c) EB Sep 2009
+
+#ifndef CLError_h
+#define CLError_h
+
+namespace cl {
+
+// Report an OpenCL call status.
+// FILE,LINE,FUNCTION shall be the source file/line/name of the function where the call failed.
+// STATUS is the returned value (CL_SUCCESS,...).
+void reportStatus(const char * file,int line,const char * function,int status);
+
+// Use this macro to report errors.
+#define REPORT_OPENCL_STATUS(status) { cl::reportStatus(__FILE__,__LINE__,__FUNCTION__,status); }
+
+} // namespace
+
+#endif // CLError_h
diff --git a/BealtoOpenCL/include/CLEvent.h b/BealtoOpenCL/include/CLEvent.h
new file mode 100644
index 0000000..b7ac9b4
--- /dev/null
+++ b/BealtoOpenCL/include/CLEvent.h
@@ -0,0 +1,67 @@
+// OpenCL event object
+// (c) EB Sep 2009
+
+#ifndef CLEvent_h
+#define CLEvent_h
+
+#include <CL/cl.h>
+#include "CLError.h"
+
+namespace cl {
+
+class Event
+{
+public:
+
+  // Instances of this class are created by calls in a CommandQueue.
+
+  // Copy constructor
+  Event(const Event & a) : mX(a.mX)
+  { if (mX != 0) clRetainEvent(mX); }
+
+  // = operator
+  Event & operator = (const Event & a)
+  {
+    if (a.mX != mX)
+    {
+      if (mX != 0) clReleaseEvent(mX);
+      mX = a.mX;
+      if (mX != 0) clRetainEvent(mX);
+    }
+    return *this;
+  }
+
+  // Destructor
+  virtual ~Event()
+  { if (mX != 0) clReleaseEvent(mX); }
+
+  // Check if the event is valid
+  bool isValid() const { return (mX != 0); }
+
+  // Wait for this event to complete.
+  // Return TRUE if OK, and FALSE otherwise.
+  // If the event is invalid, return TRUE.
+  bool wait()
+  {
+    if (mX == 0) return true; // OK, event is invalid
+    cl_int status = clWaitForEvents(1,&mX);
+    REPORT_OPENCL_STATUS(status);
+    return (status == CL_SUCCESS);
+  }
+
+private:
+
+  Event(); // not implemented
+  Event(cl_event x) : mX(x) { }
+
+  // OpenCL handle (MAY BE 0)
+  cl_event mX;
+
+  friend class CommandQueue;
+  friend class EventList;
+
+}; // class Event
+
+} // namespace
+
+#endif // CLEvent_h
diff --git a/BealtoOpenCL/include/CLEventList.h b/BealtoOpenCL/include/CLEventList.h
new file mode 100644
index 0000000..a841d15
--- /dev/null
+++ b/BealtoOpenCL/include/CLEventList.h
@@ -0,0 +1,96 @@
+// OpenCL event list
+// (c) EB Sep 2009
+
+#ifndef CLEventList_h
+#define CLEventList_h
+
+#include <CL/cl.h>
+#include <vector>
+#include "CLEvent.h"
+
+namespace cl {
+
+class EventList
+{
+public:
+
+  // Constructor from a list of events
+  EventList() { }
+  EventList(const Event & e1) { insert(e1); }
+  EventList(const Event & e1,const Event & e2) { insert(e1); insert(e2); }
+  EventList(const Event & e1,const Event & e2,const Event & e3) { insert(e1); insert(e2); insert(e3); }
+
+  // Copy constructor
+  EventList(const EventList & e)
+  {
+    for (event_list_t::const_iterator it = e.mX.begin(); it != e.mX.end(); it++) insert(*it);
+  }
+
+  // = operator
+  EventList & operator = (const EventList & e)
+  {
+    if (&e == this) return *this;
+    clear();
+    for (event_list_t::const_iterator it = e.mX.begin(); it != e.mX.end(); it++) insert(*it);
+    return *this;
+  }
+
+  // Destructor
+  ~EventList() { clear(); }
+
+  // Clear the list
+  void clear()
+  {
+    for (event_list_t::iterator it = mX.begin(); it != mX.end(); it++) clReleaseEvent(*it);
+    mX.clear();
+  }
+
+  // Insert an event to the list.  Ignore if E is invalid.
+  void insert(const Event & e)
+  {
+    cl_event x = e.mX;
+    if (x == 0) return;
+    clRetainEvent(x);
+    mX.push_back(x);
+  }
+
+  // Get the number of events in the list
+  int size() const { return (int)mX.size(); }
+
+  // Is the list empty?
+  bool empty() const { return mX.empty(); }
+
+  // Wait for all the events to complete.
+  // Return TRUE if OK, and FALSE otherwise.
+  // If the list is empty, return TRUE.
+  bool wait()
+  {
+    if (mX.empty()) return true; // OK, empty
+    cl_uint num_events;
+    const cl_event * events;
+    getParams(num_events,events);
+    cl_int status = clWaitForEvents(num_events,events);
+    REPORT_OPENCL_STATUS(status);
+    return (status == CL_SUCCESS);
+  }
+
+  // Get CL parameters for the calls
+  void getParams(cl_uint & num_events,const cl_event * & events) const
+  {
+    num_events = (cl_uint)mX.size();
+    if (num_events == 0) events = 0;
+    else events = &(mX[0]);
+  }
+
+private:
+
+  typedef std::vector<cl_event> event_list_t;
+
+  // OpenCL handles
+  event_list_t mX;
+
+}; // class EventList
+
+} // namespace
+
+#endif // CLEventList_h
diff --git a/BealtoOpenCL/include/CLImage2D.h b/BealtoOpenCL/include/CLImage2D.h
new file mode 100644
index 0000000..baf996f
--- /dev/null
+++ b/BealtoOpenCL/include/CLImage2D.h
@@ -0,0 +1,35 @@
+// OpenCL image2D object
+// (c) EB Nov 2009
+
+#ifndef CLImage2D_h
+#define CLImage2D_h
+
+#include <CL/cl.h>
+#include "CLError.h"
+#include "CLMemoryObject.h"
+
+namespace cl {
+
+  class Image2D : public MemoryObject
+{
+public:
+
+  // Instances of this class are created from a Context.
+
+  // Destructor
+  virtual ~Image2D() { }
+
+private:
+
+  Image2D(); // not implemented
+  Image2D(cl_mem x) : MemoryObject(x) { }
+
+  friend class Context;
+  friend class CommandQueue;
+  friend class Kernel;
+
+}; // class Image2D
+
+} // namespace
+
+#endif // CLImage2D_h
diff --git a/BealtoOpenCL/include/CLKernel.h b/BealtoOpenCL/include/CLKernel.h
new file mode 100644
index 0000000..7ded751
--- /dev/null
+++ b/BealtoOpenCL/include/CLKernel.h
@@ -0,0 +1,78 @@
+// OpenCL kernel object
+// (c) EB Sep 2009
+
+#ifndef CLKernel_h
+#define CLKernel_h
+
+#include <CL/cl.h>
+#include "CLError.h"
+#include "CLBuffer.h"
+
+namespace cl {
+
+class Kernel
+{
+public:
+
+  // Instances of this class are created from a Program.
+
+  // Copy constructor
+  Kernel(const Kernel & a) : mX(a.mX)
+  { clRetainKernel(mX); }
+
+  // = operator
+  Kernel & operator = (const Kernel & a)
+  {
+    if (a.mX != mX)
+    {
+      clReleaseKernel(mX);
+      mX = a.mX;
+      clRetainKernel(mX);
+    }
+    return *this;
+  }
+
+  // Destructor
+  virtual ~Kernel()
+  { clReleaseKernel(mX); }
+
+  // Set kernel argument
+  // See clSetKernelArg for arguments.
+  // Return TRUE if OK, and FALSE otherwise.
+  bool setArg(cl_uint arg_index,size_t arg_size,const void * arg_value)
+  {
+    cl_int status = clSetKernelArg(mX,arg_index,arg_size,arg_value);
+    REPORT_OPENCL_STATUS(status);
+    return (status == CL_SUCCESS);
+  }
+  // Set kernel argument (Buffer)
+  // Return TRUE if OK, and FALSE otherwise.
+  bool setArg(cl_uint arg_index,Buffer * b)
+  {
+    if (b == 0) return false; // Invalid
+    cl_mem m = b->mX;
+    return setArg(arg_index,sizeof(m),(const void *)&m);
+  }
+  // Set kernel argument (other types)
+  // Return TRUE if OK, and FALSE otherwise.
+  bool setArg(cl_int arg_index,cl_int x)
+  { return setArg(arg_index,sizeof(x),(const void *)&x); }
+  bool setArg(cl_int arg_index,cl_float x)
+  { return setArg(arg_index,sizeof(x),(const void *)&x); }
+
+private:
+
+  Kernel(); // not implemented
+  Kernel(cl_kernel x) : mX(x) { }
+
+  // OpenCL handle (always non 0)
+  cl_kernel mX;
+
+  friend class Program;
+  friend class CommandQueue;
+
+}; // class Kernel
+
+} // namespace
+
+#endif // CLKernel_h
diff --git a/BealtoOpenCL/include/CLMemoryObject.h b/BealtoOpenCL/include/CLMemoryObject.h
new file mode 100644
index 0000000..433bbdc
--- /dev/null
+++ b/BealtoOpenCL/include/CLMemoryObject.h
@@ -0,0 +1,77 @@
+// OpenCL memory object
+// (c) EB Dec 2009
+
+#ifndef CLMemoryObject_h
+#define CLMemoryObject_h
+
+#include <CL/cl.h>
+#include "CLError.h"
+
+namespace cl {
+
+class MemoryObject
+{
+public:
+
+  // Instances of this class are created from a Context.
+
+  // Copy constructor
+  MemoryObject(const MemoryObject & a) : mX(a.mX)
+  { clRetainMemObject(mX); }
+
+  // = operator
+  MemoryObject & operator = (const MemoryObject & a)
+  {
+    if (a.mX != mX)
+    {
+      clReleaseMemObject(mX);
+      mX = a.mX;
+      clRetainMemObject(mX);
+    }
+    return *this;
+  }
+
+  // Destructor
+  virtual ~MemoryObject()
+  { clReleaseMemObject(mX); }
+
+  // Get memory object info.
+  // See clGetMemObjectInfo for the arguments.
+  bool getMemObjectInfo(cl_mem_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret)
+  {
+    cl_int status = clGetMemObjectInfo(mX,param_name,param_value_size,param_value,param_value_size_ret);
+    REPORT_OPENCL_STATUS(status);
+    return (status == CL_SUCCESS);
+  }
+
+  // Template version suitable for scalar types
+  template <class T> bool getMemObjectInfo(cl_mem_info param_name,T & x)
+  { return getMemObjectInfo(param_name,sizeof(T),&x,0); }
+
+  // Specific info entries
+  size_t getSize()
+  {
+    size_t x;
+    bool ok = getMemObjectInfo(CL_MEM_SIZE,x);
+    return (ok)?x:0;
+  }
+
+private:
+
+  MemoryObject(); // not implemented
+  MemoryObject(cl_mem x) : mX(x) { }
+
+  // OpenCL handle (always non 0)
+  cl_mem mX;
+
+  friend class Context;
+  friend class CommandQueue;
+  friend class Kernel;
+  friend class Buffer;
+  friend class Image2D;
+
+}; // class MemoryObject
+
+} // namespace
+
+#endif // CLMemoryObject_h
diff --git a/BealtoOpenCL/include/CLProgram.h b/BealtoOpenCL/include/CLProgram.h
new file mode 100644
index 0000000..07a85a7
--- /dev/null
+++ b/BealtoOpenCL/include/CLProgram.h
@@ -0,0 +1,74 @@
+// OpenCL program object
+// (c) EB Sep 2009
+
+#ifndef CLProgram_h
+#define CLProgram_h
+
+#include <CL/cl.h>
+#include <string>
+#include <vector>
+#include "CLError.h"
+#include "CLKernel.h"
+
+namespace cl {
+
+class Program
+{
+public:
+
+  // Instances of this class are created from a Context.
+
+  // Copy constructor
+  Program(const Program & a) : mX(a.mX)
+  { clRetainProgram(mX); }
+
+  // = operator
+  Program & operator = (const Program & a)
+  {
+    if (a.mX != mX)
+    {
+      clReleaseProgram(mX);
+      mX = a.mX;
+      clRetainProgram(mX);
+    }
+    return *this;
+  }
+
+  // Destructor
+  virtual ~Program()
+  { clReleaseProgram(mX); }
+
+  // Build the program (for all devices, and blocking)
+  // See clBuildProgram for the arguments.
+  // Return TRUE if OK, and FALSE otherwise.
+  bool build(const char * options,std::string & buildErrors);
+
+  // Get program binary for first device
+  bool getBinary(std::vector<unsigned char> & binary);
+
+  // Get a kernel for this program
+  // Return a new instance if OK, and 0 otherwise.
+  Kernel * createKernel(const char * kernel_name)
+  {
+    cl_int status;
+    cl_kernel k = clCreateKernel(mX,kernel_name,&status);
+    REPORT_OPENCL_STATUS(status);
+    if (k == 0) return 0; // Failed
+    return new Kernel(k);
+  }
+
+private:
+
+  Program(); // not implemented
+  Program(cl_program x) : mX(x) { }
+
+  // OpenCL handle (always non 0)
+  cl_program mX;
+
+  friend class Context;
+
+}; // class Buffer
+
+} // namespace
+
+#endif // CLProgram_h
diff --git a/BealtoOpenCL/src/CLContext.cpp b/BealtoOpenCL/src/CLContext.cpp
new file mode 100644
index 0000000..7ca7848
--- /dev/null
+++ b/BealtoOpenCL/src/CLContext.cpp
@@ -0,0 +1,351 @@
+// OpenCL context
+// (c) EB Sep 2009
+
+#define _CRT_SECURE_NO_WARNINGS
+#include <sys/stat.h>
+#include <vector>
+#include <sstream>
+#include <stdlib.h>
+#include "CLError.h"
+#include "CLContext.h"
+#include "CLCommandQueue.h"
+#include "CLBuffer.h"
+#include "CLImage2D.h"
+#include "CLProgram.h"
+
+#ifdef Linux
+#define _stat stat
+#endif
+
+using namespace cl;
+
+Context::Context() : mX(0)
+{
+}
+
+Context::Context(const Context & a) : mX(a.mX)
+{
+  clRetainContext(mX);
+}
+
+Context & Context::operator = (const Context & a)
+{
+  if (a.mX != mX)
+  {
+    clReleaseContext(mX);
+    mX = a.mX;
+    clRetainContext(mX);
+  }
+  return *this;
+}
+
+Context::~Context()
+{
+  clReleaseContext(mX);
+}
+
+Context * Context::create(cl_device_type deviceType)
+{
+  cl_platform_id bestPlatform = 0;
+  std::vector<cl_device_id> devices;
+  int status;
+
+  // Platforms
+  cl_uint nPlatforms = 0;
+  std::vector<cl_platform_id> pIDs;
+  status = clGetPlatformIDs(0,0,&nPlatforms);
+  REPORT_OPENCL_STATUS(status);
+  if (status != CL_SUCCESS || nPlatforms == 0) return 0; // Failed
+  pIDs.resize(nPlatforms,0);
+  status = clGetPlatformIDs(nPlatforms,&(pIDs[0]),&nPlatforms);
+  REPORT_OPENCL_STATUS(status);
+  if (status != CL_SUCCESS) return 0; // Failed
+
+  // printf("NPlatforms: %u\n",nPlatforms);
+
+  // Devices for each platform
+  for (unsigned int i=0;i<nPlatforms;i++)
+  {
+    cl_platform_id p = pIDs[i];
+
+    // Get all GPU devices for this platform
+    cl_uint nDevices = 0;
+    status = clGetDeviceIDs(p,deviceType,0,0,&nDevices);
+    // printf("NDevices: %u\n",nDevices);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS || nDevices == 0) continue;
+    devices.resize(nDevices,0);
+    status = clGetDeviceIDs(p,deviceType,nDevices,&(devices[0]),&nDevices);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) { devices.clear(); continue; }
+
+    bestPlatform = p;
+    break; // OK
+  }
+  if (devices.empty())
+    {
+      printf("No device found\n");
+      return 0;
+    }
+
+  cl_context_properties cp[4];
+  cp[0] = CL_CONTEXT_PLATFORM;
+  cp[1] = (cl_context_properties)bestPlatform;
+  cp[2] = cp[3] = 0;
+  cl_context c = clCreateContext(cp,(int)devices.size(),&(devices[0]),0,0,0);
+  if (c == 0) return 0;
+
+  Context * result = new Context();
+  result->mX = c;
+  result->mDevices = devices;
+
+  return result;
+}
+
+int Context::getNDevices() const
+{
+  return (int)mDevices.size();
+}
+
+CommandQueue * Context::createCommandQueue(int d,cl_command_queue_properties properties)
+{
+  cl_int status;
+  if (d<0 || d>=(int)mDevices.size()) return 0; // Invalid D
+  cl_command_queue q = clCreateCommandQueue(mX,mDevices[d],properties,&status);
+  REPORT_OPENCL_STATUS(status);
+  if (q == 0) return 0; // Failed
+  return new CommandQueue(q);
+}
+
+Buffer * Context::createBuffer(cl_mem_flags flags,size_t size,void * host_ptr)
+{
+  cl_int status;
+  cl_mem m = clCreateBuffer(mX,flags,size,host_ptr,&status);
+  REPORT_OPENCL_STATUS(status);
+  if (m == 0) return 0; // Failed
+  return new Buffer(m);
+}
+
+Image2D * Context::createImage2D(cl_mem_flags flags,const cl_image_format * image_format,
+    size_t width,size_t height,size_t pitch,void * host_ptr)
+{
+  cl_int status;
+  cl_mem m = clCreateImage2D(mX,flags,image_format,width,height,pitch,host_ptr,&status);
+  REPORT_OPENCL_STATUS(status);
+  if (m == 0) return 0; // Failed
+  return new Image2D(m);
+}
+
+Program * Context::createProgramWithSource(cl_uint count,const char ** strings,const size_t * lengths)
+{
+  cl_int status;
+  cl_program p = clCreateProgramWithSource(mX,count,strings,lengths,&status);
+  REPORT_OPENCL_STATUS(status);
+  if (p == 0) return 0; // Failed
+  return new Program(p);
+}
+
+Program * Context::createProgramWithFiles(cl_uint count,const char ** filenames)
+{
+  if (count == 0 || filenames == 0) return 0; // Invalid
+
+  std::vector<char *> strings;
+  std::vector<size_t> lengths;
+  strings.resize(count,0);
+  lengths.resize(count,0);
+  bool ok = true;
+  for (unsigned int i=0;i<count;i++)
+  {
+    // Get file size and check it exists
+    struct _stat s;
+    int status = ::_stat(filenames[i],&s);
+    if (status != 0) { ok = false; break; }
+    size_t sz = s.st_size;
+    if (sz == 0) { ok = false; break; }
+
+    // Read the file
+    FILE * f = fopen(filenames[i],"rb");
+    if (f == 0) { ok = false; break; }
+    char * buffer = (char *)malloc(sz);
+    strings[i] = buffer;
+    lengths[i] = sz;
+
+    size_t readBlocks = fread(buffer,sz,1,f);
+    fclose(f);
+    if (readBlocks != 1) { ok = false; break; } // read failed
+  }
+  Program * result = 0;
+  if (ok) result = createProgramWithSource(count,(const char **)&(strings[0]),&(lengths[0]));
+
+  // Cleanup
+  for (unsigned int i=0;i<count;i++) if (strings[i] != 0)
+  { free(strings[i]); }
+
+  return result;
+}
+
+Program * Context::createProgramWithFiles(const std::vector<std::string> & filenames)
+{
+  cl_uint n = (int)filenames.size();
+  if (n == 0) return 0; // Empty
+  std::vector<const char *> files(n,0);
+  for (cl_uint i=0;i<n;i++)
+  {
+    if (filenames[i].empty()) continue; // 0 if empty
+    files[i] = filenames[i].c_str();
+  }
+  return createProgramWithFiles(n,&(files[0]));
+}
+
+bool Context::getDeviceInfo(int d,cl_device_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret)
+{
+  if (d<0 || d>=(int)mDevices.size()) return 0; // Invalid D
+  cl_int status = clGetDeviceInfo(mDevices[d],param_name,param_value_size,param_value,param_value_size_ret);
+  REPORT_OPENCL_STATUS(status);
+  return (status == CL_SUCCESS);
+}
+
+bool Context::getAllDeviceInfo(int d,std::string & info)
+{
+  cl_uint ui;
+  cl_ulong ul;
+  cl_bool b;
+  info.clear();
+  std::ostringstream o;
+  o << "Device info (ID=0x" << mDevices[d] << ")\n";
+
+  cl_device_type dType;
+  if (!getDeviceInfo(d,CL_DEVICE_TYPE,dType)) return false;
+  o << "- type:";
+  if (dType & CL_DEVICE_TYPE_CPU) o << " cpu";
+  if (dType & CL_DEVICE_TYPE_GPU) o << " gpu";
+  if (dType & CL_DEVICE_TYPE_ACCELERATOR) o << " accelerator";
+  o << "\n";
+
+  if (!getDeviceInfo(d,CL_DEVICE_VENDOR_ID,ui)) return false;
+  o << "- vendor id: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_COMPUTE_UNITS,ui)) return false;
+  o << "- max compute units: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,ui)) return false;
+  o << "- max work item dimensions: " << ui << "\n";
+  size_t sz[24];
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_ITEM_SIZES,24*sizeof(size_t),sz,0)) return false;
+  o << "- max work item sizes:";
+  for (cl_uint i=0;i<ui;i++) o << " " << sz[i];
+  o << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_GROUP_SIZE,sz[0])) return false;
+  o << "- max work group size: " << sz[0] << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,ui)) return false;
+  o << "- preferred vector width char: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,ui)) return false;
+  o << "- preferred vector width short: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,ui)) return false;
+  o << "- preferred vector width int: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,ui)) return false;
+  o << "- preferred vector width long: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,ui)) return false;
+  o << "- preferred vector width float: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,ui)) return false;
+  o << "- preferred vector width double: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_CLOCK_FREQUENCY,ui)) return false;
+  o << "- max clock frequency: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_ADDRESS_BITS,ui)) return false;
+  o << "- address bits: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_MEM_ALLOC_SIZE,ul)) return false;
+  o << "- max mem alloc size: " << ul << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_IMAGE_SUPPORT,b)) return false;
+  o << "- image support: " << (b?"yes":"no") << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_READ_IMAGE_ARGS,ui)) return false;
+  o << "- max read image args: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_WRITE_IMAGE_ARGS,ui)) return false;
+  o << "- max write image args: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_WIDTH,sz[0])) return false;
+  o << "- image2d max width: " << sz[0] << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_HEIGHT,sz[0])) return false;
+  o << "- image2d max height: " << sz[0] << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_WIDTH,sz[0])) return false;
+  o << "- image3d max width: " << sz[0] << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_HEIGHT,sz[0])) return false;
+  o << "- image3d max height: " << sz[0] << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_DEPTH,sz[0])) return false;
+  o << "- image3d max depth: " << sz[0] << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_SAMPLERS,ui)) return false;
+  o << "- max samplers: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_PARAMETER_SIZE,sz[0])) return false;
+  o << "- max parameter size: " << sz[0] << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MEM_BASE_ADDR_ALIGN,ui)) return false;
+  o << "- mem base addr align: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE,ui)) return false;
+  o << "- min data type align size: " << ui << "\n";
+
+  cl_device_fp_config fpc;
+  if (!getDeviceInfo(d,CL_DEVICE_SINGLE_FP_CONFIG,fpc)) return false;
+  o << "- single fp config:";
+  if (fpc & CL_FP_DENORM) o << " denorm";
+  if (fpc & CL_FP_INF_NAN) o << " inf_nan";
+  if (fpc & CL_FP_ROUND_TO_NEAREST) o << " round_to_nearest";
+  if (fpc & CL_FP_ROUND_TO_ZERO) o << " round_to_zero";
+  if (fpc & CL_FP_ROUND_TO_INF) o << " round_to_inf";
+  if (fpc & CL_FP_FMA) o << " fma";
+  o << "\n";
+
+  cl_device_mem_cache_type mct;
+  if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHE_TYPE,mct)) return false;
+  o << "- global mem cache type:";
+  if (mct == CL_NONE) o << " none";
+  else if (mct == CL_READ_ONLY_CACHE) o << " read_only_cache";
+  else if (mct == CL_READ_WRITE_CACHE) o << " read_write_cache";
+  o << "\n";
+
+  if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE,ui)) return false;
+  o << "- global mem cacheline size: " << ui << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,ul)) return false;
+  o << "- global mem cache size: " << ul << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_SIZE,ul)) return false;
+  o << "- global mem size: " << ul << "\n";
+
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,ul)) return false;
+  o << "- max constant buffer size: " << ul << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_MAX_CONSTANT_ARGS,ui)) return false;
+  o << "- max constant args: " << ui << "\n";
+
+  cl_device_local_mem_type lmt;
+  if (!getDeviceInfo(d,CL_DEVICE_LOCAL_MEM_TYPE,lmt)) return false;
+  o << "- local mem type:";
+  if (lmt == CL_LOCAL) o << " local";
+  else o << " global";
+  o << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_LOCAL_MEM_SIZE,ul)) return false;
+  o << "- local mem size: " << ul << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_ERROR_CORRECTION_SUPPORT,b)) return false;
+  o << "- error correction support: " << (b?"yes":"no") << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_PROFILING_TIMER_RESOLUTION,sz[0])) return false;
+  o << "- profiling timer resolution: " << sz[0] << " ns\n";
+  if (!getDeviceInfo(d,CL_DEVICE_ENDIAN_LITTLE,b)) return false;
+  o << "- endian little: " << (b?"yes":"no") << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_AVAILABLE,b)) return false;
+  o << "- available: " << (b?"yes":"no") << "\n";
+  if (!getDeviceInfo(d,CL_DEVICE_COMPILER_AVAILABLE,b)) return false;
+  o << "- compiler available: " << (b?"yes":"no") << "\n";
+
+  cl_device_exec_capabilities ec;
+  if (!getDeviceInfo(d,CL_DEVICE_EXECUTION_CAPABILITIES,ec)) return false;
+  o << "- execution capabilities:";
+  if (ec & CL_EXEC_KERNEL) o << " exec_kernel";
+  if (ec & CL_EXEC_NATIVE_KERNEL) o << " exec_native_kernel";
+  o << "\n";
+
+  cl_command_queue_properties qp;
+  if (!getDeviceInfo(d,CL_DEVICE_QUEUE_PROPERTIES,qp)) return false;
+  o << "- queue properties:";
+  if (ec & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) o << " out_of_order_exec_mode";
+  if (ec & CL_QUEUE_PROFILING_ENABLE) o << " profiling";
+  o << "\n";
+
+  char exts[8192];
+  if (!getDeviceInfo(d,CL_DEVICE_EXTENSIONS,8192,exts,0)) return false;
+  o << "- extensions: " << exts << "\n";
+
+  info.assign(o.str());
+  return true;
+}
diff --git a/BealtoOpenCL/src/CLError.cpp b/BealtoOpenCL/src/CLError.cpp
new file mode 100644
index 0000000..1fcd12b
--- /dev/null
+++ b/BealtoOpenCL/src/CLError.cpp
@@ -0,0 +1,82 @@
+// OpenCL error reporting
+// (c) EB Sep 2009
+
+#if CONFIG_USE_QT
+#include <QtCore/QDebug>
+#include <QtCore/QString>
+#endif
+#include <stdlib.h>
+#include <stdio.h>
+#include <CL/cl.h>
+#include "CLError.h"
+
+const int NErrorCodes = 63;
+static const char * ErrorCodes[NErrorCodes] = {
+"CL_SUCCESS", // 0
+"CL_DEVICE_NOT_FOUND", // -1
+"CL_DEVICE_NOT_AVAILABLE", // -2
+"CL_COMPILER_NOT_AVAILABLE", // -3
+"CL_MEM_OBJECT_ALLOCATION_FAILURE", // -4
+"CL_OUT_OF_RESOURCES", // -5
+"CL_OUT_OF_HOST_MEMORY", // -6
+"CL_PROFILING_INFO_NOT_AVAILABLE", // -7
+"CL_MEM_COPY_OVERLAP", // -8
+"CL_IMAGE_FORMAT_MISMATCH", // -9
+"CL_IMAGE_FORMAT_NOT_SUPPORTED", // -10
+"CL_BUILD_PROGRAM_FAILURE", // -11
+"CL_MAP_FAILURE", // -12
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // -13..-29
+"CL_INVALID_VALUE", // -30
+"CL_INVALID_DEVICE_TYPE", // -31
+"CL_INVALID_PLATFORM", // -32
+"CL_INVALID_DEVICE", // -33
+"CL_INVALID_CONTEXT", // -34
+"CL_INVALID_QUEUE_PROPERTIES", // -35
+"CL_INVALID_COMMAND_QUEUE", // -36
+"CL_INVALID_HOST_PTR", // -37
+"CL_INVALID_MEM_OBJECT", // -38
+"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", // -39
+"CL_INVALID_IMAGE_SIZE", // -40
+"CL_INVALID_SAMPLER", // -41
+"CL_INVALID_BINARY", // -42
+"CL_INVALID_BUILD_OPTIONS", // -43
+"CL_INVALID_PROGRAM", // -44
+"CL_INVALID_PROGRAM_EXECUTABLE", // -45
+"CL_INVALID_KERNEL_NAME", // -46
+"CL_INVALID_KERNEL_DEFINITION", // -47
+"CL_INVALID_KERNEL", // -48
+"CL_INVALID_ARG_INDEX", // -49
+"CL_INVALID_ARG_VALUE", // -50
+"CL_INVALID_ARG_SIZE", // -51
+"CL_INVALID_KERNEL_ARGS", // -52
+"CL_INVALID_WORK_DIMENSION", // -53
+"CL_INVALID_WORK_GROUP_SIZE", // -54
+"CL_INVALID_WORK_ITEM_SIZE", // -55
+"CL_INVALID_GLOBAL_OFFSET", // -56
+"CL_INVALID_EVENT_WAIT_LIST", // -57
+"CL_INVALID_EVENT", // -58
+"CL_INVALID_OPERATION", // -59
+"CL_INVALID_GL_OBJECT", // -60
+"CL_INVALID_BUFFER_SIZE", // -61
+"CL_INVALID_MIP_LEVEL" // -62
+};
+
+void cl::reportStatus(const char * file,int line,const char * function,int status)
+{
+  if (status >= 0) return;
+  status = -status;
+#if CONFIG_USE_QT
+  QString e;
+  if (status >= NErrorCodes || ErrorCodes[status] == 0) e = QString("%1").arg(status);
+  else e = ErrorCodes[status];
+  qDebug() << QString("OpenCL ERROR %1(%2),%3: %4").arg(file).arg(line).arg(function).arg(e);
+#else
+  const char * e = 0;
+  if (status >= NErrorCodes || ErrorCodes[status] == 0) e = "???";
+  else e = ErrorCodes[status];
+  printf("Error %s(%d),%s,%s\n",file,line,function,e);
+#endif
+#ifdef Linux
+  // exit(1);
+#endif
+}
diff --git a/BealtoOpenCL/src/CLProgram.cpp b/BealtoOpenCL/src/CLProgram.cpp
new file mode 100644
index 0000000..731c385
--- /dev/null
+++ b/BealtoOpenCL/src/CLProgram.cpp
@@ -0,0 +1,84 @@
+// OpenCL program object
+// (c) EB Sep 2009
+
+#include <stdlib.h>
+#include <string.h>
+#include "CLProgram.h"
+
+using namespace cl;
+
+bool Program::build(const char * options,std::string & buildErrors)
+{
+  buildErrors.clear();
+  cl_int status;
+
+  // Get devices
+  cl_uint nDevices = 0;
+  status = clGetProgramInfo(mX,CL_PROGRAM_NUM_DEVICES,sizeof(nDevices),&nDevices,0);
+  REPORT_OPENCL_STATUS(status);
+  if (status != CL_SUCCESS || nDevices == 0) { buildErrors.append("clGetProgramInfo NUM_DEVICES failed\n"); return false; }
+  std::vector<cl_device_id> devices;
+  devices.resize(nDevices,0);
+  status = clGetProgramInfo(mX,CL_PROGRAM_DEVICES,sizeof(cl_device_id)*nDevices,&(devices[0]),0);
+  REPORT_OPENCL_STATUS(status);
+  if (status != CL_SUCCESS) { buildErrors.append("clGetProgramInfo DEVICES failed\n"); return false; }
+
+  // Build program
+  status = clBuildProgram(mX,nDevices,&(devices[0]),options,0,0);
+  REPORT_OPENCL_STATUS(status);
+  if (status == CL_SUCCESS) return true; // OK
+
+  // Get build info for each device
+  for (unsigned int i=0;i<nDevices;i++)
+  {
+    cl_device_id d = devices[i];
+    char aux[4000];
+    status = clGetProgramBuildInfo(mX,d,CL_PROGRAM_BUILD_LOG,4000,aux,0);
+    REPORT_OPENCL_STATUS(status);
+    if (status != CL_SUCCESS) { buildErrors.append("clGetProgramBuildInfo failed\n"); continue; }
+    buildErrors.append(aux);
+  }
+
+  buildErrors.append("Build failed\n");
+  return false;
+}
+
+// Get program binary for first device
+bool Program::getBinary(std::vector<unsigned char> & binary)
+{
+  cl_int status;
+  binary.clear();
+
+  // Get devices
+  cl_uint nDevices = 0;
+  status = clGetProgramInfo(mX,CL_PROGRAM_NUM_DEVICES,sizeof(nDevices),&nDevices,0);
+  REPORT_OPENCL_STATUS(status);
+  if (status != CL_SUCCESS || nDevices == 0) return false;
+  // Get binary sizes for all devices
+  std::vector<size_t> binarySize(nDevices,0);
+  status = clGetProgramInfo(mX,CL_PROGRAM_BINARY_SIZES,nDevices*sizeof(size_t),&(binarySize[0]),0);
+  REPORT_OPENCL_STATUS(status);
+  if (status != CL_SUCCESS) return false;
+  // Alloc buffers
+  std::vector<unsigned char *> buf(nDevices,0);
+  for (cl_uint i=0;i<nDevices;i++)
+  {
+    if (binarySize[i]>0) buf[i] = (unsigned char *)malloc(binarySize[i]);
+  }
+  status = clGetProgramInfo(mX,CL_PROGRAM_BINARIES,nDevices*sizeof(unsigned char *),&(buf[0]),0);
+  REPORT_OPENCL_STATUS(status);
+  if (status != CL_SUCCESS) return false;
+  // Copy result for first device
+  if (binarySize[0]>0)
+  {
+    binary.resize(binarySize[0],0);
+    memcpy(&(binary[0]),buf[0],binarySize[0]);
+  }
+  // Free buffers
+  for (cl_uint i=0;i<nDevices;i++)
+  {
+    if (buf[i] != 0) free(buf[i]);
+  }
+  return true; // OK
+}
+
diff --git a/BealtoOpenCL/vs2008/BealtoOpenCL.sln b/BealtoOpenCL/vs2008/BealtoOpenCL.sln
new file mode 100644
index 0000000..8ccd961
--- /dev/null
+++ b/BealtoOpenCL/vs2008/BealtoOpenCL.sln
@@ -0,0 +1,26 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BealtoOpenCL", "BealtoOpenCL.vcproj", "{86930221-164E-4E80-B8C0-50DCB5740B37}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.ActiveCfg = Debug|Win32
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.Build.0 = Debug|Win32
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.ActiveCfg = Debug|x64
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.Build.0 = Debug|x64
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.ActiveCfg = Release|Win32
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.Build.0 = Release|Win32
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.ActiveCfg = Release|x64
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj b/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj
new file mode 100644
index 0000000..ce9faa9
--- /dev/null
+++ b/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj
@@ -0,0 +1,369 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9,00"
+	Name="BealtoOpenCL"
+	ProjectGUID="{86930221-164E-4E80-B8C0-50DCB5740B37}"
+	RootNamespace="BealtoOpenCL"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+		<Platform
+			Name="x64"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+			IntermediateDirectory="$(ConfigurationName)-32"
+			ConfigurationType="4"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\include"
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				EnableEnhancedInstructionSet="2"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				AdditionalDependencies="OpenCL.lib"
+				OutputFile="$(ProjectDir)..\lib-32\$(ProjectName)d.lib"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|x64"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)-64"
+			IntermediateDirectory="$(ConfigurationName)-64"
+			ConfigurationType="4"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\include"
+				PreprocessorDefinitions="WIN32;_DEBUG;_LIB;CONFIG_USE_QT=1"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				EnableEnhancedInstructionSet="0"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				AdditionalDependencies="OpenCL.lib"
+				OutputFile="$(ProjectDir)..\lib-64\$(ProjectName)d.lib"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+			IntermediateDirectory="$(ConfigurationName)-32"
+			ConfigurationType="4"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="..\include"
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				EnableEnhancedInstructionSet="2"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				AdditionalDependencies="OpenCL.lib"
+				OutputFile="$(ProjectDir)..\lib-32\$(ProjectName).lib"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|x64"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)-64"
+			IntermediateDirectory="$(ConfigurationName)-64"
+			ConfigurationType="4"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="..\include"
+				PreprocessorDefinitions="WIN32;NDEBUG;_LIB;CONFIG_USE_QT=1"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				EnableEnhancedInstructionSet="0"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLibrarianTool"
+				AdditionalDependencies="OpenCL.lib"
+				OutputFile="$(ProjectDir)..\lib-64\$(ProjectName).lib"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath="..\src\CLContext.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\src\CLError.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\src\CLProgram.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath="..\include\BealtoOpenCL.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLBuffer.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLCommandQueue.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLContext.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLError.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLEvent.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLEventList.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLImage2D.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLKernel.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLMemoryObject.h"
+				>
+			</File>
+			<File
+				RelativePath="..\include\CLProgram.h"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/MPBenchmarks/ArithmeticTasks.cpp b/MPBenchmarks/ArithmeticTasks.cpp
new file mode 100644
index 0000000..670f1d6
--- /dev/null
+++ b/MPBenchmarks/ArithmeticTasks.cpp
@@ -0,0 +1,294 @@
+// Arithmetic tasks
+// (c) EB Nov 2009
+
+#include <stdlib.h>
+#include "ArithmeticTasks.h"
+
+#ifdef Linux
+typedef short __int16;
+typedef int __int32;
+typedef long long int __int64;
+#endif
+
+// Get the ratio of user bits per stored bits for a given word size
+// (constants defined in OpenCL code)
+inline double useRatio(int ws)
+{
+  switch (ws)
+  {
+  case 16: return 14.0/16.0;
+  case 32: return 30.0/32.0;
+  case 64: return 62.0/64.0;
+  }
+  return 1.0;
+}
+
+double AddNGPUTask::run(int workgroupSize,size_t sz)
+{
+  if (!mBuildOK) return -1;
+
+  cl::Context * c = getContext();
+  cl::CommandQueue * q = getQueue();
+  cl::Program * p = getProgram();
+  if (c == 0 || q == 0 || p == 0) return -1;
+
+  // Check allocation size
+  size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size
+  if (3*sz > max_sz) return -1;
+
+  cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz);
+  cl::Buffer * b = c->createBuffer(CL_MEM_READ_WRITE,sz);
+  cl::Buffer * y = c->createBuffer(CL_MEM_READ_WRITE,sz);
+  cl::Kernel * kernel = 0;
+  if (mVariant == ADDN_V1) kernel = p->createKernel("add_v1");
+  else if (mVariant == ADDN_V2) kernel = p->createKernel("add_v2");
+  unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16);
+  int n = (int)sz / (mWS>>3); // N words in SZ
+  double mbps = -1;
+  if (kernel == 0 || a == 0 || b == 0 || y == 0 || buf == 0) goto END;
+
+  // Initialize A and B
+  for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF);
+  if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+  for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF);
+  if (!q->writeBuffer(b,true,0,sz,buf).isValid()) goto END;
+
+  // Run tests, double nOps until min time is reached
+  kernel->setArg(0,a);
+  kernel->setArg(1,b);
+  kernel->setArg(2,y);
+  if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END;
+  for (int nOps = 1; ; nOps <<= 1)
+  {
+    double t0 = getT();
+    for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize);
+    q->finish();
+    double t = (getT() - t0);
+    if (t < MIN_RUNNING_TIME) continue;
+    // OK, t is large enough
+    t /= (double)nOps;
+    mbps = (double)sz*useRatio(mWS)*1.0e-6/t; // MB/s
+    break;
+  }
+
+END: // Cleanup
+  if (buf != 0) _aligned_free(buf);
+  if (a != 0) delete a;
+  if (b != 0) delete b;
+  if (y != 0) delete y;
+  if (kernel != 0) delete kernel;
+
+  return mbps;
+}
+
+double Mul1GPUTask::run(int workgroupSize,size_t sz)
+{
+  if (!mBuildOK) return -1;
+
+  cl::Context * c = getContext();
+  cl::CommandQueue * q = getQueue();
+  cl::Program * p = getProgram();
+  if (c == 0 || q == 0 || p == 0) return -1;
+
+  // Check allocation size
+  size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size
+  if (2*sz > max_sz) return -1;
+
+  cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz);
+  cl::Buffer * y = c->createBuffer(CL_MEM_READ_WRITE,sz);
+  cl::Kernel * kernel = 0;
+  if (mVariant == MUL1_V1) kernel = p->createKernel("mul1_v1");
+
+  unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16);
+  int n = (int)sz>>2; // 4 bytes/word
+  const int kk = 0x2FEFEFEF;
+  double mbps = -1;
+  if (kernel == 0 || a == 0 || y == 0 || buf == 0) goto END;
+
+  // Initialize A
+  for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF);
+  if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+
+  // Run tests, double nOps until min time is reached
+  kernel->setArg(0,kk);
+  kernel->setArg(1,a);
+  kernel->setArg(2,y);
+  if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END;
+  for (int nOps = 1; ; nOps <<= 1)
+  {
+    double t0 = getT();
+    for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize);
+    q->finish();
+    double t = (getT() - t0);
+    if (t < MIN_RUNNING_TIME) continue;
+    // OK, t is large enough
+    t /= (double)nOps;
+    mbps = (double)sz*useRatio(32)*1.0e-6/t; // MB/s
+    break;
+  }
+
+END: // Cleanup
+  if (buf != 0) _aligned_free(buf);
+  if (a != 0) delete a;
+  if (y != 0) delete y;
+  if (kernel != 0) delete kernel;
+
+  return mbps;
+}
+
+// CPU
+
+struct AddNThreadParam
+{
+  int nOps; // number of loops
+  int index; // Block index
+  size_t sz; // size to process in thread
+  int wordSize; // 32 or 64
+  int variant; // ADD_V1, ADD_V2
+  const unsigned char * a;
+  const unsigned char * b;
+  unsigned char * out;
+};
+
+template <typename T,int LOG_BASE> void addNv1(AddNThreadParam * p)
+{
+  size_t n = p->sz / sizeof(T); // Words in block
+  const T * a = (const T *)(p->a);
+  const T * b = (const T *)(p->b);
+  T * out = (T *)(p->out);
+  T BASE = (T)1<<(T)LOG_BASE;
+  T BASE_MINUS1 = BASE - (T)1;
+
+  for (int it=0;it<p->nOps;it++)
+  {
+    T t = 0; // "carry" from previous word
+    if (p->index > 0)
+    {
+      // Get "carry" for last values of previous block
+      t = (a[-1] + b[-1]) >> (T)LOG_BASE;
+    }
+
+    for (size_t i=0;i<n;i++)
+    {
+      T s = a[i] + b[i];
+      out[i] = (s & BASE_MINUS1) + t;
+      t = s >> (T)LOG_BASE;
+    }
+  }
+}
+
+template <typename T,int LOG_BASE> void addNv2(AddNThreadParam * p)
+{
+  size_t n = p->sz / sizeof(T); // Words in block
+  const T * a = (const T *)(p->a);
+  const T * b = (const T *)(p->b);
+  T * out = (T *)(p->out);
+
+  for (int it=0;it<p->nOps;it++)
+  {
+    for (size_t i=0;i<n;i++)
+    {
+      out[i] = a[i] + b[i];
+    }
+  }
+}
+
+#ifndef Linux // Uses Win32 intrinsics
+void addNv3(AddNThreadParam * p)
+{
+  const int LOG_BASE = 30;
+  const int BASE = 1<<LOG_BASE;
+  const int BASE_MINUS1 = BASE - 1;
+
+  size_t n = p->sz >> 4; // We process blocks of 16 bytes
+  const __m128i * a = (const __m128i *)(p->a);
+  const __m128i * b = (const __m128i *)(p->b);
+  __m128i * out = (__m128i *)(p->out);
+
+  __m128i t;
+  __m128i mask = _mm_set1_epi32(BASE_MINUS1);
+  for (int it=0;it<p->nOps;it++)
+  {
+    // TODO: initialize T with the carry from the end of previous block
+    t = _mm_setzero_si128();
+    for (size_t i=0;i<n;i++)
+    {
+      __m128i s = _mm_add_epi32(_mm_load_si128(a+i),_mm_load_si128(b+i));
+      __m128i o = _mm_add_epi32(_mm_and_si128(s,mask),t);
+      _mm_store_si128(out+i,o);
+      t = _mm_srli_epi32(s,LOG_BASE);
+    }
+  }
+}
+#endif
+
+thread_return_t AddNThread(void * x)
+{
+  AddNThreadParam * p = (AddNThreadParam *)x;
+
+  if (p->wordSize == 16 && p->variant == ADDN_V1) addNv1<__int16,14>(p);
+  else if (p->wordSize == 32 && p->variant == ADDN_V1) addNv1<__int32,30>(p);
+  else if (p->wordSize == 64 && p->variant == ADDN_V1) addNv1<__int64,62>(p);
+  else if (p->wordSize == 16 && p->variant == ADDN_V2) addNv2<__int16,14>(p);
+  else if (p->wordSize == 32 && p->variant == ADDN_V2) addNv2<__int32,30>(p);
+  else if (p->wordSize == 64 && p->variant == ADDN_V2) addNv2<__int64,62>(p);
+#ifndef Linux
+  else if (p->wordSize == 32 && p->variant == ADDN_V3) addNv3(p);
+#endif
+
+  return 0;
+}
+
+double AddNCPUTask::run(int nThreads,size_t sz)
+{
+  if (nThreads <= 0 || sz <= 0) return -1; // invalid
+  size_t sz1 = sz / nThreads;
+  if (sz1 <= 0) return -1; // SZ too small
+
+  unsigned char * in_a = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+  unsigned char * in_b = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+  unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+  double mbps = -1;
+
+  std::vector<AddNThreadParam> params(nThreads);
+  for (int nOps=1; ;nOps<<=1)
+  {
+    // Initialize memory
+    for (size_t i=0;i<sz;i++)
+    {
+      in_a[i] = (unsigned char)(i & 0xFF);
+      in_b[i] = (unsigned char)((i+99) & 0xFF);
+      out[i] = 0;
+    }
+    // setup params
+    for (int i=0;i<nThreads;i++)
+    {
+      params[i].a = in_a + i*sz1;
+      params[i].b = in_b + i*sz1;
+      params[i].out = out + i*sz1;
+      params[i].sz = sz1;
+      params[i].nOps = nOps;
+      params[i].wordSize = mWS;
+      params[i].variant = mVariant;
+      params[i].index = i;
+    }
+    // run threads
+    double t = runCPUThreads(params,AddNThread);
+    // Check result
+    if (nOps == 1)
+    {
+      // ZZZ: do it!
+    }
+
+    if (t < MIN_RUNNING_TIME) continue; // Too short
+    t /= (double)nOps;
+
+    mbps = (double)sz1*(double)nThreads*useRatio(mWS)*1.0e-6/t;
+    break;
+  } // nOps loop
+
+  _aligned_free(in_a);
+  _aligned_free(in_b);
+  _aligned_free(out);
+  return mbps;
+}
diff --git a/MPBenchmarks/ArithmeticTasks.h b/MPBenchmarks/ArithmeticTasks.h
new file mode 100644
index 0000000..fd322a5
--- /dev/null
+++ b/MPBenchmarks/ArithmeticTasks.h
@@ -0,0 +1,103 @@
+// Arithmetic tasks
+// (c) EB Nov 2009
+
+#ifndef ArithmeticTasks_h
+#define ArithmeticTasks_h
+
+#include "Config.h"
+#include "GPUTask.h"
+#include "CPUTask.h"
+
+enum AddNVariants
+{
+  ADDN_V1 = 0,
+  ADDN_V2,
+  ADDN_V3,
+  NB_ADDN_VARIANTS
+};
+
+enum Mul1Variants
+{
+  MUL1_V1 = 0,
+  NB_MUL1_VARIANTS
+};
+
+
+// Sum two buffers
+class AddNGPUTask : public GPUTask
+{
+
+public:
+
+  AddNGPUTask(int variant,int wordSize,Logger * log) : GPUTask("gpu_add.cl",log)
+  {
+    mWS = wordSize;
+    mVariant = variant;
+    char options[200];
+    _snprintf(options,200,"-DWORD_SIZE=%d",mWS);
+    mBuildOK = buildProgram(options,log);
+    if (!mBuildOK) fprintf(stderr,"Build failed\n");
+  }
+
+  double run(int workgroupSize,size_t sz);
+
+private:
+
+  // Word size
+  int mWS;
+  // Variant
+  int mVariant;
+  // Build OK?
+  bool mBuildOK;
+
+};
+
+// Multiply buffer by 1 digit
+class Mul1GPUTask : public GPUTask
+{
+
+public:
+
+  Mul1GPUTask(int variant,int blockSize,Logger * log) : GPUTask("gpu_mul1.cl",log)
+  {
+    mBS = blockSize;
+    mVariant = variant;
+    char options[200];
+    const int logBase = 30;
+    const int base = 1<<logBase;
+    const int baseMinus1 = base-1;
+    _snprintf(options,200,"-DLOG_BASE=%d -DBASE=%d -DBASE_MINUS1=%d -DBLOCK_SIZE=%d",logBase,base,baseMinus1,blockSize);
+    mBuildOK = buildProgram(options,log);
+  }
+
+  double run(int workgroupSize,size_t sz);
+
+private:
+
+  // Build OK?
+  bool mBuildOK;
+  // Block size
+  int mBS;
+  // Variant
+  int mVariant;
+
+};
+
+// CPU
+
+class AddNCPUTask : public CPUTask
+{
+public:
+  AddNCPUTask(int variant,int wordSize) : mWS(wordSize), mVariant(variant) { }
+
+  double run(int nThreads,size_t sz);
+
+private:
+
+  // Word size (32 or 64)
+  int mWS;
+  // Variant
+  int mVariant;
+};
+
+#endif // ArithmeticTasks_h
diff --git a/MPBenchmarks/CPUTask.h b/MPBenchmarks/CPUTask.h
new file mode 100644
index 0000000..62fe768
--- /dev/null
+++ b/MPBenchmarks/CPUTask.h
@@ -0,0 +1,22 @@
+// CPU task base class
+// (c) EB Nov 2009
+
+#ifndef CPUTask_h
+#define CPUTask_h
+
+class CPUTask
+{
+public:
+
+  // Destructor
+  virtual ~CPUTask() { }
+
+  // Run task on NTHREADS threads and output size SZ.
+  // If SZ is not a multiple of NTHREADS, SZ is rounded down to
+  // the closest multiple.
+  // Return throughput in MB/s of output, or -1 on error.
+  virtual double run(int nThreads,size_t sz) = 0;
+
+};
+
+#endif // CPUTask_h
diff --git a/MPBenchmarks/Config.h b/MPBenchmarks/Config.h
new file mode 100644
index 0000000..c9a72ff
--- /dev/null
+++ b/MPBenchmarks/Config.h
@@ -0,0 +1,172 @@
+// Config flags, common includes, and utils
+// (c) EB Nov 2009
+
+#ifndef Config_h
+#define Config_h
+
+// Configuration. The flags may be defined on the compiler command line,
+// and in this case we don't change them here.
+
+// Use Qt interface?
+#ifndef CONFIG_USE_QT
+#define CONFIG_USE_QT 1
+#endif
+
+// Use MPIR library to cross-check results?
+#ifndef CONFIG_USE_MPIR
+#define CONFIG_USE_MPIR 0
+#endif
+
+#ifdef WIN32
+#define _CRT_SECURE_NO_WARNINGS
+#define WIN32_LEAN_AND_MEAN
+#endif
+
+// Minimal run time required to have a valid measurement (s)
+const double MIN_RUNNING_TIME = 0.2;
+
+// CPU memory alloc alignment (bytes)
+const int ALLOC_ALIGN = 4096;
+
+#include <malloc.h>
+#include <math.h>
+#include <string>
+
+// Windows specific
+#ifdef WIN32
+#include <emmintrin.h>
+#include <windows.h>
+inline double getT()
+{
+  LARGE_INTEGER c,f;
+  QueryPerformanceFrequency(&f);
+  QueryPerformanceCounter(&c);
+  return (double)c.QuadPart/(double)f.QuadPart;
+}
+#endif
+
+// Linux specific
+#ifdef Linux
+#include <sys/time.h>
+#include <pthread.h>
+inline void * _aligned_malloc(size_t sz,size_t a) { return memalign(a,sz); }
+inline void _aligned_free(void * x) { free(x); }
+#define _snprintf snprintf
+inline double getT()
+{
+  struct timeval tv;
+  gettimeofday(&tv,0);
+  return (double)tv.tv_sec+1.0e-6*(double)tv.tv_usec;
+}
+#endif
+
+// Common
+#if CONFIG_USE_MPIR
+#include <mpir.h>
+#endif
+#include "BealtoOpenCL.h"
+
+// Text logger
+class Logger
+{
+public:
+  virtual ~Logger() { }
+  virtual void append(const char * s) { printf("%s\n",s); fflush(stdout); }
+  void append(const std::string & s) { append(s.c_str()); }
+  virtual void appendTitle(const char * s) { printf("\n*** %s\n\n",s); fflush(stdout); }
+  void appendTitle(const std::string & s) { appendTitle(s.c_str()); }
+};
+
+#if CONFIG_USE_QT
+#include <QtGui/QTextEdit>
+#include <QtGui/QApplication>
+// Qt QTextEdit logger
+class QtLogger : public Logger
+{
+public:
+  QtLogger(QTextEdit * e) : mE(e) { }
+  void append(const char * s)
+  {
+    QTextCharFormat f;
+    f.setFontFamily("fixed");
+    mE->setCurrentCharFormat(f);
+    mE->append(s);
+    qApp->processEvents();
+  }
+  void appendTitle(const char * s)
+  {
+    QTextCharFormat f;
+    f.setFontWeight(QFont::Bold);
+    mE->setCurrentCharFormat(f);
+    mE->append(s);
+    qApp->processEvents();
+  }
+private:
+  QTextEdit * mE;
+};
+#endif
+
+// Return user friendly size from LOG (size = 1<<LOG) in S.
+inline void getUserSize(int log,std::string & s)
+{
+  if (log < 0 || log >= 40) { s.assign("???"); return; }
+  char aux[200];
+  if (log < 10) _snprintf(aux,200,"%d B",1<<log);
+  else if (log < 20) _snprintf(aux,200,"%d KiB",1<<(log-10));
+  else if (log < 30) _snprintf(aux,200,"%d MiB",1<<(log-20));
+  else if (log < 40) _snprintf(aux,200,"%d GiB",1<<(log-30));
+  s.assign(aux);
+}
+
+#ifdef Linux
+typedef void * (*ThreadProc)(void *);
+#define thread_return_t void *
+#endif
+#ifdef WIN32
+typedef LPTHREAD_START_ROUTINE ThreadProc;
+#define thread_return_t DWORD WINAPI
+#endif
+
+// Run one CPU thread calling F for each parameter in the PARAMS array.
+// Return the real time of execution, or -1 on error.
+template <class P> double runCPUThreads(std::vector<P> & params,ThreadProc f)
+{
+  int nt = (int)params.size(); // thread count
+  if (nt <= 0) return -1; // invalid
+  double t0 = getT();
+  bool ok = true;
+#ifdef Linux
+  std::vector<pthread_t> threads(nt,0);
+  // start all threads
+  for (int i=0;i<nt;i++)
+    {
+      int s = pthread_create(&(threads[i]),0,f,&(params[i]));
+      if (s != 0) { ok = false; threads[i] = 0; }
+    }
+  // join all threads
+  for (int i=0;i<nt;i++)
+    {
+      if (threads[i] == 0) continue; // creation failed
+      int s = pthread_join(threads[i],0);
+      if (s != 0) ok = false;
+    }
+#endif
+#ifdef WIN32
+  std::vector<HANDLE> threads(nt,0);
+  // start all threads
+  for (int i=0;i<nt;i++)
+    {
+      threads[i] = CreateThread(0,0,f,&(params[i]),0,0);
+      if (threads[i] == 0) ok = false;
+    }
+  // wait for all threads to terminate
+  WaitForMultipleObjects(nt,&(threads[0]),TRUE,INFINITE);
+  // delete threads
+  for (int i=0;i<nt;i++) CloseHandle(threads[i]);
+#endif
+  if (!ok) return -1; // error
+  double t = getT() - t0;
+  return t;
+}
+
+#endif // Config_h
diff --git a/MPBenchmarks/Conversions.h b/MPBenchmarks/Conversions.h
new file mode 100644
index 0000000..1ec9eb9
--- /dev/null
+++ b/MPBenchmarks/Conversions.h
@@ -0,0 +1,74 @@
+// Convert 30-bit and MPZ numbers to 32-bit hexa strings for comparisons
+// (c) EB Nov 2009
+
+#ifndef Conversions_h
+#define Conversions_h
+
+// Convert X[N] from redundand representation in base 2^30
+// to an hex string.
+void convert30(int n,const int * x,std::string & out)
+{
+  int nAux = 0; // Valid bits in aux
+  __int64 aux = 0;
+  std::vector<unsigned int> y(n,0);
+  int j = 0; // Next value in Y receiving bits
+
+  for (int i=0;i<n;i++)
+  {
+    // Insert X[i] into AUX
+    aux += ((__int64)x[i]<<nAux);
+    nAux += 30;
+    // Extract 32 bits if available
+    if (nAux>=32)
+    {
+      y[j++] = (unsigned int)(aux&0xFFFFFFFF);
+      aux >>= 32;
+      nAux -= 32;
+    }
+  }
+  // Extract last bits
+  while (nAux > 0)
+  {
+    y[j++] = (unsigned int)(aux&0xFFFFFFFF);
+    aux >>= 32;
+    nAux -= 32;
+  }
+  out.clear();
+  out.reserve(9*n+20);
+  char s[32];
+  bool first = true;
+  for (int i=n-1;i>=0;i--)
+  {
+    if (first)
+    {
+      if (y[i] == 0) continue;
+      first = false;
+    }
+    else out.push_back('.');
+    _snprintf(s,32,"%08X",y[i]);
+    out.append(s);
+  }
+  if (first) out.append("0");
+}
+
+#if CONFIG_USE_MPIR
+// Convert mpz_t integer to a string
+void convertMPZ(mpz_t x,std::string & out)
+{
+  char * s = mpz_get_str(0,16,x);
+  if (s == 0) return; // Failed
+  int n = (int)strlen(s);
+  out.clear();
+  out.reserve(n+n/8+8);
+  int digits = 0;
+  while ( (digits+n)&7 ) { out.push_back('0'); digits++; } // Initial 0 to get a multiple of 8
+  for (int i=0;i<n;i++)
+  {
+    if ( (digits&7) == 0 && i>0 ) out.push_back('.');
+    out.push_back(toupper(s[i])); digits++;
+  }
+  free(s);
+}
+#endif
+
+#endif // Conversions_h
diff --git a/MPBenchmarks/GPUTask.cpp b/MPBenchmarks/GPUTask.cpp
new file mode 100644
index 0000000..dbfa65c
--- /dev/null
+++ b/MPBenchmarks/GPUTask.cpp
@@ -0,0 +1,74 @@
+// GPU task base class
+// (c) EB Nov 2009
+
+#include "GPUTask.h"
+
+GPUTask::GPUTask(const char * programFile,Logger * log)
+{
+  cl::Context * c = 0;
+  cl::CommandQueue * q = 0;
+  cl::Program * p = 0;
+  bool ok = true;
+  std::string s;
+  std::vector<unsigned char> binary;
+
+  if (log == 0) { ok = false; goto END; }
+  c = cl::Context::create();
+  if (c == 0) { log->append("Context creation failed"); ok = false; goto END; }
+  q = c->createCommandQueue(0,0);
+  if (q == 0) { log->append("Command queue creation failed"); ok = false; goto END; }
+  if (programFile == 0) p = 0;
+  else
+  {
+#ifdef WIN32
+    // We run from the vs2008 solution directory, so we must
+    // change the path... (quick & dirty patch)
+    char programFileWin[1000];
+    _snprintf(programFileWin,1000,"../%s",programFile);
+    p = c->createProgramWithFile(programFileWin);
+#else
+    p = c->createProgramWithFile(programFile);
+#endif
+    if (p == 0) { log->append("Program creation failed"); ok = false; goto END; }
+  }
+
+END:
+  if (!ok)
+  {
+    if (p != 0) { delete p; p = 0; }
+    if (q != 0) { delete q; q = 0; }
+    if (c != 0) { delete c; c = 0; }
+  }
+  mProgram = p;
+  mQueue = q;
+  mContext = c;
+}
+
+GPUTask::~GPUTask()
+{
+  delete mProgram;
+  delete mQueue;
+  delete mContext;
+}
+
+bool GPUTask::buildProgram(const char * options,Logger * log)
+{
+  if (mProgram == 0) return false; // Invalid
+  if (options == 0 || log == 0) return false; // Invalid
+
+  std::string s;
+  bool ok = mProgram->build(options,s);
+  if (!ok) { log->append("Build failed, errors:"); log->append(s.c_str()); }
+#if 0 // Show PTX code (nvidia only)
+  ok &= p->getBinary(binary);
+  if (!ok) { log->append("Get binary failed"); goto END; }
+  log->append((char *)&(binary[0]));
+#endif
+  return ok;
+}
+
+cl::Kernel * GPUTask::createKernel(const char * name)
+{
+  if (mProgram == 0) return 0; // Invalid
+  return mProgram->createKernel(name);
+}
diff --git a/MPBenchmarks/GPUTask.h b/MPBenchmarks/GPUTask.h
new file mode 100644
index 0000000..d9f8155
--- /dev/null
+++ b/MPBenchmarks/GPUTask.h
@@ -0,0 +1,56 @@
+// GPU task base class
+// (c) EB Nov 2009
+
+#ifndef GPUTask_h
+#define GPUTask_h
+
+#include "Config.h"
+
+class GPUTask
+{
+public:
+
+  // Create a new context, with a command queue, and the given program.
+  // PROGRAMFILE may be 0 if no program is needed.
+  // Check if the class is valid after construction with isValid().
+  GPUTask(const char * programFile,Logger * log);
+
+  // Destructor. Release CL objects
+  virtual ~GPUTask();
+
+  // Check if the task is well-defined.  May be redefined in derived classes
+  // to add more checks (call base class if redefined).
+  virtual bool isValid() { return (mContext != 0) && (mQueue != 0); }
+
+  // Functions to redefine in derived classes
+
+  // Run test producing SZ bytes of output.
+  // Return throughput in MB/s of generated output,
+  // or -1 on error.
+  virtual double run(int workgroupSize,size_t sz) = 0;
+
+protected:
+
+  // Build the program with OPTIONS.  Error messages are put in LOG.
+  // Return TRUE if OK, FALSE on error.
+  bool buildProgram(const char * options,Logger * log);
+
+  // Create a kernel for the current program, unless undefined
+  cl::Kernel * createKernel(const char * name);
+
+  inline cl::Context * getContext() { return mContext; }
+  inline cl::CommandQueue * getQueue() { return mQueue; }
+  inline cl::Program * getProgram() { return mProgram; }
+
+private:
+
+  // No copy
+  GPUTask(const GPUTask &);
+  GPUTask & operator = (const GPUTask &);
+
+  cl::Context * mContext;
+  cl::CommandQueue * mQueue;
+  cl::Program * mProgram;
+};
+
+#endif // GPUTask_h
diff --git a/MPBenchmarks/LICENSE.txt b/MPBenchmarks/LICENSE.txt
new file mode 100644
index 0000000..f5fe116
--- /dev/null
+++ b/MPBenchmarks/LICENSE.txt
@@ -0,0 +1,27 @@
+This code is released under the following license (BSD-style).
+--
+
+Copyright (c) 2009, Eric Bainville
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Eric Bainville nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY ERIC BAINVILLE ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ERIC BAINVILLE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MPBenchmarks/Makefile b/MPBenchmarks/Makefile
new file mode 100644
index 0000000..ebd701b
--- /dev/null
+++ b/MPBenchmarks/Makefile
@@ -0,0 +1,72 @@
+# OpenCL multiprecision CPU+GPU benchmarks
+# EB Nov 2009
+
+# archive data
+CURRENT_DATE := $(shell date +%Y%m%d_%Hh%M)
+CURRENT_DIR := $(notdir $(shell /bin/pwd))
+CURRENT_MACHINE := $(shell uname -m)
+
+# output directories
+OBJDIR := objs
+DEPDIR := deps
+ifeq ($(CURRENT_MACHINE),x86_64)
+LIBDIR := lib-64
+else
+LIBDIR := lib-32
+endif
+
+# targets
+OBJS = $(patsubst %,$(OBJDIR)/%.o, GPUTask MemoryTasks ArithmeticTasks appMain )
+DEPS = $(patsubst $(OBJDIR)/%.o, $(DEPDIR)/%.d, $(OBJS))
+
+# OpenCL SDK directory
+# OPENCL_DIR = /opt/cuda/include
+OPENCL_DIR = /opt/ati-stream-sdk-v2.0-beta4-lnx64
+# Bealto OpenCL library path
+BEALTO_OPENCL_DIR = ../BealtoOpenCL
+BEALTO_OPENCL_LIB = $(BEALTO_OPENCL_DIR)/$(LIBDIR)/libBealtoOpenCL.a
+
+# flags
+CXXFLAGS = -DLinux -DCONFIG_USE_QT=0 -O2 -mtune=nocona -msse3 -Wall -I/usr/include/qt4 -I$(OPENCL_DIR)/include -I$(BEALTO_OPENCL_DIR)/include
+LDFLAGS = -L/usr/lib/qt4 -L$(OPENCL_DIR)/lib/x86_64
+
+all: tocl
+
+tocl: $(OBJS) $(BEALTO_OPENCL_LIB)
+	g++ $(LDFLAGS) -o $@ $^ $(BEALTO_OPENCL_LIB) -lpthread -lOpenCL # -lQtCore -lQtGui 
+
+$(BEALTO_OPENCL_LIB):
+	make -C $(BEALTO_OPENCL_DIR) lib
+
+t1: t1.o
+	g++ -o $@ $^
+
+dos2unix:
+	dos2unix *.cpp *.h *.cl Makefile
+
+clean:
+	/bin/rm -f *.o *~ tocl t1
+	/bin/rm -rf vs2008/Release-* vs2008/Debug-* $(OBJDIR) $(DEPDIR)
+
+archive: clean
+	@echo "ARCHIVE $(CURRENT_DATE)"
+	tar czf "../MPBenchmarks-$(CURRENT_DATE).tar.gz" -C.. --exclude=".svn" $(CURRENT_DIR)
+
+##### Dependencies
+$(DEPDIR)/%.d: %.cpp
+	@[ -d $(DEPDIR) ] || mkdir -p $(DEPDIR)
+	@/bin/echo -e "DEPS \033[32m$*\033[0m"
+	@$(CXX) $(CXXFLAGS) -o $@ -MM -MT '$(OBJDIR)/$*.o $@' $<
+
+##### Compilation
+$(OBJDIR)/%.o: %.cpp
+	@[ -d $(OBJDIR) ] || mkdir -p $(OBJDIR)
+	@/bin/echo -e "C++  \033[34m$*\033[0m"
+	@$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+##### Qt
+moc_%.cpp: %.h
+	@/bin/echo -e "MOC  \033[34m$*\033[0m"
+	@moc -o $@ $<
+
+-include $(DEPS)
diff --git a/MPBenchmarks/MemoryTasks.cpp b/MPBenchmarks/MemoryTasks.cpp
new file mode 100644
index 0000000..a7780bc
--- /dev/null
+++ b/MPBenchmarks/MemoryTasks.cpp
@@ -0,0 +1,280 @@
+// Memory tasks
+// (c) EB Nov 2009
+
+#include "MemoryTasks.h"
+
+#include <string.h>
+
+
+// GPU
+
+double CopyGPUTask::run(int workgroupSize,size_t sz)
+{
+  cl::Context * c = getContext();
+  cl::CommandQueue * q = getQueue();
+  if (c == 0 || q == 0) return -1;
+
+  // Check allocation size
+  size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size
+  if (mCT == DEVICE_TO_DEVICE_COPY)
+  {
+    if (2*sz > max_sz) return -1;
+  }
+  if (sz > max_sz) return -1;
+
+  bool ok = true;
+  cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz);
+  if (a == 0) ok = false;
+  cl::Buffer * b = 0;
+  if (mCT == DEVICE_TO_DEVICE_COPY)
+  {
+    b = c->createBuffer(CL_MEM_READ_WRITE,sz);
+    if (b == 0) ok = false;
+  }
+  unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16);
+  if (buf == 0) ok = false;
+  double mbps = -1;
+
+  if (!ok) goto END; // Alloc failed
+
+  // Initialize A and check errors
+  for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF);
+  if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+  for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)0;
+  if (!q->readBuffer(a,true,0,sz,buf).isValid()) goto END;
+  ok = true;
+
+  // check write+read loop
+  for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)(i & 0xFF)) ok = false;
+  if (!ok) { fprintf(stderr,"write+read failed\n"); goto END; }
+
+  // check write+copy+read loop
+  if (b != 0)
+  {
+    for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF);
+    if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+    for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)0;
+    if (!q->writeBuffer(b,true,0,sz,buf).isValid()) goto END;
+    if (!q->copyBuffer(a,b,0,0,sz).isValid()) goto END;
+    if (!q->readBuffer(b,true,0,sz,buf).isValid()) goto END;
+    for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)(i & 0xFF)) ok = false;
+    if (!ok) { fprintf(stderr,"write+copy+read failed\n"); goto END; }
+  }
+
+  // Run tests, double nOps until the min time is reached
+  for (int nOps = 5; ; nOps <<= 1)
+  {
+    double t0 = getT();
+    switch (mCT)
+    {
+    case HOST_TO_DEVICE_COPY:
+      for (int i=0;i<nOps;i++) q->writeBuffer(a,false,0,sz,buf);
+      break;
+    case DEVICE_TO_HOST_COPY:
+      for (int i=0;i<nOps;i++) q->readBuffer(a,false,0,sz,buf);
+      break;
+    case DEVICE_TO_DEVICE_COPY:
+      for (int i=0;i<nOps;i++) q->copyBuffer(a,b,0,0,sz);
+      break;
+    }
+    q->finish();
+    double t = (getT() - t0);
+    if (t < MIN_RUNNING_TIME) continue;
+    // OK, t is large enough
+    t /= (double)nOps;
+    mbps = (double)sz*1.0e-6/t; // MB/s
+    break;
+  }
+
+END:
+  if (buf != 0) _aligned_free(buf);
+  if (a != 0) delete a;
+  if (b != 0) delete b;
+
+  return mbps;
+}
+
+double ZeroGPUTask::run(int workgroupSize,size_t sz)
+{
+  cl::Context * c = getContext();
+  cl::CommandQueue * q = getQueue();
+  cl::Program * p = getProgram();
+  if (c == 0 || q == 0 || p == 0) return -1;
+
+  // Check allocation size
+  size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size
+  if (sz > max_sz) return -1;
+
+  cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz);
+  unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16);
+  cl::Kernel * kernel = p->createKernel("zero");
+  double mbps = -1;
+  int n = (int)sz / (mWS>>3); // N words in SZ
+  bool ok;
+  if (kernel == 0 || a == 0 || buf == 0) goto END;
+
+  // Initialize A
+  for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF);
+  if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END;
+
+  // Run tests, double nOps until min time is reached
+  kernel->setArg(0,a);
+  if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END;
+
+  // check write+zero+read
+  if (!q->readBuffer(a,true,0,sz,buf).isValid()) goto END;
+  ok = true;
+  for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)0) ok = false;
+  if (!ok) { fprintf(stderr,"write+zero+read failed\n"); goto END; }
+
+  for (int nOps = 5; ; nOps <<= 1)
+  {
+    double t0 = getT();
+    for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize);
+    q->finish();
+    double t = (getT() - t0);
+    if (t < MIN_RUNNING_TIME) continue;
+    // OK, t is large enough
+    t /= (double)nOps;
+    mbps = (double)sz*1.0e-6/t; // MB/s
+    break;
+  }
+
+END:
+  if (buf != 0) _aligned_free(buf);
+  if (a != 0) delete a;
+  if (kernel != 0) delete kernel;
+
+  return mbps;
+}
+
+// CPU
+
+struct CopyThreadParam
+{
+  int nOps; // number of loops
+  size_t sz; // size to process in thread
+  const unsigned char * in;
+  unsigned char * out;
+};
+
+thread_return_t CopyThread(void * x)
+{
+  CopyThreadParam * p = (CopyThreadParam *)x;
+#ifdef WIN32
+  for (int i=0;i<p->nOps;i++) CopyMemory(p->out,p->in,p->sz);
+#else
+  for (int i=0;i<p->nOps;i++) memcpy(p->out,p->in,p->sz);
+#endif
+  return 0;
+}
+
+double CopyCPUTask::run(int nThreads,size_t sz)
+{
+  if (nThreads <= 0 || sz <= 0) return -1; // invalid
+  size_t sz1 = sz / nThreads;
+  if (sz1 <= 0) return -1; // SZ too small
+
+  unsigned char * in = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+  unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+  double mbps = -1;
+
+  std::vector<CopyThreadParam> params(nThreads);
+  for (int nOps=1; ;nOps<<=1)
+  {
+    // Initialize memory
+    for (size_t i=0;i<sz;i++)
+    {
+      in[i] = (unsigned char)(i & 0xFF);
+      out[i] = 0;
+    }
+    // setup params
+    for (int i=0;i<nThreads;i++)
+    {
+      params[i].in = in + i*sz1;
+      params[i].out = out + i*sz1;
+      params[i].sz = sz1;
+      params[i].nOps = nOps;
+    }
+    // run threads
+    double t = runCPUThreads(params,CopyThread);
+    // Check result
+    if (nOps == 1)
+    {
+      if (memcmp(in,out,sz) != 0) { fprintf(stderr,"CPU copy error\n"); mbps = -1; break; }
+    }
+
+    if (t < MIN_RUNNING_TIME) continue; // Too short
+    t /= (double)nOps;
+
+    mbps = (double)sz1*(double)nThreads*1.0e-6/t;
+    break;
+  }
+
+  _aligned_free(in);
+  _aligned_free(out);
+  return mbps;
+}
+
+struct ZeroThreadParam
+{
+  int nOps; // number of loops
+  size_t sz; // size to process in thread
+  unsigned char * out;
+};
+
+thread_return_t ZeroThread(void * x)
+{
+  CopyThreadParam * p = (CopyThreadParam *)x;
+#ifdef WIN32
+  for (int i=0;i<p->nOps;i++) ZeroMemory(p->out,p->sz);
+#else
+  for (int i=0;i<p->nOps;i++) memset(p->out,0,p->sz);
+#endif
+  return 0;
+}
+
+double ZeroCPUTask::run(int nThreads,size_t sz)
+{
+  if (nThreads <= 0 || sz <= 0) return -1; // invalid
+  size_t sz1 = sz / nThreads;
+  if (sz1 <= 0) return -1; // SZ too small
+
+  unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN);
+  double mbps = -1;
+
+  std::vector<CopyThreadParam> params(nThreads);
+  for (int nOps=1; ;nOps<<=1)
+  {
+    // Initialize memory
+    for (size_t i=0;i<sz;i++)
+    {
+      out[i] = (unsigned char)(i & 0xFF);
+    }
+    // setup params
+    for (int i=0;i<nThreads;i++)
+    {
+      params[i].out = out + i*sz1;
+      params[i].sz = sz1;
+      params[i].nOps = nOps;
+    }
+    // run threads
+    double t = runCPUThreads(params,ZeroThread);
+    // Check result
+    if (nOps == 1)
+    {
+      bool ok = true;
+      for (size_t i=0;i<sz;i++) if ( out[i] != 0 ) { ok = false; break; }
+      if (!ok) { fprintf(stderr,"CPU zero error\n"); mbps = -1; break; }
+    }
+
+    if (t < MIN_RUNNING_TIME) continue; // Too short
+    t /= (double)nOps;
+
+    mbps = (double)sz1*(double)nThreads*1.0e-6/t;
+    break;
+  }
+
+  _aligned_free(out);
+  return mbps;
+}
diff --git a/MPBenchmarks/MemoryTasks.h b/MPBenchmarks/MemoryTasks.h
new file mode 100644
index 0000000..92e35f6
--- /dev/null
+++ b/MPBenchmarks/MemoryTasks.h
@@ -0,0 +1,83 @@
+// Memory tasks
+// (c) EB Nov 2009
+
+#ifndef MemoryTasks_h
+#define MemoryTasks_h
+
+#include "Config.h"
+#include "GPUTask.h"
+#include "CPUTask.h"
+
+// Copy two buffers (GPU)
+class CopyGPUTask : public GPUTask
+{
+
+public:
+
+  enum CopyType
+  {
+    HOST_TO_DEVICE_COPY,
+    DEVICE_TO_HOST_COPY,
+    DEVICE_TO_DEVICE_COPY
+  };
+
+  CopyGPUTask(int copyType,Logger * log) : GPUTask(0,log)
+  {
+    mCT = copyType;
+  }
+
+  ~CopyGPUTask()
+  {
+  }
+
+  double run(int workgroupSize,size_t sz);
+
+private:
+
+  // Copy type
+  int mCT;
+
+};
+
+// Zero one buffer (GPU)
+class ZeroGPUTask : public GPUTask
+{
+
+public:
+
+  ZeroGPUTask(int wordSize,Logger * log) : GPUTask("gpu_zero.cl",log)
+  {
+    mWS = wordSize;
+    char options[200];
+    _snprintf(options,200,"-DWORD_SIZE=%d",mWS);
+    buildProgram(options,log);
+  }
+
+  ~ZeroGPUTask()
+  {
+  }
+
+  double run(int workgroupSize,size_t sz);
+
+private:
+
+  // Word size
+  int mWS;
+
+};
+
+// CPU
+
+class CopyCPUTask : public CPUTask
+{
+public:
+  double run(int nThreads,size_t sz);
+};
+
+class ZeroCPUTask : public CPUTask
+{
+public:
+  double run(int nThreads,size_t sz);
+};
+
+#endif // MemoryTasks_h
diff --git a/MPBenchmarks/appMain.cpp b/MPBenchmarks/appMain.cpp
new file mode 100644
index 0000000..f6f82bf
--- /dev/null
+++ b/MPBenchmarks/appMain.cpp
@@ -0,0 +1,284 @@
+// (c) EB Nov 2009
+
+#include "Config.h"
+
+#include "MemoryTasks.h"
+#include "ArithmeticTasks.h"
+
+#if CONFIG_USE_QT
+#include <QtGui/QApplication>
+#include <QtGui/QMainWindow>
+#include <QtGui/QTextEdit>
+#include <QtGui/QClipboard>
+#endif
+
+struct TestResult
+{
+  std::string title;
+  std::vector<int> mbps; // MB/s by logSize
+  void update(int ls,int x) { if (x > mbps[ls]) mbps[ls] = x; } // keep max
+};
+
+void print(FILE * f,const TestResult & r,int minLogSize,int maxLogSize)
+{
+  fprintf(f,"%-25s",r.title.c_str());
+  for (int i=minLogSize;i<=maxLogSize;i++)
+    {
+      int x = r.mbps[i];
+      fprintf(f,"%8d",x);
+    }
+  fprintf(f,"\n");
+}
+
+void print(Logger * log,const TestResult & r,int minLogSize,int maxLogSize)
+{
+  char aux[200];
+  std::string s;
+  _snprintf(aux,200,"%-25s",r.title.c_str()); s.append(aux);
+  for (int i=minLogSize;i<=maxLogSize;i++)
+    {
+      int x = r.mbps[i];
+      _snprintf(aux,200,"%8d",x); s.append(aux);
+    }
+  log->append(s);
+}
+
+int main(int argc,char ** argv)
+{
+#if CONFIG_USE_QT
+  // Qt windows
+  QApplication app(argc,argv);
+  QMainWindow * w = new QMainWindow();
+  QTextEdit * te = new QTextEdit();
+  te->setReadOnly(true);
+  w->setCentralWidget(te);
+  w->setMinimumSize(QSize(800,600));
+  w->show();
+  app.processEvents();
+  Logger * log = new QtLogger(te);
+#else
+  // Command line
+  Logger * log = new Logger();
+#endif
+
+#if 1
+  // Display device info
+  cl::Context * c = cl::Context::create();
+  if (c != 0)
+    {
+      std::string s;
+      if (c->getAllDeviceInfo(0,s)) log->append(s.c_str());
+      delete c;
+      c = 0;
+    }
+#endif
+
+  int minLogSize = 10;
+  int maxLogSize = 29;
+  std::vector<TestResult> allTests;
+
+  // Run all CPU tests
+  for (int task=0;task<100;task++)
+    {
+      // Select tests here
+      // if (task != 30) continue;
+      // continue;
+
+      CPUTask * g = 0;
+      // Default values
+      std::string title;
+      int minNThreads = 1;
+      int maxNThreads = 32;
+
+      switch (task)
+	{
+	case 0:
+	  title.assign("CPU copyN");
+	  g = new CopyCPUTask();
+	  break;
+	case 1:
+	  title.assign("CPU zeroN");
+	  g = new ZeroCPUTask();
+	  break;
+
+	case 10:
+	  title.assign("CPU addN v1 16-bit");
+	  g = new AddNCPUTask(ADDN_V1,16);
+ 	  break;
+	case 11:
+	  title.assign("CPU addN v1 32-bit");
+	  g = new AddNCPUTask(ADDN_V1,32);
+ 	  break;
+	case 12:
+	  title.assign("CPU addN v1 64-bit");
+	  g = new AddNCPUTask(ADDN_V1,64);
+	  break;
+
+	case 20:
+	  title.assign("CPU addN v2 16-bit");
+	  g = new AddNCPUTask(ADDN_V2,16);
+	  break;
+	case 21:
+	  title.assign("CPU addN v2 32-bit");
+	  g = new AddNCPUTask(ADDN_V2,32);
+	  break;
+	case 22:
+	  title.assign("CPU addN v2 64-bit");
+	  g = new AddNCPUTask(ADDN_V2,64);
+	  break;
+
+#ifndef Linux
+	case 30:
+	  title.assign("CPU addN v3 32-bit");
+	  g = new AddNCPUTask(ADDN_V3,32);
+	  break;
+#endif
+	}
+      if (g == 0) continue; // nothing to do
+
+      TestResult tr;
+      tr.title.assign(title);
+      tr.mbps.resize(1+maxLogSize,0);
+      log->appendTitle(title.c_str());
+      for (int nt=minNThreads;nt<=maxNThreads;nt<<=1)
+	for (int ls=minLogSize;ls<=maxLogSize;ls++)
+	  {
+	    size_t sz = 1<<ls;
+	    double mbps = g->run(nt,sz);
+	    char aux[200];
+	    std::string usz;
+	    getUserSize(ls,usz);
+	    _snprintf(aux,200,"NT=%3d  SZ=%7s   %6.0f MB/s",nt,usz.c_str(),mbps);
+	    log->append(aux);
+	    tr.update(ls,mbps);
+	  }
+
+      print(stdout,tr,minLogSize,maxLogSize);
+#if CONFIG_USE_QT
+      print(log,tr,minLogSize,maxLogSize);
+#endif
+      allTests.push_back(tr);
+      delete g;
+    }
+
+  // Run all GPU tests
+  for (int task=0;task<100;task++)
+    {
+      // Select tests here
+      //continue;
+      //if (task != 30) continue;
+
+      GPUTask * g = 0;
+      // Default values
+      std::string title;
+      int minWorkgroupSize = 16;
+      int maxWorkgroupSize = 512;
+
+      switch (task)
+	{
+	case 0:
+	  title.assign("Host to device copy");
+	  g = new CopyGPUTask(CopyGPUTask::HOST_TO_DEVICE_COPY,log);
+	  minWorkgroupSize = maxWorkgroupSize;
+	  break;
+	case 1:
+	  title.assign("Device to host copy");
+	  g = new CopyGPUTask(CopyGPUTask::DEVICE_TO_HOST_COPY,log);
+	  minWorkgroupSize = maxWorkgroupSize;
+	  break;
+	case 2:
+	  title.assign("copyN");
+	  g = new CopyGPUTask(CopyGPUTask::DEVICE_TO_DEVICE_COPY,log);
+	  minWorkgroupSize = maxWorkgroupSize;
+	  break;
+	case 3:
+	  title.assign("zeroN 32-bit");
+	  g = new ZeroGPUTask(32,log);
+	  break;
+	case 4:
+	  title.assign("zeroN 64-bit");
+	  g = new ZeroGPUTask(64,log);
+	  break;
+
+	case 10:
+	  title.assign("AddN v1 16-bit");
+	  g = new AddNGPUTask(ADDN_V1,16,log);
+	  break;
+	case 11:
+	  title.assign("AddN v1 32-bit");
+	  g = new AddNGPUTask(ADDN_V1,32,log);
+	  break;
+	case 12:
+	  title.assign("AddN v1 64-bit");
+	  g = new AddNGPUTask(ADDN_V1,64,log);
+	  break;
+
+	case 20:
+	  title.assign("AddN v2 16-bit");
+	  g = new AddNGPUTask(ADDN_V2,16,log);
+	  break;
+	case 21:
+	  title.assign("AddN v2 32-bit");
+	  g = new AddNGPUTask(ADDN_V2,32,log);
+	  break;
+	case 22:
+	  title.assign("AddN v2 64-bit");
+	  g = new AddNGPUTask(ADDN_V2,64,log);
+	  break;
+
+	case 40:
+	  title.assign("Mul1 v1 32-bit");
+	  g = new Mul1GPUTask(MUL1_V1,1,log);
+	  break;
+
+	}
+      if (g == 0) continue; // nothing to do
+
+      TestResult tr;
+      tr.title.assign(title);
+      tr.mbps.resize(1+maxLogSize,0);
+      log->appendTitle(title.c_str());
+      for (int wg=minWorkgroupSize;wg<=maxWorkgroupSize;wg<<=1)
+	{
+	  char aux[1000];
+	  _snprintf(aux,1000,"%s - workgroup_size=%d",title.c_str(),wg);
+	  log->append(aux);
+	  for (int ls=minLogSize;ls<=maxLogSize;ls++)
+	    {
+	      size_t sz = 1<<ls;
+	      double mbps = g->run(wg,sz);
+	      std::string usz;
+	      getUserSize(ls,usz);
+	      _snprintf(aux,1000,"WG=%3d  SZ=%7s   %6.0f MB/s",wg,usz.c_str(),mbps);
+	      log->append(aux);
+	      tr.update(ls,mbps);
+	    }
+	}
+      
+      print(stdout,tr,minLogSize,maxLogSize);
+#if CONFIG_USE_QT
+      print(log,tr,minLogSize,maxLogSize);
+#endif
+      allTests.push_back(tr);
+      delete g;
+    }
+
+  // Dump all results
+  int nt = (int)allTests.size();
+  log->appendTitle("RESULTS");
+  printf("\n----\n");
+  for (int i=0;i<nt;i++)
+    {
+      TestResult & tr = allTests[i];
+      print(stdout,tr,minLogSize,maxLogSize);
+#if CONFIG_USE_QT
+      print(log,tr,minLogSize,maxLogSize);
+#endif
+    }
+
+#if CONFIG_USE_QT
+  // Enter event loop to terminate
+  log->appendTitle("DONE");
+  app.exec();
+#endif
+}
diff --git a/MPBenchmarks/gpu_add.cl b/MPBenchmarks/gpu_add.cl
new file mode 100644
index 0000000..7237395
--- /dev/null
+++ b/MPBenchmarks/gpu_add.cl
@@ -0,0 +1,86 @@
+// GPU addition
+// (c) EB Sep 2009
+
+#if (WORD_SIZE == 64)
+
+typedef long Word;
+const Word LOG_BASE = 62;
+const Word BASE = (Word)1<<(Word)62;
+const Word BASE_MINUS1 = (Word)1<<(Word)62 - 1;
+
+#elif (WORD_SIZE == 32)
+
+typedef int Word;
+const Word LOG_BASE = 30;
+const Word BASE = (Word)1<<(Word)30;
+const Word BASE_MINUS1 = (Word)1<<(Word)30 - 1;
+
+#elif (WORD_SIZE == 16)
+
+typedef short Word;
+const Word LOG_BASE = 14;
+const Word BASE = (Word)1<<(Word)14;
+const Word BASE_MINUS1 = (Word)1<<(Word)14 - 1;
+
+#endif
+
+__kernel void add_v1(__global const Word * x,__global const Word * y,__global Word * z)
+{
+  int i = get_global_id(0);
+
+  // Carry from I-1
+  Word t = (Word)0;
+  if (i>0)
+  {
+    t = (x[i-1]+y[i-1]) >> LOG_BASE;
+  }
+
+  // Sum I
+  Word s = (x[i]+y[i]) & BASE_MINUS1;
+
+  // Final result
+  z[i] = t + s;
+}
+
+__kernel void add_v2(__global const Word * x,__global const Word * y,__global Word * z)
+{
+  int i = get_global_id(0);
+  z[i] = x[i] + y[i];
+}
+
+#if 0
+__kernel void add_v3(__global const Word * x,__global const Word * y,__global Word * z,
+		   __local Word * sLoc)
+{
+  int i = get_global_id(0);
+  int ii = get_local_id(0);
+  sLoc[ii] = x[i] + y[i];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Carry from I-1
+  Word t = (Word)0;
+  Word s = 0;
+#if 0
+  if (ii == 0)
+  {
+    s = (i>0)?(x[i-1] + y[i-1]):0;
+  }
+  else
+  {
+    // s = sLoc[ii-1];
+  }
+#endif
+  // s = sLoc[ii-1];
+  if (s >= MAX_WORD) t = 1;
+  else if (s <= -MAX_WORD) t = -1;
+
+  // Sum I
+  s = sLoc[ii-1];
+  if (s >= MAX_WORD) s -= BASE;
+  else if (s <= -MAX_WORD) s += BASE;
+
+  // Final result
+  z[i] = t + s;
+}
+#endif
diff --git a/MPBenchmarks/gpu_mp.cl b/MPBenchmarks/gpu_mp.cl
new file mode 100644
index 0000000..703f5fa
--- /dev/null
+++ b/MPBenchmarks/gpu_mp.cl
@@ -0,0 +1,153 @@
+// GPU multi-precision (old file)
+// (c) EB Sep 2009
+
+#if (WORD_SIZE == 64)
+
+typedef long Word;
+const Word BASE = (Word)1<<(Word)60;
+const Word MAX_WORD = (Word)1<<(Word)60 - (Word)1;
+
+#elif (WORD_SIZE == 32)
+
+typedef int Word;
+const Word BASE = (Word)1<<(Word)30;
+const Word MAX_WORD = (Word)1<<(Word)30 - (Word)1;
+
+#elif (WORD_SIZE == 16)
+
+typedef short Word;
+const Word BASE = (Word)1<<(Word)14;
+const Word MAX_WORD = (Word)1<<(Word)14 - (Word)1;
+
+#elif (WORD_SIZE == 128)
+
+typedef float4 Word;
+
+#endif
+
+__kernel void copy(__global Word * x,__global Word * z)
+{
+  int i = get_global_id(0);
+  z[i] = x[i];
+}
+
+__kernel void zero(__global Word * a)
+{
+  int i = get_global_id(0);
+  a[i] = 0.0f;
+}
+
+__kernel void add_v1(__global const Word * x,__global const Word * y,__global Word * z)
+{
+  int i = get_global_id(0);
+
+  // Carry from I-1
+  Word t = (Word)0;
+  if (i>0)
+  {
+    Word s1 = x[i-1]+y[i-1];
+    if (s1 >= MAX_WORD) t = 1;
+    else if (s1 <= -MAX_WORD) t = -1;
+  }
+
+  // Sum I
+  Word s = x[i]+y[i];
+  if (s >= MAX_WORD) s -= BASE;
+  else if (s <= -MAX_WORD) s += BASE;
+
+  // Final result
+  z[i] = t + s;
+}
+
+__kernel void add_v3(__global const Word * x,__global const Word * y,__global Word * z,
+		   __local Word * sLoc)
+{
+  int i = get_global_id(0);
+  int ii = get_local_id(0);
+  sLoc[ii] = x[i] + y[i];
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Carry from I-1
+  Word t = (Word)0;
+  Word s = 0;
+#if 0
+  if (ii == 0)
+  {
+    s = (i>0)?(x[i-1] + y[i-1]):0;
+  }
+  else
+  {
+    // s = sLoc[ii-1];
+  }
+#endif
+  // s = sLoc[ii-1];
+  if (s >= MAX_WORD) t = 1;
+  else if (s <= -MAX_WORD) t = -1;
+
+  // Sum I
+  s = sLoc[ii-1];
+  if (s >= MAX_WORD) s -= BASE;
+  else if (s <= -MAX_WORD) s += BASE;
+
+  // Final result
+  z[i] = t + s;
+}
+
+__kernel void add_v2(__global const Word * x,__global const Word * y,__global Word * z)
+{
+  int i = get_global_id(0);
+  z[i] = x[i] + y[i];
+}
+
+__kernel void propagate(__global const Word * x,__global Word * z)
+{
+  int i = get_global_id(0);
+  Word a = x[i];
+  Word b = 0;
+}
+
+// ______________________________________________________________________
+//
+
+#if (VECTOR_LENGTH == 1)
+typedef float vector_t;
+#elif (VECTOR_LENGTH == 2)
+typedef float2 vector_t;
+#elif (VECTOR_LENGTH == 4)
+typedef float4 vector_t;
+#endif
+
+__kernel void crunch(__global float * out)
+{
+  vector_t x,y,cs,sn,xx,yy;
+  x = 1.0f;
+  y = 0.0f;
+  cs = cos(2.0f); // random angle
+  sn = sin(2.0f);
+  for (int i=0;i<N_ROTATIONS;i++)
+  {
+    xx = x*cs - y*sn;
+    yy = y*cs + x*sn;
+    x = xx;
+    y = yy;
+  }
+  out[get_global_id(0)] = dot(x,y);
+}
+
+__kernel void crunch2(__global float * out)
+{
+  vector_t x,y,cs,sn,xx,yy;
+  x = 1.0f;
+  y = 0.0f;
+  cs = cos(2.0f); // random angle
+  sn = sin(2.0f);
+  for (int i=0;i<N_ROTATIONS;i++)
+  {
+    xx = mad(x,cs,-y*sn);
+    yy = mad(y,cs,x*sn);
+    x = xx;
+    y = yy;
+  }
+  out[get_global_id(0)] = dot(x,y);
+}
diff --git a/MPBenchmarks/gpu_mul1.cl b/MPBenchmarks/gpu_mul1.cl
new file mode 100644
index 0000000..be2875b
--- /dev/null
+++ b/MPBenchmarks/gpu_mul1.cl
@@ -0,0 +1,116 @@
+// GPU multi-precision
+// (c) EB Nov 2009
+
+__kernel void mul1_v0(int k,__global const int * x,__global int * z)
+{
+  int i = get_global_id(0);
+  z[i] = (int)(((long)k*(long)x[i]) & (long)BASE_MINUS1);
+}
+
+__kernel void mul1_v1(int k,__global const int * x,__global int * z)
+{
+  int i = get_global_id(0);
+  long z0 = ((long)k*(long)x[i]) & (long)BASE_MINUS1;
+  long z1 = (((long)k*(long)((i>0)?x[i-1]:0))>>LOG_BASE) & (long)BASE_MINUS1;
+  long z2 = (((long)k*(long)((i>1)?x[i-2]:0))>>(2*LOG_BASE)) & (long)BASE_MINUS1;
+  z[i] = (int)(z0+z1+z2);
+}
+
+#if 0
+__kernel void mul1_v2(int k,__global const int * x,__global int * z)
+{
+  __local long aux[WORKGROUP_SIZE+2];
+  int i = get_global_id(0); // in X
+  int ii = get_local_id(0); // in workgroup
+  long s;
+
+  // Load X[i]*K into AUX for all threads of the workgroup + 2
+  s = (long)x[i];
+  aux[ii+2] = s * (long)k;
+  if (ii < 2) // 2 work items load the previous 2 values
+  {
+    s = (long)((i>2)?x[i-2]:0);
+    aux[ii] = s * (long)k;
+  }
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  // Shift, mask and sum the 3 consecutive words in each cell
+  s  = (aux[ii]>>(2*LOG_BASE)) & (long)BASE_MINUS1;
+  s += (aux[ii+1]>>(LOG_BASE)) & (long)BASE_MINUS1;
+  s +=  aux[ii+2]              & (long)BASE_MINUS1;
+
+  // Store the result
+  z[i] = (int)s;
+}
+#endif
+
+#if 0
+__kernel void mul1_v3(int k,__global const int * x,__global int * z)
+{
+  // Get index of block to compute
+  int i = get_global_id(0) * BLOCK_SIZE;
+
+  // Load the previous two values
+  long p0,p1,p2;
+  if (i>0)
+  {
+    p2 = (long)x[i-2]*(long)k; p2 >>= (2*LOG_BASE);
+    p1 = (long)x[i-1]*(long)k; p1 >>= LOG_BASE;
+  }
+  else p2 = p1 = 0;
+
+  // Compute the block (sequentially)
+  for (int j=0;j<BLOCK_SIZE;j++)
+  {
+    // Load one value
+    p0 = (long)x[i]*(long)k;
+    // Store one result computed from the last 3 values
+    z[i] = (int)((p0&(long)BASE_MINUS1)+(p1&(long)BASE_MINUS1)+p2);
+    // Shift
+    i++;
+    p2 = p1 >> LOG_BASE;
+    p1 = p0 >> LOG_BASE;
+  }
+}
+#endif
+
+__kernel void mul1_v3bis(int k,__global const int * x,__global int * z)
+{
+  int i = get_global_id(0) << 1;
+  long m1,m2;
+  if (i>0)
+  {
+    m2 = ((long)k*(long)x[i-2]) >> (2*LOG_BASE);
+    m1 = ((long)k*(long)x[i-1]) >> LOG_BASE;
+  } else m1 = m2 = 0;
+  long x0 = (long)k*(long)x[i];
+  long x1 = (long)k*(long)x[i+1];
+  z[i]   = (int)( ( x0 & (long)BASE_MINUS1 )
+                + ( m1 & (long)BASE_MINUS1 )
+                + ( m2 & (long)BASE_MINUS1 ) );
+  x0 >>= LOG_BASE;
+  m1 >>= LOG_BASE;
+  z[i+1] = (int)( ( x1 & (long)BASE_MINUS1 )
+                + ( x0 & (long)BASE_MINUS1 )
+                + ( m1 & (long)BASE_MINUS1 ) );
+}
+
+__kernel void mul1_v4a(int k,__global const int * x,__global int * z0,__global int * z1,__global int * z2)
+{
+  int i = get_global_id(0);
+  long u = (long)k*(long)x[i];
+  z0[i] = (int) (u & (long)BASE_MINUS1);
+  u >>= LOG_BASE;
+  z1[i] = (int) (u & (long)BASE_MINUS1);
+  u >>= LOG_BASE;
+  z2[i] = (int)u;
+}
+
+__kernel void mul1_v4b(__global const int * z0,__global const int * z1,__global const int * z2,__global int * z)
+{
+  int i = get_global_id(0);
+  int s = z0[i];
+  if (i>0) s += z1[i-1];
+  if (i>1) s += z2[i-2];
+  z[i] = s;
+}
diff --git a/MPBenchmarks/gpu_zero.cl b/MPBenchmarks/gpu_zero.cl
new file mode 100644
index 0000000..0f276e0
--- /dev/null
+++ b/MPBenchmarks/gpu_zero.cl
@@ -0,0 +1,18 @@
+// GPU memory 0
+// (c) EB Sep 2009
+
+#if (WORD_SIZE == 64)
+typedef long Word;
+#elif (WORD_SIZE == 32)
+typedef int Word;
+#elif (WORD_SIZE == 16)
+typedef short Word;
+#elif (WORD_SIZE == 128)
+typedef float4 Word;
+#endif
+
+__kernel void zero(__global Word * a)
+{
+  int i = get_global_id(0);
+  a[i] = 0.0f;
+}
diff --git a/MPBenchmarks/t1.cpp b/MPBenchmarks/t1.cpp
new file mode 100644
index 0000000..2398a1b
--- /dev/null
+++ b/MPBenchmarks/t1.cpp
@@ -0,0 +1,18 @@
+#include <stdio.h>
+
+typedef long long int int64;
+
+int main()
+{
+  const int64 Blog = 8;
+  const int64 B = (int64)1<<Blog;
+
+  for (int64 m=B;m<3*B ;m++)
+    {
+      int64 x = B-1+2*(m/B);
+      if (x>m) continue;
+
+      printf("Blog=%lld  m=%llX\n",Blog,m);
+      // break;
+    }
+}
diff --git a/MPBenchmarks/vs2008/MPBenchmarks.sln b/MPBenchmarks/vs2008/MPBenchmarks.sln
new file mode 100644
index 0000000..a3d9660
--- /dev/null
+++ b/MPBenchmarks/vs2008/MPBenchmarks.sln
@@ -0,0 +1,39 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MPBenchmarks", "MPBenchmarks.vcproj", "{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}"
+	ProjectSection(ProjectDependencies) = postProject
+		{86930221-164E-4E80-B8C0-50DCB5740B37} = {86930221-164E-4E80-B8C0-50DCB5740B37}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BealtoOpenCL", "..\..\BealtoOpenCL\vs2008\BealtoOpenCL.vcproj", "{86930221-164E-4E80-B8C0-50DCB5740B37}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Debug|x64 = Debug|x64
+		Release|Win32 = Release|Win32
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|Win32.ActiveCfg = Debug|Win32
+		{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|Win32.Build.0 = Debug|Win32
+		{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|x64.ActiveCfg = Debug|x64
+		{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|x64.Build.0 = Debug|x64
+		{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|Win32.ActiveCfg = Release|Win32
+		{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|Win32.Build.0 = Release|Win32
+		{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|x64.ActiveCfg = Release|x64
+		{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|x64.Build.0 = Release|x64
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.ActiveCfg = Debug|Win32
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.Build.0 = Debug|Win32
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.ActiveCfg = Debug|x64
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.Build.0 = Debug|x64
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.ActiveCfg = Release|Win32
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.Build.0 = Release|Win32
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.ActiveCfg = Release|x64
+		{86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/MPBenchmarks/vs2008/MPBenchmarks.vcproj b/MPBenchmarks/vs2008/MPBenchmarks.vcproj
new file mode 100644
index 0000000..5fe65c4
--- /dev/null
+++ b/MPBenchmarks/vs2008/MPBenchmarks.vcproj
@@ -0,0 +1,428 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9,00"
+	Name="MPBenchmarks"
+	ProjectGUID="{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}"
+	RootNamespace="TestsOpenCL"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+		<Platform
+			Name="x64"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+			IntermediateDirectory="$(ConfigurationName)-32"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="qtmaind.lib QtCored4.lib QtGuid4.lib OpenCL.lib BealtoOpenCLd.lib"
+				LinkIncremental="2"
+				AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-32"
+				GenerateDebugInformation="true"
+				SubSystem="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				Description="Installing DLL&apos;s"
+				CommandLine="copy &quot;$(QTDIR_BASE)-32\bin\QtCored4.dll&quot; $(OutDir) ;&#x0D;&#x0A;copy &quot;$(QTDIR_BASE)-32\bin\QtGuid4.dll&quot; $(OutDir) ;&#x0D;&#x0A;copy &quot;$(ATISTREAMSDKROOT)\bin\x86\OpenCL.dll&quot; $(OutDir) ;&#x0D;&#x0A;"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Debug|x64"
+			OutputDirectory="$(SolutionDir)\$(ConfigurationName)-64"
+			IntermediateDirectory="$(ConfigurationName)-64"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+				PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="qtmaind.lib QtCored4.lib QtGuid4.lib OpenCL.lib BealtoOpenCLd.lib"
+				LinkIncremental="2"
+				AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-64"
+				GenerateDebugInformation="true"
+				SubSystem="2"
+				TargetMachine="17"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				Description="Installing Qt DLL&apos;s"
+				CommandLine="copy &quot;$(QTDIR_BASE)-64\bin\QtCored4.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;copy &quot;$(QTDIR_BASE)-64\bin\QtGuid4.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;copy &quot;$(ATISTREAMSDKROOT)\bin\x86_64\OpenCL.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+			IntermediateDirectory="$(ConfigurationName)-32"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				EnableEnhancedInstructionSet="2"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="OpenCL.lib qtmain.lib QtCore4.lib QtGui4.lib BealtoOpenCL.lib"
+				LinkIncremental="1"
+				AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-32"
+				GenerateDebugInformation="true"
+				SubSystem="2"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				Description="Installing DLL&apos;s"
+				CommandLine="copy &quot;$(QTDIR_BASE)-32\bin\QtCore4.dll&quot; $(OutDir) ;&#x0D;&#x0A;copy &quot;$(QTDIR_BASE)-32\bin\QtGui4.dll&quot; $(OutDir) ;&#x0D;&#x0A;copy &quot;$(ATISTREAMSDKROOT)\bin\x86\OpenCL.dll&quot; $(OutDir) ;&#x0D;&#x0A;"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|x64"
+			OutputDirectory="$(SolutionDir)\$(ConfigurationName)-64"
+			IntermediateDirectory="$(ConfigurationName)-64"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+				TargetEnvironment="3"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+				PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="OpenCL.lib  QtCore4.lib QtGui4.lib qtmain.lib BealtoOpenCL.lib"
+				LinkIncremental="1"
+				AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-64"
+				GenerateDebugInformation="true"
+				SubSystem="2"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="17"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+				Description="Installing Qt DLL&apos;s"
+				CommandLine="copy &quot;$(QTDIR_BASE)-64\bin\QtCore4.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;copy &quot;$(QTDIR_BASE)-64\bin\QtGui4.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;copy &quot;$(ATISTREAMSDKROOT)\bin\x86_64\OpenCL.dll&quot; &quot;$(OutDir)&quot;;&#x0D;&#x0A;"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="Source Files"
+			Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+			UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+			>
+			<File
+				RelativePath="..\appMain.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\ArithmeticTasks.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\GPUTask.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\MemoryTasks.cpp"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Header Files"
+			Filter="h;hpp;hxx;hm;inl;inc;xsd"
+			UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+			>
+			<File
+				RelativePath="..\ArithmeticTasks.h"
+				>
+			</File>
+			<File
+				RelativePath="..\Config.h"
+				>
+			</File>
+			<File
+				RelativePath="..\Conversions.h"
+				>
+			</File>
+			<File
+				RelativePath="..\CPUTask.h"
+				>
+			</File>
+			<File
+				RelativePath="..\GPUTask.h"
+				>
+			</File>
+			<File
+				RelativePath="..\MemoryTasks.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="OpenCL Files"
+			>
+			<File
+				RelativePath="..\gpu_add.cl"
+				>
+			</File>
+			<File
+				RelativePath="..\gpu_mp.cl"
+				>
+			</File>
+			<File
+				RelativePath="..\gpu_mul1.cl"
+				>
+			</File>
+			<File
+				RelativePath="..\gpu_zero.cl"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="Data Files"
+			>
+			<File
+				RelativePath="..\LICENSE.txt"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
author	Zhenyu Wang <zhenyuw@linux.intel.com>	2014-10-16 14:10:11 +0800
committer	Zhenyu Wang <zhenyuw@linux.intel.com>	2014-10-16 14:10:11 +0800
commit	bf8d34506d36df0c2028b97c25abd61c8659d326 (patch)
tree	bf78890927d593c29144d6cf078a3828e286cd29