diff options
author | Zhenyu Wang <zhenyuw@linux.intel.com> | 2014-10-16 14:10:11 +0800 |
---|---|---|
committer | Zhenyu Wang <zhenyuw@linux.intel.com> | 2014-10-16 14:10:11 +0800 |
commit | bf8d34506d36df0c2028b97c25abd61c8659d326 (patch) | |
tree | bf78890927d593c29144d6cf078a3828e286cd29 |
Import MPBenchmark 20091214
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
37 files changed, 4277 insertions, 0 deletions
diff --git a/BealtoOpenCL/LICENSE.txt b/BealtoOpenCL/LICENSE.txt new file mode 100644 index 0000000..f5fe116 --- /dev/null +++ b/BealtoOpenCL/LICENSE.txt @@ -0,0 +1,27 @@ +This code is released under the following license (BSD-style).
+--
+
+Copyright (c) 2009, Eric Bainville
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Eric Bainville nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY ERIC BAINVILLE ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ERIC BAINVILLE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/BealtoOpenCL/Makefile b/BealtoOpenCL/Makefile new file mode 100644 index 0000000..f8e725f --- /dev/null +++ b/BealtoOpenCL/Makefile @@ -0,0 +1,68 @@ +# OpenCL multiprecision CPU+GPU benchmarks +# EB Nov 2009 + +# archive data +CURRENT_DATE := $(shell date +%Y%m%d_%Hh%M) +CURRENT_DIR := $(notdir $(shell /bin/pwd)) +CURRENT_MACHINE := $(shell uname -m) + +# output directories +OBJDIR := objs +DEPDIR := deps +ifeq ($(CURRENT_MACHINE),x86_64) +LIBDIR := lib-64 +else +LIBDIR := lib-32 +endif + +# targets +OBJS = $(patsubst %,$(OBJDIR)/%.o, CLContext CLError CLProgram ) +DEPS = $(patsubst $(OBJDIR)/%.o, $(DEPDIR)/%.d, $(OBJS)) + +LIB = $(LIBDIR)/libBealtoOpenCL.a + +VPATH = src + +# OpenCL SDK directory +# OPENCL_DIR = /opt/cuda/include +OPENCL_DIR = /opt/ati-stream-sdk-v2.0-beta4-lnx64 + +# flags +CXXFLAGS = -DLinux -DCONFIG_USE_QT=0 -O2 -mtune=nocona -msse3 -Wall -I./include -I$(OPENCL_DIR)/include +LDFLAGS = -L$(OPENCL_DIR)/lib/x86_64 + +lib: $(LIB) + +$(LIB): $(OBJS) + @[ -d $(LIBDIR) ] || mkdir -p $(LIBDIR) + ar rvs $(LIB) $(OBJS) + +dos2unix: + dos2unix Makefile src/*.cpp include/*.h + +clean: + /bin/rm -f *.o *~ $(LIB) + /bin/rm -rf vs2008/Release-* vs2008/Debug-* $(OBJDIR) $(DEPDIR) lib-32 lib-64 + +archive: clean + @echo "ARCHIVE $(CURRENT_DATE)" + tar czf "../BealtoOpenCL-$(CURRENT_DATE).tar.gz" -C.. --exclude=".svn" $(CURRENT_DIR) + +##### Dependencies +$(DEPDIR)/%.d: %.cpp + @[ -d $(DEPDIR) ] || mkdir -p $(DEPDIR) + @/bin/echo -e "DEPS \033[32m$*\033[0m" + @$(CXX) $(CXXFLAGS) -o $@ -MM -MT '$(OBJDIR)/$*.o $@' $< + +##### Compilation +$(OBJDIR)/%.o: %.cpp + @[ -d $(OBJDIR) ] || mkdir -p $(OBJDIR) + @/bin/echo -e "C++ \033[34m$*\033[0m" + @$(CXX) $(CXXFLAGS) -c -o $@ $< + +##### Qt +moc_%.cpp: %.h + @/bin/echo -e "MOC \033[34m$*\033[0m" + @moc -o $@ $< + +-include $(DEPS) diff --git a/BealtoOpenCL/include/BealtoOpenCL.h b/BealtoOpenCL/include/BealtoOpenCL.h new file mode 100644 index 0000000..b35e9f1 --- /dev/null +++ b/BealtoOpenCL/include/BealtoOpenCL.h @@ -0,0 +1,13 @@ +// Main CL C++ bindings include file +// (c) EB Oct 2009 + +#pragma once + +#include "CLError.h" +#include "CLContext.h" +#include "CLCommandQueue.h" +#include "CLBuffer.h" +#include "CLEvent.h" +#include "CLEventList.h" +#include "CLProgram.h" +#include "CLKernel.h" diff --git a/BealtoOpenCL/include/CLBuffer.h b/BealtoOpenCL/include/CLBuffer.h new file mode 100644 index 0000000..be0f640 --- /dev/null +++ b/BealtoOpenCL/include/CLBuffer.h @@ -0,0 +1,34 @@ +// OpenCL buffer object +// (c) EB Sep 2009 + +#ifndef CLBuffer_h +#define CLBuffer_h + +#include <CL/cl.h> +#include "CLMemoryObject.h" +#include "CLError.h" + +namespace cl { + +class Buffer : public MemoryObject +{ +public: + + // Instances of this class are created from a Context. + + // Destructor + virtual ~Buffer() { } + +private: + + Buffer(); // not implemented + Buffer(cl_mem x) : MemoryObject(x) { } + + friend class Context; + friend class CommandQueue; + friend class Kernel; +}; + +} // namespace + +#endif // CLBuffer_h diff --git a/BealtoOpenCL/include/CLCommandQueue.h b/BealtoOpenCL/include/CLCommandQueue.h new file mode 100644 index 0000000..6fde44c --- /dev/null +++ b/BealtoOpenCL/include/CLCommandQueue.h @@ -0,0 +1,229 @@ +// OpenCL command queue object +// (c) EB Sep 2009 + +#ifndef CLCommandQueue_h +#define CLCommandQueue_h + +#include <CL/cl.h> +#include "CLBuffer.h" +#include "CLEvent.h" +#include "CLEventList.h" +#include "CLKernel.h" + +namespace cl { + +class CommandQueue +{ +public: + + // Instances of this class are created from a Context. + + // Copy constructor + CommandQueue(const CommandQueue & a) : mX(a.mX) + { clRetainCommandQueue(mX); } + + // = operator + CommandQueue & operator = (const CommandQueue & a) + { + if (a.mX != mX) + { + clReleaseCommandQueue(mX); + mX = a.mX; + clRetainCommandQueue(mX); + } + return *this; + } + + // Destructor + virtual ~CommandQueue() + { clReleaseCommandQueue(mX); } + + // Commands + // All commands return an Event instance. + // If the call fails, the returned event is invalid. + + // Read buffer to host memory + Event readBuffer(Buffer * b,cl_bool blocking_read,size_t offset,size_t cb,void * ptr,const EventList & wait_list = EventList()) + { + if (b == 0) return Event(0); // Invalid + cl_uint num_events = 0; + const cl_event * events = 0; + wait_list.getParams(num_events,events); + cl_event e = 0; + cl_int status = clEnqueueReadBuffer(mX,b->mX,blocking_read,offset,cb,ptr,num_events,events,&e); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) e = 0; + return Event(e); + } + + // Write buffer from host memory + Event writeBuffer(Buffer * b,cl_bool blocking_write,size_t offset,size_t cb,const void * ptr,const EventList & wait_list = EventList()) + { + if (b == 0) return Event(0); // Invalid + cl_uint num_events = 0; + const cl_event * events = 0; + wait_list.getParams(num_events,events); + cl_event e = 0; + cl_int status = clEnqueueWriteBuffer(mX,b->mX,blocking_write,offset,cb,ptr,num_events,events,&e); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) e = 0; + return Event(e); + } + + // Copy buffers + Event copyBuffer(Buffer * src,Buffer * dst,size_t src_offset,size_t dst_offset,size_t cb,const EventList & wait_list = EventList()) + { + if (src == 0 || dst == 0) return Event(0); // Invalid + cl_uint num_events = 0; + const cl_event * events = 0; + wait_list.getParams(num_events,events); + cl_event e = 0; + cl_int status = clEnqueueCopyBuffer(mX,src->mX,dst->mX,src_offset,dst_offset,cb,num_events,events,&e); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) e = 0; + return Event(e); + } + + // Map buffer. The mapped address is put in ADDRESS. + template <typename T> Event mapBuffer(Buffer * b,T * & address,cl_bool blocking_map,cl_map_flags map_flags,size_t offset,size_t cb,const EventList & wait_list = EventList()) + { + if (b == 0) return Event(0); + cl_uint num_events = 0; + const cl_event * events = 0; + wait_list.getParams(num_events,events); + cl_event e = 0; + cl_int status; + address = (T *)clEnqueueMapBuffer(mX,b->mX,blocking_map,map_flags,offset,cb,num_events,events,&e,&status); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) e = 0; + return Event(e); + } + + // Map full buffer. Queries its size, and maps all its contents. The mapped address is put in ADDRESS. + template <typename T> Event mapBuffer(Buffer * b,T * & address,cl_bool blocking_map,cl_map_flags map_flags,const EventList & wait_list = EventList()) + { + if (b == 0) return Event(0); // Invalid buffer + size_t sz = b->getSize(); + if (sz == 0) return Event(0); // Invalid size + return mapBuffer<T>(b,address,blocking_map,map_flags,0,sz,wait_list); + } + + // Unmap memory object. (Buffer, Image2D, etc.) + Event unmapMemoryObject(MemoryObject * m,void * address,const EventList & wait_list = EventList()) + { + if (m == 0) return Event(0); + cl_uint num_events = 0; + const cl_event * events = 0; + wait_list.getParams(num_events,events); + cl_event e = 0; + cl_int status = clEnqueueUnmapMemObject(mX,m->mX,address,num_events,events,&e); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) e = 0; + return Event(e); + } + + // Enqueue 1D kernel execution + // K is the kernel to run. + // N is the total number of work items (global work size). + // G is the number of work items inside a work group. G must divide N. + // G can be 0, in which case the OpenCL implementation will choose the best value. + Event execKernel1(Kernel * k,size_t n,size_t g,const EventList & wait_list = EventList()) + { + if (k == 0) return Event(0); // Invalid + cl_uint num_events = 0; + const cl_event * events = 0; + wait_list.getParams(num_events,events); + cl_event e = 0; + size_t * pgw = &n; + size_t * plw = (g>0)?(&g):0; + cl_int status = clEnqueueNDRangeKernel(mX,k->mX,1,0,pgw,plw,num_events,events,&e); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) e = 0; + return Event(e); + } + + // Enqueue 2D kernel execution + // K is the kernel to run. + // NX*NY is the total number of work items (global work size). + // GX*GY is the number of work items inside a work group. G<d> must divide N<d>. + Event execKernel2(Kernel * k,size_t nx,size_t ny,size_t gx,size_t gy,const EventList & wait_list = EventList()) + { + if (k == 0) return Event(0); // Invalid + cl_uint num_events = 0; + const cl_event * events = 0; + wait_list.getParams(num_events,events); + cl_event e = 0; + size_t pgw[2]; pgw[0] = nx; pgw[1] = ny; + size_t plw[2]; plw[0] = gx; plw[1] = gy; + cl_int status = clEnqueueNDRangeKernel(mX,k->mX,2,0,pgw,plw,num_events,events,&e); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) e = 0; + return Event(e); + } + + // + // Execution control + // + + // Enqueue a marker + Event mark() + { + cl_event e = 0; + cl_int status = clEnqueueMarker(mX,&e); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) e = 0; + return Event(e); + } + + // Insert a wait point for a specific list of events + bool wait(const EventList & wait_list) + { + cl_uint num_events = 0; + const cl_event * events = 0; + wait_list.getParams(num_events,events); + if (num_events == 0) return true; // Nop + cl_int status = clEnqueueWaitForEvents(mX,num_events,events); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); + } + + // Insert a wait point for all events. (barrier) + bool wait() + { + cl_int status = clEnqueueBarrier(mX); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); + } + + // Blocks until all queued commands have been submitted to the device. + bool flush() + { + cl_int status = clFlush(mX); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); + } + + // Blocks until all queued commands have been executed. + bool finish() + { + cl_int status = clFinish(mX); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); + } + +private: + + // Private constructors + CommandQueue(); // not implemented + CommandQueue(cl_command_queue x) : mX(x) { } + + // OpenCL handle (always non 0) + cl_command_queue mX; + + friend class Context; + +}; // class CommandQueue + +} // namespace + +#endif // CLCommandQueue_h diff --git a/BealtoOpenCL/include/CLContext.h b/BealtoOpenCL/include/CLContext.h new file mode 100644 index 0000000..2f8fcd4 --- /dev/null +++ b/BealtoOpenCL/include/CLContext.h @@ -0,0 +1,149 @@ +// OpenCL context +// (c) EB Sep 2009 + +#ifndef CLContext_h +#define CLContext_h + +#include <CL/cl.h> +#include <vector> +#include <string> + +namespace cl { + +class Buffer; +class Image2D; +class CommandQueue; +class Program; + +class Context +{ +public: + + // Copy constructor + Context(const Context & a); + + // = operator + Context & operator = (const Context & a); + + // Destructor + virtual ~Context(); + + // Create default context (including all devices with the given type). + // Return a new instance if OK, and 0 otherwise. + static Context * create(cl_device_type deviceType = CL_DEVICE_TYPE_GPU); + + // Get number of associated devices + int getNDevices() const; + + // Create a new command queue object. + // D is the index of the device ine the context (0..NDevices-1) + // See clCreateCommandQueue for the arguments. + // Return a new instance if OK, and 0 otherwise. + CommandQueue * createCommandQueue(int d,cl_command_queue_properties properties); + + // Create a new buffer object. + // See clCreateBuffer for the arguments. + // Return a new instance if OK, and 0 otherwise. + Buffer * createBuffer(cl_mem_flags flags,size_t size,void * host_ptr = 0); + + // Create a new image2D object. + // See clCreateImage2D for the arguments. + // Return a new instance if OK, and 0 otherwise. + Image2D * createImage2D(cl_mem_flags flags,const cl_image_format * image_format, + size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0); + + // Create a R / RG / RGBA image2D object. + // channel_type is passed in a cl_image_format structure. + Image2D * createRImage2D(cl_mem_flags flags,cl_channel_type data_type, + size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0) + { + cl_image_format f; + f.image_channel_order = CL_R; + f.image_channel_data_type = data_type; + return createImage2D(flags,&f,width,height,pitch,host_ptr); + } + Image2D * createRGImage2D(cl_mem_flags flags,cl_channel_type data_type, + size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0) + { + cl_image_format f; + f.image_channel_order = CL_RG; + f.image_channel_data_type = data_type; + return createImage2D(flags,&f,width,height,pitch,host_ptr); + } + Image2D * createRGBAImage2D(cl_mem_flags flags,cl_channel_type data_type, + size_t width,size_t height,size_t pitch = 0,void * host_ptr = 0) + { + cl_image_format f; + f.image_channel_order = CL_RGBA; + f.image_channel_data_type = data_type; + return createImage2D(flags,&f,width,height,pitch,host_ptr); + } + + // Create a new program from source strings. + // See clCreateProgramWithSource for the arguments. + // Return a new instance if OK, and 0 otherwise. + Program * createProgramWithSource(cl_uint count,const char ** strings,const size_t * lengths = 0); + + // Create a new program from source files. + // See clCreateProgramWithSource for the arguments. + // Return a new instance if OK, and 0 otherwise. + Program * createProgramWithFiles(cl_uint count,const char ** filenames); + Program * createProgramWithFile(const char * filename) + { return createProgramWithFiles(1,&filename); } + Program * createProgramWithFiles(const std::vector<std::string> & filenames); + + // Get device info for device D (0..NDevices-1). + // See clGetDeviceInfo for the arguments. + bool getDeviceInfo(int d,cl_device_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret); + + // Template version suitable for scalar types + template <class T> bool getDeviceInfo(int d,cl_device_info param_name,T & x) + { return getDeviceInfo(d,param_name,sizeof(T),&x,0); } + + // Get all info for device D. + // INFO receives the result (one attribute per line), cleared by the call. + bool getAllDeviceInfo(int d,std::string & info); + + // Specific info entries. + // D is the device index in context, 0..NDevices-1. + size_t getDeviceMaxMemAllocSize(int d = 0) + { + cl_ulong x; + if (!getDeviceInfo(d,CL_DEVICE_MAX_MEM_ALLOC_SIZE,x)) return 0; // Failed + return (size_t)x; + } + bool getDeviceImageSupport(int d = 0) + { + cl_bool x; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE_SUPPORT,x)) return false; // Failed + return (x)?true:false; + } + size_t getDeviceImage2DMaxWidth(int d = 0) + { + size_t x; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_WIDTH,x)) return 0; // Failed + return x; + } + size_t getDeviceImage2DMaxHeight(int d = 0) + { + size_t x; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_HEIGHT,x)) return 0; // Failed + return x; + } + +private: + + // This constructor is private, use create() to instanciate this class + Context(); + + // OpenCL handle (always non 0) + cl_context mX; + + // Devices associated with this context + std::vector<cl_device_id> mDevices; + +}; // class Context + +} // namespace + +#endif // CLContext_h diff --git a/BealtoOpenCL/include/CLError.h b/BealtoOpenCL/include/CLError.h new file mode 100644 index 0000000..d929695 --- /dev/null +++ b/BealtoOpenCL/include/CLError.h @@ -0,0 +1,19 @@ +// OpenCL error reporting +// (c) EB Sep 2009 + +#ifndef CLError_h +#define CLError_h + +namespace cl { + +// Report an OpenCL call status. +// FILE,LINE,FUNCTION shall be the source file/line/name of the function where the call failed. +// STATUS is the returned value (CL_SUCCESS,...). +void reportStatus(const char * file,int line,const char * function,int status); + +// Use this macro to report errors. +#define REPORT_OPENCL_STATUS(status) { cl::reportStatus(__FILE__,__LINE__,__FUNCTION__,status); } + +} // namespace + +#endif // CLError_h diff --git a/BealtoOpenCL/include/CLEvent.h b/BealtoOpenCL/include/CLEvent.h new file mode 100644 index 0000000..b7ac9b4 --- /dev/null +++ b/BealtoOpenCL/include/CLEvent.h @@ -0,0 +1,67 @@ +// OpenCL event object +// (c) EB Sep 2009 + +#ifndef CLEvent_h +#define CLEvent_h + +#include <CL/cl.h> +#include "CLError.h" + +namespace cl { + +class Event +{ +public: + + // Instances of this class are created by calls in a CommandQueue. + + // Copy constructor + Event(const Event & a) : mX(a.mX) + { if (mX != 0) clRetainEvent(mX); } + + // = operator + Event & operator = (const Event & a) + { + if (a.mX != mX) + { + if (mX != 0) clReleaseEvent(mX); + mX = a.mX; + if (mX != 0) clRetainEvent(mX); + } + return *this; + } + + // Destructor + virtual ~Event() + { if (mX != 0) clReleaseEvent(mX); } + + // Check if the event is valid + bool isValid() const { return (mX != 0); } + + // Wait for this event to complete. + // Return TRUE if OK, and FALSE otherwise. + // If the event is invalid, return TRUE. + bool wait() + { + if (mX == 0) return true; // OK, event is invalid + cl_int status = clWaitForEvents(1,&mX); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); + } + +private: + + Event(); // not implemented + Event(cl_event x) : mX(x) { } + + // OpenCL handle (MAY BE 0) + cl_event mX; + + friend class CommandQueue; + friend class EventList; + +}; // class Event + +} // namespace + +#endif // CLEvent_h diff --git a/BealtoOpenCL/include/CLEventList.h b/BealtoOpenCL/include/CLEventList.h new file mode 100644 index 0000000..a841d15 --- /dev/null +++ b/BealtoOpenCL/include/CLEventList.h @@ -0,0 +1,96 @@ +// OpenCL event list +// (c) EB Sep 2009 + +#ifndef CLEventList_h +#define CLEventList_h + +#include <CL/cl.h> +#include <vector> +#include "CLEvent.h" + +namespace cl { + +class EventList +{ +public: + + // Constructor from a list of events + EventList() { } + EventList(const Event & e1) { insert(e1); } + EventList(const Event & e1,const Event & e2) { insert(e1); insert(e2); } + EventList(const Event & e1,const Event & e2,const Event & e3) { insert(e1); insert(e2); insert(e3); } + + // Copy constructor + EventList(const EventList & e) + { + for (event_list_t::const_iterator it = e.mX.begin(); it != e.mX.end(); it++) insert(*it); + } + + // = operator + EventList & operator = (const EventList & e) + { + if (&e == this) return *this; + clear(); + for (event_list_t::const_iterator it = e.mX.begin(); it != e.mX.end(); it++) insert(*it); + return *this; + } + + // Destructor + ~EventList() { clear(); } + + // Clear the list + void clear() + { + for (event_list_t::iterator it = mX.begin(); it != mX.end(); it++) clReleaseEvent(*it); + mX.clear(); + } + + // Insert an event to the list. Ignore if E is invalid. + void insert(const Event & e) + { + cl_event x = e.mX; + if (x == 0) return; + clRetainEvent(x); + mX.push_back(x); + } + + // Get the number of events in the list + int size() const { return (int)mX.size(); } + + // Is the list empty? + bool empty() const { return mX.empty(); } + + // Wait for all the events to complete. + // Return TRUE if OK, and FALSE otherwise. + // If the list is empty, return TRUE. + bool wait() + { + if (mX.empty()) return true; // OK, empty + cl_uint num_events; + const cl_event * events; + getParams(num_events,events); + cl_int status = clWaitForEvents(num_events,events); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); + } + + // Get CL parameters for the calls + void getParams(cl_uint & num_events,const cl_event * & events) const + { + num_events = (cl_uint)mX.size(); + if (num_events == 0) events = 0; + else events = &(mX[0]); + } + +private: + + typedef std::vector<cl_event> event_list_t; + + // OpenCL handles + event_list_t mX; + +}; // class EventList + +} // namespace + +#endif // CLEventList_h diff --git a/BealtoOpenCL/include/CLImage2D.h b/BealtoOpenCL/include/CLImage2D.h new file mode 100644 index 0000000..baf996f --- /dev/null +++ b/BealtoOpenCL/include/CLImage2D.h @@ -0,0 +1,35 @@ +// OpenCL image2D object +// (c) EB Nov 2009 + +#ifndef CLImage2D_h +#define CLImage2D_h + +#include <CL/cl.h> +#include "CLError.h" +#include "CLMemoryObject.h" + +namespace cl { + + class Image2D : public MemoryObject +{ +public: + + // Instances of this class are created from a Context. + + // Destructor + virtual ~Image2D() { } + +private: + + Image2D(); // not implemented + Image2D(cl_mem x) : MemoryObject(x) { } + + friend class Context; + friend class CommandQueue; + friend class Kernel; + +}; // class Image2D + +} // namespace + +#endif // CLImage2D_h diff --git a/BealtoOpenCL/include/CLKernel.h b/BealtoOpenCL/include/CLKernel.h new file mode 100644 index 0000000..7ded751 --- /dev/null +++ b/BealtoOpenCL/include/CLKernel.h @@ -0,0 +1,78 @@ +// OpenCL kernel object +// (c) EB Sep 2009 + +#ifndef CLKernel_h +#define CLKernel_h + +#include <CL/cl.h> +#include "CLError.h" +#include "CLBuffer.h" + +namespace cl { + +class Kernel +{ +public: + + // Instances of this class are created from a Program. + + // Copy constructor + Kernel(const Kernel & a) : mX(a.mX) + { clRetainKernel(mX); } + + // = operator + Kernel & operator = (const Kernel & a) + { + if (a.mX != mX) + { + clReleaseKernel(mX); + mX = a.mX; + clRetainKernel(mX); + } + return *this; + } + + // Destructor + virtual ~Kernel() + { clReleaseKernel(mX); } + + // Set kernel argument + // See clSetKernelArg for arguments. + // Return TRUE if OK, and FALSE otherwise. + bool setArg(cl_uint arg_index,size_t arg_size,const void * arg_value) + { + cl_int status = clSetKernelArg(mX,arg_index,arg_size,arg_value); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); + } + // Set kernel argument (Buffer) + // Return TRUE if OK, and FALSE otherwise. + bool setArg(cl_uint arg_index,Buffer * b) + { + if (b == 0) return false; // Invalid + cl_mem m = b->mX; + return setArg(arg_index,sizeof(m),(const void *)&m); + } + // Set kernel argument (other types) + // Return TRUE if OK, and FALSE otherwise. + bool setArg(cl_int arg_index,cl_int x) + { return setArg(arg_index,sizeof(x),(const void *)&x); } + bool setArg(cl_int arg_index,cl_float x) + { return setArg(arg_index,sizeof(x),(const void *)&x); } + +private: + + Kernel(); // not implemented + Kernel(cl_kernel x) : mX(x) { } + + // OpenCL handle (always non 0) + cl_kernel mX; + + friend class Program; + friend class CommandQueue; + +}; // class Kernel + +} // namespace + +#endif // CLKernel_h diff --git a/BealtoOpenCL/include/CLMemoryObject.h b/BealtoOpenCL/include/CLMemoryObject.h new file mode 100644 index 0000000..433bbdc --- /dev/null +++ b/BealtoOpenCL/include/CLMemoryObject.h @@ -0,0 +1,77 @@ +// OpenCL memory object +// (c) EB Dec 2009 + +#ifndef CLMemoryObject_h +#define CLMemoryObject_h + +#include <CL/cl.h> +#include "CLError.h" + +namespace cl { + +class MemoryObject +{ +public: + + // Instances of this class are created from a Context. + + // Copy constructor + MemoryObject(const MemoryObject & a) : mX(a.mX) + { clRetainMemObject(mX); } + + // = operator + MemoryObject & operator = (const MemoryObject & a) + { + if (a.mX != mX) + { + clReleaseMemObject(mX); + mX = a.mX; + clRetainMemObject(mX); + } + return *this; + } + + // Destructor + virtual ~MemoryObject() + { clReleaseMemObject(mX); } + + // Get memory object info. + // See clGetMemObjectInfo for the arguments. + bool getMemObjectInfo(cl_mem_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret) + { + cl_int status = clGetMemObjectInfo(mX,param_name,param_value_size,param_value,param_value_size_ret); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); + } + + // Template version suitable for scalar types + template <class T> bool getMemObjectInfo(cl_mem_info param_name,T & x) + { return getMemObjectInfo(param_name,sizeof(T),&x,0); } + + // Specific info entries + size_t getSize() + { + size_t x; + bool ok = getMemObjectInfo(CL_MEM_SIZE,x); + return (ok)?x:0; + } + +private: + + MemoryObject(); // not implemented + MemoryObject(cl_mem x) : mX(x) { } + + // OpenCL handle (always non 0) + cl_mem mX; + + friend class Context; + friend class CommandQueue; + friend class Kernel; + friend class Buffer; + friend class Image2D; + +}; // class MemoryObject + +} // namespace + +#endif // CLMemoryObject_h diff --git a/BealtoOpenCL/include/CLProgram.h b/BealtoOpenCL/include/CLProgram.h new file mode 100644 index 0000000..07a85a7 --- /dev/null +++ b/BealtoOpenCL/include/CLProgram.h @@ -0,0 +1,74 @@ +// OpenCL program object +// (c) EB Sep 2009 + +#ifndef CLProgram_h +#define CLProgram_h + +#include <CL/cl.h> +#include <string> +#include <vector> +#include "CLError.h" +#include "CLKernel.h" + +namespace cl { + +class Program +{ +public: + + // Instances of this class are created from a Context. + + // Copy constructor + Program(const Program & a) : mX(a.mX) + { clRetainProgram(mX); } + + // = operator + Program & operator = (const Program & a) + { + if (a.mX != mX) + { + clReleaseProgram(mX); + mX = a.mX; + clRetainProgram(mX); + } + return *this; + } + + // Destructor + virtual ~Program() + { clReleaseProgram(mX); } + + // Build the program (for all devices, and blocking) + // See clBuildProgram for the arguments. + // Return TRUE if OK, and FALSE otherwise. + bool build(const char * options,std::string & buildErrors); + + // Get program binary for first device + bool getBinary(std::vector<unsigned char> & binary); + + // Get a kernel for this program + // Return a new instance if OK, and 0 otherwise. + Kernel * createKernel(const char * kernel_name) + { + cl_int status; + cl_kernel k = clCreateKernel(mX,kernel_name,&status); + REPORT_OPENCL_STATUS(status); + if (k == 0) return 0; // Failed + return new Kernel(k); + } + +private: + + Program(); // not implemented + Program(cl_program x) : mX(x) { } + + // OpenCL handle (always non 0) + cl_program mX; + + friend class Context; + +}; // class Buffer + +} // namespace + +#endif // CLProgram_h diff --git a/BealtoOpenCL/src/CLContext.cpp b/BealtoOpenCL/src/CLContext.cpp new file mode 100644 index 0000000..7ca7848 --- /dev/null +++ b/BealtoOpenCL/src/CLContext.cpp @@ -0,0 +1,351 @@ +// OpenCL context +// (c) EB Sep 2009 + +#define _CRT_SECURE_NO_WARNINGS +#include <sys/stat.h> +#include <vector> +#include <sstream> +#include <stdlib.h> +#include "CLError.h" +#include "CLContext.h" +#include "CLCommandQueue.h" +#include "CLBuffer.h" +#include "CLImage2D.h" +#include "CLProgram.h" + +#ifdef Linux +#define _stat stat +#endif + +using namespace cl; + +Context::Context() : mX(0) +{ +} + +Context::Context(const Context & a) : mX(a.mX) +{ + clRetainContext(mX); +} + +Context & Context::operator = (const Context & a) +{ + if (a.mX != mX) + { + clReleaseContext(mX); + mX = a.mX; + clRetainContext(mX); + } + return *this; +} + +Context::~Context() +{ + clReleaseContext(mX); +} + +Context * Context::create(cl_device_type deviceType) +{ + cl_platform_id bestPlatform = 0; + std::vector<cl_device_id> devices; + int status; + + // Platforms + cl_uint nPlatforms = 0; + std::vector<cl_platform_id> pIDs; + status = clGetPlatformIDs(0,0,&nPlatforms); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS || nPlatforms == 0) return 0; // Failed + pIDs.resize(nPlatforms,0); + status = clGetPlatformIDs(nPlatforms,&(pIDs[0]),&nPlatforms); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) return 0; // Failed + + // printf("NPlatforms: %u\n",nPlatforms); + + // Devices for each platform + for (unsigned int i=0;i<nPlatforms;i++) + { + cl_platform_id p = pIDs[i]; + + // Get all GPU devices for this platform + cl_uint nDevices = 0; + status = clGetDeviceIDs(p,deviceType,0,0,&nDevices); + // printf("NDevices: %u\n",nDevices); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS || nDevices == 0) continue; + devices.resize(nDevices,0); + status = clGetDeviceIDs(p,deviceType,nDevices,&(devices[0]),&nDevices); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) { devices.clear(); continue; } + + bestPlatform = p; + break; // OK + } + if (devices.empty()) + { + printf("No device found\n"); + return 0; + } + + cl_context_properties cp[4]; + cp[0] = CL_CONTEXT_PLATFORM; + cp[1] = (cl_context_properties)bestPlatform; + cp[2] = cp[3] = 0; + cl_context c = clCreateContext(cp,(int)devices.size(),&(devices[0]),0,0,0); + if (c == 0) return 0; + + Context * result = new Context(); + result->mX = c; + result->mDevices = devices; + + return result; +} + +int Context::getNDevices() const +{ + return (int)mDevices.size(); +} + +CommandQueue * Context::createCommandQueue(int d,cl_command_queue_properties properties) +{ + cl_int status; + if (d<0 || d>=(int)mDevices.size()) return 0; // Invalid D + cl_command_queue q = clCreateCommandQueue(mX,mDevices[d],properties,&status); + REPORT_OPENCL_STATUS(status); + if (q == 0) return 0; // Failed + return new CommandQueue(q); +} + +Buffer * Context::createBuffer(cl_mem_flags flags,size_t size,void * host_ptr) +{ + cl_int status; + cl_mem m = clCreateBuffer(mX,flags,size,host_ptr,&status); + REPORT_OPENCL_STATUS(status); + if (m == 0) return 0; // Failed + return new Buffer(m); +} + +Image2D * Context::createImage2D(cl_mem_flags flags,const cl_image_format * image_format, + size_t width,size_t height,size_t pitch,void * host_ptr) +{ + cl_int status; + cl_mem m = clCreateImage2D(mX,flags,image_format,width,height,pitch,host_ptr,&status); + REPORT_OPENCL_STATUS(status); + if (m == 0) return 0; // Failed + return new Image2D(m); +} + +Program * Context::createProgramWithSource(cl_uint count,const char ** strings,const size_t * lengths) +{ + cl_int status; + cl_program p = clCreateProgramWithSource(mX,count,strings,lengths,&status); + REPORT_OPENCL_STATUS(status); + if (p == 0) return 0; // Failed + return new Program(p); +} + +Program * Context::createProgramWithFiles(cl_uint count,const char ** filenames) +{ + if (count == 0 || filenames == 0) return 0; // Invalid + + std::vector<char *> strings; + std::vector<size_t> lengths; + strings.resize(count,0); + lengths.resize(count,0); + bool ok = true; + for (unsigned int i=0;i<count;i++) + { + // Get file size and check it exists + struct _stat s; + int status = ::_stat(filenames[i],&s); + if (status != 0) { ok = false; break; } + size_t sz = s.st_size; + if (sz == 0) { ok = false; break; } + + // Read the file + FILE * f = fopen(filenames[i],"rb"); + if (f == 0) { ok = false; break; } + char * buffer = (char *)malloc(sz); + strings[i] = buffer; + lengths[i] = sz; + + size_t readBlocks = fread(buffer,sz,1,f); + fclose(f); + if (readBlocks != 1) { ok = false; break; } // read failed + } + Program * result = 0; + if (ok) result = createProgramWithSource(count,(const char **)&(strings[0]),&(lengths[0])); + + // Cleanup + for (unsigned int i=0;i<count;i++) if (strings[i] != 0) + { free(strings[i]); } + + return result; +} + +Program * Context::createProgramWithFiles(const std::vector<std::string> & filenames) +{ + cl_uint n = (int)filenames.size(); + if (n == 0) return 0; // Empty + std::vector<const char *> files(n,0); + for (cl_uint i=0;i<n;i++) + { + if (filenames[i].empty()) continue; // 0 if empty + files[i] = filenames[i].c_str(); + } + return createProgramWithFiles(n,&(files[0])); +} + +bool Context::getDeviceInfo(int d,cl_device_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret) +{ + if (d<0 || d>=(int)mDevices.size()) return 0; // Invalid D + cl_int status = clGetDeviceInfo(mDevices[d],param_name,param_value_size,param_value,param_value_size_ret); + REPORT_OPENCL_STATUS(status); + return (status == CL_SUCCESS); +} + +bool Context::getAllDeviceInfo(int d,std::string & info) +{ + cl_uint ui; + cl_ulong ul; + cl_bool b; + info.clear(); + std::ostringstream o; + o << "Device info (ID=0x" << mDevices[d] << ")\n"; + + cl_device_type dType; + if (!getDeviceInfo(d,CL_DEVICE_TYPE,dType)) return false; + o << "- type:"; + if (dType & CL_DEVICE_TYPE_CPU) o << " cpu"; + if (dType & CL_DEVICE_TYPE_GPU) o << " gpu"; + if (dType & CL_DEVICE_TYPE_ACCELERATOR) o << " accelerator"; + o << "\n"; + + if (!getDeviceInfo(d,CL_DEVICE_VENDOR_ID,ui)) return false; + o << "- vendor id: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_COMPUTE_UNITS,ui)) return false; + o << "- max compute units: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,ui)) return false; + o << "- max work item dimensions: " << ui << "\n"; + size_t sz[24]; + if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_ITEM_SIZES,24*sizeof(size_t),sz,0)) return false; + o << "- max work item sizes:"; + for (cl_uint i=0;i<ui;i++) o << " " << sz[i]; + o << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_WORK_GROUP_SIZE,sz[0])) return false; + o << "- max work group size: " << sz[0] << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR,ui)) return false; + o << "- preferred vector width char: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT,ui)) return false; + o << "- preferred vector width short: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT,ui)) return false; + o << "- preferred vector width int: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG,ui)) return false; + o << "- preferred vector width long: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,ui)) return false; + o << "- preferred vector width float: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,ui)) return false; + o << "- preferred vector width double: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_CLOCK_FREQUENCY,ui)) return false; + o << "- max clock frequency: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_ADDRESS_BITS,ui)) return false; + o << "- address bits: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_MEM_ALLOC_SIZE,ul)) return false; + o << "- max mem alloc size: " << ul << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE_SUPPORT,b)) return false; + o << "- image support: " << (b?"yes":"no") << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_READ_IMAGE_ARGS,ui)) return false; + o << "- max read image args: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_WRITE_IMAGE_ARGS,ui)) return false; + o << "- max write image args: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_WIDTH,sz[0])) return false; + o << "- image2d max width: " << sz[0] << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE2D_MAX_HEIGHT,sz[0])) return false; + o << "- image2d max height: " << sz[0] << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_WIDTH,sz[0])) return false; + o << "- image3d max width: " << sz[0] << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_HEIGHT,sz[0])) return false; + o << "- image3d max height: " << sz[0] << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_IMAGE3D_MAX_DEPTH,sz[0])) return false; + o << "- image3d max depth: " << sz[0] << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_SAMPLERS,ui)) return false; + o << "- max samplers: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_PARAMETER_SIZE,sz[0])) return false; + o << "- max parameter size: " << sz[0] << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MEM_BASE_ADDR_ALIGN,ui)) return false; + o << "- mem base addr align: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE,ui)) return false; + o << "- min data type align size: " << ui << "\n"; + + cl_device_fp_config fpc; + if (!getDeviceInfo(d,CL_DEVICE_SINGLE_FP_CONFIG,fpc)) return false; + o << "- single fp config:"; + if (fpc & CL_FP_DENORM) o << " denorm"; + if (fpc & CL_FP_INF_NAN) o << " inf_nan"; + if (fpc & CL_FP_ROUND_TO_NEAREST) o << " round_to_nearest"; + if (fpc & CL_FP_ROUND_TO_ZERO) o << " round_to_zero"; + if (fpc & CL_FP_ROUND_TO_INF) o << " round_to_inf"; + if (fpc & CL_FP_FMA) o << " fma"; + o << "\n"; + + cl_device_mem_cache_type mct; + if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHE_TYPE,mct)) return false; + o << "- global mem cache type:"; + if (mct == CL_NONE) o << " none"; + else if (mct == CL_READ_ONLY_CACHE) o << " read_only_cache"; + else if (mct == CL_READ_WRITE_CACHE) o << " read_write_cache"; + o << "\n"; + + if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE,ui)) return false; + o << "- global mem cacheline size: " << ui << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,ul)) return false; + o << "- global mem cache size: " << ul << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_GLOBAL_MEM_SIZE,ul)) return false; + o << "- global mem size: " << ul << "\n"; + + if (!getDeviceInfo(d,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,ul)) return false; + o << "- max constant buffer size: " << ul << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_MAX_CONSTANT_ARGS,ui)) return false; + o << "- max constant args: " << ui << "\n"; + + cl_device_local_mem_type lmt; + if (!getDeviceInfo(d,CL_DEVICE_LOCAL_MEM_TYPE,lmt)) return false; + o << "- local mem type:"; + if (lmt == CL_LOCAL) o << " local"; + else o << " global"; + o << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_LOCAL_MEM_SIZE,ul)) return false; + o << "- local mem size: " << ul << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_ERROR_CORRECTION_SUPPORT,b)) return false; + o << "- error correction support: " << (b?"yes":"no") << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_PROFILING_TIMER_RESOLUTION,sz[0])) return false; + o << "- profiling timer resolution: " << sz[0] << " ns\n"; + if (!getDeviceInfo(d,CL_DEVICE_ENDIAN_LITTLE,b)) return false; + o << "- endian little: " << (b?"yes":"no") << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_AVAILABLE,b)) return false; + o << "- available: " << (b?"yes":"no") << "\n"; + if (!getDeviceInfo(d,CL_DEVICE_COMPILER_AVAILABLE,b)) return false; + o << "- compiler available: " << (b?"yes":"no") << "\n"; + + cl_device_exec_capabilities ec; + if (!getDeviceInfo(d,CL_DEVICE_EXECUTION_CAPABILITIES,ec)) return false; + o << "- execution capabilities:"; + if (ec & CL_EXEC_KERNEL) o << " exec_kernel"; + if (ec & CL_EXEC_NATIVE_KERNEL) o << " exec_native_kernel"; + o << "\n"; + + cl_command_queue_properties qp; + if (!getDeviceInfo(d,CL_DEVICE_QUEUE_PROPERTIES,qp)) return false; + o << "- queue properties:"; + if (ec & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) o << " out_of_order_exec_mode"; + if (ec & CL_QUEUE_PROFILING_ENABLE) o << " profiling"; + o << "\n"; + + char exts[8192]; + if (!getDeviceInfo(d,CL_DEVICE_EXTENSIONS,8192,exts,0)) return false; + o << "- extensions: " << exts << "\n"; + + info.assign(o.str()); + return true; +} diff --git a/BealtoOpenCL/src/CLError.cpp b/BealtoOpenCL/src/CLError.cpp new file mode 100644 index 0000000..1fcd12b --- /dev/null +++ b/BealtoOpenCL/src/CLError.cpp @@ -0,0 +1,82 @@ +// OpenCL error reporting +// (c) EB Sep 2009 + +#if CONFIG_USE_QT +#include <QtCore/QDebug> +#include <QtCore/QString> +#endif +#include <stdlib.h> +#include <stdio.h> +#include <CL/cl.h> +#include "CLError.h" + +const int NErrorCodes = 63; +static const char * ErrorCodes[NErrorCodes] = { +"CL_SUCCESS", // 0 +"CL_DEVICE_NOT_FOUND", // -1 +"CL_DEVICE_NOT_AVAILABLE", // -2 +"CL_COMPILER_NOT_AVAILABLE", // -3 +"CL_MEM_OBJECT_ALLOCATION_FAILURE", // -4 +"CL_OUT_OF_RESOURCES", // -5 +"CL_OUT_OF_HOST_MEMORY", // -6 +"CL_PROFILING_INFO_NOT_AVAILABLE", // -7 +"CL_MEM_COPY_OVERLAP", // -8 +"CL_IMAGE_FORMAT_MISMATCH", // -9 +"CL_IMAGE_FORMAT_NOT_SUPPORTED", // -10 +"CL_BUILD_PROGRAM_FAILURE", // -11 +"CL_MAP_FAILURE", // -12 +0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // -13..-29 +"CL_INVALID_VALUE", // -30 +"CL_INVALID_DEVICE_TYPE", // -31 +"CL_INVALID_PLATFORM", // -32 +"CL_INVALID_DEVICE", // -33 +"CL_INVALID_CONTEXT", // -34 +"CL_INVALID_QUEUE_PROPERTIES", // -35 +"CL_INVALID_COMMAND_QUEUE", // -36 +"CL_INVALID_HOST_PTR", // -37 +"CL_INVALID_MEM_OBJECT", // -38 +"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", // -39 +"CL_INVALID_IMAGE_SIZE", // -40 +"CL_INVALID_SAMPLER", // -41 +"CL_INVALID_BINARY", // -42 +"CL_INVALID_BUILD_OPTIONS", // -43 +"CL_INVALID_PROGRAM", // -44 +"CL_INVALID_PROGRAM_EXECUTABLE", // -45 +"CL_INVALID_KERNEL_NAME", // -46 +"CL_INVALID_KERNEL_DEFINITION", // -47 +"CL_INVALID_KERNEL", // -48 +"CL_INVALID_ARG_INDEX", // -49 +"CL_INVALID_ARG_VALUE", // -50 +"CL_INVALID_ARG_SIZE", // -51 +"CL_INVALID_KERNEL_ARGS", // -52 +"CL_INVALID_WORK_DIMENSION", // -53 +"CL_INVALID_WORK_GROUP_SIZE", // -54 +"CL_INVALID_WORK_ITEM_SIZE", // -55 +"CL_INVALID_GLOBAL_OFFSET", // -56 +"CL_INVALID_EVENT_WAIT_LIST", // -57 +"CL_INVALID_EVENT", // -58 +"CL_INVALID_OPERATION", // -59 +"CL_INVALID_GL_OBJECT", // -60 +"CL_INVALID_BUFFER_SIZE", // -61 +"CL_INVALID_MIP_LEVEL" // -62 +}; + +void cl::reportStatus(const char * file,int line,const char * function,int status) +{ + if (status >= 0) return; + status = -status; +#if CONFIG_USE_QT + QString e; + if (status >= NErrorCodes || ErrorCodes[status] == 0) e = QString("%1").arg(status); + else e = ErrorCodes[status]; + qDebug() << QString("OpenCL ERROR %1(%2),%3: %4").arg(file).arg(line).arg(function).arg(e); +#else + const char * e = 0; + if (status >= NErrorCodes || ErrorCodes[status] == 0) e = "???"; + else e = ErrorCodes[status]; + printf("Error %s(%d),%s,%s\n",file,line,function,e); +#endif +#ifdef Linux + // exit(1); +#endif +} diff --git a/BealtoOpenCL/src/CLProgram.cpp b/BealtoOpenCL/src/CLProgram.cpp new file mode 100644 index 0000000..731c385 --- /dev/null +++ b/BealtoOpenCL/src/CLProgram.cpp @@ -0,0 +1,84 @@ +// OpenCL program object +// (c) EB Sep 2009 + +#include <stdlib.h> +#include <string.h> +#include "CLProgram.h" + +using namespace cl; + +bool Program::build(const char * options,std::string & buildErrors) +{ + buildErrors.clear(); + cl_int status; + + // Get devices + cl_uint nDevices = 0; + status = clGetProgramInfo(mX,CL_PROGRAM_NUM_DEVICES,sizeof(nDevices),&nDevices,0); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS || nDevices == 0) { buildErrors.append("clGetProgramInfo NUM_DEVICES failed\n"); return false; } + std::vector<cl_device_id> devices; + devices.resize(nDevices,0); + status = clGetProgramInfo(mX,CL_PROGRAM_DEVICES,sizeof(cl_device_id)*nDevices,&(devices[0]),0); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) { buildErrors.append("clGetProgramInfo DEVICES failed\n"); return false; } + + // Build program + status = clBuildProgram(mX,nDevices,&(devices[0]),options,0,0); + REPORT_OPENCL_STATUS(status); + if (status == CL_SUCCESS) return true; // OK + + // Get build info for each device + for (unsigned int i=0;i<nDevices;i++) + { + cl_device_id d = devices[i]; + char aux[4000]; + status = clGetProgramBuildInfo(mX,d,CL_PROGRAM_BUILD_LOG,4000,aux,0); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) { buildErrors.append("clGetProgramBuildInfo failed\n"); continue; } + buildErrors.append(aux); + } + + buildErrors.append("Build failed\n"); + return false; +} + +// Get program binary for first device +bool Program::getBinary(std::vector<unsigned char> & binary) +{ + cl_int status; + binary.clear(); + + // Get devices + cl_uint nDevices = 0; + status = clGetProgramInfo(mX,CL_PROGRAM_NUM_DEVICES,sizeof(nDevices),&nDevices,0); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS || nDevices == 0) return false; + // Get binary sizes for all devices + std::vector<size_t> binarySize(nDevices,0); + status = clGetProgramInfo(mX,CL_PROGRAM_BINARY_SIZES,nDevices*sizeof(size_t),&(binarySize[0]),0); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) return false; + // Alloc buffers + std::vector<unsigned char *> buf(nDevices,0); + for (cl_uint i=0;i<nDevices;i++) + { + if (binarySize[i]>0) buf[i] = (unsigned char *)malloc(binarySize[i]); + } + status = clGetProgramInfo(mX,CL_PROGRAM_BINARIES,nDevices*sizeof(unsigned char *),&(buf[0]),0); + REPORT_OPENCL_STATUS(status); + if (status != CL_SUCCESS) return false; + // Copy result for first device + if (binarySize[0]>0) + { + binary.resize(binarySize[0],0); + memcpy(&(binary[0]),buf[0],binarySize[0]); + } + // Free buffers + for (cl_uint i=0;i<nDevices;i++) + { + if (buf[i] != 0) free(buf[i]); + } + return true; // OK +} + diff --git a/BealtoOpenCL/vs2008/BealtoOpenCL.sln b/BealtoOpenCL/vs2008/BealtoOpenCL.sln new file mode 100644 index 0000000..8ccd961 --- /dev/null +++ b/BealtoOpenCL/vs2008/BealtoOpenCL.sln @@ -0,0 +1,26 @@ +
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BealtoOpenCL", "BealtoOpenCL.vcproj", "{86930221-164E-4E80-B8C0-50DCB5740B37}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.ActiveCfg = Debug|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.Build.0 = Debug|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.ActiveCfg = Debug|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.Build.0 = Debug|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.ActiveCfg = Release|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.Build.0 = Release|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.ActiveCfg = Release|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj b/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj new file mode 100644 index 0000000..ce9faa9 --- /dev/null +++ b/BealtoOpenCL/vs2008/BealtoOpenCL.vcproj @@ -0,0 +1,369 @@ +<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="9,00"
+ Name="BealtoOpenCL"
+ ProjectGUID="{86930221-164E-4E80-B8C0-50DCB5740B37}"
+ RootNamespace="BealtoOpenCL"
+ Keyword="Win32Proj"
+ TargetFrameworkVersion="196613"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ <Platform
+ Name="x64"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+ IntermediateDirectory="$(ConfigurationName)-32"
+ ConfigurationType="4"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="..\include"
+ PreprocessorDefinitions="WIN32;_DEBUG;_LIB"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ EnableEnhancedInstructionSet="2"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ AdditionalDependencies="OpenCL.lib"
+ OutputFile="$(ProjectDir)..\lib-32\$(ProjectName)d.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Debug|x64"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-64"
+ IntermediateDirectory="$(ConfigurationName)-64"
+ ConfigurationType="4"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="..\include"
+ PreprocessorDefinitions="WIN32;_DEBUG;_LIB;CONFIG_USE_QT=1"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ EnableEnhancedInstructionSet="0"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ AdditionalDependencies="OpenCL.lib"
+ OutputFile="$(ProjectDir)..\lib-64\$(ProjectName)d.lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+ IntermediateDirectory="$(ConfigurationName)-32"
+ ConfigurationType="4"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ AdditionalIncludeDirectories="..\include"
+ PreprocessorDefinitions="WIN32;NDEBUG;_LIB"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ EnableEnhancedInstructionSet="2"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ AdditionalDependencies="OpenCL.lib"
+ OutputFile="$(ProjectDir)..\lib-32\$(ProjectName).lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|x64"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-64"
+ IntermediateDirectory="$(ConfigurationName)-64"
+ ConfigurationType="4"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ AdditionalIncludeDirectories="..\include"
+ PreprocessorDefinitions="WIN32;NDEBUG;_LIB;CONFIG_USE_QT=1"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ EnableEnhancedInstructionSet="0"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLibrarianTool"
+ AdditionalDependencies="OpenCL.lib"
+ OutputFile="$(ProjectDir)..\lib-64\$(ProjectName).lib"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <File
+ RelativePath="..\src\CLContext.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\src\CLError.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\src\CLProgram.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <File
+ RelativePath="..\include\BealtoOpenCL.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLBuffer.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLCommandQueue.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLContext.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLError.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLEvent.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLEventList.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLImage2D.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLKernel.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLMemoryObject.h"
+ >
+ </File>
+ <File
+ RelativePath="..\include\CLProgram.h"
+ >
+ </File>
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
diff --git a/MPBenchmarks/ArithmeticTasks.cpp b/MPBenchmarks/ArithmeticTasks.cpp new file mode 100644 index 0000000..670f1d6 --- /dev/null +++ b/MPBenchmarks/ArithmeticTasks.cpp @@ -0,0 +1,294 @@ +// Arithmetic tasks +// (c) EB Nov 2009 + +#include <stdlib.h> +#include "ArithmeticTasks.h" + +#ifdef Linux +typedef short __int16; +typedef int __int32; +typedef long long int __int64; +#endif + +// Get the ratio of user bits per stored bits for a given word size +// (constants defined in OpenCL code) +inline double useRatio(int ws) +{ + switch (ws) + { + case 16: return 14.0/16.0; + case 32: return 30.0/32.0; + case 64: return 62.0/64.0; + } + return 1.0; +} + +double AddNGPUTask::run(int workgroupSize,size_t sz) +{ + if (!mBuildOK) return -1; + + cl::Context * c = getContext(); + cl::CommandQueue * q = getQueue(); + cl::Program * p = getProgram(); + if (c == 0 || q == 0 || p == 0) return -1; + + // Check allocation size + size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size + if (3*sz > max_sz) return -1; + + cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz); + cl::Buffer * b = c->createBuffer(CL_MEM_READ_WRITE,sz); + cl::Buffer * y = c->createBuffer(CL_MEM_READ_WRITE,sz); + cl::Kernel * kernel = 0; + if (mVariant == ADDN_V1) kernel = p->createKernel("add_v1"); + else if (mVariant == ADDN_V2) kernel = p->createKernel("add_v2"); + unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16); + int n = (int)sz / (mWS>>3); // N words in SZ + double mbps = -1; + if (kernel == 0 || a == 0 || b == 0 || y == 0 || buf == 0) goto END; + + // Initialize A and B + for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF); + if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END; + for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF); + if (!q->writeBuffer(b,true,0,sz,buf).isValid()) goto END; + + // Run tests, double nOps until min time is reached + kernel->setArg(0,a); + kernel->setArg(1,b); + kernel->setArg(2,y); + if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END; + for (int nOps = 1; ; nOps <<= 1) + { + double t0 = getT(); + for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize); + q->finish(); + double t = (getT() - t0); + if (t < MIN_RUNNING_TIME) continue; + // OK, t is large enough + t /= (double)nOps; + mbps = (double)sz*useRatio(mWS)*1.0e-6/t; // MB/s + break; + } + +END: // Cleanup + if (buf != 0) _aligned_free(buf); + if (a != 0) delete a; + if (b != 0) delete b; + if (y != 0) delete y; + if (kernel != 0) delete kernel; + + return mbps; +} + +double Mul1GPUTask::run(int workgroupSize,size_t sz) +{ + if (!mBuildOK) return -1; + + cl::Context * c = getContext(); + cl::CommandQueue * q = getQueue(); + cl::Program * p = getProgram(); + if (c == 0 || q == 0 || p == 0) return -1; + + // Check allocation size + size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size + if (2*sz > max_sz) return -1; + + cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz); + cl::Buffer * y = c->createBuffer(CL_MEM_READ_WRITE,sz); + cl::Kernel * kernel = 0; + if (mVariant == MUL1_V1) kernel = p->createKernel("mul1_v1"); + + unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16); + int n = (int)sz>>2; // 4 bytes/word + const int kk = 0x2FEFEFEF; + double mbps = -1; + if (kernel == 0 || a == 0 || y == 0 || buf == 0) goto END; + + // Initialize A + for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(rand() & 0xFF); + if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END; + + // Run tests, double nOps until min time is reached + kernel->setArg(0,kk); + kernel->setArg(1,a); + kernel->setArg(2,y); + if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END; + for (int nOps = 1; ; nOps <<= 1) + { + double t0 = getT(); + for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize); + q->finish(); + double t = (getT() - t0); + if (t < MIN_RUNNING_TIME) continue; + // OK, t is large enough + t /= (double)nOps; + mbps = (double)sz*useRatio(32)*1.0e-6/t; // MB/s + break; + } + +END: // Cleanup + if (buf != 0) _aligned_free(buf); + if (a != 0) delete a; + if (y != 0) delete y; + if (kernel != 0) delete kernel; + + return mbps; +} + +// CPU + +struct AddNThreadParam +{ + int nOps; // number of loops + int index; // Block index + size_t sz; // size to process in thread + int wordSize; // 32 or 64 + int variant; // ADD_V1, ADD_V2 + const unsigned char * a; + const unsigned char * b; + unsigned char * out; +}; + +template <typename T,int LOG_BASE> void addNv1(AddNThreadParam * p) +{ + size_t n = p->sz / sizeof(T); // Words in block + const T * a = (const T *)(p->a); + const T * b = (const T *)(p->b); + T * out = (T *)(p->out); + T BASE = (T)1<<(T)LOG_BASE; + T BASE_MINUS1 = BASE - (T)1; + + for (int it=0;it<p->nOps;it++) + { + T t = 0; // "carry" from previous word + if (p->index > 0) + { + // Get "carry" for last values of previous block + t = (a[-1] + b[-1]) >> (T)LOG_BASE; + } + + for (size_t i=0;i<n;i++) + { + T s = a[i] + b[i]; + out[i] = (s & BASE_MINUS1) + t; + t = s >> (T)LOG_BASE; + } + } +} + +template <typename T,int LOG_BASE> void addNv2(AddNThreadParam * p) +{ + size_t n = p->sz / sizeof(T); // Words in block + const T * a = (const T *)(p->a); + const T * b = (const T *)(p->b); + T * out = (T *)(p->out); + + for (int it=0;it<p->nOps;it++) + { + for (size_t i=0;i<n;i++) + { + out[i] = a[i] + b[i]; + } + } +} + +#ifndef Linux // Uses Win32 intrinsics +void addNv3(AddNThreadParam * p) +{ + const int LOG_BASE = 30; + const int BASE = 1<<LOG_BASE; + const int BASE_MINUS1 = BASE - 1; + + size_t n = p->sz >> 4; // We process blocks of 16 bytes + const __m128i * a = (const __m128i *)(p->a); + const __m128i * b = (const __m128i *)(p->b); + __m128i * out = (__m128i *)(p->out); + + __m128i t; + __m128i mask = _mm_set1_epi32(BASE_MINUS1); + for (int it=0;it<p->nOps;it++) + { + // TODO: initialize T with the carry from the end of previous block + t = _mm_setzero_si128(); + for (size_t i=0;i<n;i++) + { + __m128i s = _mm_add_epi32(_mm_load_si128(a+i),_mm_load_si128(b+i)); + __m128i o = _mm_add_epi32(_mm_and_si128(s,mask),t); + _mm_store_si128(out+i,o); + t = _mm_srli_epi32(s,LOG_BASE); + } + } +} +#endif + +thread_return_t AddNThread(void * x) +{ + AddNThreadParam * p = (AddNThreadParam *)x; + + if (p->wordSize == 16 && p->variant == ADDN_V1) addNv1<__int16,14>(p); + else if (p->wordSize == 32 && p->variant == ADDN_V1) addNv1<__int32,30>(p); + else if (p->wordSize == 64 && p->variant == ADDN_V1) addNv1<__int64,62>(p); + else if (p->wordSize == 16 && p->variant == ADDN_V2) addNv2<__int16,14>(p); + else if (p->wordSize == 32 && p->variant == ADDN_V2) addNv2<__int32,30>(p); + else if (p->wordSize == 64 && p->variant == ADDN_V2) addNv2<__int64,62>(p); +#ifndef Linux + else if (p->wordSize == 32 && p->variant == ADDN_V3) addNv3(p); +#endif + + return 0; +} + +double AddNCPUTask::run(int nThreads,size_t sz) +{ + if (nThreads <= 0 || sz <= 0) return -1; // invalid + size_t sz1 = sz / nThreads; + if (sz1 <= 0) return -1; // SZ too small + + unsigned char * in_a = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN); + unsigned char * in_b = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN); + unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN); + double mbps = -1; + + std::vector<AddNThreadParam> params(nThreads); + for (int nOps=1; ;nOps<<=1) + { + // Initialize memory + for (size_t i=0;i<sz;i++) + { + in_a[i] = (unsigned char)(i & 0xFF); + in_b[i] = (unsigned char)((i+99) & 0xFF); + out[i] = 0; + } + // setup params + for (int i=0;i<nThreads;i++) + { + params[i].a = in_a + i*sz1; + params[i].b = in_b + i*sz1; + params[i].out = out + i*sz1; + params[i].sz = sz1; + params[i].nOps = nOps; + params[i].wordSize = mWS; + params[i].variant = mVariant; + params[i].index = i; + } + // run threads + double t = runCPUThreads(params,AddNThread); + // Check result + if (nOps == 1) + { + // ZZZ: do it! + } + + if (t < MIN_RUNNING_TIME) continue; // Too short + t /= (double)nOps; + + mbps = (double)sz1*(double)nThreads*useRatio(mWS)*1.0e-6/t; + break; + } // nOps loop + + _aligned_free(in_a); + _aligned_free(in_b); + _aligned_free(out); + return mbps; +} diff --git a/MPBenchmarks/ArithmeticTasks.h b/MPBenchmarks/ArithmeticTasks.h new file mode 100644 index 0000000..fd322a5 --- /dev/null +++ b/MPBenchmarks/ArithmeticTasks.h @@ -0,0 +1,103 @@ +// Arithmetic tasks +// (c) EB Nov 2009 + +#ifndef ArithmeticTasks_h +#define ArithmeticTasks_h + +#include "Config.h" +#include "GPUTask.h" +#include "CPUTask.h" + +enum AddNVariants +{ + ADDN_V1 = 0, + ADDN_V2, + ADDN_V3, + NB_ADDN_VARIANTS +}; + +enum Mul1Variants +{ + MUL1_V1 = 0, + NB_MUL1_VARIANTS +}; + + +// Sum two buffers +class AddNGPUTask : public GPUTask +{ + +public: + + AddNGPUTask(int variant,int wordSize,Logger * log) : GPUTask("gpu_add.cl",log) + { + mWS = wordSize; + mVariant = variant; + char options[200]; + _snprintf(options,200,"-DWORD_SIZE=%d",mWS); + mBuildOK = buildProgram(options,log); + if (!mBuildOK) fprintf(stderr,"Build failed\n"); + } + + double run(int workgroupSize,size_t sz); + +private: + + // Word size + int mWS; + // Variant + int mVariant; + // Build OK? + bool mBuildOK; + +}; + +// Multiply buffer by 1 digit +class Mul1GPUTask : public GPUTask +{ + +public: + + Mul1GPUTask(int variant,int blockSize,Logger * log) : GPUTask("gpu_mul1.cl",log) + { + mBS = blockSize; + mVariant = variant; + char options[200]; + const int logBase = 30; + const int base = 1<<logBase; + const int baseMinus1 = base-1; + _snprintf(options,200,"-DLOG_BASE=%d -DBASE=%d -DBASE_MINUS1=%d -DBLOCK_SIZE=%d",logBase,base,baseMinus1,blockSize); + mBuildOK = buildProgram(options,log); + } + + double run(int workgroupSize,size_t sz); + +private: + + // Build OK? + bool mBuildOK; + // Block size + int mBS; + // Variant + int mVariant; + +}; + +// CPU + +class AddNCPUTask : public CPUTask +{ +public: + AddNCPUTask(int variant,int wordSize) : mWS(wordSize), mVariant(variant) { } + + double run(int nThreads,size_t sz); + +private: + + // Word size (32 or 64) + int mWS; + // Variant + int mVariant; +}; + +#endif // ArithmeticTasks_h diff --git a/MPBenchmarks/CPUTask.h b/MPBenchmarks/CPUTask.h new file mode 100644 index 0000000..62fe768 --- /dev/null +++ b/MPBenchmarks/CPUTask.h @@ -0,0 +1,22 @@ +// CPU task base class +// (c) EB Nov 2009 + +#ifndef CPUTask_h +#define CPUTask_h + +class CPUTask +{ +public: + + // Destructor + virtual ~CPUTask() { } + + // Run task on NTHREADS threads and output size SZ. + // If SZ is not a multiple of NTHREADS, SZ is rounded down to + // the closest multiple. + // Return throughput in MB/s of output, or -1 on error. + virtual double run(int nThreads,size_t sz) = 0; + +}; + +#endif // CPUTask_h diff --git a/MPBenchmarks/Config.h b/MPBenchmarks/Config.h new file mode 100644 index 0000000..c9a72ff --- /dev/null +++ b/MPBenchmarks/Config.h @@ -0,0 +1,172 @@ +// Config flags, common includes, and utils +// (c) EB Nov 2009 + +#ifndef Config_h +#define Config_h + +// Configuration. The flags may be defined on the compiler command line, +// and in this case we don't change them here. + +// Use Qt interface? +#ifndef CONFIG_USE_QT +#define CONFIG_USE_QT 1 +#endif + +// Use MPIR library to cross-check results? +#ifndef CONFIG_USE_MPIR +#define CONFIG_USE_MPIR 0 +#endif + +#ifdef WIN32 +#define _CRT_SECURE_NO_WARNINGS +#define WIN32_LEAN_AND_MEAN +#endif + +// Minimal run time required to have a valid measurement (s) +const double MIN_RUNNING_TIME = 0.2; + +// CPU memory alloc alignment (bytes) +const int ALLOC_ALIGN = 4096; + +#include <malloc.h> +#include <math.h> +#include <string> + +// Windows specific +#ifdef WIN32 +#include <emmintrin.h> +#include <windows.h> +inline double getT() +{ + LARGE_INTEGER c,f; + QueryPerformanceFrequency(&f); + QueryPerformanceCounter(&c); + return (double)c.QuadPart/(double)f.QuadPart; +} +#endif + +// Linux specific +#ifdef Linux +#include <sys/time.h> +#include <pthread.h> +inline void * _aligned_malloc(size_t sz,size_t a) { return memalign(a,sz); } +inline void _aligned_free(void * x) { free(x); } +#define _snprintf snprintf +inline double getT() +{ + struct timeval tv; + gettimeofday(&tv,0); + return (double)tv.tv_sec+1.0e-6*(double)tv.tv_usec; +} +#endif + +// Common +#if CONFIG_USE_MPIR +#include <mpir.h> +#endif +#include "BealtoOpenCL.h" + +// Text logger +class Logger +{ +public: + virtual ~Logger() { } + virtual void append(const char * s) { printf("%s\n",s); fflush(stdout); } + void append(const std::string & s) { append(s.c_str()); } + virtual void appendTitle(const char * s) { printf("\n*** %s\n\n",s); fflush(stdout); } + void appendTitle(const std::string & s) { appendTitle(s.c_str()); } +}; + +#if CONFIG_USE_QT +#include <QtGui/QTextEdit> +#include <QtGui/QApplication> +// Qt QTextEdit logger +class QtLogger : public Logger +{ +public: + QtLogger(QTextEdit * e) : mE(e) { } + void append(const char * s) + { + QTextCharFormat f; + f.setFontFamily("fixed"); + mE->setCurrentCharFormat(f); + mE->append(s); + qApp->processEvents(); + } + void appendTitle(const char * s) + { + QTextCharFormat f; + f.setFontWeight(QFont::Bold); + mE->setCurrentCharFormat(f); + mE->append(s); + qApp->processEvents(); + } +private: + QTextEdit * mE; +}; +#endif + +// Return user friendly size from LOG (size = 1<<LOG) in S. +inline void getUserSize(int log,std::string & s) +{ + if (log < 0 || log >= 40) { s.assign("???"); return; } + char aux[200]; + if (log < 10) _snprintf(aux,200,"%d B",1<<log); + else if (log < 20) _snprintf(aux,200,"%d KiB",1<<(log-10)); + else if (log < 30) _snprintf(aux,200,"%d MiB",1<<(log-20)); + else if (log < 40) _snprintf(aux,200,"%d GiB",1<<(log-30)); + s.assign(aux); +} + +#ifdef Linux +typedef void * (*ThreadProc)(void *); +#define thread_return_t void * +#endif +#ifdef WIN32 +typedef LPTHREAD_START_ROUTINE ThreadProc; +#define thread_return_t DWORD WINAPI +#endif + +// Run one CPU thread calling F for each parameter in the PARAMS array. +// Return the real time of execution, or -1 on error. +template <class P> double runCPUThreads(std::vector<P> & params,ThreadProc f) +{ + int nt = (int)params.size(); // thread count + if (nt <= 0) return -1; // invalid + double t0 = getT(); + bool ok = true; +#ifdef Linux + std::vector<pthread_t> threads(nt,0); + // start all threads + for (int i=0;i<nt;i++) + { + int s = pthread_create(&(threads[i]),0,f,&(params[i])); + if (s != 0) { ok = false; threads[i] = 0; } + } + // join all threads + for (int i=0;i<nt;i++) + { + if (threads[i] == 0) continue; // creation failed + int s = pthread_join(threads[i],0); + if (s != 0) ok = false; + } +#endif +#ifdef WIN32 + std::vector<HANDLE> threads(nt,0); + // start all threads + for (int i=0;i<nt;i++) + { + threads[i] = CreateThread(0,0,f,&(params[i]),0,0); + if (threads[i] == 0) ok = false; + } + // wait for all threads to terminate + WaitForMultipleObjects(nt,&(threads[0]),TRUE,INFINITE); + // delete threads + for (int i=0;i<nt;i++) CloseHandle(threads[i]); +#endif + if (!ok) return -1; // error + double t = getT() - t0; + return t; +} + +#endif // Config_h diff --git a/MPBenchmarks/Conversions.h b/MPBenchmarks/Conversions.h new file mode 100644 index 0000000..1ec9eb9 --- /dev/null +++ b/MPBenchmarks/Conversions.h @@ -0,0 +1,74 @@ +// Convert 30-bit and MPZ numbers to 32-bit hexa strings for comparisons +// (c) EB Nov 2009 + +#ifndef Conversions_h +#define Conversions_h + +// Convert X[N] from redundand representation in base 2^30 +// to an hex string. +void convert30(int n,const int * x,std::string & out) +{ + int nAux = 0; // Valid bits in aux + __int64 aux = 0; + std::vector<unsigned int> y(n,0); + int j = 0; // Next value in Y receiving bits + + for (int i=0;i<n;i++) + { + // Insert X[i] into AUX + aux += ((__int64)x[i]<<nAux); + nAux += 30; + // Extract 32 bits if available + if (nAux>=32) + { + y[j++] = (unsigned int)(aux&0xFFFFFFFF); + aux >>= 32; + nAux -= 32; + } + } + // Extract last bits + while (nAux > 0) + { + y[j++] = (unsigned int)(aux&0xFFFFFFFF); + aux >>= 32; + nAux -= 32; + } + out.clear(); + out.reserve(9*n+20); + char s[32]; + bool first = true; + for (int i=n-1;i>=0;i--) + { + if (first) + { + if (y[i] == 0) continue; + first = false; + } + else out.push_back('.'); + _snprintf(s,32,"%08X",y[i]); + out.append(s); + } + if (first) out.append("0"); +} + +#if CONFIG_USE_MPIR +// Convert mpz_t integer to a string +void convertMPZ(mpz_t x,std::string & out) +{ + char * s = mpz_get_str(0,16,x); + if (s == 0) return; // Failed + int n = (int)strlen(s); + out.clear(); + out.reserve(n+n/8+8); + int digits = 0; + while ( (digits+n)&7 ) { out.push_back('0'); digits++; } // Initial 0 to get a multiple of 8 + for (int i=0;i<n;i++) + { + if ( (digits&7) == 0 && i>0 ) out.push_back('.'); + out.push_back(toupper(s[i])); digits++; + } + free(s); +} +#endif + +#endif // Conversions_h diff --git a/MPBenchmarks/GPUTask.cpp b/MPBenchmarks/GPUTask.cpp new file mode 100644 index 0000000..dbfa65c --- /dev/null +++ b/MPBenchmarks/GPUTask.cpp @@ -0,0 +1,74 @@ +// GPU task base class +// (c) EB Nov 2009 + +#include "GPUTask.h" + +GPUTask::GPUTask(const char * programFile,Logger * log) +{ + cl::Context * c = 0; + cl::CommandQueue * q = 0; + cl::Program * p = 0; + bool ok = true; + std::string s; + std::vector<unsigned char> binary; + + if (log == 0) { ok = false; goto END; } + c = cl::Context::create(); + if (c == 0) { log->append("Context creation failed"); ok = false; goto END; } + q = c->createCommandQueue(0,0); + if (q == 0) { log->append("Command queue creation failed"); ok = false; goto END; } + if (programFile == 0) p = 0; + else + { +#ifdef WIN32 + // We run from the vs2008 solution directory, so we must + // change the path... (quick & dirty patch) + char programFileWin[1000]; + _snprintf(programFileWin,1000,"../%s",programFile); + p = c->createProgramWithFile(programFileWin); +#else + p = c->createProgramWithFile(programFile); +#endif + if (p == 0) { log->append("Program creation failed"); ok = false; goto END; } + } + +END: + if (!ok) + { + if (p != 0) { delete p; p = 0; } + if (q != 0) { delete q; q = 0; } + if (c != 0) { delete c; c = 0; } + } + mProgram = p; + mQueue = q; + mContext = c; +} + +GPUTask::~GPUTask() +{ + delete mProgram; + delete mQueue; + delete mContext; +} + +bool GPUTask::buildProgram(const char * options,Logger * log) +{ + if (mProgram == 0) return false; // Invalid + if (options == 0 || log == 0) return false; // Invalid + + std::string s; + bool ok = mProgram->build(options,s); + if (!ok) { log->append("Build failed, errors:"); log->append(s.c_str()); } +#if 0 // Show PTX code (nvidia only) + ok &= p->getBinary(binary); + if (!ok) { log->append("Get binary failed"); goto END; } + log->append((char *)&(binary[0])); +#endif + return ok; +} + +cl::Kernel * GPUTask::createKernel(const char * name) +{ + if (mProgram == 0) return 0; // Invalid + return mProgram->createKernel(name); +} diff --git a/MPBenchmarks/GPUTask.h b/MPBenchmarks/GPUTask.h new file mode 100644 index 0000000..d9f8155 --- /dev/null +++ b/MPBenchmarks/GPUTask.h @@ -0,0 +1,56 @@ +// GPU task base class +// (c) EB Nov 2009 + +#ifndef GPUTask_h +#define GPUTask_h + +#include "Config.h" + +class GPUTask +{ +public: + + // Create a new context, with a command queue, and the given program. + // PROGRAMFILE may be 0 if no program is needed. + // Check if the class is valid after construction with isValid(). + GPUTask(const char * programFile,Logger * log); + + // Destructor. Release CL objects + virtual ~GPUTask(); + + // Check if the task is well-defined. May be redefined in derived classes + // to add more checks (call base class if redefined). + virtual bool isValid() { return (mContext != 0) && (mQueue != 0); } + + // Functions to redefine in derived classes + + // Run test producing SZ bytes of output. + // Return throughput in MB/s of generated output, + // or -1 on error. + virtual double run(int workgroupSize,size_t sz) = 0; + +protected: + + // Build the program with OPTIONS. Error messages are put in LOG. + // Return TRUE if OK, FALSE on error. + bool buildProgram(const char * options,Logger * log); + + // Create a kernel for the current program, unless undefined + cl::Kernel * createKernel(const char * name); + + inline cl::Context * getContext() { return mContext; } + inline cl::CommandQueue * getQueue() { return mQueue; } + inline cl::Program * getProgram() { return mProgram; } + +private: + + // No copy + GPUTask(const GPUTask &); + GPUTask & operator = (const GPUTask &); + + cl::Context * mContext; + cl::CommandQueue * mQueue; + cl::Program * mProgram; +}; + +#endif // GPUTask_h diff --git a/MPBenchmarks/LICENSE.txt b/MPBenchmarks/LICENSE.txt new file mode 100644 index 0000000..f5fe116 --- /dev/null +++ b/MPBenchmarks/LICENSE.txt @@ -0,0 +1,27 @@ +This code is released under the following license (BSD-style).
+--
+
+Copyright (c) 2009, Eric Bainville
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Eric Bainville nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY ERIC BAINVILLE ''AS IS'' AND ANY
+EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ERIC BAINVILLE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/MPBenchmarks/Makefile b/MPBenchmarks/Makefile new file mode 100644 index 0000000..ebd701b --- /dev/null +++ b/MPBenchmarks/Makefile @@ -0,0 +1,72 @@ +# OpenCL multiprecision CPU+GPU benchmarks +# EB Nov 2009 + +# archive data +CURRENT_DATE := $(shell date +%Y%m%d_%Hh%M) +CURRENT_DIR := $(notdir $(shell /bin/pwd)) +CURRENT_MACHINE := $(shell uname -m) + +# output directories +OBJDIR := objs +DEPDIR := deps +ifeq ($(CURRENT_MACHINE),x86_64) +LIBDIR := lib-64 +else +LIBDIR := lib-32 +endif + +# targets +OBJS = $(patsubst %,$(OBJDIR)/%.o, GPUTask MemoryTasks ArithmeticTasks appMain ) +DEPS = $(patsubst $(OBJDIR)/%.o, $(DEPDIR)/%.d, $(OBJS)) + +# OpenCL SDK directory +# OPENCL_DIR = /opt/cuda/include +OPENCL_DIR = /opt/ati-stream-sdk-v2.0-beta4-lnx64 +# Bealto OpenCL library path +BEALTO_OPENCL_DIR = ../BealtoOpenCL +BEALTO_OPENCL_LIB = $(BEALTO_OPENCL_DIR)/$(LIBDIR)/libBealtoOpenCL.a + +# flags +CXXFLAGS = -DLinux -DCONFIG_USE_QT=0 -O2 -mtune=nocona -msse3 -Wall -I/usr/include/qt4 -I$(OPENCL_DIR)/include -I$(BEALTO_OPENCL_DIR)/include +LDFLAGS = -L/usr/lib/qt4 -L$(OPENCL_DIR)/lib/x86_64 + +all: tocl + +tocl: $(OBJS) $(BEALTO_OPENCL_LIB) + g++ $(LDFLAGS) -o $@ $^ $(BEALTO_OPENCL_LIB) -lpthread -lOpenCL # -lQtCore -lQtGui + +$(BEALTO_OPENCL_LIB): + make -C $(BEALTO_OPENCL_DIR) lib + +t1: t1.o + g++ -o $@ $^ + +dos2unix: + dos2unix *.cpp *.h *.cl Makefile + +clean: + /bin/rm -f *.o *~ tocl t1 + /bin/rm -rf vs2008/Release-* vs2008/Debug-* $(OBJDIR) $(DEPDIR) + +archive: clean + @echo "ARCHIVE $(CURRENT_DATE)" + tar czf "../MPBenchmarks-$(CURRENT_DATE).tar.gz" -C.. --exclude=".svn" $(CURRENT_DIR) + +##### Dependencies +$(DEPDIR)/%.d: %.cpp + @[ -d $(DEPDIR) ] || mkdir -p $(DEPDIR) + @/bin/echo -e "DEPS \033[32m$*\033[0m" + @$(CXX) $(CXXFLAGS) -o $@ -MM -MT '$(OBJDIR)/$*.o $@' $< + +##### Compilation +$(OBJDIR)/%.o: %.cpp + @[ -d $(OBJDIR) ] || mkdir -p $(OBJDIR) + @/bin/echo -e "C++ \033[34m$*\033[0m" + @$(CXX) $(CXXFLAGS) -c -o $@ $< + +##### Qt +moc_%.cpp: %.h + @/bin/echo -e "MOC \033[34m$*\033[0m" + @moc -o $@ $< + +-include $(DEPS) diff --git a/MPBenchmarks/MemoryTasks.cpp b/MPBenchmarks/MemoryTasks.cpp new file mode 100644 index 0000000..a7780bc --- /dev/null +++ b/MPBenchmarks/MemoryTasks.cpp @@ -0,0 +1,280 @@ +// Memory tasks +// (c) EB Nov 2009 + +#include "MemoryTasks.h" + +#include <string.h> + + +// GPU + +double CopyGPUTask::run(int workgroupSize,size_t sz) +{ + cl::Context * c = getContext(); + cl::CommandQueue * q = getQueue(); + if (c == 0 || q == 0) return -1; + + // Check allocation size + size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size + if (mCT == DEVICE_TO_DEVICE_COPY) + { + if (2*sz > max_sz) return -1; + } + if (sz > max_sz) return -1; + + bool ok = true; + cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz); + if (a == 0) ok = false; + cl::Buffer * b = 0; + if (mCT == DEVICE_TO_DEVICE_COPY) + { + b = c->createBuffer(CL_MEM_READ_WRITE,sz); + if (b == 0) ok = false; + } + unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16); + if (buf == 0) ok = false; + double mbps = -1; + + if (!ok) goto END; // Alloc failed + + // Initialize A and check errors + for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF); + if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END; + for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)0; + if (!q->readBuffer(a,true,0,sz,buf).isValid()) goto END; + ok = true; + + // check write+read loop + for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)(i & 0xFF)) ok = false; + if (!ok) { fprintf(stderr,"write+read failed\n"); goto END; } + + // check write+copy+read loop + if (b != 0) + { + for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF); + if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END; + for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)0; + if (!q->writeBuffer(b,true,0,sz,buf).isValid()) goto END; + if (!q->copyBuffer(a,b,0,0,sz).isValid()) goto END; + if (!q->readBuffer(b,true,0,sz,buf).isValid()) goto END; + for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)(i & 0xFF)) ok = false; + if (!ok) { fprintf(stderr,"write+copy+read failed\n"); goto END; } + } + + // Run tests, double nOps until the min time is reached + for (int nOps = 5; ; nOps <<= 1) + { + double t0 = getT(); + switch (mCT) + { + case HOST_TO_DEVICE_COPY: + for (int i=0;i<nOps;i++) q->writeBuffer(a,false,0,sz,buf); + break; + case DEVICE_TO_HOST_COPY: + for (int i=0;i<nOps;i++) q->readBuffer(a,false,0,sz,buf); + break; + case DEVICE_TO_DEVICE_COPY: + for (int i=0;i<nOps;i++) q->copyBuffer(a,b,0,0,sz); + break; + } + q->finish(); + double t = (getT() - t0); + if (t < MIN_RUNNING_TIME) continue; + // OK, t is large enough + t /= (double)nOps; + mbps = (double)sz*1.0e-6/t; // MB/s + break; + } + +END: + if (buf != 0) _aligned_free(buf); + if (a != 0) delete a; + if (b != 0) delete b; + + return mbps; +} + +double ZeroGPUTask::run(int workgroupSize,size_t sz) +{ + cl::Context * c = getContext(); + cl::CommandQueue * q = getQueue(); + cl::Program * p = getProgram(); + if (c == 0 || q == 0 || p == 0) return -1; + + // Check allocation size + size_t max_sz = c->getDeviceMaxMemAllocSize(); // Max global mem size + if (sz > max_sz) return -1; + + cl::Buffer * a = c->createBuffer(CL_MEM_READ_WRITE,sz); + unsigned char * buf = (unsigned char *)_aligned_malloc(sz,16); + cl::Kernel * kernel = p->createKernel("zero"); + double mbps = -1; + int n = (int)sz / (mWS>>3); // N words in SZ + bool ok; + if (kernel == 0 || a == 0 || buf == 0) goto END; + + // Initialize A + for (size_t i=0;i<sz;i++) buf[i] = (unsigned char)(i & 0xFF); + if (!q->writeBuffer(a,true,0,sz,buf).isValid()) goto END; + + // Run tests, double nOps until min time is reached + kernel->setArg(0,a); + if (!q->execKernel1(kernel,n,workgroupSize).isValid()) goto END; + + // check write+zero+read + if (!q->readBuffer(a,true,0,sz,buf).isValid()) goto END; + ok = true; + for (size_t i=0;i<sz;i++) if (buf[i] != (unsigned char)0) ok = false; + if (!ok) { fprintf(stderr,"write+zero+read failed\n"); goto END; } + + for (int nOps = 5; ; nOps <<= 1) + { + double t0 = getT(); + for (int i=0;i<nOps;i++) q->execKernel1(kernel,n,workgroupSize); + q->finish(); + double t = (getT() - t0); + if (t < MIN_RUNNING_TIME) continue; + // OK, t is large enough + t /= (double)nOps; + mbps = (double)sz*1.0e-6/t; // MB/s + break; + } + +END: + if (buf != 0) _aligned_free(buf); + if (a != 0) delete a; + if (kernel != 0) delete kernel; + + return mbps; +} + +// CPU + +struct CopyThreadParam +{ + int nOps; // number of loops + size_t sz; // size to process in thread + const unsigned char * in; + unsigned char * out; +}; + +thread_return_t CopyThread(void * x) +{ + CopyThreadParam * p = (CopyThreadParam *)x; +#ifdef WIN32 + for (int i=0;i<p->nOps;i++) CopyMemory(p->out,p->in,p->sz); +#else + for (int i=0;i<p->nOps;i++) memcpy(p->out,p->in,p->sz); +#endif + return 0; +} + +double CopyCPUTask::run(int nThreads,size_t sz) +{ + if (nThreads <= 0 || sz <= 0) return -1; // invalid + size_t sz1 = sz / nThreads; + if (sz1 <= 0) return -1; // SZ too small + + unsigned char * in = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN); + unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN); + double mbps = -1; + + std::vector<CopyThreadParam> params(nThreads); + for (int nOps=1; ;nOps<<=1) + { + // Initialize memory + for (size_t i=0;i<sz;i++) + { + in[i] = (unsigned char)(i & 0xFF); + out[i] = 0; + } + // setup params + for (int i=0;i<nThreads;i++) + { + params[i].in = in + i*sz1; + params[i].out = out + i*sz1; + params[i].sz = sz1; + params[i].nOps = nOps; + } + // run threads + double t = runCPUThreads(params,CopyThread); + // Check result + if (nOps == 1) + { + if (memcmp(in,out,sz) != 0) { fprintf(stderr,"CPU copy error\n"); mbps = -1; break; } + } + + if (t < MIN_RUNNING_TIME) continue; // Too short + t /= (double)nOps; + + mbps = (double)sz1*(double)nThreads*1.0e-6/t; + break; + } + + _aligned_free(in); + _aligned_free(out); + return mbps; +} + +struct ZeroThreadParam +{ + int nOps; // number of loops + size_t sz; // size to process in thread + unsigned char * out; +}; + +thread_return_t ZeroThread(void * x) +{ + CopyThreadParam * p = (CopyThreadParam *)x; +#ifdef WIN32 + for (int i=0;i<p->nOps;i++) ZeroMemory(p->out,p->sz); +#else + for (int i=0;i<p->nOps;i++) memset(p->out,0,p->sz); +#endif + return 0; +} + +double ZeroCPUTask::run(int nThreads,size_t sz) +{ + if (nThreads <= 0 || sz <= 0) return -1; // invalid + size_t sz1 = sz / nThreads; + if (sz1 <= 0) return -1; // SZ too small + + unsigned char * out = (unsigned char *)_aligned_malloc(sz,ALLOC_ALIGN); + double mbps = -1; + + std::vector<CopyThreadParam> params(nThreads); + for (int nOps=1; ;nOps<<=1) + { + // Initialize memory + for (size_t i=0;i<sz;i++) + { + out[i] = (unsigned char)(i & 0xFF); + } + // setup params + for (int i=0;i<nThreads;i++) + { + params[i].out = out + i*sz1; + params[i].sz = sz1; + params[i].nOps = nOps; + } + // run threads + double t = runCPUThreads(params,ZeroThread); + // Check result + if (nOps == 1) + { + bool ok = true; + for (size_t i=0;i<sz;i++) if ( out[i] != 0 ) { ok = false; break; } + if (!ok) { fprintf(stderr,"CPU zero error\n"); mbps = -1; break; } + } + + if (t < MIN_RUNNING_TIME) continue; // Too short + t /= (double)nOps; + + mbps = (double)sz1*(double)nThreads*1.0e-6/t; + break; + } + + _aligned_free(out); + return mbps; +} diff --git a/MPBenchmarks/MemoryTasks.h b/MPBenchmarks/MemoryTasks.h new file mode 100644 index 0000000..92e35f6 --- /dev/null +++ b/MPBenchmarks/MemoryTasks.h @@ -0,0 +1,83 @@ +// Memory tasks +// (c) EB Nov 2009 + +#ifndef MemoryTasks_h +#define MemoryTasks_h + +#include "Config.h" +#include "GPUTask.h" +#include "CPUTask.h" + +// Copy two buffers (GPU) +class CopyGPUTask : public GPUTask +{ + +public: + + enum CopyType + { + HOST_TO_DEVICE_COPY, + DEVICE_TO_HOST_COPY, + DEVICE_TO_DEVICE_COPY + }; + + CopyGPUTask(int copyType,Logger * log) : GPUTask(0,log) + { + mCT = copyType; + } + + ~CopyGPUTask() + { + } + + double run(int workgroupSize,size_t sz); + +private: + + // Copy type + int mCT; + +}; + +// Zero one buffer (GPU) +class ZeroGPUTask : public GPUTask +{ + +public: + + ZeroGPUTask(int wordSize,Logger * log) : GPUTask("gpu_zero.cl",log) + { + mWS = wordSize; + char options[200]; + _snprintf(options,200,"-DWORD_SIZE=%d",mWS); + buildProgram(options,log); + } + + ~ZeroGPUTask() + { + } + + double run(int workgroupSize,size_t sz); + +private: + + // Word size + int mWS; + +}; + +// CPU + +class CopyCPUTask : public CPUTask +{ +public: + double run(int nThreads,size_t sz); +}; + +class ZeroCPUTask : public CPUTask +{ +public: + double run(int nThreads,size_t sz); +}; + +#endif // MemoryTasks_h diff --git a/MPBenchmarks/appMain.cpp b/MPBenchmarks/appMain.cpp new file mode 100644 index 0000000..f6f82bf --- /dev/null +++ b/MPBenchmarks/appMain.cpp @@ -0,0 +1,284 @@ +// (c) EB Nov 2009 + +#include "Config.h" + +#include "MemoryTasks.h" +#include "ArithmeticTasks.h" + +#if CONFIG_USE_QT +#include <QtGui/QApplication> +#include <QtGui/QMainWindow> +#include <QtGui/QTextEdit> +#include <QtGui/QClipboard> +#endif + +struct TestResult +{ + std::string title; + std::vector<int> mbps; // MB/s by logSize + void update(int ls,int x) { if (x > mbps[ls]) mbps[ls] = x; } // keep max +}; + +void print(FILE * f,const TestResult & r,int minLogSize,int maxLogSize) +{ + fprintf(f,"%-25s",r.title.c_str()); + for (int i=minLogSize;i<=maxLogSize;i++) + { + int x = r.mbps[i]; + fprintf(f,"%8d",x); + } + fprintf(f,"\n"); +} + +void print(Logger * log,const TestResult & r,int minLogSize,int maxLogSize) +{ + char aux[200]; + std::string s; + _snprintf(aux,200,"%-25s",r.title.c_str()); s.append(aux); + for (int i=minLogSize;i<=maxLogSize;i++) + { + int x = r.mbps[i]; + _snprintf(aux,200,"%8d",x); s.append(aux); + } + log->append(s); +} + +int main(int argc,char ** argv) +{ +#if CONFIG_USE_QT + // Qt windows + QApplication app(argc,argv); + QMainWindow * w = new QMainWindow(); + QTextEdit * te = new QTextEdit(); + te->setReadOnly(true); + w->setCentralWidget(te); + w->setMinimumSize(QSize(800,600)); + w->show(); + app.processEvents(); + Logger * log = new QtLogger(te); +#else + // Command line + Logger * log = new Logger(); +#endif + +#if 1 + // Display device info + cl::Context * c = cl::Context::create(); + if (c != 0) + { + std::string s; + if (c->getAllDeviceInfo(0,s)) log->append(s.c_str()); + delete c; + c = 0; + } +#endif + + int minLogSize = 10; + int maxLogSize = 29; + std::vector<TestResult> allTests; + + // Run all CPU tests + for (int task=0;task<100;task++) + { + // Select tests here + // if (task != 30) continue; + // continue; + + CPUTask * g = 0; + // Default values + std::string title; + int minNThreads = 1; + int maxNThreads = 32; + + switch (task) + { + case 0: + title.assign("CPU copyN"); + g = new CopyCPUTask(); + break; + case 1: + title.assign("CPU zeroN"); + g = new ZeroCPUTask(); + break; + + case 10: + title.assign("CPU addN v1 16-bit"); + g = new AddNCPUTask(ADDN_V1,16); + break; + case 11: + title.assign("CPU addN v1 32-bit"); + g = new AddNCPUTask(ADDN_V1,32); + break; + case 12: + title.assign("CPU addN v1 64-bit"); + g = new AddNCPUTask(ADDN_V1,64); + break; + + case 20: + title.assign("CPU addN v2 16-bit"); + g = new AddNCPUTask(ADDN_V2,16); + break; + case 21: + title.assign("CPU addN v2 32-bit"); + g = new AddNCPUTask(ADDN_V2,32); + break; + case 22: + title.assign("CPU addN v2 64-bit"); + g = new AddNCPUTask(ADDN_V2,64); + break; + +#ifndef Linux + case 30: + title.assign("CPU addN v3 32-bit"); + g = new AddNCPUTask(ADDN_V3,32); + break; +#endif + } + if (g == 0) continue; // nothing to do + + TestResult tr; + tr.title.assign(title); + tr.mbps.resize(1+maxLogSize,0); + log->appendTitle(title.c_str()); + for (int nt=minNThreads;nt<=maxNThreads;nt<<=1) + for (int ls=minLogSize;ls<=maxLogSize;ls++) + { + size_t sz = 1<<ls; + double mbps = g->run(nt,sz); + char aux[200]; + std::string usz; + getUserSize(ls,usz); + _snprintf(aux,200,"NT=%3d SZ=%7s %6.0f MB/s",nt,usz.c_str(),mbps); + log->append(aux); + tr.update(ls,mbps); + } + + print(stdout,tr,minLogSize,maxLogSize); +#if CONFIG_USE_QT + print(log,tr,minLogSize,maxLogSize); +#endif + allTests.push_back(tr); + delete g; + } + + // Run all GPU tests + for (int task=0;task<100;task++) + { + // Select tests here + //continue; + //if (task != 30) continue; + + GPUTask * g = 0; + // Default values + std::string title; + int minWorkgroupSize = 16; + int maxWorkgroupSize = 512; + + switch (task) + { + case 0: + title.assign("Host to device copy"); + g = new CopyGPUTask(CopyGPUTask::HOST_TO_DEVICE_COPY,log); + minWorkgroupSize = maxWorkgroupSize; + break; + case 1: + title.assign("Device to host copy"); + g = new CopyGPUTask(CopyGPUTask::DEVICE_TO_HOST_COPY,log); + minWorkgroupSize = maxWorkgroupSize; + break; + case 2: + title.assign("copyN"); + g = new CopyGPUTask(CopyGPUTask::DEVICE_TO_DEVICE_COPY,log); + minWorkgroupSize = maxWorkgroupSize; + break; + case 3: + title.assign("zeroN 32-bit"); + g = new ZeroGPUTask(32,log); + break; + case 4: + title.assign("zeroN 64-bit"); + g = new ZeroGPUTask(64,log); + break; + + case 10: + title.assign("AddN v1 16-bit"); + g = new AddNGPUTask(ADDN_V1,16,log); + break; + case 11: + title.assign("AddN v1 32-bit"); + g = new AddNGPUTask(ADDN_V1,32,log); + break; + case 12: + title.assign("AddN v1 64-bit"); + g = new AddNGPUTask(ADDN_V1,64,log); + break; + + case 20: + title.assign("AddN v2 16-bit"); + g = new AddNGPUTask(ADDN_V2,16,log); + break; + case 21: + title.assign("AddN v2 32-bit"); + g = new AddNGPUTask(ADDN_V2,32,log); + break; + case 22: + title.assign("AddN v2 64-bit"); + g = new AddNGPUTask(ADDN_V2,64,log); + break; + + case 40: + title.assign("Mul1 v1 32-bit"); + g = new Mul1GPUTask(MUL1_V1,1,log); + break; + + } + if (g == 0) continue; // nothing to do + + TestResult tr; + tr.title.assign(title); + tr.mbps.resize(1+maxLogSize,0); + log->appendTitle(title.c_str()); + for (int wg=minWorkgroupSize;wg<=maxWorkgroupSize;wg<<=1) + { + char aux[1000]; + _snprintf(aux,1000,"%s - workgroup_size=%d",title.c_str(),wg); + log->append(aux); + for (int ls=minLogSize;ls<=maxLogSize;ls++) + { + size_t sz = 1<<ls; + double mbps = g->run(wg,sz); + std::string usz; + getUserSize(ls,usz); + _snprintf(aux,1000,"WG=%3d SZ=%7s %6.0f MB/s",wg,usz.c_str(),mbps); + log->append(aux); + tr.update(ls,mbps); + } + } + + print(stdout,tr,minLogSize,maxLogSize); +#if CONFIG_USE_QT + print(log,tr,minLogSize,maxLogSize); +#endif + allTests.push_back(tr); + delete g; + } + + // Dump all results + int nt = (int)allTests.size(); + log->appendTitle("RESULTS"); + printf("\n----\n"); + for (int i=0;i<nt;i++) + { + TestResult & tr = allTests[i]; + print(stdout,tr,minLogSize,maxLogSize); +#if CONFIG_USE_QT + print(log,tr,minLogSize,maxLogSize); +#endif + } + +#if CONFIG_USE_QT + // Enter event loop to terminate + log->appendTitle("DONE"); + app.exec(); +#endif +} diff --git a/MPBenchmarks/gpu_add.cl b/MPBenchmarks/gpu_add.cl new file mode 100644 index 0000000..7237395 --- /dev/null +++ b/MPBenchmarks/gpu_add.cl @@ -0,0 +1,86 @@ +// GPU addition +// (c) EB Sep 2009 + +#if (WORD_SIZE == 64) + +typedef long Word; +const Word LOG_BASE = 62; +const Word BASE = (Word)1<<(Word)62; +const Word BASE_MINUS1 = (Word)1<<(Word)62 - 1; + +#elif (WORD_SIZE == 32) + +typedef int Word; +const Word LOG_BASE = 30; +const Word BASE = (Word)1<<(Word)30; +const Word BASE_MINUS1 = (Word)1<<(Word)30 - 1; + +#elif (WORD_SIZE == 16) + +typedef short Word; +const Word LOG_BASE = 14; +const Word BASE = (Word)1<<(Word)14; +const Word BASE_MINUS1 = (Word)1<<(Word)14 - 1; + +#endif + +__kernel void add_v1(__global const Word * x,__global const Word * y,__global Word * z) +{ + int i = get_global_id(0); + + // Carry from I-1 + Word t = (Word)0; + if (i>0) + { + t = (x[i-1]+y[i-1]) >> LOG_BASE; + } + + // Sum I + Word s = (x[i]+y[i]) & BASE_MINUS1; + + // Final result + z[i] = t + s; +} + +__kernel void add_v2(__global const Word * x,__global const Word * y,__global Word * z) +{ + int i = get_global_id(0); + z[i] = x[i] + y[i]; +} + +#if 0 +__kernel void add_v3(__global const Word * x,__global const Word * y,__global Word * z, + __local Word * sLoc) +{ + int i = get_global_id(0); + int ii = get_local_id(0); + sLoc[ii] = x[i] + y[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + // Carry from I-1 + Word t = (Word)0; + Word s = 0; +#if 0 + if (ii == 0) + { + s = (i>0)?(x[i-1] + y[i-1]):0; + } + else + { + // s = sLoc[ii-1]; + } +#endif + // s = sLoc[ii-1]; + if (s >= MAX_WORD) t = 1; + else if (s <= -MAX_WORD) t = -1; + + // Sum I + s = sLoc[ii-1]; + if (s >= MAX_WORD) s -= BASE; + else if (s <= -MAX_WORD) s += BASE; + + // Final result + z[i] = t + s; +} +#endif diff --git a/MPBenchmarks/gpu_mp.cl b/MPBenchmarks/gpu_mp.cl new file mode 100644 index 0000000..703f5fa --- /dev/null +++ b/MPBenchmarks/gpu_mp.cl @@ -0,0 +1,153 @@ +// GPU multi-precision (old file) +// (c) EB Sep 2009 + +#if (WORD_SIZE == 64) + +typedef long Word; +const Word BASE = (Word)1<<(Word)60; +const Word MAX_WORD = (Word)1<<(Word)60 - (Word)1; + +#elif (WORD_SIZE == 32) + +typedef int Word; +const Word BASE = (Word)1<<(Word)30; +const Word MAX_WORD = (Word)1<<(Word)30 - (Word)1; + +#elif (WORD_SIZE == 16) + +typedef short Word; +const Word BASE = (Word)1<<(Word)14; +const Word MAX_WORD = (Word)1<<(Word)14 - (Word)1; + +#elif (WORD_SIZE == 128) + +typedef float4 Word; + +#endif + +__kernel void copy(__global Word * x,__global Word * z) +{ + int i = get_global_id(0); + z[i] = x[i]; +} + +__kernel void zero(__global Word * a) +{ + int i = get_global_id(0); + a[i] = 0.0f; +} + +__kernel void add_v1(__global const Word * x,__global const Word * y,__global Word * z) +{ + int i = get_global_id(0); + + // Carry from I-1 + Word t = (Word)0; + if (i>0) + { + Word s1 = x[i-1]+y[i-1]; + if (s1 >= MAX_WORD) t = 1; + else if (s1 <= -MAX_WORD) t = -1; + } + + // Sum I + Word s = x[i]+y[i]; + if (s >= MAX_WORD) s -= BASE; + else if (s <= -MAX_WORD) s += BASE; + + // Final result + z[i] = t + s; +} + +__kernel void add_v3(__global const Word * x,__global const Word * y,__global Word * z, + __local Word * sLoc) +{ + int i = get_global_id(0); + int ii = get_local_id(0); + sLoc[ii] = x[i] + y[i]; + + barrier(CLK_LOCAL_MEM_FENCE); + + // Carry from I-1 + Word t = (Word)0; + Word s = 0; +#if 0 + if (ii == 0) + { + s = (i>0)?(x[i-1] + y[i-1]):0; + } + else + { + // s = sLoc[ii-1]; + } +#endif + // s = sLoc[ii-1]; + if (s >= MAX_WORD) t = 1; + else if (s <= -MAX_WORD) t = -1; + + // Sum I + s = sLoc[ii-1]; + if (s >= MAX_WORD) s -= BASE; + else if (s <= -MAX_WORD) s += BASE; + + // Final result + z[i] = t + s; +} + +__kernel void add_v2(__global const Word * x,__global const Word * y,__global Word * z) +{ + int i = get_global_id(0); + z[i] = x[i] + y[i]; +} + +__kernel void propagate(__global const Word * x,__global Word * z) +{ + int i = get_global_id(0); + Word a = x[i]; + Word b = 0; +} + +// ______________________________________________________________________ +// + +#if (VECTOR_LENGTH == 1) +typedef float vector_t; +#elif (VECTOR_LENGTH == 2) +typedef float2 vector_t; +#elif (VECTOR_LENGTH == 4) +typedef float4 vector_t; +#endif + +__kernel void crunch(__global float * out) +{ + vector_t x,y,cs,sn,xx,yy; + x = 1.0f; + y = 0.0f; + cs = cos(2.0f); // random angle + sn = sin(2.0f); + for (int i=0;i<N_ROTATIONS;i++) + { + xx = x*cs - y*sn; + yy = y*cs + x*sn; + x = xx; + y = yy; + } + out[get_global_id(0)] = dot(x,y); +} + +__kernel void crunch2(__global float * out) +{ + vector_t x,y,cs,sn,xx,yy; + x = 1.0f; + y = 0.0f; + cs = cos(2.0f); // random angle + sn = sin(2.0f); + for (int i=0;i<N_ROTATIONS;i++) + { + xx = mad(x,cs,-y*sn); + yy = mad(y,cs,x*sn); + x = xx; + y = yy; + } + out[get_global_id(0)] = dot(x,y); +} diff --git a/MPBenchmarks/gpu_mul1.cl b/MPBenchmarks/gpu_mul1.cl new file mode 100644 index 0000000..be2875b --- /dev/null +++ b/MPBenchmarks/gpu_mul1.cl @@ -0,0 +1,116 @@ +// GPU multi-precision +// (c) EB Nov 2009 + +__kernel void mul1_v0(int k,__global const int * x,__global int * z) +{ + int i = get_global_id(0); + z[i] = (int)(((long)k*(long)x[i]) & (long)BASE_MINUS1); +} + +__kernel void mul1_v1(int k,__global const int * x,__global int * z) +{ + int i = get_global_id(0); + long z0 = ((long)k*(long)x[i]) & (long)BASE_MINUS1; + long z1 = (((long)k*(long)((i>0)?x[i-1]:0))>>LOG_BASE) & (long)BASE_MINUS1; + long z2 = (((long)k*(long)((i>1)?x[i-2]:0))>>(2*LOG_BASE)) & (long)BASE_MINUS1; + z[i] = (int)(z0+z1+z2); +} + +#if 0 +__kernel void mul1_v2(int k,__global const int * x,__global int * z) +{ + __local long aux[WORKGROUP_SIZE+2]; + int i = get_global_id(0); // in X + int ii = get_local_id(0); // in workgroup + long s; + + // Load X[i]*K into AUX for all threads of the workgroup + 2 + s = (long)x[i]; + aux[ii+2] = s * (long)k; + if (ii < 2) // 2 work items load the previous 2 values + { + s = (long)((i>2)?x[i-2]:0); + aux[ii] = s * (long)k; + } + barrier(CLK_LOCAL_MEM_FENCE); + + // Shift, mask and sum the 3 consecutive words in each cell + s = (aux[ii]>>(2*LOG_BASE)) & (long)BASE_MINUS1; + s += (aux[ii+1]>>(LOG_BASE)) & (long)BASE_MINUS1; + s += aux[ii+2] & (long)BASE_MINUS1; + + // Store the result + z[i] = (int)s; +} +#endif + +#if 0 +__kernel void mul1_v3(int k,__global const int * x,__global int * z) +{ + // Get index of block to compute + int i = get_global_id(0) * BLOCK_SIZE; + + // Load the previous two values + long p0,p1,p2; + if (i>0) + { + p2 = (long)x[i-2]*(long)k; p2 >>= (2*LOG_BASE); + p1 = (long)x[i-1]*(long)k; p1 >>= LOG_BASE; + } + else p2 = p1 = 0; + + // Compute the block (sequentially) + for (int j=0;j<BLOCK_SIZE;j++) + { + // Load one value + p0 = (long)x[i]*(long)k; + // Store one result computed from the last 3 values + z[i] = (int)((p0&(long)BASE_MINUS1)+(p1&(long)BASE_MINUS1)+p2); + // Shift + i++; + p2 = p1 >> LOG_BASE; + p1 = p0 >> LOG_BASE; + } +} +#endif + +__kernel void mul1_v3bis(int k,__global const int * x,__global int * z) +{ + int i = get_global_id(0) << 1; + long m1,m2; + if (i>0) + { + m2 = ((long)k*(long)x[i-2]) >> (2*LOG_BASE); + m1 = ((long)k*(long)x[i-1]) >> LOG_BASE; + } else m1 = m2 = 0; + long x0 = (long)k*(long)x[i]; + long x1 = (long)k*(long)x[i+1]; + z[i] = (int)( ( x0 & (long)BASE_MINUS1 ) + + ( m1 & (long)BASE_MINUS1 ) + + ( m2 & (long)BASE_MINUS1 ) ); + x0 >>= LOG_BASE; + m1 >>= LOG_BASE; + z[i+1] = (int)( ( x1 & (long)BASE_MINUS1 ) + + ( x0 & (long)BASE_MINUS1 ) + + ( m1 & (long)BASE_MINUS1 ) ); +} + +__kernel void mul1_v4a(int k,__global const int * x,__global int * z0,__global int * z1,__global int * z2) +{ + int i = get_global_id(0); + long u = (long)k*(long)x[i]; + z0[i] = (int) (u & (long)BASE_MINUS1); + u >>= LOG_BASE; + z1[i] = (int) (u & (long)BASE_MINUS1); + u >>= LOG_BASE; + z2[i] = (int)u; +} + +__kernel void mul1_v4b(__global const int * z0,__global const int * z1,__global const int * z2,__global int * z) +{ + int i = get_global_id(0); + int s = z0[i]; + if (i>0) s += z1[i-1]; + if (i>1) s += z2[i-2]; + z[i] = s; +} diff --git a/MPBenchmarks/gpu_zero.cl b/MPBenchmarks/gpu_zero.cl new file mode 100644 index 0000000..0f276e0 --- /dev/null +++ b/MPBenchmarks/gpu_zero.cl @@ -0,0 +1,18 @@ +// GPU memory 0 +// (c) EB Sep 2009 + +#if (WORD_SIZE == 64) +typedef long Word; +#elif (WORD_SIZE == 32) +typedef int Word; +#elif (WORD_SIZE == 16) +typedef short Word; +#elif (WORD_SIZE == 128) +typedef float4 Word; +#endif + +__kernel void zero(__global Word * a) +{ + int i = get_global_id(0); + a[i] = 0.0f; +} diff --git a/MPBenchmarks/t1.cpp b/MPBenchmarks/t1.cpp new file mode 100644 index 0000000..2398a1b --- /dev/null +++ b/MPBenchmarks/t1.cpp @@ -0,0 +1,18 @@ +#include <stdio.h> + +typedef long long int int64; + +int main() +{ + const int64 Blog = 8; + const int64 B = (int64)1<<Blog; + + for (int64 m=B;m<3*B ;m++) + { + int64 x = B-1+2*(m/B); + if (x>m) continue; + + printf("Blog=%lld m=%llX\n",Blog,m); + // break; + } +} diff --git a/MPBenchmarks/vs2008/MPBenchmarks.sln b/MPBenchmarks/vs2008/MPBenchmarks.sln new file mode 100644 index 0000000..a3d9660 --- /dev/null +++ b/MPBenchmarks/vs2008/MPBenchmarks.sln @@ -0,0 +1,39 @@ +
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual Studio 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MPBenchmarks", "MPBenchmarks.vcproj", "{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}"
+ ProjectSection(ProjectDependencies) = postProject
+ {86930221-164E-4E80-B8C0-50DCB5740B37} = {86930221-164E-4E80-B8C0-50DCB5740B37}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BealtoOpenCL", "..\..\BealtoOpenCL\vs2008\BealtoOpenCL.vcproj", "{86930221-164E-4E80-B8C0-50DCB5740B37}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Debug|x64 = Debug|x64
+ Release|Win32 = Release|Win32
+ Release|x64 = Release|x64
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|Win32.ActiveCfg = Debug|Win32
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|Win32.Build.0 = Debug|Win32
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|x64.ActiveCfg = Debug|x64
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Debug|x64.Build.0 = Debug|x64
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|Win32.ActiveCfg = Release|Win32
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|Win32.Build.0 = Release|Win32
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|x64.ActiveCfg = Release|x64
+ {FDA95523-9ED9-4C13-9855-C349F5AEBB2D}.Release|x64.Build.0 = Release|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.ActiveCfg = Debug|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|Win32.Build.0 = Debug|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.ActiveCfg = Debug|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Debug|x64.Build.0 = Debug|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.ActiveCfg = Release|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|Win32.Build.0 = Release|Win32
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.ActiveCfg = Release|x64
+ {86930221-164E-4E80-B8C0-50DCB5740B37}.Release|x64.Build.0 = Release|x64
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/MPBenchmarks/vs2008/MPBenchmarks.vcproj b/MPBenchmarks/vs2008/MPBenchmarks.vcproj new file mode 100644 index 0000000..5fe65c4 --- /dev/null +++ b/MPBenchmarks/vs2008/MPBenchmarks.vcproj @@ -0,0 +1,428 @@ +<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+ ProjectType="Visual C++"
+ Version="9,00"
+ Name="MPBenchmarks"
+ ProjectGUID="{FDA95523-9ED9-4C13-9855-C349F5AEBB2D}"
+ RootNamespace="TestsOpenCL"
+ Keyword="Win32Proj"
+ TargetFrameworkVersion="196613"
+ >
+ <Platforms>
+ <Platform
+ Name="Win32"
+ />
+ <Platform
+ Name="x64"
+ />
+ </Platforms>
+ <ToolFiles>
+ </ToolFiles>
+ <Configurations>
+ <Configuration
+ Name="Debug|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+ IntermediateDirectory="$(ConfigurationName)-32"
+ ConfigurationType="1"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="qtmaind.lib QtCored4.lib QtGuid4.lib OpenCL.lib BealtoOpenCLd.lib"
+ LinkIncremental="2"
+ AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-32"
+ GenerateDebugInformation="true"
+ SubSystem="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ Description="Installing DLL's"
+ CommandLine="copy "$(QTDIR_BASE)-32\bin\QtCored4.dll" $(OutDir) ;
copy "$(QTDIR_BASE)-32\bin\QtGuid4.dll" $(OutDir) ;
copy "$(ATISTREAMSDKROOT)\bin\x86\OpenCL.dll" $(OutDir) ;
"
+ />
+ </Configuration>
+ <Configuration
+ Name="Debug|x64"
+ OutputDirectory="$(SolutionDir)\$(ConfigurationName)-64"
+ IntermediateDirectory="$(ConfigurationName)-64"
+ ConfigurationType="1"
+ CharacterSet="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="0"
+ AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+ PreprocessorDefinitions="WIN32;_DEBUG;_WINDOWS"
+ MinimalRebuild="true"
+ BasicRuntimeChecks="3"
+ RuntimeLibrary="3"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="qtmaind.lib QtCored4.lib QtGuid4.lib OpenCL.lib BealtoOpenCLd.lib"
+ LinkIncremental="2"
+ AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-64"
+ GenerateDebugInformation="true"
+ SubSystem="2"
+ TargetMachine="17"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ Description="Installing Qt DLL's"
+ CommandLine="copy "$(QTDIR_BASE)-64\bin\QtCored4.dll" "$(OutDir)";
copy "$(QTDIR_BASE)-64\bin\QtGuid4.dll" "$(OutDir)";
copy "$(ATISTREAMSDKROOT)\bin\x86_64\OpenCL.dll" "$(OutDir)";
"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|Win32"
+ OutputDirectory="$(SolutionDir)$(ConfigurationName)-32"
+ IntermediateDirectory="$(ConfigurationName)-32"
+ ConfigurationType="1"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ EnableEnhancedInstructionSet="2"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-32\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="OpenCL.lib qtmain.lib QtCore4.lib QtGui4.lib BealtoOpenCL.lib"
+ LinkIncremental="1"
+ AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-32"
+ GenerateDebugInformation="true"
+ SubSystem="2"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="1"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ Description="Installing DLL's"
+ CommandLine="copy "$(QTDIR_BASE)-32\bin\QtCore4.dll" $(OutDir) ;
copy "$(QTDIR_BASE)-32\bin\QtGui4.dll" $(OutDir) ;
copy "$(ATISTREAMSDKROOT)\bin\x86\OpenCL.dll" $(OutDir) ;
"
+ />
+ </Configuration>
+ <Configuration
+ Name="Release|x64"
+ OutputDirectory="$(SolutionDir)\$(ConfigurationName)-64"
+ IntermediateDirectory="$(ConfigurationName)-64"
+ ConfigurationType="1"
+ CharacterSet="1"
+ WholeProgramOptimization="1"
+ >
+ <Tool
+ Name="VCPreBuildEventTool"
+ />
+ <Tool
+ Name="VCCustomBuildTool"
+ />
+ <Tool
+ Name="VCXMLDataGeneratorTool"
+ />
+ <Tool
+ Name="VCWebServiceProxyGeneratorTool"
+ />
+ <Tool
+ Name="VCMIDLTool"
+ TargetEnvironment="3"
+ />
+ <Tool
+ Name="VCCLCompilerTool"
+ Optimization="2"
+ EnableIntrinsicFunctions="true"
+ AdditionalIncludeDirectories="$(ProjectDir)..\..\BealtoOpenCL\include"
+ PreprocessorDefinitions="WIN32;NDEBUG;_WINDOWS"
+ RuntimeLibrary="2"
+ EnableFunctionLevelLinking="true"
+ FloatingPointModel="2"
+ UsePrecompiledHeader="0"
+ ProgramDataBaseFileName="$(SolutionDir)\$(ConfigurationName)-64\$(ProjectName)"
+ WarningLevel="3"
+ DebugInformationFormat="3"
+ />
+ <Tool
+ Name="VCManagedResourceCompilerTool"
+ />
+ <Tool
+ Name="VCResourceCompilerTool"
+ />
+ <Tool
+ Name="VCPreLinkEventTool"
+ />
+ <Tool
+ Name="VCLinkerTool"
+ AdditionalDependencies="OpenCL.lib QtCore4.lib QtGui4.lib qtmain.lib BealtoOpenCL.lib"
+ LinkIncremental="1"
+ AdditionalLibraryDirectories="$(ProjectDir)..\..\BealtoOpenCL\lib-64"
+ GenerateDebugInformation="true"
+ SubSystem="2"
+ OptimizeReferences="2"
+ EnableCOMDATFolding="2"
+ TargetMachine="17"
+ />
+ <Tool
+ Name="VCALinkTool"
+ />
+ <Tool
+ Name="VCManifestTool"
+ />
+ <Tool
+ Name="VCXDCMakeTool"
+ />
+ <Tool
+ Name="VCBscMakeTool"
+ />
+ <Tool
+ Name="VCFxCopTool"
+ />
+ <Tool
+ Name="VCAppVerifierTool"
+ />
+ <Tool
+ Name="VCPostBuildEventTool"
+ Description="Installing Qt DLL's"
+ CommandLine="copy "$(QTDIR_BASE)-64\bin\QtCore4.dll" "$(OutDir)";
copy "$(QTDIR_BASE)-64\bin\QtGui4.dll" "$(OutDir)";
copy "$(ATISTREAMSDKROOT)\bin\x86_64\OpenCL.dll" "$(OutDir)";
"
+ />
+ </Configuration>
+ </Configurations>
+ <References>
+ </References>
+ <Files>
+ <Filter
+ Name="Source Files"
+ Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
+ UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
+ >
+ <File
+ RelativePath="..\appMain.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\ArithmeticTasks.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\GPUTask.cpp"
+ >
+ </File>
+ <File
+ RelativePath="..\MemoryTasks.cpp"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Header Files"
+ Filter="h;hpp;hxx;hm;inl;inc;xsd"
+ UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
+ >
+ <File
+ RelativePath="..\ArithmeticTasks.h"
+ >
+ </File>
+ <File
+ RelativePath="..\Config.h"
+ >
+ </File>
+ <File
+ RelativePath="..\Conversions.h"
+ >
+ </File>
+ <File
+ RelativePath="..\CPUTask.h"
+ >
+ </File>
+ <File
+ RelativePath="..\GPUTask.h"
+ >
+ </File>
+ <File
+ RelativePath="..\MemoryTasks.h"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="OpenCL Files"
+ >
+ <File
+ RelativePath="..\gpu_add.cl"
+ >
+ </File>
+ <File
+ RelativePath="..\gpu_mp.cl"
+ >
+ </File>
+ <File
+ RelativePath="..\gpu_mul1.cl"
+ >
+ </File>
+ <File
+ RelativePath="..\gpu_zero.cl"
+ >
+ </File>
+ </Filter>
+ <Filter
+ Name="Data Files"
+ >
+ <File
+ RelativePath="..\LICENSE.txt"
+ >
+ </File>
+ </Filter>
+ </Files>
+ <Globals>
+ </Globals>
+</VisualStudioProject>
|