summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Stellard <tstellar@redhat.com>2017-04-05 20:40:41 +0000
committerTom Stellard <tstellar@redhat.com>2017-04-06 03:27:02 +0000
commit158af35e1458d196f3a93d5ee651b74fafdb6e7f (patch)
tree1fcd6163a58cd178fb6024eaebec93c4b7ceab39
parent7ceb1a4fa826910508ef6cb1d1b27529cd999340 (diff)
Add guda state trackerguda
This is an experimental prototype, it has only been build tested, and not been tested with any GPU hardware. This implementation passes ptx to the drivers, so drivers will need to implement ptx->whatever in order for this to actually work. The implementation so far contains the minimal implementation required to run the most simple possible cuda program. There are several shortcuts, and some things like the global state handling aren't well thought out, but the goal with this is just to get one program working. Instructions for testing (these instructions are for fedora, but are probably similar for other distros): 0. This is how I configured mesa when testing: $ ./configure --with-llvm-prefix=/usr/local --enable-cuda --disable-glx --with-gallium-drivers=nouveau --disable-egl --disable-llvm --with-dri-drivers= --disable-dri3 1. Install nvidia cuda development tools/header: $ curl -O http://developer.download.nvidia.com/compute/cuda/repos/fedora23/x86_64/cuda-repo-fedora23-8.0.61-1.x86_64.rpm $ rpm -i cuda-repo-fedora23-8.0.61-1.x86_64.rpm $ dnf install cuda-driver-dev-8-0 cuda-command-line-tools-8-0 cuda-misc-headers-8-0 cuda-command-line-tools-8-0 cuda-curand-dev-8-0 2. Install clang (I've been testing with clang 4.0) dnf install fedora-repos-rawhide sudo dnf install clang --enablerepo=rawhide 3. Compile this test program: $ cat simple.cu #include <stdio.h> __global__ void pi(float* a) { *a = 3.14159; } int main(int argc, char* argv[]) { const int kDataLen = 1; float a = 0.0f; // Copy input data to device. float* device_a; cudaMalloc(&device_a, sizeof(float)); // Launch the kernel. pi<<<1, kDataLen>>>(device_a); // Copy output data to host. cudaDeviceSynchronize(); cudaMemcpy(&a, &device_a, sizeof(float), cudaMemcpyDeviceToHost); // Print the results. for (int i = 0; i < kDataLen; ++i) { printf("pi = %f\n", a); } cudaDeviceReset(); return 0; } $ clang++ simple.cu -O2 -o simple --cuda-gpu-arch=sm_30 -Xcuda-fatbinary -compress -Xcuda-fatbinary false -L/path/to/mesa/lib/gallium -L/path/to/mesa/lib/gallium/ -lcuda -ldl -lrt -pthread -nocudalib 4. Run the program: $ ./simple
-rw-r--r--configure.ac9
-rw-r--r--src/gallium/Makefile.am4
-rw-r--r--src/gallium/include/pipe/p_defines.h1
-rw-r--r--src/gallium/state_trackers/guda/Makefile.am17
-rw-r--r--src/gallium/state_trackers/guda/Makefile.sources2
-rw-r--r--src/gallium/state_trackers/guda/guda.c575
-rw-r--r--src/gallium/targets/cuda/Makefile.am32
7 files changed, 640 insertions, 0 deletions
diff --git a/configure.ac b/configure.ac
index ab9a91ed17b..b37f6e3a340 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1167,6 +1167,11 @@ AC_ARG_ENABLE([opencl_icd],
@<:@default=disabled@:>@])],
[enable_opencl_icd="$enableval"],
[enable_opencl_icd=no])
+AC_ARG_ENABLE([cuda],
+ [AS_HELP_STRING([--enable-cuda],
+ [enable Cuda library @<:@default=disabaled@:>@])],
+ [enable_cuda="$enableval"],
+ [enable_cuda=no])
AC_ARG_ENABLE([gallium-tests],
[AS_HELP_STRING([--enable-gallium-tests],
@@ -1203,6 +1208,7 @@ if test "x$enable_opengl" = xno -a \
"x$enable_vdpau" = xno -a \
"x$enable_omx" = xno -a \
"x$enable_va" = xno -a \
+ "x$enable_cuda" = xno -a \
"x$enable_opencl" = xno; then
AC_MSG_ERROR([at least one API should be enabled])
fi
@@ -2074,6 +2080,7 @@ AM_CONDITIONAL(HAVE_CLOVER_ICD, test "x$enable_opencl_icd" = xyes)
AC_SUBST([OPENCL_LIBNAME])
AC_SUBST([CLANG_RESOURCE_DIR])
+AM_CONDITIONAL(HAVE_GUDA, test "x$enable_cuda" = xyes)
dnl
dnl Gallium configuration
dnl
@@ -2637,6 +2644,7 @@ AC_CONFIG_FILES([Makefile
src/gallium/state_trackers/clover/Makefile
src/gallium/state_trackers/dri/Makefile
src/gallium/state_trackers/glx/xlib/Makefile
+ src/gallium/state_trackers/guda/Makefile
src/gallium/state_trackers/nine/Makefile
src/gallium/state_trackers/omx/Makefile
src/gallium/state_trackers/osmesa/Makefile
@@ -2644,6 +2652,7 @@ AC_CONFIG_FILES([Makefile
src/gallium/state_trackers/vdpau/Makefile
src/gallium/state_trackers/xa/Makefile
src/gallium/state_trackers/xvmc/Makefile
+ src/gallium/targets/cuda/Makefile
src/gallium/targets/d3dadapter9/Makefile
src/gallium/targets/d3dadapter9/d3d.pc
src/gallium/targets/dri/Makefile
diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
index 38da63b0690..d583337d97e 100644
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@@ -137,6 +137,10 @@ if HAVE_CLOVER
SUBDIRS += state_trackers/clover targets/opencl
endif
+if HAVE_GUDA
+SUBDIRS += state_trackers/guda targets/cuda
+endif
+
if HAVE_DRICOMMON
SUBDIRS += state_trackers/dri targets/dri
endif
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 924711d4dd3..29c883aeb09 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -843,6 +843,7 @@ enum pipe_shader_ir
PIPE_SHADER_IR_LLVM,
PIPE_SHADER_IR_NATIVE,
PIPE_SHADER_IR_NIR,
+ PIPE_SHADER_IR_PTX
};
/**
diff --git a/src/gallium/state_trackers/guda/Makefile.am b/src/gallium/state_trackers/guda/Makefile.am
new file mode 100644
index 00000000000..53a207096fa
--- /dev/null
+++ b/src/gallium/state_trackers/guda/Makefile.am
@@ -0,0 +1,17 @@
+include Makefile.sources
+
+AM_CPPFLAGS = \
+ -I$(top_srcdir)/include \
+ -I$(top_builddir)/src \
+ -I$(top_srcdir)/src \
+ -I$(top_srcdir)/src/gallium/include \
+ -I$(top_srcdir)/src/gallium/drivers \
+ -I$(top_srcdir)/src/gallium/auxiliary \
+ -I$(top_srcdir)/src/gallium/winsys \
+ -I/usr/local/cuda-8.0/targets/x86_64-linux/include
+
+noinst_LTLIBRARIES = libguda.la
+
+libguda_la_SOURCES = $(C_SOURCES)
+
+EXTRA_DIST = Doxyfile
diff --git a/src/gallium/state_trackers/guda/Makefile.sources b/src/gallium/state_trackers/guda/Makefile.sources
new file mode 100644
index 00000000000..5314c8f67d3
--- /dev/null
+++ b/src/gallium/state_trackers/guda/Makefile.sources
@@ -0,0 +1,2 @@
+C_SOURCES := \
+ guda.c
diff --git a/src/gallium/state_trackers/guda/guda.c b/src/gallium/state_trackers/guda/guda.c
new file mode 100644
index 00000000000..92dc662bcb9
--- /dev/null
+++ b/src/gallium/state_trackers/guda/guda.c
@@ -0,0 +1,575 @@
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <libelf.h>
+#include <gelf.h>
+
+#include <crt/host_runtime.h>
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
+#include "pipe/p_state.h"
+#include "pipe-loader/pipe_loader.h"
+#include "util/list.h"
+
+/******************************************************************************/
+/* State tracker internal functions / global state */
+/******************************************************************************/
+
+#define USE_FAKE_CONTEXT 0
+
+#define UNUSED_BUFFER_OFFSET ~0
+struct cuda_ctx;
+
+struct cuda_dev {
+ struct pipe_loader_device *pipe_dev;
+ struct pipe_screen *screen;
+};
+
+struct cuda_buffer {
+ struct list_head list;
+ unsigned offset;
+ struct pipe_resource *res;
+};
+
+enum cuda_arg_type {
+ CUDA_ARG_SCALAR,
+ CUDA_ARG_PTR
+};
+
+struct cuda_arg_info {
+ struct list_head list;
+ enum cuda_arg_type type;
+ unsigned argno;
+ unsigned offset;
+ unsigned size;
+};
+
+struct cuda_exec_state {
+ struct list_head list;
+ dim3 grid_dim;
+ dim3 block_dim;
+ size_t shared_mem;
+ cudaStream_t stream;
+ char *args;
+ unsigned arg_size;
+ struct cuda_arg_info arg_info;
+ void *compute_state;
+};
+
+struct cuda_ctx {
+ struct pipe_context *pipe_ctx;
+ struct cuda_dev *dev;
+ struct cuda_buffer buffers;
+ struct cuda_exec_state exec_state;
+};
+
+// FIXME Support more than one device function;
+struct cuda_fatbin {
+ char *ptr;
+ char *dev_func_name;
+} global_fatbin;
+
+struct cuda_dev **cuda_dev_list = NULL;
+
+// FIXME primary_ctx should be shared across all CPU threads.
+struct cuda_ctx **primary_ctx_list = NULL;
+struct cuda_ctx *current_ctx = NULL;
+size_t current_dev = 0;
+
+#define CUDA_TRACE(fmt, ...) printf("%s: "fmt, __PRETTY_FUNCTION__, __VA_ARGS__)
+#define CUDA_TRACE_NOARG(fmt) printf("%s: "fmt, __PRETTY_FUNCTION__)
+
+static int init_cuda_dev_list() {
+ struct pipe_loader_device **pipe_dev_list;
+ int n = pipe_loader_probe(NULL, 0);
+ int i,j;
+
+ if (n == 0) {
+ return 0;
+ }
+
+ pipe_dev_list = malloc(n * sizeof(struct pipe_loader_device*));
+ cuda_dev_list = malloc(n * sizeof(struct cuda_dev*));
+
+ n = pipe_loader_probe(pipe_dev_list, n);
+
+ for (i = 0, j = 0; i < n; i++) {
+ struct pipe_loader_device *pipe_dev = pipe_dev_list[i];
+ struct pipe_screen *screen = pipe_loader_create_screen(pipe_dev);
+ struct cuda_dev *dev;
+ if (!screen || !screen->get_param(screen, PIPE_CAP_COMPUTE)) {
+ if (screen) {
+ screen->destroy(screen);
+ }
+ pipe_loader_release(&pipe_dev, 1);
+ continue;
+ }
+
+ dev = malloc(sizeof(struct cuda_dev));
+ dev->pipe_dev = pipe_dev;
+ dev->screen = screen;
+ cuda_dev_list[j] = dev;
+ }
+
+ if (j == 0) {
+ free(pipe_dev_list);
+ free(cuda_dev_list);
+ pipe_dev_list = NULL;
+ cuda_dev_list = NULL;
+ }
+ return j;
+}
+
+static void init_primary_ctx_list(int n) {
+ primary_ctx_list = calloc(n, sizeof(struct cuda_ctx*));
+}
+
+static struct cuda_ctx *create_cuda_ctx(struct cuda_dev *dev) {
+ struct cuda_ctx *ctx = malloc(sizeof(struct cuda_ctx));
+ if (dev)
+ ctx->pipe_ctx = dev->screen->context_create(dev->screen, NULL,
+ PIPE_CONTEXT_COMPUTE_ONLY);
+ ctx->dev = dev;
+ memset(&ctx->exec_state, 0, sizeof(ctx->exec_state));
+ list_inithead(&ctx->exec_state.list);
+ memset(&ctx->buffers, 0, sizeof(ctx->buffers));
+ list_inithead(&ctx->buffers.list);
+ return ctx;
+}
+
+static struct cuda_ctx *get_ctx() {
+#if USE_FAKE_CONTEXT
+ if (!current_ctx) {
+ current_ctx = create_cuda_ctx(NULL);
+ }
+#endif
+ if (current_ctx) {
+ return current_ctx;
+ }
+
+ // There is no current context, so we need to use the primary context.
+ if (!cuda_dev_list) {
+ // Is this the best pla
+ int n = init_cuda_dev_list();
+ if (n == 0) {
+ return NULL;
+ }
+ init_primary_ctx_list(n);
+ }
+
+ if (!primary_ctx_list[current_dev]) {
+ primary_ctx_list[current_dev] = create_cuda_ctx(cuda_dev_list[current_dev]);
+ }
+
+ return primary_ctx_list[current_dev];
+}
+
+static struct cuda_exec_state *get_exec_state() {
+ struct cuda_ctx *ctx = get_ctx();
+ if (!ctx) {
+ return NULL;
+ }
+ return list_first_entry(&ctx->exec_state.list, struct cuda_exec_state, list);
+}
+
+
+/******************************************************************************/
+/* Functions calls generated by clang */
+/******************************************************************************/
+
+cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset);
+cudaError_t cudaLaunch(const void* func);
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
+ char *deviceFun, const char *deviceName,
+ int thread_limit, void *tid, void *bid,
+ void *bDim, void *gDim, int *wSize);
+void** __cudaRegisterFatBinary(void *fatCubin);
+extern void __cudaUnregisterFatBinary(void **fatCubinHandle);
+
+
+cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset) {
+ CUDA_TRACE("arg = %p size = %ld offset = %ld)\n", arg, size, offset);
+ struct cuda_exec_state *exec_state = get_exec_state();
+ struct cuda_ctx *ctx = get_ctx();
+ if (!exec_state) {
+ return cudaErrorNoDevice;
+ }
+
+ if (offset + size > exec_state->arg_size) {
+ exec_state->arg_size *= 2;
+ if (!realloc(exec_state->args, exec_state->arg_size))
+ return cudaErrorUnknown;
+ }
+
+ list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+ if (buf->res == *(void**)arg) {
+ CUDA_TRACE("Arg is a buffer: %p\n", *(void**)arg);
+ buf->offset = offset;
+ }
+ }
+ memcpy(exec_state->args + offset, arg, size);
+ return cudaSuccess;
+}
+
+#if 0
+// This function extrace what I think is NV ISA for the given function
+static void extract_binary(unsigned char *elf_bin, unsigned elf_size, const char *func_name) {
+ Elf *elf = elf_memory(elf_bin, elf_size);
+ size_t section_str_index;
+ elf_getshdrstrndx(elf, &section_str_index);
+
+ for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) {
+ GElf_Shdr header;
+ if (gelf_getshdr(section, &header) != &header) {
+ //XXX: Error here
+ }
+
+ if (strcmp(elf_strptr(elf, section_str_index, header.sh_name), ".symtab"))
+ continue;
+
+ Elf_Data *const symtab_data = elf_getdata(section, NULL);
+ GElf_Sym symbol;
+ GElf_Sym *s;
+ unsigned i = 0;
+
+ while ((s = gelf_getsym(symtab_data, i++, &symbol))) {
+ Elf_Scn *symbol_scn;
+ GElf_Shdr symbol_shdr;
+ const char *name = elf_strptr(elf, header.sh_link, s->st_name);
+ size_t symbol_offset;
+ printf("func_name = %s symbol name = %s idx = %d\n", func_name, name, s->st_shndx);
+ // FIXME: Not safe!
+ if (strcmp(name, func_name))
+ continue;
+
+ symbol_scn = elf_getscn(elf, s->st_shndx);
+ gelf_getshdr(symbol_scn, &symbol_shdr);
+ symbol_offset = symbol_shdr.sh_offset + s->st_value;
+ printf("size = %d\n", s->st_size);
+ printf("offset = %d\n", symbol_offset);
+ for (int j = 0; j < s->st_size; j++) {
+ printf("%x", *(elf_bin + symbol_offset + j));
+ }
+ }
+ }
+
+}
+#endif
+
+
+static int read_arginfo(char *elf_bin, unsigned elf_size,
+ const char *func_name, struct cuda_arg_info *arg_info) {
+
+ Elf *elf = elf_memory(elf_bin, elf_size);
+ size_t section_str_index;
+ elf_getshdrstrndx(elf, &section_str_index);
+ int req_input_mem = 0;
+
+ for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) {
+ GElf_Shdr header;
+ Elf_Data *section_data = NULL;
+ unsigned int *data;
+ if (gelf_getshdr(section, &header) != &header) {
+ //XXX: Error here
+ }
+ const char *section_name =
+ elf_strptr(elf, section_str_index, header.sh_name);
+
+ if (strncmp(section_name, ".nv.info.", 9) ||
+ strcmp(section_name + 9, func_name))
+ continue;
+
+ section_data = elf_getdata(section, section_data);
+ data = section_data->d_buf;
+
+ // FIXME: I have no idea what the first 16 bytes are.
+ data +=4;
+
+ while (data != section_data->d_buf + header.sh_size) {
+ unsigned dword2, dword3;
+ struct cuda_arg_info *arg = malloc(sizeof(struct cuda_arg_info));
+ list_inithead(&arg->list);
+ list_add(&arg->list, &arg_info->list);
+
+ // FIXME: I have no idea what the first 8 bytes are.
+ data+=2;
+ dword2 = *data++;
+ dword3 = *data++;
+ arg->argno = dword2 & 0xffff;
+ arg->offset = dword2 >> 16;
+ // FIXME: Not sure about this one.
+ arg->size = 0x2 << ((dword3 >> 20) & 0xf);
+ req_input_mem += arg->size;
+ CUDA_TRACE("argno = %d offset = %d size = %d\n", arg->argno, arg->offset, arg->size);
+ }
+ }
+ return req_input_mem;
+}
+
+static int get_work_dim(dim3 block_dim, dim3 grid_dim) {
+ if (block_dim.z * grid_dim.z > 1)
+ return 3;
+ if (block_dim.y * grid_dim.y > 1)
+ return 2;
+ return 1;
+}
+
+cudaError_t cudaLaunch(const void* func) {
+ CUDA_TRACE("func = %p\n", func);
+ // FIXME: I think need to use func to look up which function to execute. We
+ // currently just execute the most recently registered function.
+ struct cuda_exec_state *exec_state = get_exec_state();
+ struct cuda_ctx *ctx = get_ctx();
+ struct cuda_arg_info arg_info;
+ if (!exec_state) {
+ return cudaErrorNoDevice;
+ }
+
+ char *nv_fatbin = global_fatbin.ptr;
+ //uint32_t nv_fatbin_magic = *(uint32_t*)(nv_fatbin);
+ //uint32_t nv_fatbin_version = *(uint32_t*)(nv_fatbin + 4);
+ //uint64_t nv_fatbin_size = *(uint64_t *)(nv_fatbin + 8);
+ char *fatbin = (nv_fatbin + 16);
+ //uint32_t fatbin_flags = *(uint32_t*)(fatbin);
+ uint32_t elf_offset = *(uint32_t*)(fatbin + 4);
+ char *elf = fatbin + elf_offset;
+ uint32_t elf_size = *(uint32_t*)(fatbin + 8);
+ char *ptx_bin = elf + elf_size;
+ //uint32_t unknown = *(uint32_t*)(ptx_bin);
+ uint32_t ptx_text_offset = *(uint32_t*)(ptx_bin + 4);
+ char *ptx_text = ptx_bin + ptx_text_offset;
+ uint32_t ptx_text_size = *(uint32_t*)(ptx_bin + 8);
+ struct pipe_compute_state compute_state;
+ struct pipe_grid_info info;
+
+ memset(&arg_info, 0, sizeof(arg_info));
+ list_inithead(&arg_info.list);
+ // FIXME: Should we be compiling ptx in __cudaRegisterFunction() ?
+
+ // TOOO: compile_ptx_to_isa();
+ for (unsigned i = 0; i < ptx_text_size; i++) {
+ printf("%c", ptx_text[i]);
+ }
+
+ // TODO: Create compute_state and bind
+ memset(&compute_state, 0, sizeof(compute_state));
+ compute_state.ir_type = PIPE_SHADER_IR_PTX;
+ compute_state.req_input_mem = read_arginfo(elf, elf_size, global_fatbin.dev_func_name, &arg_info);
+ compute_state.prog = malloc(ptx_text_size + 4);
+ *(uint32_t *)compute_state.prog = ptx_text_size;
+ memcpy(((uint32_t*)compute_state.prog) + 1, ptx_text, ptx_text_size);
+
+ info.work_dim = get_work_dim(exec_state->block_dim, exec_state->grid_dim);
+ memcpy(info.block, &exec_state->block_dim, sizeof(info.block));
+ memcpy(info.grid, &exec_state->grid_dim, sizeof(info.grid));
+ info.pc = 0;
+ info.input = exec_state->args;
+
+#if !USE_FAKE_CONTEXT
+ exec_state->compute_state =
+ ctx->pipe_ctx->create_compute_state(ctx->pipe_ctx, &compute_state);
+ ctx->pipe_ctx->bind_compute_state(ctx->pipe_ctx, exec_state->compute_state);
+ // TODO: bind_sampler_states
+ // TODO: set_sampler_views
+ // TODO: set_compute_resources
+ list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+ char *arg_start = exec_state->args + buf->offset;
+ ctx->pipe_ctx->set_global_binding(ctx->pipe_ctx, 0, 1, &buf->res,
+ (uint32_t**)&arg_start);
+ }
+
+ CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>"
+ " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1],
+ info.block[2], info.grid[0], info.grid[1], info.grid[2],
+ info.pc, compute_state.req_input_mem);
+
+ ctx->pipe_ctx->launch_grid(ctx->pipe_ctx, &info);
+ // TODO: UBNIND
+ ctx->pipe_ctx->memory_barrier(ctx->pipe_ctx, PIPE_BARRIER_GLOBAL_BUFFER);
+#else
+ list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) {
+ char *arg_start = exec_state->args + buf->offset;
+ *(struct pipe_resource**)arg_start = buf->res;
+ }
+
+ CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>"
+ " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1],
+ info.block[2], info.grid[0], info.grid[1], info.grid[2],
+ info.pc, compute_state.req_input_mem);
+
+ CUDA_TRACE_NOARG("Inputs: \n");
+ for (unsigned i = 0; i < compute_state.req_input_mem; i+=4) {
+ CUDA_TRACE("Dword %d: %x\n", i / 4, *(int*)(exec_state->args + i));
+ }
+#endif
+
+ return cudaSuccess;
+}
+
+void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun,
+ char *deviceFun, const char *deviceName,
+ int thread_limit, void *tid, void *bid,
+ void *bDim, void *gDim, int *wSize) {
+ CUDA_TRACE("fatCubinHandle = %p hostFun = %s "
+ "deviceFun = %s deviceName = %s thread_limit = %d "
+ "tid = %p bid = %p bDim = %p gDim = %p wSize = %p\n",
+ fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit,
+ tid, bid, bDim, gDim, wSize);
+ struct cuda_exec_state *exec_state = get_exec_state();
+ if (!exec_state) {
+ return;
+ }
+ global_fatbin.dev_func_name = deviceFun;
+}
+
+void** __cudaRegisterFatBinary(void *fatCubin) {
+ CUDA_TRACE("fatCubin = %p\n", fatCubin);
+ uint32_t magic = *(uint32_t*)fatCubin;
+ uint32_t version = *(uint32_t*)(fatCubin + 4);
+ char *nv_fatbin = *(char**)(fatCubin + 8);
+
+ CUDA_TRACE("magic = %x version = %x nv_fatbin = %p\n", magic, version, nv_fatbin);
+ global_fatbin.ptr = nv_fatbin;
+
+ // FIXME: What should we return here?
+ return (void**)&global_fatbin.ptr;
+}
+
+
+void __cudaUnregisterFatBinary(void **fatCubinHandle) {
+ CUDA_TRACE("fatCubinHandle = %p\n", fatCubinHandle);
+}
+
+/******************************************************************************/
+/* Functions calls called by users */
+/******************************************************************************/
+cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem /*__dv(0)*/, cudaStream_t stream /*__dv(0)*/);
+cudaError_t cudaMalloc(void **devPtr, size_t size);
+cudaError_t cudaDeviceSynchronize(void);
+cudaError_t cudaDeviceReset(void);
+
+cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
+ CUDA_TRACE("dst = %p src = %p count = %zd kind = %d\n", dst, src, count, kind);
+ struct cuda_ctx *ctx = get_ctx();
+ struct pipe_transfer *transfer_src = NULL, *transfer_dst = NULL;
+ struct pipe_box box;
+
+ if (!ctx || !ctx->dev) {
+ return cudaErrorNoDevice;
+ }
+
+ memset(&box, 0, sizeof(box));
+ box.width = count;
+ box.height = 1;
+ box.depth = 1;
+
+ if (kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToDevice) {
+ dst = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx,
+ (struct pipe_resource *)dst,
+ 0, PIPE_TRANSFER_WRITE, &box,
+ &transfer_dst);
+ if (!dst) {
+ return cudaErrorInvalidDevicePointer;
+ }
+ }
+
+ if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice) {
+ src = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx,
+ (struct pipe_resource *)src,
+ 0, PIPE_TRANSFER_READ, &box,
+ &transfer_src);
+ if (!src) {
+ return cudaErrorInvalidDevicePointer;
+ }
+ }
+
+ memcpy(dst, src, count);
+
+ if (transfer_src) {
+ ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_src);
+ }
+
+ if (transfer_dst) {
+ ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_dst);
+ }
+
+ return cudaSuccess;
+}
+
+cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream) {
+ CUDA_TRACE("gridDim = <%d, %d, %d> blockDim = <%d, %d, %d> sharedMem = %zd stream = ?\n",
+ gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z,
+ sharedMem);
+ struct cuda_ctx *ctx = get_ctx();
+ struct cuda_exec_state *state;
+
+ if (!ctx) {
+ return cudaErrorNoDevice;
+ }
+
+ state = malloc(sizeof(struct cuda_exec_state));
+ state->grid_dim = gridDim;
+ state->block_dim = blockDim;
+ state->shared_mem = sharedMem;
+ state->stream = stream;
+ state->arg_size = 1024;
+ state->args = malloc(state->arg_size);
+ list_inithead(&state->list);
+
+ list_add(&state->list, &ctx->exec_state.list);
+ return 0;
+}
+
+cudaError_t cudaMalloc(void **devPtr, size_t size) {
+ CUDA_TRACE("devPtr = %p size = %ld\n", devPtr, size);
+ struct cuda_ctx *ctx = get_ctx();
+ struct pipe_resource info;
+ struct pipe_resource *res;
+ struct cuda_buffer *buf;
+
+ if (!ctx) {
+ return cudaErrorNoDevice;
+ }
+
+ info.width0 = size;
+ info.height0 = 1;
+ info.depth0 = 1;
+ info.array_size = 1;
+ info.target = PIPE_BUFFER;
+ info.bind = PIPE_BIND_SAMPLER_VIEW |
+ PIPE_BIND_COMPUTE_RESOURCE |
+ PIPE_BIND_GLOBAL;
+
+#if USE_FAKE_CONTEXT
+ (void)info;
+ res = malloc(sizeof(struct pipe_resource));
+#else
+ res = ctx->dev->screen->resource_create(ctx->dev->screen, &info);
+ if (!res) {
+ return cudaErrorMemoryAllocation;
+ }
+#endif
+
+ buf = malloc(sizeof(struct cuda_buffer));
+ buf->res = res;
+ buf->offset = UNUSED_BUFFER_OFFSET;
+ list_add(&buf->list, &ctx->buffers.list);
+
+ *devPtr = res;
+
+ return cudaSuccess;
+}
+
+cudaError_t cudaDeviceSynchronize(void) {
+ CUDA_TRACE_NOARG("\n");
+ return 0;
+}
+
+cudaError_t cudaDeviceReset(void) {
+ CUDA_TRACE_NOARG("\n");
+ return 0;
+}
diff --git a/src/gallium/targets/cuda/Makefile.am b/src/gallium/targets/cuda/Makefile.am
new file mode 100644
index 00000000000..e11eb9458f4
--- /dev/null
+++ b/src/gallium/targets/cuda/Makefile.am
@@ -0,0 +1,32 @@
+include $(top_srcdir)/src/gallium/Automake.inc
+
+lib_LTLIBRARIES = libcuda.la
+
+libcuda_la_LDFLAGS = \
+ -no-undefined \
+ $(GC_SECTIONS) \
+ $(LD_NO_UNDEFINED)
+
+# -version-number @CUDA_VERSION@:0
+
+#if HAVE_LD_VERSION_SCRIPT
+#lib@CUDA_LIBNAME@_la_LDFLAGS += \
+# -Wl,--version-script=$(top_srcdir)/src/gallium/targets/opencl/opencl.sym
+#endif
+
+libcuda_la_LIBADD = \
+ $(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la \
+ $(top_builddir)/src/gallium/state_trackers/guda/libguda.la \
+ $(top_builddir)/src/gallium/auxiliary/libgallium.la \
+ $(top_builddir)/src/util/libmesautil.la \
+ $(LIBELF_LIBS) \
+ $(DLOPEN_LIBS) \
+ $(PTHREAD_LIBS)
+
+nodist_EXTRA_libcuda_la_SOURCES = dummy.cpp
+libcuda_la_SOURCES =
+
+#EXTRA_lib@CUDA_LIBNAME@_la_DEPENDENCIES = opencl.sym
+#EXTRA_DIST = opencl.sym
+
+include $(top_srcdir)/install-gallium-links.mk