diff options
author | Tom Stellard <tstellar@redhat.com> | 2017-04-05 20:40:41 +0000 |
---|---|---|
committer | Tom Stellard <tstellar@redhat.com> | 2017-04-06 03:27:02 +0000 |
commit | 158af35e1458d196f3a93d5ee651b74fafdb6e7f (patch) | |
tree | 1fcd6163a58cd178fb6024eaebec93c4b7ceab39 | |
parent | 7ceb1a4fa826910508ef6cb1d1b27529cd999340 (diff) |
Add guda state trackerguda
This is an experimental prototype, it has only been build tested,
and not been tested with any GPU hardware. This implementation
passes ptx to the drivers, so drivers will need to implement
ptx->whatever in order for this to actually work.
The implementation so far contains the minimal
implementation required to run the most simple possible cuda program.
There are several shortcuts, and some things like the global
state handling aren't well thought out, but the goal with this
is just to get one program working.
Instructions for testing (these instructions are for fedora, but
are probably similar for other distros):
0. This is how I configured mesa when testing:
$ ./configure --with-llvm-prefix=/usr/local --enable-cuda --disable-glx --with-gallium-drivers=nouveau --disable-egl --disable-llvm --with-dri-drivers= --disable-dri3
1. Install nvidia cuda development tools/header:
$ curl -O http://developer.download.nvidia.com/compute/cuda/repos/fedora23/x86_64/cuda-repo-fedora23-8.0.61-1.x86_64.rpm
$ rpm -i cuda-repo-fedora23-8.0.61-1.x86_64.rpm
$ dnf install cuda-driver-dev-8-0 cuda-command-line-tools-8-0 cuda-misc-headers-8-0 cuda-command-line-tools-8-0 cuda-curand-dev-8-0
2. Install clang (I've been testing with clang 4.0)
dnf install fedora-repos-rawhide
sudo dnf install clang --enablerepo=rawhide
3. Compile this test program:
$ cat simple.cu
#include <stdio.h>
__global__ void pi(float* a) {
*a = 3.14159;
}
int main(int argc, char* argv[]) {
const int kDataLen = 1;
float a = 0.0f;
// Copy input data to device.
float* device_a;
cudaMalloc(&device_a, sizeof(float));
// Launch the kernel.
pi<<<1, kDataLen>>>(device_a);
// Copy output data to host.
cudaDeviceSynchronize();
cudaMemcpy(&a, &device_a, sizeof(float), cudaMemcpyDeviceToHost);
// Print the results.
for (int i = 0; i < kDataLen; ++i) {
printf("pi = %f\n", a);
}
cudaDeviceReset();
return 0;
}
$ clang++ simple.cu -O2 -o simple --cuda-gpu-arch=sm_30 -Xcuda-fatbinary -compress -Xcuda-fatbinary false -L/path/to/mesa/lib/gallium -L/path/to/mesa/lib/gallium/ -lcuda -ldl -lrt -pthread -nocudalib
4. Run the program:
$ ./simple
-rw-r--r-- | configure.ac | 9 | ||||
-rw-r--r-- | src/gallium/Makefile.am | 4 | ||||
-rw-r--r-- | src/gallium/include/pipe/p_defines.h | 1 | ||||
-rw-r--r-- | src/gallium/state_trackers/guda/Makefile.am | 17 | ||||
-rw-r--r-- | src/gallium/state_trackers/guda/Makefile.sources | 2 | ||||
-rw-r--r-- | src/gallium/state_trackers/guda/guda.c | 575 | ||||
-rw-r--r-- | src/gallium/targets/cuda/Makefile.am | 32 |
7 files changed, 640 insertions, 0 deletions
diff --git a/configure.ac b/configure.ac index ab9a91ed17b..b37f6e3a340 100644 --- a/configure.ac +++ b/configure.ac @@ -1167,6 +1167,11 @@ AC_ARG_ENABLE([opencl_icd], @<:@default=disabled@:>@])], [enable_opencl_icd="$enableval"], [enable_opencl_icd=no]) +AC_ARG_ENABLE([cuda], + [AS_HELP_STRING([--enable-cuda], + [enable Cuda library @<:@default=disabaled@:>@])], + [enable_cuda="$enableval"], + [enable_cuda=no]) AC_ARG_ENABLE([gallium-tests], [AS_HELP_STRING([--enable-gallium-tests], @@ -1203,6 +1208,7 @@ if test "x$enable_opengl" = xno -a \ "x$enable_vdpau" = xno -a \ "x$enable_omx" = xno -a \ "x$enable_va" = xno -a \ + "x$enable_cuda" = xno -a \ "x$enable_opencl" = xno; then AC_MSG_ERROR([at least one API should be enabled]) fi @@ -2074,6 +2080,7 @@ AM_CONDITIONAL(HAVE_CLOVER_ICD, test "x$enable_opencl_icd" = xyes) AC_SUBST([OPENCL_LIBNAME]) AC_SUBST([CLANG_RESOURCE_DIR]) +AM_CONDITIONAL(HAVE_GUDA, test "x$enable_cuda" = xyes) dnl dnl Gallium configuration dnl @@ -2637,6 +2644,7 @@ AC_CONFIG_FILES([Makefile src/gallium/state_trackers/clover/Makefile src/gallium/state_trackers/dri/Makefile src/gallium/state_trackers/glx/xlib/Makefile + src/gallium/state_trackers/guda/Makefile src/gallium/state_trackers/nine/Makefile src/gallium/state_trackers/omx/Makefile src/gallium/state_trackers/osmesa/Makefile @@ -2644,6 +2652,7 @@ AC_CONFIG_FILES([Makefile src/gallium/state_trackers/vdpau/Makefile src/gallium/state_trackers/xa/Makefile src/gallium/state_trackers/xvmc/Makefile + src/gallium/targets/cuda/Makefile src/gallium/targets/d3dadapter9/Makefile src/gallium/targets/d3dadapter9/d3d.pc src/gallium/targets/dri/Makefile diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am index 38da63b0690..d583337d97e 100644 --- a/src/gallium/Makefile.am +++ b/src/gallium/Makefile.am @@ -137,6 +137,10 @@ if HAVE_CLOVER SUBDIRS += state_trackers/clover targets/opencl endif +if HAVE_GUDA +SUBDIRS += state_trackers/guda targets/cuda +endif + if HAVE_DRICOMMON SUBDIRS += state_trackers/dri targets/dri endif diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index 924711d4dd3..29c883aeb09 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -843,6 +843,7 @@ enum pipe_shader_ir PIPE_SHADER_IR_LLVM, PIPE_SHADER_IR_NATIVE, PIPE_SHADER_IR_NIR, + PIPE_SHADER_IR_PTX }; /** diff --git a/src/gallium/state_trackers/guda/Makefile.am b/src/gallium/state_trackers/guda/Makefile.am new file mode 100644 index 00000000000..53a207096fa --- /dev/null +++ b/src/gallium/state_trackers/guda/Makefile.am @@ -0,0 +1,17 @@ +include Makefile.sources + +AM_CPPFLAGS = \ + -I$(top_srcdir)/include \ + -I$(top_builddir)/src \ + -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/gallium/include \ + -I$(top_srcdir)/src/gallium/drivers \ + -I$(top_srcdir)/src/gallium/auxiliary \ + -I$(top_srcdir)/src/gallium/winsys \ + -I/usr/local/cuda-8.0/targets/x86_64-linux/include + +noinst_LTLIBRARIES = libguda.la + +libguda_la_SOURCES = $(C_SOURCES) + +EXTRA_DIST = Doxyfile diff --git a/src/gallium/state_trackers/guda/Makefile.sources b/src/gallium/state_trackers/guda/Makefile.sources new file mode 100644 index 00000000000..5314c8f67d3 --- /dev/null +++ b/src/gallium/state_trackers/guda/Makefile.sources @@ -0,0 +1,2 @@ +C_SOURCES := \ + guda.c diff --git a/src/gallium/state_trackers/guda/guda.c b/src/gallium/state_trackers/guda/guda.c new file mode 100644 index 00000000000..92dc662bcb9 --- /dev/null +++ b/src/gallium/state_trackers/guda/guda.c @@ -0,0 +1,575 @@ + +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> + +#include <libelf.h> +#include <gelf.h> + +#include <crt/host_runtime.h> +#include "pipe/p_context.h" +#include "pipe/p_screen.h" +#include "pipe/p_state.h" +#include "pipe-loader/pipe_loader.h" +#include "util/list.h" + +/******************************************************************************/ +/* State tracker internal functions / global state */ +/******************************************************************************/ + +#define USE_FAKE_CONTEXT 0 + +#define UNUSED_BUFFER_OFFSET ~0 +struct cuda_ctx; + +struct cuda_dev { + struct pipe_loader_device *pipe_dev; + struct pipe_screen *screen; +}; + +struct cuda_buffer { + struct list_head list; + unsigned offset; + struct pipe_resource *res; +}; + +enum cuda_arg_type { + CUDA_ARG_SCALAR, + CUDA_ARG_PTR +}; + +struct cuda_arg_info { + struct list_head list; + enum cuda_arg_type type; + unsigned argno; + unsigned offset; + unsigned size; +}; + +struct cuda_exec_state { + struct list_head list; + dim3 grid_dim; + dim3 block_dim; + size_t shared_mem; + cudaStream_t stream; + char *args; + unsigned arg_size; + struct cuda_arg_info arg_info; + void *compute_state; +}; + +struct cuda_ctx { + struct pipe_context *pipe_ctx; + struct cuda_dev *dev; + struct cuda_buffer buffers; + struct cuda_exec_state exec_state; +}; + +// FIXME Support more than one device function; +struct cuda_fatbin { + char *ptr; + char *dev_func_name; +} global_fatbin; + +struct cuda_dev **cuda_dev_list = NULL; + +// FIXME primary_ctx should be shared across all CPU threads. +struct cuda_ctx **primary_ctx_list = NULL; +struct cuda_ctx *current_ctx = NULL; +size_t current_dev = 0; + +#define CUDA_TRACE(fmt, ...) printf("%s: "fmt, __PRETTY_FUNCTION__, __VA_ARGS__) +#define CUDA_TRACE_NOARG(fmt) printf("%s: "fmt, __PRETTY_FUNCTION__) + +static int init_cuda_dev_list() { + struct pipe_loader_device **pipe_dev_list; + int n = pipe_loader_probe(NULL, 0); + int i,j; + + if (n == 0) { + return 0; + } + + pipe_dev_list = malloc(n * sizeof(struct pipe_loader_device*)); + cuda_dev_list = malloc(n * sizeof(struct cuda_dev*)); + + n = pipe_loader_probe(pipe_dev_list, n); + + for (i = 0, j = 0; i < n; i++) { + struct pipe_loader_device *pipe_dev = pipe_dev_list[i]; + struct pipe_screen *screen = pipe_loader_create_screen(pipe_dev); + struct cuda_dev *dev; + if (!screen || !screen->get_param(screen, PIPE_CAP_COMPUTE)) { + if (screen) { + screen->destroy(screen); + } + pipe_loader_release(&pipe_dev, 1); + continue; + } + + dev = malloc(sizeof(struct cuda_dev)); + dev->pipe_dev = pipe_dev; + dev->screen = screen; + cuda_dev_list[j] = dev; + } + + if (j == 0) { + free(pipe_dev_list); + free(cuda_dev_list); + pipe_dev_list = NULL; + cuda_dev_list = NULL; + } + return j; +} + +static void init_primary_ctx_list(int n) { + primary_ctx_list = calloc(n, sizeof(struct cuda_ctx*)); +} + +static struct cuda_ctx *create_cuda_ctx(struct cuda_dev *dev) { + struct cuda_ctx *ctx = malloc(sizeof(struct cuda_ctx)); + if (dev) + ctx->pipe_ctx = dev->screen->context_create(dev->screen, NULL, + PIPE_CONTEXT_COMPUTE_ONLY); + ctx->dev = dev; + memset(&ctx->exec_state, 0, sizeof(ctx->exec_state)); + list_inithead(&ctx->exec_state.list); + memset(&ctx->buffers, 0, sizeof(ctx->buffers)); + list_inithead(&ctx->buffers.list); + return ctx; +} + +static struct cuda_ctx *get_ctx() { +#if USE_FAKE_CONTEXT + if (!current_ctx) { + current_ctx = create_cuda_ctx(NULL); + } +#endif + if (current_ctx) { + return current_ctx; + } + + // There is no current context, so we need to use the primary context. + if (!cuda_dev_list) { + // Is this the best pla + int n = init_cuda_dev_list(); + if (n == 0) { + return NULL; + } + init_primary_ctx_list(n); + } + + if (!primary_ctx_list[current_dev]) { + primary_ctx_list[current_dev] = create_cuda_ctx(cuda_dev_list[current_dev]); + } + + return primary_ctx_list[current_dev]; +} + +static struct cuda_exec_state *get_exec_state() { + struct cuda_ctx *ctx = get_ctx(); + if (!ctx) { + return NULL; + } + return list_first_entry(&ctx->exec_state.list, struct cuda_exec_state, list); +} + + +/******************************************************************************/ +/* Functions calls generated by clang */ +/******************************************************************************/ + +cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset); +cudaError_t cudaLaunch(const void* func); +void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, + char *deviceFun, const char *deviceName, + int thread_limit, void *tid, void *bid, + void *bDim, void *gDim, int *wSize); +void** __cudaRegisterFatBinary(void *fatCubin); +extern void __cudaUnregisterFatBinary(void **fatCubinHandle); + + +cudaError_t cudaSetupArgument(const void* arg, size_t size, size_t offset) { + CUDA_TRACE("arg = %p size = %ld offset = %ld)\n", arg, size, offset); + struct cuda_exec_state *exec_state = get_exec_state(); + struct cuda_ctx *ctx = get_ctx(); + if (!exec_state) { + return cudaErrorNoDevice; + } + + if (offset + size > exec_state->arg_size) { + exec_state->arg_size *= 2; + if (!realloc(exec_state->args, exec_state->arg_size)) + return cudaErrorUnknown; + } + + list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) { + if (buf->res == *(void**)arg) { + CUDA_TRACE("Arg is a buffer: %p\n", *(void**)arg); + buf->offset = offset; + } + } + memcpy(exec_state->args + offset, arg, size); + return cudaSuccess; +} + +#if 0 +// This function extrace what I think is NV ISA for the given function +static void extract_binary(unsigned char *elf_bin, unsigned elf_size, const char *func_name) { + Elf *elf = elf_memory(elf_bin, elf_size); + size_t section_str_index; + elf_getshdrstrndx(elf, §ion_str_index); + + for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) { + GElf_Shdr header; + if (gelf_getshdr(section, &header) != &header) { + //XXX: Error here + } + + if (strcmp(elf_strptr(elf, section_str_index, header.sh_name), ".symtab")) + continue; + + Elf_Data *const symtab_data = elf_getdata(section, NULL); + GElf_Sym symbol; + GElf_Sym *s; + unsigned i = 0; + + while ((s = gelf_getsym(symtab_data, i++, &symbol))) { + Elf_Scn *symbol_scn; + GElf_Shdr symbol_shdr; + const char *name = elf_strptr(elf, header.sh_link, s->st_name); + size_t symbol_offset; + printf("func_name = %s symbol name = %s idx = %d\n", func_name, name, s->st_shndx); + // FIXME: Not safe! + if (strcmp(name, func_name)) + continue; + + symbol_scn = elf_getscn(elf, s->st_shndx); + gelf_getshdr(symbol_scn, &symbol_shdr); + symbol_offset = symbol_shdr.sh_offset + s->st_value; + printf("size = %d\n", s->st_size); + printf("offset = %d\n", symbol_offset); + for (int j = 0; j < s->st_size; j++) { + printf("%x", *(elf_bin + symbol_offset + j)); + } + } + } + +} +#endif + + +static int read_arginfo(char *elf_bin, unsigned elf_size, + const char *func_name, struct cuda_arg_info *arg_info) { + + Elf *elf = elf_memory(elf_bin, elf_size); + size_t section_str_index; + elf_getshdrstrndx(elf, §ion_str_index); + int req_input_mem = 0; + + for (Elf_Scn *section = elf_nextscn(elf, NULL); section; section = elf_nextscn(elf, section)) { + GElf_Shdr header; + Elf_Data *section_data = NULL; + unsigned int *data; + if (gelf_getshdr(section, &header) != &header) { + //XXX: Error here + } + const char *section_name = + elf_strptr(elf, section_str_index, header.sh_name); + + if (strncmp(section_name, ".nv.info.", 9) || + strcmp(section_name + 9, func_name)) + continue; + + section_data = elf_getdata(section, section_data); + data = section_data->d_buf; + + // FIXME: I have no idea what the first 16 bytes are. + data +=4; + + while (data != section_data->d_buf + header.sh_size) { + unsigned dword2, dword3; + struct cuda_arg_info *arg = malloc(sizeof(struct cuda_arg_info)); + list_inithead(&arg->list); + list_add(&arg->list, &arg_info->list); + + // FIXME: I have no idea what the first 8 bytes are. + data+=2; + dword2 = *data++; + dword3 = *data++; + arg->argno = dword2 & 0xffff; + arg->offset = dword2 >> 16; + // FIXME: Not sure about this one. + arg->size = 0x2 << ((dword3 >> 20) & 0xf); + req_input_mem += arg->size; + CUDA_TRACE("argno = %d offset = %d size = %d\n", arg->argno, arg->offset, arg->size); + } + } + return req_input_mem; +} + +static int get_work_dim(dim3 block_dim, dim3 grid_dim) { + if (block_dim.z * grid_dim.z > 1) + return 3; + if (block_dim.y * grid_dim.y > 1) + return 2; + return 1; +} + +cudaError_t cudaLaunch(const void* func) { + CUDA_TRACE("func = %p\n", func); + // FIXME: I think need to use func to look up which function to execute. We + // currently just execute the most recently registered function. + struct cuda_exec_state *exec_state = get_exec_state(); + struct cuda_ctx *ctx = get_ctx(); + struct cuda_arg_info arg_info; + if (!exec_state) { + return cudaErrorNoDevice; + } + + char *nv_fatbin = global_fatbin.ptr; + //uint32_t nv_fatbin_magic = *(uint32_t*)(nv_fatbin); + //uint32_t nv_fatbin_version = *(uint32_t*)(nv_fatbin + 4); + //uint64_t nv_fatbin_size = *(uint64_t *)(nv_fatbin + 8); + char *fatbin = (nv_fatbin + 16); + //uint32_t fatbin_flags = *(uint32_t*)(fatbin); + uint32_t elf_offset = *(uint32_t*)(fatbin + 4); + char *elf = fatbin + elf_offset; + uint32_t elf_size = *(uint32_t*)(fatbin + 8); + char *ptx_bin = elf + elf_size; + //uint32_t unknown = *(uint32_t*)(ptx_bin); + uint32_t ptx_text_offset = *(uint32_t*)(ptx_bin + 4); + char *ptx_text = ptx_bin + ptx_text_offset; + uint32_t ptx_text_size = *(uint32_t*)(ptx_bin + 8); + struct pipe_compute_state compute_state; + struct pipe_grid_info info; + + memset(&arg_info, 0, sizeof(arg_info)); + list_inithead(&arg_info.list); + // FIXME: Should we be compiling ptx in __cudaRegisterFunction() ? + + // TOOO: compile_ptx_to_isa(); + for (unsigned i = 0; i < ptx_text_size; i++) { + printf("%c", ptx_text[i]); + } + + // TODO: Create compute_state and bind + memset(&compute_state, 0, sizeof(compute_state)); + compute_state.ir_type = PIPE_SHADER_IR_PTX; + compute_state.req_input_mem = read_arginfo(elf, elf_size, global_fatbin.dev_func_name, &arg_info); + compute_state.prog = malloc(ptx_text_size + 4); + *(uint32_t *)compute_state.prog = ptx_text_size; + memcpy(((uint32_t*)compute_state.prog) + 1, ptx_text, ptx_text_size); + + info.work_dim = get_work_dim(exec_state->block_dim, exec_state->grid_dim); + memcpy(info.block, &exec_state->block_dim, sizeof(info.block)); + memcpy(info.grid, &exec_state->grid_dim, sizeof(info.grid)); + info.pc = 0; + info.input = exec_state->args; + +#if !USE_FAKE_CONTEXT + exec_state->compute_state = + ctx->pipe_ctx->create_compute_state(ctx->pipe_ctx, &compute_state); + ctx->pipe_ctx->bind_compute_state(ctx->pipe_ctx, exec_state->compute_state); + // TODO: bind_sampler_states + // TODO: set_sampler_views + // TODO: set_compute_resources + list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) { + char *arg_start = exec_state->args + buf->offset; + ctx->pipe_ctx->set_global_binding(ctx->pipe_ctx, 0, 1, &buf->res, + (uint32_t**)&arg_start); + } + + CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>" + " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1], + info.block[2], info.grid[0], info.grid[1], info.grid[2], + info.pc, compute_state.req_input_mem); + + ctx->pipe_ctx->launch_grid(ctx->pipe_ctx, &info); + // TODO: UBNIND + ctx->pipe_ctx->memory_barrier(ctx->pipe_ctx, PIPE_BARRIER_GLOBAL_BUFFER); +#else + list_for_each_entry(struct cuda_buffer, buf, &ctx->buffers.list, list) { + char *arg_start = exec_state->args + buf->offset; + *(struct pipe_resource**)arg_start = buf->res; + } + + CUDA_TRACE("dim = %d block_size = <%d, %d, %d> grid_size = <%d, %d, %d>" + " pc = %d input_size = %d\n", info.work_dim, info.block[0], info.block[1], + info.block[2], info.grid[0], info.grid[1], info.grid[2], + info.pc, compute_state.req_input_mem); + + CUDA_TRACE_NOARG("Inputs: \n"); + for (unsigned i = 0; i < compute_state.req_input_mem; i+=4) { + CUDA_TRACE("Dword %d: %x\n", i / 4, *(int*)(exec_state->args + i)); + } +#endif + + return cudaSuccess; +} + +void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, + char *deviceFun, const char *deviceName, + int thread_limit, void *tid, void *bid, + void *bDim, void *gDim, int *wSize) { + CUDA_TRACE("fatCubinHandle = %p hostFun = %s " + "deviceFun = %s deviceName = %s thread_limit = %d " + "tid = %p bid = %p bDim = %p gDim = %p wSize = %p\n", + fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, + tid, bid, bDim, gDim, wSize); + struct cuda_exec_state *exec_state = get_exec_state(); + if (!exec_state) { + return; + } + global_fatbin.dev_func_name = deviceFun; +} + +void** __cudaRegisterFatBinary(void *fatCubin) { + CUDA_TRACE("fatCubin = %p\n", fatCubin); + uint32_t magic = *(uint32_t*)fatCubin; + uint32_t version = *(uint32_t*)(fatCubin + 4); + char *nv_fatbin = *(char**)(fatCubin + 8); + + CUDA_TRACE("magic = %x version = %x nv_fatbin = %p\n", magic, version, nv_fatbin); + global_fatbin.ptr = nv_fatbin; + + // FIXME: What should we return here? + return (void**)&global_fatbin.ptr; +} + + +void __cudaUnregisterFatBinary(void **fatCubinHandle) { + CUDA_TRACE("fatCubinHandle = %p\n", fatCubinHandle); +} + +/******************************************************************************/ +/* Functions calls called by users */ +/******************************************************************************/ +cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); +cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem /*__dv(0)*/, cudaStream_t stream /*__dv(0)*/); +cudaError_t cudaMalloc(void **devPtr, size_t size); +cudaError_t cudaDeviceSynchronize(void); +cudaError_t cudaDeviceReset(void); + +cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { + CUDA_TRACE("dst = %p src = %p count = %zd kind = %d\n", dst, src, count, kind); + struct cuda_ctx *ctx = get_ctx(); + struct pipe_transfer *transfer_src = NULL, *transfer_dst = NULL; + struct pipe_box box; + + if (!ctx || !ctx->dev) { + return cudaErrorNoDevice; + } + + memset(&box, 0, sizeof(box)); + box.width = count; + box.height = 1; + box.depth = 1; + + if (kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDeviceToDevice) { + dst = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx, + (struct pipe_resource *)dst, + 0, PIPE_TRANSFER_WRITE, &box, + &transfer_dst); + if (!dst) { + return cudaErrorInvalidDevicePointer; + } + } + + if (kind == cudaMemcpyDeviceToHost || kind == cudaMemcpyDeviceToDevice) { + src = ctx->pipe_ctx->transfer_map(ctx->pipe_ctx, + (struct pipe_resource *)src, + 0, PIPE_TRANSFER_READ, &box, + &transfer_src); + if (!src) { + return cudaErrorInvalidDevicePointer; + } + } + + memcpy(dst, src, count); + + if (transfer_src) { + ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_src); + } + + if (transfer_dst) { + ctx->pipe_ctx->transfer_unmap(ctx->pipe_ctx, transfer_dst); + } + + return cudaSuccess; +} + +cudaError_t cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem , cudaStream_t stream) { + CUDA_TRACE("gridDim = <%d, %d, %d> blockDim = <%d, %d, %d> sharedMem = %zd stream = ?\n", + gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, + sharedMem); + struct cuda_ctx *ctx = get_ctx(); + struct cuda_exec_state *state; + + if (!ctx) { + return cudaErrorNoDevice; + } + + state = malloc(sizeof(struct cuda_exec_state)); + state->grid_dim = gridDim; + state->block_dim = blockDim; + state->shared_mem = sharedMem; + state->stream = stream; + state->arg_size = 1024; + state->args = malloc(state->arg_size); + list_inithead(&state->list); + + list_add(&state->list, &ctx->exec_state.list); + return 0; +} + +cudaError_t cudaMalloc(void **devPtr, size_t size) { + CUDA_TRACE("devPtr = %p size = %ld\n", devPtr, size); + struct cuda_ctx *ctx = get_ctx(); + struct pipe_resource info; + struct pipe_resource *res; + struct cuda_buffer *buf; + + if (!ctx) { + return cudaErrorNoDevice; + } + + info.width0 = size; + info.height0 = 1; + info.depth0 = 1; + info.array_size = 1; + info.target = PIPE_BUFFER; + info.bind = PIPE_BIND_SAMPLER_VIEW | + PIPE_BIND_COMPUTE_RESOURCE | + PIPE_BIND_GLOBAL; + +#if USE_FAKE_CONTEXT + (void)info; + res = malloc(sizeof(struct pipe_resource)); +#else + res = ctx->dev->screen->resource_create(ctx->dev->screen, &info); + if (!res) { + return cudaErrorMemoryAllocation; + } +#endif + + buf = malloc(sizeof(struct cuda_buffer)); + buf->res = res; + buf->offset = UNUSED_BUFFER_OFFSET; + list_add(&buf->list, &ctx->buffers.list); + + *devPtr = res; + + return cudaSuccess; +} + +cudaError_t cudaDeviceSynchronize(void) { + CUDA_TRACE_NOARG("\n"); + return 0; +} + +cudaError_t cudaDeviceReset(void) { + CUDA_TRACE_NOARG("\n"); + return 0; +} diff --git a/src/gallium/targets/cuda/Makefile.am b/src/gallium/targets/cuda/Makefile.am new file mode 100644 index 00000000000..e11eb9458f4 --- /dev/null +++ b/src/gallium/targets/cuda/Makefile.am @@ -0,0 +1,32 @@ +include $(top_srcdir)/src/gallium/Automake.inc + +lib_LTLIBRARIES = libcuda.la + +libcuda_la_LDFLAGS = \ + -no-undefined \ + $(GC_SECTIONS) \ + $(LD_NO_UNDEFINED) + +# -version-number @CUDA_VERSION@:0 + +#if HAVE_LD_VERSION_SCRIPT +#lib@CUDA_LIBNAME@_la_LDFLAGS += \ +# -Wl,--version-script=$(top_srcdir)/src/gallium/targets/opencl/opencl.sym +#endif + +libcuda_la_LIBADD = \ + $(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_dynamic.la \ + $(top_builddir)/src/gallium/state_trackers/guda/libguda.la \ + $(top_builddir)/src/gallium/auxiliary/libgallium.la \ + $(top_builddir)/src/util/libmesautil.la \ + $(LIBELF_LIBS) \ + $(DLOPEN_LIBS) \ + $(PTHREAD_LIBS) + +nodist_EXTRA_libcuda_la_SOURCES = dummy.cpp +libcuda_la_SOURCES = + +#EXTRA_lib@CUDA_LIBNAME@_la_DEPENDENCIES = opencl.sym +#EXTRA_DIST = opencl.sym + +include $(top_srcdir)/install-gallium-links.mk |