diff options
author | Guo Yejun <yejun.guo@intel.com> | 2014-11-07 16:18:54 +0800 |
---|---|---|
committer | Zhigang Gong <zhigang.gong@intel.com> | 2014-11-07 15:50:41 +0800 |
commit | 075390db926de7bfd2ac853404ab1bcfc8b9c650 (patch) | |
tree | 9f70873dae000f4374025cfd27b3714562242b4d /src | |
parent | 54594b626c31a68956af97f69dc29132dc545f7c (diff) |
support CL_MEM_USE_HOST_PTR with userptr for cl buffer
userptr is used to wrap a memory pointer (page aligned) supplied
by user space into a buffer object accessed by GPU, and so no extra
copy is needed. It is supported starting from linux kernel 3.16
and libdrm 2.4.58.
This patch is originally finished by Zhenyu Wang <zhenyuw@linux.intel.com>,
I did a little change and some code clean.
No regression issue found on IVB+Ubuntu14.10 with libdrm upgraded with tests:
beignet/utests, piglit, OpenCV/test&perf, conformance/basic&mem_host_flags&buffers
V2: add page align limit for data size, add comments for kernel without MMU_NOTIFIER
V3: add runtime check with host_unified_memory, return CL_MEM_OBJECT_ALLOCATION_FAILURE if failed
Signed-off-by: Guo Yejun <yejun.guo@intel.com>
Reviewed-by: Zhigang Gong <zhigang.gong@linux.intel.com>
Reviewed-by: Zhenyu Wang <zhenyuw@linux.intel.com>
Diffstat (limited to 'src')
-rw-r--r-- | src/CMakeLists.txt | 5 | ||||
-rw-r--r-- | src/cl_api.c | 10 | ||||
-rw-r--r-- | src/cl_driver.h | 3 | ||||
-rw-r--r-- | src/cl_driver_defs.c | 1 | ||||
-rw-r--r-- | src/cl_enqueue.c | 19 | ||||
-rw-r--r-- | src/cl_mem.c | 37 | ||||
-rw-r--r-- | src/cl_mem.h | 2 | ||||
-rw-r--r-- | src/cl_mem_gl.c | 2 | ||||
-rw-r--r-- | src/intel/intel_driver.c | 15 |
9 files changed, 78 insertions, 16 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fc5de89e..7182bada 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -109,6 +109,11 @@ SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}") SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}") endif (OCLIcd_FOUND) +if (DRM_INTEL_USERPTR) +SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}") +SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}") +endif (DRM_INTEL_USERPTR) + set(GIT_SHA1 "git_sha1.h") add_custom_target(${GIT_SHA1} ALL COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh diff --git a/src/cl_api.c b/src/cl_api.c index 05d30933..1f246386 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -2665,9 +2665,13 @@ clEnqueueMapBuffer(cl_command_queue command_queue, ptr = data->ptr; if(event) cl_event_set_status(*event, CL_COMPLETE); } else { - if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) { - err = CL_MAP_FAILURE; - goto error; + if (buffer->is_userptr) + ptr = buffer->host_ptr; + else { + if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) { + err = CL_MAP_FAILURE; + goto error; + } } } err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL); diff --git a/src/cl_driver.h b/src/cl_driver.h index 638b791c..8697ff2c 100644 --- a/src/cl_driver.h +++ b/src/cl_driver.h @@ -285,6 +285,9 @@ extern cl_gpgpu_walker_cb *cl_gpgpu_walker; typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, size_t, size_t); extern cl_buffer_alloc_cb *cl_buffer_alloc; +typedef cl_buffer (cl_buffer_alloc_userptr_cb)(cl_buffer_mgr, const char*, void *, size_t, unsigned long); +extern cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr; + /* Set a buffer's tiling mode */ typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride); extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling; diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c index c31b6fc4..1335c20f 100644 --- a/src/cl_driver_defs.c +++ b/src/cl_driver_defs.c @@ -29,6 +29,7 @@ LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL; /* Buffer */ LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL; +LOCAL cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr = NULL; LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL; LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL; LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL; diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c index db0bce74..5bdb7cd7 100644 --- a/src/cl_enqueue.c +++ b/src/cl_enqueue.c @@ -234,11 +234,15 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data) mem->type == CL_MEM_SUBBUFFER_TYPE); struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem; - if(data->unsync_map == 1) - //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here - ptr = cl_mem_map_gtt(mem); - else - ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0); + if (mem->is_userptr) + ptr = mem->host_ptr; + else { + if(data->unsync_map == 1) + //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here + ptr = cl_mem_map_gtt(mem); + else + ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0); + } if (ptr == NULL) { err = CL_MAP_FAILURE; @@ -246,7 +250,7 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data) } data->ptr = ptr; - if(mem->flags & CL_MEM_USE_HOST_PTR) { + if((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) { assert(mem->host_ptr); ptr = (char*)ptr + data->offset + buffer->sub_offset; memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size); @@ -331,7 +335,8 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data) assert(mapped_ptr >= memobj->host_ptr && mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size); /* Sync the data. */ - memcpy(v_ptr, mapped_ptr, mapped_size); + if (!memobj->is_userptr) + memcpy(v_ptr, mapped_ptr, mapped_size); } else { CHECK_IMAGE(memobj, image); diff --git a/src/cl_mem.c b/src/cl_mem.c index 16bd6135..d3199668 100644 --- a/src/cl_mem.c +++ b/src/cl_mem.c @@ -33,6 +33,7 @@ #include <assert.h> #include <stdio.h> #include <string.h> +#include <unistd.h> #define FIELD_SIZE(CASE,TYPE) \ case JOIN(CL_,CASE): \ @@ -223,6 +224,7 @@ cl_mem_allocate(enum cl_mem_type type, cl_mem_flags flags, size_t sz, cl_int is_tiled, + void *host_ptr, cl_int *errcode) { cl_buffer_mgr bufmgr = NULL; @@ -251,6 +253,7 @@ cl_mem_allocate(enum cl_mem_type type, mem->ref_n = 1; mem->magic = CL_MAGIC_MEM_HEADER; mem->flags = flags; + mem->is_userptr = 0; if (sz != 0) { /* Pinning will require stricter alignment rules */ @@ -260,7 +263,28 @@ cl_mem_allocate(enum cl_mem_type type, /* Allocate space in memory */ bufmgr = cl_context_get_bufmgr(ctx); assert(bufmgr); + +#ifdef HAS_USERPTR + if (ctx->device->host_unified_memory) { + /* currently only cl buf is supported, will add cl image support later */ + if ((flags & CL_MEM_USE_HOST_PTR) && host_ptr != NULL) { + /* userptr not support tiling */ + if (!is_tiled) { + int page_size = getpagesize(); + if ((((unsigned long)host_ptr | sz) & (page_size - 1)) == 0) { + mem->is_userptr = 1; + mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", host_ptr, sz, 0); + } + } + } + } + + if (!mem->is_userptr) + mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment); +#else mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment); +#endif + if (UNLIKELY(mem->bo == NULL)) { err = CL_MEM_OBJECT_ALLOCATION_FAILURE; goto error; @@ -387,12 +411,15 @@ cl_mem_new_buffer(cl_context ctx, sz = ALIGN(sz, 4); /* Create the buffer in video memory */ - mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err); + mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, data, &err); if (mem == NULL || err != CL_SUCCESS) goto error; /* Copy the data if required */ - if (flags & CL_MEM_COPY_HOST_PTR || flags & CL_MEM_USE_HOST_PTR) + if (flags & CL_MEM_COPY_HOST_PTR) + cl_buffer_subdata(mem->bo, 0, sz, data); + + if ((flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) cl_buffer_subdata(mem->bo, 0, sz, data); if (flags & CL_MEM_USE_HOST_PTR || flags & CL_MEM_COPY_HOST_PTR) @@ -762,7 +789,7 @@ _cl_mem_new_image(cl_context ctx, sz = aligned_pitch * aligned_h * depth; } - mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err); + mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; @@ -1834,7 +1861,7 @@ LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx, cl_int err = CL_SUCCESS; cl_mem mem = NULL; - mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, &err); + mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; @@ -1875,7 +1902,7 @@ LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx, goto error; } - mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, &err); + mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, NULL, &err); if (mem == NULL || err != CL_SUCCESS) { err = CL_OUT_OF_HOST_MEMORY; goto error; diff --git a/src/cl_mem.h b/src/cl_mem.h index 95c5f056..2e9dd5ad 100644 --- a/src/cl_mem.h +++ b/src/cl_mem.h @@ -92,6 +92,7 @@ typedef struct _cl_mem { int map_ref; /* The mapped count. */ uint8_t mapped_gtt; /* This object has mapped gtt, for unmap. */ cl_mem_dstr_cb *dstr_cb; /* The destroy callback. */ + uint8_t is_userptr; /* CL_MEM_USE_HOST_PTR is enabled*/ } _cl_mem; struct _cl_mem_image { @@ -262,6 +263,7 @@ cl_mem_allocate(enum cl_mem_type type, cl_mem_flags flags, size_t sz, cl_int is_tiled, + void *host_ptr, cl_int *errcode); void diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c index 28d2ac65..36409089 100644 --- a/src/cl_mem_gl.c +++ b/src/cl_mem_gl.c @@ -63,7 +63,7 @@ cl_mem_new_gl_texture(cl_context ctx, goto error; } - mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, &err); + mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, NULL, &err); if (mem == NULL || err != CL_SUCCESS) goto error; diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c index bb97220c..fc037cc8 100644 --- a/src/intel/intel_driver.c +++ b/src/intel/intel_driver.c @@ -690,6 +690,20 @@ cl_buffer intel_share_image_from_libva(cl_context ctx, return (cl_buffer)intel_bo; } +static cl_buffer intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char* name, void *data,size_t size, unsigned long flags) +{ +#ifdef HAS_USERPTR + drm_intel_bo *bo; + bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags); + /* Fallback to unsynchronized userptr allocation if kernel has no MMU notifier enabled. */ + if (bo == NULL) + bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED); + return (cl_buffer)bo; +#else + return NULL; +#endif +} + static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling) { switch (tiling) { @@ -734,6 +748,7 @@ intel_setup_callbacks(void) cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr; cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id; cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc; + cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr; cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling; #if defined(HAS_EGL) cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture; |