From c481bec356b2e40e66a000dbaaf261bf7aae930d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 12 Dec 2011 14:10:57 +0000 Subject: sna: Experiment with creating the CPU pixmap using an LLC BO A poor cousin to vmap is to instead allocate snooped bo and use a CPU mapping for zero-copy uploads into GPU resident memory. For maximum performance, we still need tiled GPU buffers so CPU bo are only useful in situations where we are frequently migrating data. Signed-off-by: Chris Wilson --- src/sna/kgem.c | 162 +++++++++++++++++++++++++++++++++++++--------------- src/sna/kgem.h | 2 + src/sna/sna_accel.c | 89 +++++++++++++++++++++-------- 3 files changed, 183 insertions(+), 70 deletions(-) diff --git a/src/sna/kgem.c b/src/sna/kgem.c index fac9c0ef..6a17bfe5 100644 --- a/src/sna/kgem.c +++ b/src/sna/kgem.c @@ -86,7 +86,11 @@ static inline void list_replace(struct list *old, #endif #define PAGE_SIZE 4096 -#define MAX_VMA_CACHE 128 +#define MAX_VMA_CACHE 256 + +#define IS_CPU_MAP(ptr) ((uintptr_t)(ptr) & 1) +#define CPU_MAP(ptr) ((void*)((uintptr_t)(ptr) & ~1)) +#define MAKE_CPU_MAP(ptr) ((void*)((uintptr_t)(ptr) | 1)) struct kgem_partial_bo { struct kgem_bo base; @@ -618,9 +622,10 @@ static void kgem_bo_free(struct kgem *kgem, struct kgem_bo *bo) } if (bo->map) { - DBG(("%s: releasing vma for handle=%d, count=%d\n", - __FUNCTION__, bo->handle, kgem->vma_count-1)); - munmap(bo->map, bo->size); + DBG(("%s: releasing %s vma for handle=%d, count=%d\n", + __FUNCTION__, IS_CPU_MAP(bo->map) ? "CPU" : "GTT", + bo->handle, kgem->vma_count-1)); + munmap(CPU_MAP(bo->map), bo->size); list_del(&bo->vma); kgem->vma_count--; } @@ -657,34 +662,39 @@ static void __kgem_bo_destroy(struct kgem *kgem, struct kgem_bo *bo) } } - if (!bo->reusable) + if (!bo->reusable) { + DBG(("%s: handle=%d, not reusable\n", + __FUNCTION__, bo->handle)); goto destroy; - - if (!bo->rq && !bo->needs_flush) { - assert(!bo->purged); - - DBG(("%s: handle=%d, purged\n", __FUNCTION__, bo->handle)); - - if (!gem_madvise(kgem->fd, bo->handle, I915_MADV_DONTNEED)) { - kgem->need_purge |= bo->gpu; - goto destroy; - } - - bo->purged = true; } kgem->need_expire = true; if (bo->rq) { DBG(("%s: handle=%d -> active\n", __FUNCTION__, bo->handle)); list_move(&bo->list, active(kgem, bo->size)); - } else if (bo->purged) { - DBG(("%s: handle=%d -> inactive\n", __FUNCTION__, bo->handle)); - list_move(&bo->list, inactive(kgem, bo->size)); - } else { + } else if (bo->needs_flush) { DBG(("%s: handle=%d -> flushing\n", __FUNCTION__, bo->handle)); assert(list_is_empty(&bo->request)); list_add(&bo->request, &kgem->flushing); list_move(&bo->list, active(kgem, bo->size)); + } else { + if (!IS_CPU_MAP(bo->map)) { + assert(!bo->purged); + + DBG(("%s: handle=%d, purged\n", + __FUNCTION__, bo->handle)); + + if (!gem_madvise(kgem->fd, bo->handle, + I915_MADV_DONTNEED)) { + kgem->need_purge |= bo->gpu; + goto destroy; + } + + bo->purged = true; + } + + DBG(("%s: handle=%d -> inactive\n", __FUNCTION__, bo->handle)); + list_move(&bo->list, inactive(kgem, bo->size)); } return; @@ -1188,7 +1198,6 @@ bool kgem_expire_cache(struct kgem *kgem) for (i = 0; i < ARRAY_SIZE(kgem->inactive); i++) { idle &= list_is_empty(&kgem->inactive[i]); list_for_each_entry(bo, &kgem->inactive[i], list) { - assert(bo->purged); if (bo->delta) { expire = now - MAX_INACTIVE_TIME; break; @@ -1213,8 +1222,9 @@ bool kgem_expire_cache(struct kgem *kgem) bo = list_last_entry(&kgem->inactive[i], struct kgem_bo, list); - if (gem_madvise(kgem->fd, bo->handle, - I915_MADV_DONTNEED) && + if ((!bo->purged || + gem_madvise(kgem->fd, bo->handle, + I915_MADV_DONTNEED)) && bo->delta > expire) { idle = false; break; @@ -1844,32 +1854,47 @@ uint32_t kgem_add_reloc(struct kgem *kgem, return delta; } +static void kgem_trim_vma_cache(struct kgem *kgem) +{ + /* vma are limited on a per-process basis to around 64k. + * This includes all malloc arenas as well as other file + * mappings. In order to be fair and not hog the cache, + * and more importantly not to exhaust that limit and to + * start failing mappings, we keep our own number of open + * vma to within a conservative value. + */ + while (kgem->vma_count > MAX_VMA_CACHE) { + struct kgem_bo *old; + + old = list_first_entry(&kgem->vma_cache, + struct kgem_bo, + vma); + DBG(("%s: discarding %s vma cache for %d\n", + __FUNCTION__, IS_CPU_MAP(old->map) ? "CPU" : "GTT", + old->handle)); + munmap(CPU_MAP(old->map), old->size); + old->map = NULL; + list_del(&old->vma); + kgem->vma_count--; + } +} + void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot) { void *ptr; + if (IS_CPU_MAP(bo->map)) { + DBG(("%s: discarding CPU vma cache for %d\n", + __FUNCTION__, bo->handle)); + munmap(CPU_MAP(bo->map), bo->size); + bo->map = NULL; + list_del(&bo->vma); + kgem->vma_count--; + } + ptr = bo->map; if (ptr == NULL) { - /* vma are limited on a per-process basis to around 64k. - * This includes all malloc arenas as well as other file - * mappings. In order to be fair and not hog the cache, - * and more importantly not to exhaust that limit and to - * start failing mappings, we keep our own number of open - * vma to within a conservative value. - */ - while (kgem->vma_count > MAX_VMA_CACHE) { - struct kgem_bo *old; - - old = list_first_entry(&kgem->vma_cache, - struct kgem_bo, - vma); - DBG(("%s: discarding vma cache for %d\n", - __FUNCTION__, old->handle)); - munmap(old->map, old->size); - old->map = NULL; - list_del(&old->vma); - kgem->vma_count--; - } + kgem_trim_vma_cache(kgem); ptr = gem_mmap(kgem->fd, bo->handle, bo->size, PROT_READ | PROT_WRITE); @@ -1907,6 +1932,53 @@ void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot) return ptr; } +void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo) +{ + struct drm_i915_gem_mmap mmap_arg; + + DBG(("%s(handle=%d, size=%d)\n", __FUNCTION__, bo->handle, bo->size)); + + if (IS_CPU_MAP(bo->map)) { + void *ptr = CPU_MAP(bo->map); + list_del(&bo->vma); + kgem->vma_count--; + bo->map = NULL; + return ptr; + } + + if (bo->map) { + DBG(("%s: discarding GTT vma cache for %d\n", + __FUNCTION__, bo->handle)); + munmap(CPU_MAP(bo->map), bo->size); + bo->map = NULL; + list_del(&bo->vma); + kgem->vma_count--; + } + + kgem_trim_vma_cache(kgem); + + VG_CLEAR(mmap_arg); + mmap_arg.handle = bo->handle; + mmap_arg.offset = 0; + mmap_arg.size = bo->size; + if (drmIoctl(kgem->fd, DRM_IOCTL_I915_GEM_MMAP, &mmap_arg)) { + assert(0); + return NULL; + } + + VG(VALGRIND_MAKE_MEM_DEFINED(mmap_arg.addr_ptr, bo->size)); + return (void *)(uintptr_t)mmap_arg.addr_ptr; +} + +void kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr) +{ + assert(bo->map == NULL); + + bo->map = MAKE_CPU_MAP(ptr); + list_move(&bo->vma, &kgem->vma_cache); + kgem->vma_count++; +} + void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo) { if (bo->map == NULL) @@ -1915,7 +1987,7 @@ void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo) DBG(("%s: (debug) releasing vma for handle=%d, count=%d\n", __FUNCTION__, bo->handle, kgem->vma_count-1)); - munmap(bo->map, bo->size); + munmap(CPU_MAP(bo->map), bo->size); bo->map = NULL; list_del(&bo->vma); diff --git a/src/sna/kgem.h b/src/sna/kgem.h index 0d85f643..2fd5a551 100644 --- a/src/sna/kgem.h +++ b/src/sna/kgem.h @@ -319,6 +319,8 @@ uint32_t kgem_add_reloc(struct kgem *kgem, void *kgem_bo_map(struct kgem *kgem, struct kgem_bo *bo, int prot); void kgem_bo_unmap(struct kgem *kgem, struct kgem_bo *bo); +void *kgem_bo_map__cpu(struct kgem *kgem, struct kgem_bo *bo); +void kgem_bo_unmap__cpu(struct kgem *kgem, struct kgem_bo *bo, void *ptr); uint32_t kgem_bo_flink(struct kgem *kgem, struct kgem_bo *bo); Bool kgem_bo_write(struct kgem *kgem, struct kgem_bo *bo, diff --git a/src/sna/sna_accel.c b/src/sna/sna_accel.c index c39b45ea..dc0fad50 100644 --- a/src/sna/sna_accel.c +++ b/src/sna/sna_accel.c @@ -61,6 +61,7 @@ #define USE_SPANS 0 #define USE_ZERO_SPANS 1 #define USE_BO_FOR_SCRATCH_PIXMAP 1 +#define USE_LLC_CPU_BO 1 static int sna_font_key; @@ -177,6 +178,54 @@ static void sna_pixmap_destroy_gpu_bo(struct sna *sna, struct sna_pixmap *priv) priv->source_count = SOURCE_BIAS; } +static void sna_pixmap_alloc_cpu(struct sna *sna, + PixmapPtr pixmap, + struct sna_pixmap *priv) +{ + if (USE_LLC_CPU_BO && sna->kgem.gen >= 60) { + DBG(("%s: allocating CPU buffer (%dx%d)\n", __FUNCTION__, + pixmap->drawable.width, pixmap->drawable.height)); + + priv->cpu_bo = kgem_create_2d(&sna->kgem, + pixmap->drawable.width, + pixmap->drawable.height, + pixmap->drawable.bitsPerPixel, + I915_TILING_NONE, + CREATE_INACTIVE); + DBG(("%s: allocated CPU handle=%d\n", __FUNCTION__, + priv->cpu_bo->handle)); + + if (priv->cpu_bo) { + priv->ptr = kgem_bo_map__cpu(&sna->kgem, priv->cpu_bo); + if (priv->ptr == NULL) { + kgem_bo_destroy(&sna->kgem, priv->cpu_bo); + priv->cpu_bo = NULL; + } + } + } + + if (priv->ptr == NULL) + priv->ptr = malloc(pixmap->devKind * pixmap->drawable.height); + + assert(priv->ptr); + pixmap->devPrivate.ptr = priv->ptr; +} + +static void sna_pixmap_free_cpu(struct sna *sna, struct sna_pixmap *priv) +{ + DBG(("%s: discarding CPU buffer, handle=%d, size=%d\n", + __FUNCTION__, priv->cpu_bo->handle, priv->cpu_bo->size)); + + if (priv->cpu_bo) { + kgem_bo_unmap__cpu(&sna->kgem, priv->cpu_bo, priv->ptr); + kgem_bo_destroy(&sna->kgem, priv->cpu_bo); + + priv->cpu_bo = NULL; + } else + free(priv->ptr); + priv->pixmap->devPrivate.ptr = priv->ptr = NULL; +} + static Bool sna_destroy_private(PixmapPtr pixmap, struct sna_pixmap *priv) { struct sna *sna = to_sna_from_pixmap(pixmap); @@ -191,6 +240,9 @@ static Bool sna_destroy_private(PixmapPtr pixmap, struct sna_pixmap *priv) if (priv->gpu_bo) kgem_bo_destroy(&sna->kgem, priv->gpu_bo); + if (priv->ptr) + sna_pixmap_free_cpu(sna, priv); + if (priv->cpu_bo) { if (kgem_bo_is_busy(priv->cpu_bo)) { list_add_tail(&priv->list, &sna->deferred_free); @@ -208,7 +260,6 @@ static Bool sna_destroy_private(PixmapPtr pixmap, struct sna_pixmap *priv) return false; } - free(priv->ptr); free(priv); return true; } @@ -531,12 +582,10 @@ sna_pixmap_move_to_cpu(PixmapPtr pixmap, bool write) __FUNCTION__, priv->gpu_bo, priv->gpu_damage, priv->gpu_only)); if (pixmap->devPrivate.ptr == NULL) { - DBG(("%s: allocating CPU buffer\n", __FUNCTION__)); assert(priv->ptr == NULL); assert(pixmap->devKind); assert(priv->cpu_damage == NULL); - priv->ptr = malloc(pixmap->devKind * pixmap->drawable.height); - pixmap->devPrivate.ptr = priv->ptr; + sna_pixmap_alloc_cpu(sna, pixmap, priv); } if (priv->gpu_bo == NULL) { @@ -644,12 +693,10 @@ sna_drawable_move_region_to_cpu(DrawablePtr drawable, } if (pixmap->devPrivate.ptr == NULL) { - DBG(("%s: allocating CPU buffer\n", __FUNCTION__)); assert(priv->ptr == NULL); assert(pixmap->devKind); assert(priv->cpu_damage == NULL); - priv->ptr = malloc(pixmap->devKind * pixmap->drawable.height); - pixmap->devPrivate.ptr = priv->ptr; + sna_pixmap_alloc_cpu(sna, pixmap, priv); } if (priv->gpu_bo == NULL) @@ -1397,13 +1444,6 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region, if (!priv) return false; - if (pixmap->devPrivate.ptr == NULL) { - if (priv->gpu_bo == NULL) - return false; - return sna_put_image_upload_blt(drawable, gc, region, - x, y, w, h, bits, stride); - } - if (gc->alu != GXcopy) return false; @@ -1432,6 +1472,9 @@ sna_put_zpixmap_blt(DrawablePtr drawable, GCPtr gc, RegionPtr region, if (priv->cpu_bo) kgem_bo_sync(&sna->kgem, priv->cpu_bo, true); + if (pixmap->devPrivate.ptr == NULL) + sna_pixmap_alloc_cpu(sna, pixmap, priv); + if (region_subsumes_drawable(region, &pixmap->drawable)) { DBG(("%s: replacing entire pixmap\n", __FUNCTION__)); sna_damage_all(&priv->cpu_damage, @@ -2216,11 +2259,8 @@ fallback: &sna->dirty_pixmaps); } - if (dst_pixmap->devPrivate.ptr == NULL) { - DBG(("%s: allocating CPU buffer\n", __FUNCTION__)); - dst_priv->ptr = malloc(dst_pixmap->devKind * dst_pixmap->drawable.height); - dst_pixmap->devPrivate.ptr = dst_priv->ptr; - } + if (dst_pixmap->devPrivate.ptr == NULL) + sna_pixmap_alloc_cpu(sna, dst_pixmap, dst_priv); } else sna_drawable_move_region_to_cpu(&dst_pixmap->drawable, ®ion, true); @@ -8610,12 +8650,11 @@ static void sna_accel_inactive(struct sna *sna) list_init(&preserve); list_for_each_entry_safe(priv, next, &sna->active_pixmaps, inactive) { - if (priv->ptr && sna_damage_is_all(&priv->gpu_damage, - priv->pixmap->drawable.width, - priv->pixmap->drawable.height)) { - DBG(("%s: discarding CPU buffer\n", __FUNCTION__)); - free(priv->ptr); - priv->pixmap->devPrivate.ptr = priv->ptr = NULL; + if (priv->ptr && + sna_damage_is_all(&priv->gpu_damage, + priv->pixmap->drawable.width, + priv->pixmap->drawable.height)) { + sna_pixmap_free_cpu(sna, priv); list_move(&priv->inactive, &preserve); } } -- cgit v1.2.3